Initial commit
This commit is contained in:
40
src/html_cleaner.py
Normal file
40
src/html_cleaner.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
def clean_html(html_content: str, config: dict) -> str:
|
||||
"""
|
||||
Bereinigt den HTML-Inhalt, indem unerwünschte Tags und Textmuster entfernt werden.
|
||||
|
||||
Args:
|
||||
html_content: Der rohe HTML-Inhalt als String.
|
||||
config: Das Konfigurationsobjekt.
|
||||
|
||||
Returns:
|
||||
Der bereinigte HTML-Inhalt als String.
|
||||
"""
|
||||
cleaner_config = config.get('html_cleaner', {}).get('value', {})
|
||||
remove_tags = cleaner_config.get('remove_tags', [])
|
||||
remove_patterns = cleaner_config.get('remove_patterns', [])
|
||||
|
||||
if not html_content:
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1. Entferne unerwünschte Tags
|
||||
for tag_name in remove_tags:
|
||||
for tag in soup.find_all(tag_name):
|
||||
tag.decompose()
|
||||
|
||||
# 2. Entferne unerwünschte Textmuster aus dem verbleibenden Inhalt
|
||||
if remove_patterns:
|
||||
combined_pattern = "|".join(remove_patterns)
|
||||
|
||||
for element in soup.find_all(string=True):
|
||||
if element.parent.name in ['style', 'script'] or isinstance(element, Comment):
|
||||
continue
|
||||
|
||||
new_string = re.sub(combined_pattern, '', str(element), flags=re.IGNORECASE)
|
||||
element.replace_with(new_string)
|
||||
|
||||
return str(soup)
|
||||
Reference in New Issue
Block a user