Initial commit

This commit is contained in:
2025-11-13 13:01:48 +01:00
commit c347078f27
29 changed files with 2185 additions and 0 deletions

40
src/html_cleaner.py Normal file
View File

@@ -0,0 +1,40 @@
import re
from bs4 import BeautifulSoup, Comment
def clean_html(html_content: str, config: dict) -> str:
"""
Bereinigt den HTML-Inhalt, indem unerwünschte Tags und Textmuster entfernt werden.
Args:
html_content: Der rohe HTML-Inhalt als String.
config: Das Konfigurationsobjekt.
Returns:
Der bereinigte HTML-Inhalt als String.
"""
cleaner_config = config.get('html_cleaner', {}).get('value', {})
remove_tags = cleaner_config.get('remove_tags', [])
remove_patterns = cleaner_config.get('remove_patterns', [])
if not html_content:
return ""
soup = BeautifulSoup(html_content, 'html.parser')
# 1. Entferne unerwünschte Tags
for tag_name in remove_tags:
for tag in soup.find_all(tag_name):
tag.decompose()
# 2. Entferne unerwünschte Textmuster aus dem verbleibenden Inhalt
if remove_patterns:
combined_pattern = "|".join(remove_patterns)
for element in soup.find_all(string=True):
if element.parent.name in ['style', 'script'] or isinstance(element, Comment):
continue
new_string = re.sub(combined_pattern, '', str(element), flags=re.IGNORECASE)
element.replace_with(new_string)
return str(soup)