40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
import re
|
|
from bs4 import BeautifulSoup, Comment
|
|
|
|
def clean_html(html_content: str, config: dict) -> str:
|
|
"""
|
|
Bereinigt den HTML-Inhalt, indem unerwünschte Tags und Textmuster entfernt werden.
|
|
|
|
Args:
|
|
html_content: Der rohe HTML-Inhalt als String.
|
|
config: Das Konfigurationsobjekt.
|
|
|
|
Returns:
|
|
Der bereinigte HTML-Inhalt als String.
|
|
"""
|
|
cleaner_config = config.get('html_cleaner', {}).get('value', {})
|
|
remove_tags = cleaner_config.get('remove_tags', [])
|
|
remove_patterns = cleaner_config.get('remove_patterns', [])
|
|
|
|
if not html_content:
|
|
return ""
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# 1. Entferne unerwünschte Tags
|
|
for tag_name in remove_tags:
|
|
for tag in soup.find_all(tag_name):
|
|
tag.decompose()
|
|
|
|
# 2. Entferne unerwünschte Textmuster aus dem verbleibenden Inhalt
|
|
if remove_patterns:
|
|
combined_pattern = "|".join(remove_patterns)
|
|
|
|
for element in soup.find_all(string=True):
|
|
if element.parent.name in ['style', 'script'] or isinstance(element, Comment):
|
|
continue
|
|
|
|
new_string = re.sub(combined_pattern, '', str(element), flags=re.IGNORECASE)
|
|
element.replace_with(new_string)
|
|
|
|
return str(soup) |