Initial commit
This commit is contained in:
BIN
src/__pycache__/crawler_core.cpython-312.pyc
Normal file
BIN
src/__pycache__/crawler_core.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/db_logger.cpython-312.pyc
Normal file
BIN
src/__pycache__/db_logger.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/duplicate_detector.cpython-312.pyc
Normal file
BIN
src/__pycache__/duplicate_detector.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/hash_manager.cpython-312.pyc
Normal file
BIN
src/__pycache__/hash_manager.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/html_cleaner.cpython-312.pyc
Normal file
BIN
src/__pycache__/html_cleaner.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/logger_setup.cpython-312.pyc
Normal file
BIN
src/__pycache__/logger_setup.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/state_manager.cpython-312.pyc
Normal file
BIN
src/__pycache__/state_manager.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/stats_manager.cpython-312.pyc
Normal file
BIN
src/__pycache__/stats_manager.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/storage.cpython-312.pyc
Normal file
BIN
src/__pycache__/storage.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/__pycache__/url_utils.cpython-312.pyc
Normal file
BIN
src/__pycache__/url_utils.cpython-312.pyc
Normal file
Binary file not shown.
277
src/crawler_core.py
Normal file
277
src/crawler_core.py
Normal file
@@ -0,0 +1,277 @@
|
||||
import trafilatura, asyncio
|
||||
from asyncio import PriorityQueue
|
||||
import aiohttp
|
||||
import logging
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib import robotparser
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .url_utils import normalize_url
|
||||
from .storage import save
|
||||
from .html_cleaner import clean_html
|
||||
from .duplicate_detector import DuplicateDetector
|
||||
from .hash_manager import HashManager
|
||||
from .state_manager import StateManager
|
||||
from .stats_manager import CrawlStats
|
||||
|
||||
try:
|
||||
from simhash import Simhash
|
||||
except ImportError:
|
||||
Simhash = None
|
||||
|
||||
|
||||
class RobotsTxtChecker:
|
||||
def __init__(self, user_agent):
|
||||
self.user_agent = user_agent
|
||||
self.parsers = {} # Cache: netloc -> robotparser.RobotFileParser
|
||||
|
||||
async def _get_parser(self, session, url):
|
||||
parsed_url = urlparse(url)
|
||||
netloc = parsed_url.netloc
|
||||
|
||||
if netloc not in self.parsers:
|
||||
robots_url = urljoin(url, "/robots.txt")
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robots_url)
|
||||
|
||||
logging.info(f"Lade robots.txt von: {robots_url}")
|
||||
|
||||
try:
|
||||
# Asynchroner Abruf der robots.txt mit aiohttp
|
||||
async with session.get(robots_url, timeout=aiohttp.ClientTimeout(total=10)) as response:
|
||||
if response.status == 200:
|
||||
text = await response.text()
|
||||
rp.parse(text.splitlines())
|
||||
elif response.status == 404:
|
||||
# Keine robots.txt gefunden, alles erlaubt
|
||||
pass
|
||||
else:
|
||||
logging.warning(f"Fehler beim Abruf von {robots_url}: Status {response.status}")
|
||||
except aiohttp.ClientError as e:
|
||||
logging.error(f"Fehler beim Abruf von {robots_url}: {e}")
|
||||
# Bei Fehler oder Timeout wird der Parser ohne Regeln gecacht (implizit alles erlaubt)
|
||||
|
||||
self.parsers[netloc] = rp
|
||||
|
||||
return self.parsers[netloc]
|
||||
|
||||
async def is_allowed(self, session, url):
|
||||
rp = await self._get_parser(session, url)
|
||||
return rp.can_fetch(self.user_agent, url)
|
||||
|
||||
# Instanziierung des Checkers
|
||||
robots_checker = None
|
||||
|
||||
|
||||
def extract_text(html):
|
||||
text = trafilatura.extract(html)
|
||||
|
||||
# fallback bei Gesetzen / technischen Dokus
|
||||
if not text or len(text) < 400:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
text = soup.get_text(" ", strip=True)
|
||||
|
||||
if not text or len(text) < 100:
|
||||
return None
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def extract_title(html):
|
||||
"""Extrahiert den Titel aus dem HTML-Inhalt."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
title_tag = soup.find("title")
|
||||
if title_tag:
|
||||
return title_tag.string.strip()
|
||||
return "Kein Titel gefunden"
|
||||
|
||||
|
||||
def allowed_link(base_url_normalized, base_path, next_url, crawl_mode, blocked_patterns, path_strict):
|
||||
# Normalisiere die URL für Deduplizierung und Fragment-/Tracking-Entfernung
|
||||
normalized_url = normalize_url(next_url)
|
||||
|
||||
if any(pat in normalized_url for pat in blocked_patterns):
|
||||
return False
|
||||
|
||||
if crawl_mode == "single_page":
|
||||
return False
|
||||
|
||||
# Domain-Limit
|
||||
# base_url_normalized ist die normalisierte Start-URL der Quelle
|
||||
if urlparse(normalized_url).netloc != urlparse(base_url_normalized).netloc:
|
||||
return False
|
||||
|
||||
# Pfadbegrenzung
|
||||
if crawl_mode == "path_limited":
|
||||
# Strikte Pfadbegrenzung: Der Pfad der gefundenen URL muss mit dem Basis-Pfad beginnen.
|
||||
# Wir verwenden urlparse, um den Pfad der gefundenen URL zu erhalten.
|
||||
next_path = urlparse(normalized_url).path
|
||||
if not next_path.startswith(base_path):
|
||||
return False
|
||||
|
||||
# Legacy path_strict (falls noch verwendet, obwohl path_limited dies ersetzen sollte)
|
||||
if path_strict and not normalized_url.startswith(base_url_normalized):
|
||||
return False
|
||||
|
||||
return normalized_url # Gibt die normalisierte URL zurück, wenn erlaubt
|
||||
|
||||
|
||||
async def crawl_source(session, source, config):
|
||||
stats = CrawlStats()
|
||||
global robots_checker
|
||||
if not robots_checker:
|
||||
user_agent = config.get("USER_AGENT", {}).get("value")
|
||||
robots_checker = RobotsTxtChecker(user_agent)
|
||||
|
||||
start_urls = source["start_urls"]
|
||||
crawl_mode = source.get("crawl_mode", "domain_wide")
|
||||
blocked_patterns = source.get("blocked_patterns", [])
|
||||
path_strict = source.get("path_strict", False)
|
||||
|
||||
# Lade Konfigurationswerte
|
||||
page_limit = config.get("PAGE_LIMIT", {}).get("value", 200)
|
||||
crawl_delay = config.get("CRAWL_DELAY", {}).get("value", 1)
|
||||
min_content_length = config.get("MIN_CONTENT_LENGTH", {}).get("value", 500)
|
||||
priority_patterns = config.get("priority_patterns", {}).get("value", [])
|
||||
duplicate_detection_config = config.get("duplicate_detection", {}).get("value", {})
|
||||
incremental_crawling_config = config.get("incremental_crawling", {}).get("value", {})
|
||||
state_management_config = config.get("state_management", {}).get("value", {})
|
||||
output_dir = config.get("OUTPUT_DIR", {}).get("value")
|
||||
|
||||
# Initialisiere DuplicateDetector, falls aktiviert
|
||||
duplicate_detector = None
|
||||
if duplicate_detection_config.get("enable"):
|
||||
duplicate_detector = DuplicateDetector(
|
||||
similarity_threshold=duplicate_detection_config.get("similarity_threshold", 95)
|
||||
)
|
||||
|
||||
# Initialisiere HashManager, falls aktiviert
|
||||
hash_manager = None
|
||||
if incremental_crawling_config.get("enable"):
|
||||
db_file = incremental_crawling_config.get("db_file", "crawled_hashes.db")
|
||||
hash_manager = HashManager(db_file)
|
||||
|
||||
# --- Zustandsverwaltung ---
|
||||
state_manager = None
|
||||
queue = PriorityQueue()
|
||||
visited = set()
|
||||
normalized_start_urls = [normalize_url(u) for u in start_urls]
|
||||
base_url_normalized = normalized_start_urls[0]
|
||||
|
||||
if state_management_config.get("enable"):
|
||||
state_file_template = state_management_config.get("state_file", "crawler_state.json")
|
||||
source_netloc = urlparse(base_url_normalized).netloc
|
||||
state_file = state_file_template.replace(".json", f"_{source_netloc}.json")
|
||||
state_manager = StateManager(state_file)
|
||||
|
||||
loaded_state = state_manager.load_state()
|
||||
if loaded_state:
|
||||
logging.info(f"Lade Zustand aus {state_file}")
|
||||
queue, visited = loaded_state
|
||||
else:
|
||||
for u in normalized_start_urls:
|
||||
await queue.put((0, u))
|
||||
else:
|
||||
for u in normalized_start_urls:
|
||||
await queue.put((0, u))
|
||||
# -------------------------
|
||||
|
||||
# Extrahiere den Basis-Pfad für path_limited Modus
|
||||
base_path = urlparse(base_url_normalized).path
|
||||
|
||||
logging.info(f"Starte Quelle: {start_urls[0]} (Modus: {crawl_mode})")
|
||||
|
||||
try:
|
||||
while not queue.empty() and len(visited) < page_limit:
|
||||
priority, url = await queue.get() # Entpacke Priorität und URL
|
||||
|
||||
if url in visited:
|
||||
continue
|
||||
visited.add(url)
|
||||
stats.total_visited += 1
|
||||
|
||||
logging.info(f"Crawle: {url}")
|
||||
|
||||
# Asynchrone robots.txt Prüfung
|
||||
if not await robots_checker.is_allowed(session, url):
|
||||
logging.warning(f"robots.txt verbietet: {url}")
|
||||
await asyncio.sleep(crawl_delay) # Respektiere den Delay, auch wenn übersprungen wird
|
||||
continue
|
||||
|
||||
try:
|
||||
# Asynchroner Abruf mit aiohttp
|
||||
async with session.get(url, timeout=aiohttp.ClientTimeout(total=20)) as response:
|
||||
response.raise_for_status() # Wirft HTTPStatusError für 4xx/5xx
|
||||
html = await response.text()
|
||||
|
||||
cleaned_html = clean_html(html, config)
|
||||
text = extract_text(cleaned_html)
|
||||
|
||||
if text and len(text) >= min_content_length:
|
||||
# Simhash-Berechnung, falls für inkrementelles Crawling benötigt
|
||||
new_simhash_value = None
|
||||
if hash_manager and Simhash:
|
||||
new_simhash_value = Simhash(text).value
|
||||
|
||||
# Inkrementelles Crawling: Prüfen, ob der Inhalt unverändert ist
|
||||
if hash_manager and new_simhash_value is not None:
|
||||
if hash_manager.is_unchanged(url, new_simhash_value):
|
||||
logging.info(f"Inhalt unverändert, übersprungen: {url}")
|
||||
continue
|
||||
|
||||
# Duplikat-Erkennung
|
||||
if duplicate_detector and duplicate_detector.is_duplicate(text):
|
||||
logging.info(f"Duplikat übersprungen: {url}")
|
||||
else:
|
||||
# Speichern und Hashes aktualisieren
|
||||
title = extract_title(cleaned_html)
|
||||
save(url, title, text, output_dir)
|
||||
stats.total_saved += 1
|
||||
stats.total_data_volume += len(text.encode('utf-8'))
|
||||
if duplicate_detector:
|
||||
duplicate_detector.add_hash(text)
|
||||
if hash_manager and new_simhash_value is not None:
|
||||
hash_manager.update_hash(url, new_simhash_value)
|
||||
else:
|
||||
logging.info(f"Inhalt verworfen (Länge: {len(text) if text else 0} < {min_content_length}): {url}")
|
||||
|
||||
if crawl_mode != "single_page":
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for link in soup.find_all("a", href=True):
|
||||
next_url = urljoin(url, link["href"])
|
||||
|
||||
# allowed_link gibt False oder die normalisierte URL zurück
|
||||
normalized_next_url = allowed_link(base_url_normalized, base_path, next_url, crawl_mode, blocked_patterns, path_strict)
|
||||
|
||||
if normalized_next_url and normalized_next_url not in visited:
|
||||
# Priorität basierend auf Mustern bestimmen
|
||||
prio = 0 if any(pat in normalized_next_url for pat in priority_patterns) else 1
|
||||
await queue.put((prio, normalized_next_url))
|
||||
|
||||
except aiohttp.ClientError as e:
|
||||
logging.error(f"aiohttp Fehler beim Abruf von {url}: {e}")
|
||||
stats.errors += 1
|
||||
except Exception as e:
|
||||
logging.error(f"Allgemeiner Fehler beim Crawlen von {url}: {e}")
|
||||
stats.errors += 1
|
||||
|
||||
await asyncio.sleep(crawl_delay)
|
||||
|
||||
except (KeyboardInterrupt, Exception):
|
||||
if state_manager:
|
||||
logging.warning("Prozess unterbrochen. Speichere aktuellen Zustand...")
|
||||
state_manager.save_state(queue, visited)
|
||||
logging.info("Zustand gespeichert.")
|
||||
raise
|
||||
else:
|
||||
# This block executes when the try block completes without an exception.
|
||||
logging.info(f"Quelle {base_url_normalized} erfolgreich gecrawlt.")
|
||||
if state_manager:
|
||||
logging.info("Lösche Zustandsdatei.")
|
||||
state_manager.delete_state()
|
||||
# Schließe die Datenbankverbindung des HashManagers
|
||||
if hash_manager:
|
||||
hash_manager.close()
|
||||
|
||||
stats.finish()
|
||||
return stats
|
||||
169
src/db_logger.py
Normal file
169
src/db_logger.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import logging
|
||||
import sqlite3
|
||||
import warnings
|
||||
|
||||
# Platzhalter für mysql.connector, um Importfehler zu vermeiden, wenn es nicht installiert ist
|
||||
try:
|
||||
import mysql.connector
|
||||
MYSQL_AVAILABLE = True
|
||||
except ImportError:
|
||||
MYSQL_AVAILABLE = False
|
||||
warnings.warn("Das 'mysql-connector-python' Paket ist nicht installiert. Der MySQLHandler ist nicht verfügbar.")
|
||||
|
||||
class SQLiteHandler(logging.Handler):
|
||||
"""Ein logging.Handler, der Log-Einträge in eine SQLite-Datenbank schreibt."""
|
||||
|
||||
def __init__(self, db_file):
|
||||
super().__init__()
|
||||
self.db_file = db_file
|
||||
self._conn = None
|
||||
self._cursor = None
|
||||
self._db_initialized = False
|
||||
|
||||
def _init_db(self):
|
||||
"""Initialisiert die Datenbankverbindung und erstellt die Tabelle, falls sie nicht existiert."""
|
||||
try:
|
||||
self._conn = sqlite3.connect(self.db_file)
|
||||
self._cursor = self._conn.cursor()
|
||||
self._cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS logs (
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
level VARCHAR(10),
|
||||
message TEXT
|
||||
)
|
||||
''')
|
||||
self._conn.commit()
|
||||
self._db_initialized = True
|
||||
except sqlite3.Error as e:
|
||||
warnings.warn(f"Fehler beim Initialisieren der SQLite-Datenbank: {e}")
|
||||
self._conn = None
|
||||
|
||||
def emit(self, record):
|
||||
"""Schreibt einen Log-Eintrag in die Datenbank."""
|
||||
if not self._db_initialized:
|
||||
self._init_db()
|
||||
|
||||
if not self._conn:
|
||||
return
|
||||
|
||||
try:
|
||||
log_entry = (record.levelname, self.format(record))
|
||||
self._cursor.execute('INSERT INTO logs (level, message) VALUES (?, ?)', log_entry)
|
||||
self._conn.commit()
|
||||
except sqlite3.Error as e:
|
||||
warnings.warn(f"Fehler beim Schreiben des Logs in die SQLite-Datenbank: {e}")
|
||||
|
||||
def close(self):
|
||||
"""Schließt die Datenbankverbindung."""
|
||||
if self._conn:
|
||||
self._conn.close()
|
||||
super().close()
|
||||
|
||||
class MySQLHandler(logging.Handler):
|
||||
"""Ein logging.Handler, der Log-Einträge in eine MySQL-Datenbank schreibt."""
|
||||
|
||||
def __init__(self, host, user, password, database):
|
||||
super().__init__()
|
||||
if not MYSQL_AVAILABLE:
|
||||
self._conn = None
|
||||
return
|
||||
|
||||
self.db_config = {
|
||||
'host': host,
|
||||
'user': user,
|
||||
'password': password,
|
||||
'database': database
|
||||
}
|
||||
self._conn = None
|
||||
self._cursor = None
|
||||
self._db_initialized = False
|
||||
|
||||
def _init_db(self):
|
||||
"""Initialisiert die Datenbankverbindung und erstellt die Tabelle, falls sie nicht existiert."""
|
||||
try:
|
||||
self._conn = mysql.connector.connect(**self.db_config)
|
||||
self._cursor = self._conn.cursor()
|
||||
self._cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS logs (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
level VARCHAR(10),
|
||||
message TEXT
|
||||
)
|
||||
''')
|
||||
self._conn.commit()
|
||||
self._db_initialized = True
|
||||
except mysql.connector.Error as e:
|
||||
warnings.warn(f"Fehler beim Initialisieren der MySQL-Datenbank: {e}")
|
||||
self._conn = None
|
||||
|
||||
def emit(self, record):
|
||||
"""Schreibt einen Log-Eintrag in die Datenbank."""
|
||||
if not MYSQL_AVAILABLE or (not self._db_initialized and not self._conn):
|
||||
self._init_db()
|
||||
|
||||
if not self._conn:
|
||||
return
|
||||
|
||||
try:
|
||||
log_entry = (record.levelname, self.format(record))
|
||||
self._cursor.execute('INSERT INTO logs (level, message) VALUES (%s, %s)', log_entry)
|
||||
self._conn.commit()
|
||||
except mysql.connector.Error as e:
|
||||
warnings.warn(f"Fehler beim Schreiben des Logs in die MySQL-Datenbank: {e}")
|
||||
|
||||
def close(self):
|
||||
"""Schließt die Datenbankverbindung."""
|
||||
if self._conn:
|
||||
self._conn.close()
|
||||
super().close()
|
||||
|
||||
def log_stats_to_mysql(stats, source_url, db_config):
|
||||
"""Schreibt Crawl-Statistiken in eine separate MySQL-Tabelle."""
|
||||
if not MYSQL_AVAILABLE:
|
||||
warnings.warn("MySQL-Connector nicht verfügbar. Statistiken können nicht geloggt werden.")
|
||||
return
|
||||
|
||||
conn = None
|
||||
try:
|
||||
conn = mysql.connector.connect(**db_config)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Tabelle erstellen, falls sie nicht existiert
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS crawl_stats (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
source_url VARCHAR(255),
|
||||
duration_seconds FLOAT,
|
||||
total_visited INT,
|
||||
total_saved INT,
|
||||
total_data_volume_bytes BIGINT,
|
||||
errors INT,
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
''')
|
||||
|
||||
# Statistiken einfügen
|
||||
query = (
|
||||
"INSERT INTO crawl_stats (source_url, duration_seconds, total_visited, "
|
||||
"total_saved, total_data_volume_bytes, errors) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s)"
|
||||
)
|
||||
values = (
|
||||
source_url,
|
||||
stats.duration,
|
||||
stats.total_visited,
|
||||
stats.total_saved,
|
||||
stats.total_data_volume,
|
||||
stats.errors
|
||||
)
|
||||
cursor.execute(query, values)
|
||||
conn.commit()
|
||||
logging.info(f"Statistiken für {source_url} erfolgreich in MySQL geloggt.")
|
||||
|
||||
except mysql.connector.Error as e:
|
||||
warnings.warn(f"Fehler beim Schreiben der Statistiken in die MySQL-Datenbank: {e}")
|
||||
finally:
|
||||
if conn and conn.is_connected():
|
||||
cursor.close()
|
||||
conn.close()
|
||||
62
src/duplicate_detector.py
Normal file
62
src/duplicate_detector.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import logging
|
||||
|
||||
try:
|
||||
from simhash import Simhash
|
||||
except ImportError:
|
||||
Simhash = None
|
||||
logging.warning("Die 'simhash'-Bibliothek wurde nicht gefunden. Die Duplikat-Erkennung ist deaktiviert.")
|
||||
|
||||
class DuplicateDetector:
|
||||
"""
|
||||
Erkennt und verwaltet Textduplikate mittels SimHash.
|
||||
"""
|
||||
def __init__(self, similarity_threshold=95):
|
||||
"""
|
||||
Initialisiert den DuplicateDetector.
|
||||
|
||||
Args:
|
||||
similarity_threshold (int): Der prozentuale Schwellenwert für die Ähnlichkeit,
|
||||
ab dem zwei Texte als Duplikate gelten.
|
||||
"""
|
||||
if Simhash is None:
|
||||
self.enabled = False
|
||||
return
|
||||
|
||||
self.enabled = True
|
||||
self.hashes = set()
|
||||
self.similarity_threshold = similarity_threshold
|
||||
|
||||
def is_duplicate(self, text):
|
||||
"""
|
||||
Überprüft, ob ein Text ein Duplikat eines bereits gesehenen Textes ist.
|
||||
|
||||
Args:
|
||||
text (str): Der zu überprüfende Text.
|
||||
|
||||
Returns:
|
||||
bool: True, wenn der Text ein Duplikat ist, andernfalls False.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
new_hash = Simhash(text)
|
||||
|
||||
for existing_hash in self.hashes:
|
||||
similarity = (1 - (new_hash.distance(existing_hash) / 64.0)) * 100
|
||||
if similarity >= self.similarity_threshold:
|
||||
logging.info(f"Duplikat gefunden mit einer Ähnlichkeit von {similarity:.2f}%.")
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_hash(self, text):
|
||||
"""
|
||||
Fügt den SimHash des Textes zur Menge der bekannten Hashes hinzu.
|
||||
|
||||
Args:
|
||||
text (str): Der Text, dessen Hash hinzugefügt werden soll.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
new_hash = Simhash(text)
|
||||
self.hashes.add(new_hash)
|
||||
88
src/hash_manager.py
Normal file
88
src/hash_manager.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import sqlite3
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class HashManager:
|
||||
"""Verwaltet die Speicherung und das Laden von Hashes für das inkrementelle Crawling via SQLite."""
|
||||
|
||||
def __init__(self, db_path):
|
||||
"""
|
||||
Initialisiert den HashManager und die Datenbankverbindung.
|
||||
|
||||
Args:
|
||||
db_path (str): Der Pfad zur SQLite-Datenbankdatei.
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self.conn = None
|
||||
try:
|
||||
self.conn = sqlite3.connect(self.db_path)
|
||||
self._init_db()
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"Datenbankfehler beim Verbinden mit {self.db_path}: {e}")
|
||||
raise
|
||||
|
||||
def _init_db(self):
|
||||
"""Erstellt die 'hashes'-Tabelle, falls sie nicht existiert."""
|
||||
try:
|
||||
with self.conn:
|
||||
self.conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS hashes (
|
||||
url TEXT PRIMARY KEY,
|
||||
simhash TEXT NOT NULL,
|
||||
timestamp TEXT NOT NULL
|
||||
)
|
||||
""")
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"Fehler beim Initialisieren der Datenbanktabelle: {e}")
|
||||
|
||||
def is_unchanged(self, url, new_simhash):
|
||||
"""
|
||||
Prüft, ob sich der Inhalt einer URL geändert hat, indem der SimHash in der DB verglichen wird.
|
||||
|
||||
Args:
|
||||
url (str): Die zu prüfende URL.
|
||||
new_simhash (int): Der neue SimHash des Inhalts.
|
||||
|
||||
Returns:
|
||||
bool: True, wenn die URL bekannt und der SimHash unverändert ist, sonst False.
|
||||
"""
|
||||
if not self.conn:
|
||||
return False
|
||||
try:
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute("SELECT simhash FROM hashes WHERE url = ?", (url,))
|
||||
result = cursor.fetchone()
|
||||
if result and result[0] == str(new_simhash):
|
||||
return True
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"Fehler beim Prüfen des Hashes für URL {url}: {e}")
|
||||
return False
|
||||
|
||||
def update_hash(self, url, simhash):
|
||||
"""
|
||||
Aktualisiert den Hash und den Zeitstempel für eine gegebene URL in der Datenbank.
|
||||
|
||||
Args:
|
||||
url (str): Die URL, deren Hash aktualisiert wird.
|
||||
simhash (int): Der neue SimHash des Inhalts.
|
||||
"""
|
||||
if not self.conn:
|
||||
return
|
||||
timestamp = datetime.utcnow().isoformat()
|
||||
try:
|
||||
with self.conn:
|
||||
self.conn.execute("""
|
||||
INSERT OR REPLACE INTO hashes (url, simhash, timestamp)
|
||||
VALUES (?, ?, ?)
|
||||
""", (url, str(simhash), timestamp))
|
||||
logger.debug(f"Hash für URL {url} in der Datenbank aktualisiert.")
|
||||
except sqlite3.Error as e:
|
||||
logger.error(f"Fehler beim Aktualisieren des Hashes für URL {url}: {e}")
|
||||
|
||||
def close(self):
|
||||
"""Schließt die Datenbankverbindung."""
|
||||
if self.conn:
|
||||
self.conn.close()
|
||||
logger.info("Datenbankverbindung für Hashes geschlossen.")
|
||||
40
src/html_cleaner.py
Normal file
40
src/html_cleaner.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
def clean_html(html_content: str, config: dict) -> str:
|
||||
"""
|
||||
Bereinigt den HTML-Inhalt, indem unerwünschte Tags und Textmuster entfernt werden.
|
||||
|
||||
Args:
|
||||
html_content: Der rohe HTML-Inhalt als String.
|
||||
config: Das Konfigurationsobjekt.
|
||||
|
||||
Returns:
|
||||
Der bereinigte HTML-Inhalt als String.
|
||||
"""
|
||||
cleaner_config = config.get('html_cleaner', {}).get('value', {})
|
||||
remove_tags = cleaner_config.get('remove_tags', [])
|
||||
remove_patterns = cleaner_config.get('remove_patterns', [])
|
||||
|
||||
if not html_content:
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1. Entferne unerwünschte Tags
|
||||
for tag_name in remove_tags:
|
||||
for tag in soup.find_all(tag_name):
|
||||
tag.decompose()
|
||||
|
||||
# 2. Entferne unerwünschte Textmuster aus dem verbleibenden Inhalt
|
||||
if remove_patterns:
|
||||
combined_pattern = "|".join(remove_patterns)
|
||||
|
||||
for element in soup.find_all(string=True):
|
||||
if element.parent.name in ['style', 'script'] or isinstance(element, Comment):
|
||||
continue
|
||||
|
||||
new_string = re.sub(combined_pattern, '', str(element), flags=re.IGNORECASE)
|
||||
element.replace_with(new_string)
|
||||
|
||||
return str(soup)
|
||||
73
src/logger_setup.py
Normal file
73
src/logger_setup.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import logging
|
||||
import sys
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from src.db_logger import SQLiteHandler, MySQLHandler
|
||||
def setup_logging(config):
|
||||
"""
|
||||
Konfiguriert das Logging-System basierend auf der Konfigurationsdatei.
|
||||
"""
|
||||
log_config = config.get('log', {}).get('value', {})
|
||||
|
||||
# Deaktivieren, wenn keine Handler konfiguriert sind
|
||||
if not log_config or not log_config.get('handlers'):
|
||||
logging.disable(logging.CRITICAL)
|
||||
return
|
||||
|
||||
# Root-Logger konfigurieren
|
||||
logger = logging.getLogger()
|
||||
if logger.hasHandlers():
|
||||
logger.handlers.clear()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Formatter erstellen
|
||||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
handlers_to_add = log_config.get('handlers', [])
|
||||
|
||||
# File-Handler dynamisch erstellen
|
||||
if 'file' in handlers_to_add:
|
||||
file_conf = log_config.get('file', {})
|
||||
log_file = file_conf.get('log_file', 'crawler.log')
|
||||
max_size_mb = file_conf.get('max_size_mb', 10)
|
||||
keep_last = file_conf.get('keep_last', 5)
|
||||
max_bytes = max_size_mb * 1024 * 1024
|
||||
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file, maxBytes=max_bytes, backupCount=keep_last
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# Console-Handler dynamisch erstellen
|
||||
if 'console' in handlers_to_add:
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
stream_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
# SQLite-Handler dynamisch erstellen
|
||||
if 'sqlite' in handlers_to_add:
|
||||
sqlite_conf = log_config.get('sqlite', {})
|
||||
db_file = sqlite_conf.get('db_file')
|
||||
if db_file:
|
||||
sqlite_handler = SQLiteHandler(db_file=db_file)
|
||||
sqlite_handler.setFormatter(formatter)
|
||||
logger.addHandler(sqlite_handler)
|
||||
|
||||
# MySQL-Handler dynamisch erstellen
|
||||
if 'mysql' in handlers_to_add:
|
||||
mysql_conf = log_config.get('mysql', {})
|
||||
# Nur initialisieren, wenn alle Konfigurationsschlüssel vorhanden sind
|
||||
if all(k in mysql_conf for k in ['host', 'user', 'password', 'database']):
|
||||
mysql_handler = MySQLHandler(
|
||||
host=mysql_conf['host'],
|
||||
user=mysql_conf['user'],
|
||||
password=mysql_conf['password'],
|
||||
database=mysql_conf['database']
|
||||
)
|
||||
mysql_handler.setFormatter(formatter)
|
||||
logger.addHandler(mysql_handler)
|
||||
|
||||
if logger.hasHandlers():
|
||||
logging.info("Logging-System erfolgreich initialisiert.")
|
||||
else:
|
||||
logging.disable(logging.CRITICAL)
|
||||
59
src/state_manager.py
Normal file
59
src/state_manager.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import json
|
||||
import os
|
||||
from queue import PriorityQueue
|
||||
from typing import Set, Tuple, List, Optional
|
||||
|
||||
class StateManager:
|
||||
"""Manages the state of the crawler for pausing and resuming."""
|
||||
|
||||
def __init__(self, state_file: str):
|
||||
"""
|
||||
Initializes the StateManager.
|
||||
|
||||
Args:
|
||||
state_file: The path to the file where the state will be saved.
|
||||
"""
|
||||
self.state_file = state_file
|
||||
|
||||
def save_state(self, queue: PriorityQueue, visited: Set[str]):
|
||||
"""
|
||||
Saves the current state of the queue and visited set to a file.
|
||||
|
||||
Args:
|
||||
queue: The PriorityQueue to save.
|
||||
visited: The set of visited URLs to save.
|
||||
"""
|
||||
state = {
|
||||
"queue": list(queue.queue),
|
||||
"visited": list(visited)
|
||||
}
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump(state, f, indent=4)
|
||||
|
||||
def load_state(self) -> Optional[Tuple[PriorityQueue, Set[str]]]:
|
||||
"""
|
||||
Loads the state from the state file if it exists.
|
||||
|
||||
Returns:
|
||||
A tuple containing the PriorityQueue and the set of visited URLs,
|
||||
or None if no state file is found.
|
||||
"""
|
||||
if not os.path.exists(self.state_file):
|
||||
return None
|
||||
|
||||
with open(self.state_file, 'r') as f:
|
||||
state = json.load(f)
|
||||
|
||||
queue = PriorityQueue()
|
||||
for item in state["queue"]:
|
||||
# The items are stored as lists in JSON, convert them back to tuples
|
||||
queue.put(tuple(item))
|
||||
|
||||
visited = set(state["visited"])
|
||||
|
||||
return queue, visited
|
||||
|
||||
def delete_state(self):
|
||||
"""Deletes the state file if it exists."""
|
||||
if os.path.exists(self.state_file):
|
||||
os.remove(self.state_file)
|
||||
19
src/stats_manager.py
Normal file
19
src/stats_manager.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import time
|
||||
|
||||
class CrawlStats:
|
||||
def __init__(self):
|
||||
self.start_time = time.time()
|
||||
self.end_time = None
|
||||
self.total_visited = 0
|
||||
self.total_saved = 0
|
||||
self.total_data_volume = 0
|
||||
self.errors = 0
|
||||
|
||||
def finish(self):
|
||||
self.end_time = time.time()
|
||||
|
||||
@property
|
||||
def duration(self):
|
||||
if self.end_time:
|
||||
return self.end_time - self.start_time
|
||||
return time.time() - self.start_time
|
||||
70
src/storage.py
Normal file
70
src/storage.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def get_safe_path(url):
|
||||
"""
|
||||
Generiert einen sicheren, hierarchischen Pfad basierend auf der URL.
|
||||
Ersetzt ungültige Dateisystemzeichen und verwendet die Domain/den Pfad.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
netloc = parsed.netloc
|
||||
path = parsed.path.strip('/')
|
||||
|
||||
# Ersetze ungültige Zeichen im Pfad (z.B. Query-Trenner) durch Unterstriche
|
||||
# Da die URL bereits normalisiert ist, sollte die Query leer sein, aber wir sichern ab.
|
||||
safe_path = path.replace(':', '_').replace('*', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('?', '_').replace('&', '_')
|
||||
|
||||
# Füge die Domain hinzu
|
||||
full_path = Path(netloc) / safe_path
|
||||
|
||||
# Wenn der Pfad leer ist (z.B. bei der Root-URL), verwenden wir 'index'
|
||||
if not full_path.name:
|
||||
full_path = full_path / "index"
|
||||
|
||||
# Füge die Dateiendung hinzu (z.B. .json, da wir später JSON speichern)
|
||||
return full_path.with_suffix(".json")
|
||||
|
||||
def save(url, title, content, output_dir):
|
||||
"""
|
||||
Speichert die extrahierten Daten (URL, Titel, Zeitstempel, Inhalt) als JSON-Objekt.
|
||||
"""
|
||||
|
||||
# 1. Generiere den Zeitstempel im ISO 8601 Format
|
||||
timestamp = datetime.now().isoformat()
|
||||
|
||||
# 2. Erstelle das JSON-Datenobjekt
|
||||
data = {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"timestamp": timestamp,
|
||||
"content": content
|
||||
}
|
||||
|
||||
# 3. Generiere den hierarchischen Pfad (für Browsability)
|
||||
relative_path = get_safe_path(url)
|
||||
|
||||
# 4. Kombiniere Output-Dir und relativen Pfad
|
||||
output_path = Path(output_dir)
|
||||
fname = output_path / relative_path
|
||||
|
||||
# Stelle sicher, dass das Verzeichnis existiert
|
||||
fname.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Speichere nur, wenn die Datei nicht existiert
|
||||
if not fname.exists():
|
||||
# Speichere das JSON-Objekt
|
||||
try:
|
||||
json_content = json.dumps(data, ensure_ascii=False, indent=4)
|
||||
fname.write_text(json_content, encoding="utf-8")
|
||||
logging.info(f"Gespeichert als {fname.relative_to(output_path)}")
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Speichern von JSON für {url}: {e}")
|
||||
else:
|
||||
# Wenn die Datei existiert, überspringen wir das Speichern.
|
||||
logging.info(f"Datei existiert bereits: {fname.relative_to(output_path)}")
|
||||
|
||||
return fname
|
||||
30
src/url_utils.py
Normal file
30
src/url_utils.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from urllib.parse import urlparse, parse_qs, urlunparse, urlencode
|
||||
|
||||
# Liste gängiger Tracking-Parameter, die entfernt werden sollen
|
||||
TRACKING_PARAMS = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'session', 'ref', 'gclid', 'fbclid']
|
||||
|
||||
def normalize_url(url):
|
||||
"""
|
||||
Normalisiert eine URL, indem Fragmente entfernt und gängige Tracking-Parameter
|
||||
aus der Query-Komponente entfernt werden.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# 1. Fragment entfernen
|
||||
# 2. Query-Parameter bereinigen
|
||||
|
||||
query_params = parse_qs(parsed.query)
|
||||
|
||||
# Tracking-Parameter entfernen
|
||||
cleaned_params = {k: v for k, v in query_params.items() if k.lower() not in TRACKING_PARAMS}
|
||||
|
||||
# Query neu zusammensetzen
|
||||
# Da parse_qs Listen von Werten zurückgibt, müssen wir sie für urlunparse/urlencode
|
||||
# in das Standardformat zurückführen (z.B. k=v1&k=v2)
|
||||
|
||||
cleaned_query = urlencode(cleaned_params, doseq=True)
|
||||
|
||||
# urlunparse erwartet eine Liste von 6 Elementen: scheme, netloc, path, params, query, fragment
|
||||
normalized_url = urlunparse(parsed._replace(query=cleaned_query, fragment=''))
|
||||
|
||||
return normalized_url
|
||||
32
src/web_server.py
Normal file
32
src/web_server.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import http.server
|
||||
import socketserver
|
||||
import os
|
||||
|
||||
class ReadmeHandler(http.server.SimpleHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
readme_path = os.path.join(os.path.dirname(__file__), '..', 'README.md')
|
||||
try:
|
||||
with open(readme_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'text/plain; charset=utf-8')
|
||||
self.end_headers()
|
||||
self.wfile.write(content.encode('utf-8'))
|
||||
except FileNotFoundError:
|
||||
self.send_error(404, 'File Not Found: README.md')
|
||||
|
||||
def start_web_server():
|
||||
PORT = 8000
|
||||
httpd = None
|
||||
try:
|
||||
handler = ReadmeHandler
|
||||
httpd = socketserver.TCPServer(("", PORT), handler)
|
||||
print(f"Serving on port {PORT}")
|
||||
httpd.serve_forever()
|
||||
finally:
|
||||
if httpd:
|
||||
httpd.server_close()
|
||||
print("Server closed.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_web_server()
|
||||
Reference in New Issue
Block a user