Initial commit
This commit is contained in:
96
config.json
Normal file
96
config.json
Normal file
@@ -0,0 +1,96 @@
|
||||
{
|
||||
"_documentation": "Allgemeine Konfigurationseinstellungen für den Web-Crawler.",
|
||||
"PAGE_LIMIT": {
|
||||
"value": 400,
|
||||
"description": "Die maximale Anzahl von Seiten, die pro Domain gecrawlt werden."
|
||||
},
|
||||
"CRAWL_DELAY": {
|
||||
"value": 2,
|
||||
"description": "Die Verzögerung in Sekunden zwischen aufeinanderfolgenden Anfragen an dieselbe Domain."
|
||||
},
|
||||
"USER_AGENT": {
|
||||
"value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
|
||||
"description": "Der User-Agent-String, der für HTTP-Anfragen verwendet wird."
|
||||
},
|
||||
"PATH_STRICT": {
|
||||
"value": false,
|
||||
"description": "Wenn true, werden nur URLs gecrawlt, die mit einem der start_urls beginnen."
|
||||
},
|
||||
"BLOCKED_PATTERNS": {
|
||||
"value": [
|
||||
"/logout",
|
||||
"/auth",
|
||||
"?session="
|
||||
],
|
||||
"description": "URL-Muster, die vom Crawler blockiert werden sollen."
|
||||
},
|
||||
"MAX_RETRIES": {
|
||||
"value": 3,
|
||||
"description": "Die maximale Anzahl von Wiederholungen für fehlgeschlagene HTTP-Anfragen."
|
||||
},
|
||||
"RETRY_DELAY_BASE": {
|
||||
"value": 5,
|
||||
"description": "Die Basisverzögerung in Sekunden für exponentielles Backoff bei Wiederholungen."
|
||||
},
|
||||
"MIN_CONTENT_LENGTH": {
|
||||
"value": 500,
|
||||
"description": "Die minimale Länge des Inhalts (in Zeichen), die eine Seite haben muss, um gespeichert zu werden."
|
||||
},
|
||||
"OUTPUT_DIR": {
|
||||
"value": "/app/output",
|
||||
"description": "Das Verzeichnis, in dem die gecrawlten Daten gespeichert werden."
|
||||
},
|
||||
"log": {
|
||||
"value": {
|
||||
"handlers": ["file", "console", "sqlite"],
|
||||
"file": {
|
||||
"log_file": "crawler.log",
|
||||
"max_size_mb": 12,
|
||||
"keep_last": 7
|
||||
},
|
||||
"sqlite": {
|
||||
"db_file": "crawler_logs.db"
|
||||
},
|
||||
"mysql": {
|
||||
"host": "localhost",
|
||||
"user": "user",
|
||||
"password": "password",
|
||||
"database": "crawler_logs",
|
||||
"log_stats": true
|
||||
}
|
||||
},
|
||||
"description": "Konfiguration für das Logging-System."
|
||||
},
|
||||
"html_cleaner": {
|
||||
"value": {
|
||||
"remove_tags": ["header", "nav", "aside", "footer"],
|
||||
"remove_patterns": ["Datenschutz", "Kontakt", "Newsletter", "Zuletzt aktualisiert"]
|
||||
},
|
||||
"description": "Konfiguration zur Bereinigung von HTML-Inhalten."
|
||||
},
|
||||
"priority_patterns": {
|
||||
"value": ["/docs/", "/wiki/", "/handbuch/", "/kapitel/"],
|
||||
"description": "URL-Pfade, die beim Crawling priorisiert werden sollen."
|
||||
},
|
||||
"duplicate_detection": {
|
||||
"value": {
|
||||
"enable": true,
|
||||
"similarity_threshold": 95
|
||||
},
|
||||
"description": "Konfiguration für die Duplikat-Erkennung mittels SimHash."
|
||||
},
|
||||
"incremental_crawling": {
|
||||
"value": {
|
||||
"enable": true,
|
||||
"db_file": "crawled_hashes.db"
|
||||
},
|
||||
"description": "Konfiguration für das inkrementelle Crawling."
|
||||
},
|
||||
"state_management": {
|
||||
"value": {
|
||||
"enable": true,
|
||||
"state_file": "crawler_state.json"
|
||||
},
|
||||
"description": "Konfiguration für die Speicherung des Crawler-Zustands zum Fortsetzen."
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user