From c347078f271517e79e76a7f0c92be882c17695b1 Mon Sep 17 00:00:00 2001 From: "madgerm@msn.com" Date: Thu, 13 Nov 2025 13:01:48 +0100 Subject: [PATCH] Initial commit --- README.md | 45 ++ config.json | 96 +++ crawled_hashes.db | Bin 0 -> 12288 bytes crawler.log | 683 ++++++++++++++++++ crawler_logs.db | Bin 0 -> 114688 bytes git_sync.sh | 72 ++ src/__pycache__/crawler_core.cpython-312.pyc | Bin 0 -> 13445 bytes src/__pycache__/db_logger.cpython-312.pyc | Bin 0 -> 8693 bytes .../duplicate_detector.cpython-312.pyc | Bin 0 -> 2626 bytes src/__pycache__/hash_manager.cpython-312.pyc | Bin 0 -> 5105 bytes src/__pycache__/html_cleaner.cpython-312.pyc | Bin 0 -> 1820 bytes src/__pycache__/logger_setup.cpython-312.pyc | Bin 0 -> 3320 bytes src/__pycache__/state_manager.cpython-312.pyc | Bin 0 -> 3024 bytes src/__pycache__/stats_manager.cpython-312.pyc | Bin 0 -> 1239 bytes src/__pycache__/storage.cpython-312.pyc | Bin 0 -> 2915 bytes src/__pycache__/url_utils.cpython-312.pyc | Bin 0 -> 1199 bytes src/crawler_core.py | 277 +++++++ src/db_logger.py | 169 +++++ src/duplicate_detector.py | 62 ++ src/hash_manager.py | 88 +++ src/html_cleaner.py | 40 + src/logger_setup.py | 73 ++ src/state_manager.py | 59 ++ src/stats_manager.py | 19 + src/storage.py | 70 ++ src/url_utils.py | 30 + src/web_server.py | 32 + start.py | 62 ++ url_list.json | 308 ++++++++ 29 files changed, 2185 insertions(+) create mode 100644 README.md create mode 100644 config.json create mode 100644 crawled_hashes.db create mode 100644 crawler.log create mode 100644 crawler_logs.db create mode 100755 git_sync.sh create mode 100644 src/__pycache__/crawler_core.cpython-312.pyc create mode 100644 src/__pycache__/db_logger.cpython-312.pyc create mode 100644 src/__pycache__/duplicate_detector.cpython-312.pyc create mode 100644 src/__pycache__/hash_manager.cpython-312.pyc create mode 100644 src/__pycache__/html_cleaner.cpython-312.pyc create mode 100644 src/__pycache__/logger_setup.cpython-312.pyc create mode 100644 src/__pycache__/state_manager.cpython-312.pyc create mode 100644 src/__pycache__/stats_manager.cpython-312.pyc create mode 100644 src/__pycache__/storage.cpython-312.pyc create mode 100644 src/__pycache__/url_utils.cpython-312.pyc create mode 100644 src/crawler_core.py create mode 100644 src/db_logger.py create mode 100644 src/duplicate_detector.py create mode 100644 src/hash_manager.py create mode 100644 src/html_cleaner.py create mode 100644 src/logger_setup.py create mode 100644 src/state_manager.py create mode 100644 src/stats_manager.py create mode 100644 src/storage.py create mode 100644 src/url_utils.py create mode 100644 src/web_server.py create mode 100644 start.py create mode 100644 url_list.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..b1d57d4 --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +# RAG-Crawler + +### Projekt-Zusammenfassung + +Der RAG-Crawler ist ein asynchroner, konfigurierbarer Web-Crawler, der entwickelt wurde, um Inhalte von Webseiten systematisch zu extrahieren und für die spätere Verwendung in RAG-Systemen (Retrieval-Augmented Generation) zu speichern. Er ist modular aufgebaut und bietet erweiterte Funktionen wie Zustandsverwaltung zum Pausieren und Fortsetzen, Duplikaterkennung zur Vermeidung redundanter Daten und inkrementelles Crawling, um nur geänderte Inhalte erneut zu verarbeiten. Die Konfiguration ermöglicht eine feingranulare Steuerung des Crawling-Verhaltens, des Loggings und der Datenbereinigung. + +### Feature-Liste + +* **Asynchrones Crawling:** Nutzt `aiohttp` für performante, nicht-blockierende HTTP-Anfragen. +* **Zustandsverwaltung:** Speichert den Fortschritt (URL-Warteschlange und besuchte Links) in einer Datei, um den Crawling-Prozess unterbrechen und später fortsetzen zu können. +* **Duplikaterkennung:** Verwendet SimHash, um semantisch ähnliche Inhalte zu identifizieren und das Speichern von Duplikaten zu verhindern. +* **Inkrementelles Crawling:** Führt eine SQLite-Datenbank mit den Hashes der Seiteninhalte, um nur Seiten zu verarbeiten, deren Inhalt sich seit dem letzten Crawl geändert hat. +* **Konfigurierbare Crawling-Modi:** Unterstützt verschiedene Modi pro Start-URL: + * `single_page`: Crawlt nur die angegebene Start-URL. + * `domain_wide`: Crawlt die gesamte Domain, ausgehend von der Start-URL. + * `path_limited`: Beschränkt das Crawling auf einen bestimmten URL-Pfad. +* **HTML-Bereinigung:** Entfernt vor der Inhaltsextraktion unerwünschte HTML-Tags (z.B. Header, Footer) und Textmuster (z.B. "Datenschutz"), um die Datenqualität zu verbessern. +* **`robots.txt`-Unterstützung:** Respektiert die in der `robots.txt` einer Webseite definierten Crawling-Regeln. +* **Flexibles Logging:** Unterstützt das Logging in die Konsole, rotierende Log-Dateien sowie in SQLite- und MySQL-Datenbanken. +* **Statistik-Tracking:** Erfasst detaillierte Statistiken für jeden Crawling-Vorgang (Dauer, besuchte Seiten, gespeicherte Datenmenge, Fehler). +* **URL-Normalisierung:** Bereinigt URLs, indem Fragmente und gängige Tracking-Parameter entfernt werden, um die Effizienz der Duplikaterkennung zu erhöhen. + +### Konfigurations-Tabelle (`config.json`) + +| Parameter | Beschreibung | Standardwert | +| :--- | :--- | :--- | +| `PAGE_LIMIT` | Die maximale Anzahl von Seiten, die pro Domain gecrawlt werden. | `400` | +| `CRAWL_DELAY` | Die Verzögerung in Sekunden zwischen Anfragen an dieselbe Domain. | `2` | +| `USER_AGENT` | Der User-Agent-String, der für HTTP-Anfragen verwendet wird. | `"Mozilla/5.0 ..."` | +| `PATH_STRICT` | (Legacy) Wenn `true`, werden nur URLs gecrawlt, die mit einer der `start_urls` beginnen. | `false` | +| `BLOCKED_PATTERNS` | URL-Muster, die vom Crawler ignoriert werden sollen. | `["/logout", "/auth", ...]` | +| `MAX_RETRIES` | Maximale Anzahl von Wiederholungen für fehlgeschlagene HTTP-Anfragen. | `3` | +| `RETRY_DELAY_BASE` | Basisverzögerung in Sekunden für exponentielles Backoff bei Wiederholungen. | `5` | +| `MIN_CONTENT_LENGTH` | Minimale Länge des extrahierten Textes (in Zeichen), damit eine Seite gespeichert wird. | `500` | +| `OUTPUT_DIR` | Das Verzeichnis, in dem die gecrawlten Daten als JSON-Dateien gespeichert werden. | `"/app/output"` | +| `log` | Konfiguration für das Logging-System (Handler, Dateipfade, DB-Zugangsdaten). | `{...}` | +| `html_cleaner` | Konfiguration zur Bereinigung von HTML (zu entfernende Tags und Textmuster). | `{...}` | +| `priority_patterns` | URL-Pfade, die in der Crawling-Warteschlange priorisiert werden. | `["/docs/", "/wiki/", ...]` | +| `duplicate_detection` | Aktiviert/deaktiviert die Duplikaterkennung und setzt den Ähnlichkeitsschwellenwert. | `{ "enable": true, ... }` | +| `incremental_crawling` | Aktiviert/deaktiviert das inkrementelle Crawling und gibt den Pfad zur Hash-Datenbank an. | `{ "enable": true, ... }` | +| `state_management` | Aktiviert/deaktiviert die Zustandsverwaltung und gibt den Namen der Zustandsdatei an. | `{ "enable": true, ... }` | + +### Zweck der `url_list.json` + +Die Datei `url_list.json` dient als Eingabe für den Crawler und definiert die zu verarbeitenden "Quellen". Sie ist eine Liste von Objekten, wobei jedes Objekt eine Quelle mit einer oder mehreren `start_urls` und einem spezifischen `crawl_mode` (z.B. `single_page` oder `domain_wide`) repräsentiert. Dies ermöglicht es, mehrere Webseiten mit unterschiedlichen Crawling-Strategien in einem einzigen Durchlauf zu verarbeiten. diff --git a/config.json b/config.json new file mode 100644 index 0000000..5ee1eb0 --- /dev/null +++ b/config.json @@ -0,0 +1,96 @@ +{ + "_documentation": "Allgemeine Konfigurationseinstellungen für den Web-Crawler.", + "PAGE_LIMIT": { + "value": 400, + "description": "Die maximale Anzahl von Seiten, die pro Domain gecrawlt werden." + }, + "CRAWL_DELAY": { + "value": 2, + "description": "Die Verzögerung in Sekunden zwischen aufeinanderfolgenden Anfragen an dieselbe Domain." + }, + "USER_AGENT": { + "value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", + "description": "Der User-Agent-String, der für HTTP-Anfragen verwendet wird." + }, + "PATH_STRICT": { + "value": false, + "description": "Wenn true, werden nur URLs gecrawlt, die mit einem der start_urls beginnen." + }, + "BLOCKED_PATTERNS": { + "value": [ + "/logout", + "/auth", + "?session=" + ], + "description": "URL-Muster, die vom Crawler blockiert werden sollen." + }, + "MAX_RETRIES": { + "value": 3, + "description": "Die maximale Anzahl von Wiederholungen für fehlgeschlagene HTTP-Anfragen." + }, + "RETRY_DELAY_BASE": { + "value": 5, + "description": "Die Basisverzögerung in Sekunden für exponentielles Backoff bei Wiederholungen." + }, + "MIN_CONTENT_LENGTH": { + "value": 500, + "description": "Die minimale Länge des Inhalts (in Zeichen), die eine Seite haben muss, um gespeichert zu werden." + }, + "OUTPUT_DIR": { + "value": "/app/output", + "description": "Das Verzeichnis, in dem die gecrawlten Daten gespeichert werden." + }, + "log": { + "value": { + "handlers": ["file", "console", "sqlite"], + "file": { + "log_file": "crawler.log", + "max_size_mb": 12, + "keep_last": 7 + }, + "sqlite": { + "db_file": "crawler_logs.db" + }, + "mysql": { + "host": "localhost", + "user": "user", + "password": "password", + "database": "crawler_logs", + "log_stats": true + } + }, + "description": "Konfiguration für das Logging-System." + }, + "html_cleaner": { + "value": { + "remove_tags": ["header", "nav", "aside", "footer"], + "remove_patterns": ["Datenschutz", "Kontakt", "Newsletter", "Zuletzt aktualisiert"] + }, + "description": "Konfiguration zur Bereinigung von HTML-Inhalten." + }, + "priority_patterns": { + "value": ["/docs/", "/wiki/", "/handbuch/", "/kapitel/"], + "description": "URL-Pfade, die beim Crawling priorisiert werden sollen." + }, + "duplicate_detection": { + "value": { + "enable": true, + "similarity_threshold": 95 + }, + "description": "Konfiguration für die Duplikat-Erkennung mittels SimHash." + }, + "incremental_crawling": { + "value": { + "enable": true, + "db_file": "crawled_hashes.db" + }, + "description": "Konfiguration für das inkrementelle Crawling." + }, + "state_management": { + "value": { + "enable": true, + "state_file": "crawler_state.json" + }, + "description": "Konfiguration für die Speicherung des Crawler-Zustands zum Fortsetzen." + } +} \ No newline at end of file diff --git a/crawled_hashes.db b/crawled_hashes.db new file mode 100644 index 0000000000000000000000000000000000000000..2f6520897d1bd9137d19255a0d24714dab574b56 GIT binary patch literal 12288 zcmeI#ze~eF6u|MjC@O{i*h;rIDvBWf0U9NP7^{sD>=YtK8c3`*S8;LB&A-h*sE+0u zItYd?PUZXH-tm&VgZtd{t_OZ<5_uR;^4Q3s>L{(%v4~Pimt(CQi?H?jq9|uuFAJ^q zUp|}dZ{@URsx@nWE!)6<1Q0*~0R#|0009ILKmY**{!!q4pf|g{p8j|<@m-dT;$oB( zRd!AT&ka2Z-4oxF>YHpgq$Z}5OhWHAl&j$E!VQLU?hW^9R~Kn+Ppo$IFGK0y_yI$M?J05$NS{9c+SecK8mMiJg=kbBGExL+rECwdNyt9*hCP2e_?A-^jfUL+69}*>yq5zP3nOZG?3*eHAU2u0n zkW?gk?p?TcB9Xoqzs2^UeSKX8P$#t87LV3dQAYIg$w77HVn=y)_aE zg_=Je3WW~9|8IUZ!CzW${s;Vt|GmW*e8$j~clE))Y6_2q;D0+`I~v&0z>WrXG_a$A z9S!VgU`GQx8radmjs|u#u%m%*FAcEcO&x;+15Ng&a&|Un<_e1?{D1SYsS^`3Cn7Tw zk4~P5;FHJ$-4WueY%QCma&~zoavc8s%&F5SBF9ghoH#c*6FGKnYU;$9nG8Oho|!m( zHcEa%&b(sgBIhTjjy*mx^?(sOLjN-urj%VYBQqzSosmB0NE>KaTr! zy|x``cQml0fgKI(XkbSJI~v&0z>WrXG_a$A9S!VgU`GT0AR5@ez6-|LSGf+~_f{6* z`yG`j_&!*97{0%$@&J4vsO*97dz>5aeYbNFzTfVghwr8k_rUiK)&<|&(bI36fp4v8_||d|bMps7pALa% z|K|RW^*`SCn|;H*pXi@wtxc9pQG?_UCOgt)FlC{g%g? z|GfE~O@G&9!LR<~f3ZUc+Y$rzU_2I29W{)jMm!QrjK`AWvGl1kC!Z1@!!vvwHInJb z(Fpz&{+KRji%B9VKX*^Q0@{2h$vjX9lBkZ;VAR}89 zh1pWZcumH$#ljp!a^MpqrNU})ZX4ueUHeg_ML!}X1ht*C@>XfdocANp;_~d$BG*|g zmK+_8rN`o2XVyLx<$hnH7&Yx2&z8;nY&L)Cidmes^7E^priC{?S&TfM1qn+Lc#U&Q zxk9OA=7*;Cwk8H%_A^pkcn_0ryapd-MxI}V=v{ukG!G+NLu}vdgGPc=qs@>iFh zmzAedV8kpg6mpA2)0$g?>CP3iYq|0eD-5cEHD>8cxc4h3UnT%NZ$V#DPl`R-85l@c8gDcsbH4nd)FI)N5EdL4qdlq^VzKL_< z%&PlrgvNrAXe=eX-|DdwL${+a1PrMcj|dLFU}zT&L+X)? zMC~OgU}*3*TzM^iUTtio_Nm@sNUcHz5M2x-5D}V9Du5^%jl~)OL~6mkK+(e7CMX(9 zrfN_G=RvW|G3!P@uA*uAEhK0NN<>FSg~Mzz3)ojI%ofU}q4L!-LaZk|s{IOU?n4Ql z^CQ8Cu&I22pjw~o&R$%MyL<^DAhCZKRLepGVP6kwso+Pzl(1WZ0ijyRi`^*B5kKN2 z1#y%}Sh}2p8sDKpadCLQFjuMqT^Ei;Y)ADHO~z_K;S18tATcp!gGh zR3#o7E}Wwjq=2z|+X2SxgYv+J0>+H!NIWQvQF4t(UN})K<_i%caUpWnEG}E%6o6GU zZ{{s?9x!Y;yRveBp$+wukj0GA2o#GZQ)9K)t3@C|sv6>CTX8_b1JevBtX((Y2xHt@ zvaht@5SzTYs$r)YLRd%&IH5)&(IVH3!yomdQqjlMfCVMOLKBX5$j@jA0l+5ZhDwzid$ItQ<$xL5BgR=+m`|X>@FVyB*%nBtJtBwq??_7JUgP z7-gFM>svt9n?PBX{m7CQ6s0`rHIl5ZEke91mgL*UNE*o)d?l;4Kpdr!Z2=>xS~tm2 z;jr*!3mBv=pc`pxr{|6sF1t2A+8halIvO2rg4kO4<gW+WkU zrhuh=5nP0Xd9MD{n6MYTduLiLDsXA}F2p6my-w4(1UcMz4ALx$4G6kH`k)#XVExl5lMEypnc~964FV)wqLpH1;mSswA_>kHh*!05 zrcfRkNObZD@7|Yv^E3%0x{-u)M+GEjL4E>AG@oh&vTw9nQ~}9TG?2s<5!$!`VUPwy zd$h6#0FwBq0!Yr_Vv>O*ZjcrZOgEVpUWN}K(awk+I*p4$jsWZA5KL+RCXQAd+QzwU z5_Kil;AmtV6{6I^Y#gneY~V>0>4uD#>en|aU_nr9f*O|WYH7BBm|3l1G7?4_A!?%@ z=Lr;xfGCYv0&w_(s8cjVsYeX;6q~)30)p5Pq<}UGcpgPT zWZcrIC?%MBzHn=T#4X)OTl;9c--_}R88r1P7ULD(eKcBq3ZzP9)IuHuK(Eo+LKkPf!`}N*R&!6>Vx__=a(RHlzqn%S7U+FmC{+ae8Z69s@ zxz@uizuq$6{9DZ@n|=g->7V!)Yt7@km4nA|8jRy((X?or`+$SR%q7f*j8(%I(43Gm zV(?!)TEHU9AO;aU)`=n5;{HKFCk+h^;T&p84jyYn91)DDV9=lwlXYD}iDa%W9)?B#R+YpD$=GaKJAH=$dqR0@T5k>SxeAy%x2@$%Hw)Uk4mQa4eMOAMD?xvvq z3^sswn?$8Rg;lb&h*rsA={$-ntdjU>G@ZnEyvVf$Xf z#V4n(G;cQ{*u~x(qC#4z{=#+m3(P>zgE}=jt=teToHg5zvDTNOJPw_O_e8t+&<8sOp6Q>;JQ%@UH&V{zHAQ_1)L|zMentS?<2k^~YV` z({*p>_jR^(WZHkT?GM@>h3r3T`FcyX`Ss?vHk}E*7Rq`E08Ftia?2&?*8D^|ni9=M z#g^N{I`JjHBI#E5v4W@Leb1H3S8n=rNhpLHi&aY$Rhx}LV-mI75{iE7#0adu2IO=U zVtd(U^9wGQL@4%^+O2OeXo#YE9TRj^3g(;SKG=qH=wr7jro{F$Y<42>S0jQLg7d%6 z6q1k|jCC{K8fD2#Rjr}=D`FHPvVO+9sASJXr!2ve4s!Z~EZ2oZb8KXE42yF%63vYF zRmrEnMj){`$Jgo4nrJLizlpdIrBYzgJ{H3DCp#y)83=Z$FDz0kQh`Y28wf~pk~yJh zeo7>vD8B*6zjhG>f+fK@kzk3ny^63T`zQ&zrv+vW1TJ9o1cD{?OJRpz!L=d#C>l9< zRpW*nfv`y}4vkQ^eJAQl_EFR_#>Ry88_Y-1t_RlpGKwVE2xvqSi<*L<)PjkGKE_5r z0yHLI^dwfT*Sn6=5D=wYOtDd+n8X)EU8507J<`aQIUaV$Ra}H}gOo;_Vph>RKxuTS zfTv4q2v0}k!%@*7m52jfOCw?@>bLI&oLsyR@iblf(sBhClH4Gbplh8HMszG7H;v{^ zu)9`qImiuC8Y#r!q<>7(2ty-Me;M^e7%6H!B?ap~7$Rw;WPK$RNN$kQsZ2Odg5Z%Z zyJZzo90HKk`VqUseF4&C8jv(23Fe~0WZJueE3iY}_3BTJ3*|!I0ZFGs1tR4Nh)8mS zR9sP&2U*`S9?se3c!^H*Adp1&Y*zjMawz=XaA$w1@7JIL;OU-!-*ch+o82#VeYWdx z=MQxJn~u)*|Iq$e+fTRM)q1Vv>&<`P{9>rbWM8yN&1*e zLPNeG9CN|;H&`46$dBZ&WHu^ZW9g9*Jmpi>G<%rwej_W&l%lvq#H_rT*XG;&v?&~92?brtYBMu-|2cyD)xGw@vb8| zIGR+LAIUgSLm(1u^JdpeA7HyDVuE{Ay2VVz+8OUVlB1(Zy5kil4DNbR$7o}}QEQvs z%{XbN93It4A{+w#%@DnkX-i}mRix^|j1zYfNTqp+!mg?hsZ!OB zrjd98k!piNf+_erHEI$oC;cH6P2i~;O2}m^kqQ|+i72AxV@1lOAV@`NQi)WY$df=S zMc;NZ23gm}#BnxuH6?ujq>85nI?qi=#fdx#q@vqZal&F?&|hR(GbcbQ&gV%Wl|~Zb z7_JAYC?%_qiqm=$NTrcdFxBftDoRooW}MxVKq|FRV$XDaNVQQ-3H+X-O{JjtOll+& zUZy{!qB%ZwLkTdgL@G}5N!V2C83gO&t{s?^CY4CVnLcHtN+>$HA>}s`!-d!|n~IV? z08+s+B8&6d8Rz?ykxG+{6!|^^Ar&sytn0n=Jz-KZPR4v zKZYI&U187}L=kbO=txe=h{i?*t!arS&DG-3oOyK!{w%Cy;g8`O-z2+@@m?J{YoQxi z09W^>lLyP0i`m>!Of8xD^6>E!!!ys$3_m_|dUE)Y)uQ$Au@jRgp1n6Qk$|MZ$P^|u z*_#<}-;tx)8kxj0c*XV$CfaySFnSyAW!TR{MBy~jigkRNZCJO&tdH>qA35Tx5iTh# ztbp5*r>PTF2PXlf%#Z>~R8v4q(dVO+M?h9Tdvc1=jXs*8#DW2Z*|3)(Ij6tZ9W3aI zW&@_uC_O5Sh7#i(oe)TM)s!X`QP<>HG{!k48elZ2WcOk+_$r=Zt&CH4${j{Y8tIei zXj)9YxP&&s>x8(>svR)Qz6a`#En1vVXw^L@&^0GfPbTzJ!gz>0h) z>@ee;pZf{zqn|E5<~MJl^7%~>^_2c?90{aEBwV5Y9|=7d3YYqSrT=co|7UxDvG=V# zKh`tc{mHJs?^^DKv z0H2f6{1$_*IJrh8H6EjSZSZjvZ2wxK(ckudktkBp4<-0!^!C2au)U0(5?>VDH=+Gh z(Y_&e_v~t^v}oq$FJ;TwTw#&l$fDT0M;J7Y4N9~`Fgn?`gU{=9h$!K_T{iNajCYis zmNiACBSBQiDT{}THVA~aIVD;x6l5~qKX#X|;f0k(?RjcM*2JE}74eL(frNFYu|KHs zZtd6k!gAKiXV$EF^T-OrW;2%ZBZJtCzlC){$x&d?K_*`&gYS_5vg+&gbT1LU+KZ(^ zp6>(v%Tc(U5iiiR7;oWn2SD{B3MlRmJI>MAp_!a;hSS*bGUF_$Px^UFVu2ok9i*aY z@VL%^?{q&qCXXI_W@75hsWXp>KY_ys6bs2y-Y~uK4*-X28!W;v)ildydGj@7q)3+; zCpR4xhp56Pm7U@vZ>lEW5)5vQS`%=AJ&h~$ke@NRPquN?QV4Ks82ro4g~^&3QlCl} ztKl^0y%38+m09gn(?9?@;4KUguadpYfZ}w3`BHIO*dAM804WFyG0ro3&X118!Xdo} zkzeH?P%12x*Rn-(cowd09bV4D#hy9-_YGJ;iczWTRjYiFzSAEpOwzERe$OKH4-E^f zmvOdFcVB~c2I9SZ1THLkQXLHx>>&m;pM$pB#z~H+6uH_|IvV4x0f9XPPvUyBJ66~! zTo3NE5TvKMrc;S%d=uBimiH`f2wOv%HHN&R613`v+``uI_6FH*26UkmHK1;`!migq zk2eWOECNL2be_+-a6zvTAqKhy71l6Wk9v zMu3)nD8W8efY!YXIW=v!eXadiLpbwXpcPD`n-Z-GG`ojQfZanh+ocTJ?tv*7Q6P$k z^VzH2vklU-E(Ro@`w4~uy__*jxYXnnFXNt=Ev_zXx{6ztvj$@0c;S6{wG@Hfg8u(K zp@mTRyTf<)f29A>zR&jE-TSfL^F3edxzRJw{ekX>x<1`C)%km!@9I3*@iQG~+P~C( zy}hID{cZQPe!kUg4M9D?VDoFu$DkkJ{?HfTgUBz-=3U3KL2uB(03!u?E&(=uo!oXa zu?bS+x}gM^q&Gpy=JzqW+i10)Dq5JXJ+PcDTG{#8p}E2`zmj!Wv7+y`p|yBWTG249 zA_`z6qG{d;8RUGHr1E+ULyNI|45N^wO$;(ZJzmWw)>{0cOtlsKBIE77yIgP&L(*Ow zT&p%F-Zbbo_{CB1zR}_94EuJ8;E!$qp~zTq*h=HvI#Wd--(tKWm)I?u*$F!)bf&a8P)(%g7`CCZ4tWo2h7`^OVh`U^>MG+6q>uReY~c`4Z{J%(kUMvt z@jlUme#Z6K*d{8`WW4{=ZINl;mKd5=ZrajP7WV9tnX|4|_w8kTMFbG3Uy*pxi@$xl zLLb|jp|;$&ml-s3QjrqPvKH@xbnRQ0dg!gY!hkh+KXX?IfRCT1tx)_)G|elZy>4BX zASH$j!A!PJVpPopAFq0m#76u}djh-0s%_x6F;4pHPJiOZlb8XsOo|-b?hWi3=qlq3 zz9*!;r2F`-k6_m*m3W{^qoV(RIP?>t@b`u9>;KXIxAuLiZ(r|+d!Ow2{hrG`9o_HlKHBw(uHBvg zy7P&S-|M*2v8(-q?GtUk(Pp&1-n!h{)bgR0RP)a@UugOtO;?&);A0y6HD%vN;L7ND zl42sq#~Zh4-#q8ucnxl{F(c2fmde@ud}%&gHm#u*8w)k0FtYYj#X5Kwu9#O>ie{;F zvBXiA#*~5$1PJ@tQP-Y+M$&!b^Qe)=br-}O zsEOmv49MHO5#N1)fD{z93O3`n zN516M=s#JA8Yzzb+5qIzk&2)rQXT0p|Aj^&c-p1eF_p(;dJ)UtW2HF3JDM-%stKLF^V0YEd9 zVC>NVux9gKXu`--f2v4U=sMj=B`ILwKD!HHKnCdq*^@Xro&Y8|PD2>vpeH5s1_sw` zPKN%NYe_*knD$U1yE?{`$zINUQmz^#%-TSBE+3crv4un04GH9MuiBhs{Aphk6qc3( z7|5|mcv!aEaF%jRSEpfN(`w*>6k^eC#qr#5q|<78Oz6i{f&^p{HYc9Z|Gzu*b>#m$ z+W&h0$-Xc56?%WU_grtN=e<1-cK>|$cXa(uSGMzOokht1|GSP8?O$x4YWtJ6Yrq9q zZuy&*kF=zlKiho1>GMq%e8{){*P2b%Z;g(psFodk418~b=cYY3F~+h|D3urC0KJ^! z1&y_hV%ofXOGXpj$eN8p42I#PVb)~xf-HBHx_*knQKr}|s`1GEvSMR777O;)gP zHTy*+N6zMTR7*K5NPB^%T5`34JNkdu`b&D+uXjQvP! zV*d@-!Gi*+;&dn~H$ZA}x_PTKmo3g)`NhaQbpDJ-&J^i@?MKtP^ZQ$D2FLh18ONuM^GLX$D@IBS`a&0$J zB{hUFYc^7A5K(H9%>|?}fGCz~|yMGQMqTU^i&y}#KZ8|WbhrF*g= zhNJM;Aw5`Sst96L@C70W<;|Wb+ngN2UBiGfl7}N^Fw>&Gb>VrpNHr+9U~~G1u(VG# z!|DQe?vbw9oXf!-fzVD%q=D1nfS_X%1eWY0xLe#MH2Sdx4B83=gz@L?2XH8Nd7pMD zk+@Ka0GqRZmJPfO)c)7Jt&xEHAN~JBp_fD9>HaVEv;Kj;kMxc9{%r3vzybJ3_h-6G z-5p&&*cIvg-p)fEA8Y^1_I&%^wtw4}X#HesqUD!brka1KInwkiO_|Ww;3NEgxRjku6X2;ufv)(rUH5ej#h+;<;zf*7T&WH!@+`Y;z0KonUem12<`q#H*J+|3OfW1JXs;WSME2I$ zsa_7@Fx3EvPL?&BGn=?m653(K7@`tLK+UD0{Gye=%FRIjx8dykvXwu&xC-PCfF>_0 z&_w-~#UdVmXmWu@6U|UUc!)+5)?x##iF^Z=PIn2ha>fNGTWU;!DRCPZO9pLsS{cM3 zGragbni#|B1;LaprSt|YC7bh{xU0NDILcPX$I*ni?zF(-lCg#?d7G1-xPvMBQH5j0 zjV!K#bMh0nWvidQH;ma*pe38rptvnt-PmzY zS)eM$6pD+(5XnX{09no{kVUEU_?Vz`f5U(EjqW712pYQo<&tgwg_YZX&Q2+my&aZU7+}YdlfsQw||782aZLhbTZ2e~Idh0;T z54Rj_{+Z^7n%>j2EA+|G*-*%G$bzHvxIsJh#wZw8!u#~KbR4SSNIR6+x~y0_4lg)z zQ{w19Rd8;IrE|q9mVw9!?09b1^DU-YIu5TnaxWbiUE%CPEl+|62!k#iN;q#zXU*Yd zN3tJe&@O_J1i#$tp@OU0C0gxtF&$oXt30{!m zkWr0C9yKctuRU@niuE#hJ8D#NEIYjX$X)KF9nw=*V!kkMLSqbCo?O8>Xuv5!nX=PD zoj3aNgtOcqXi)SZ`k@4E(x7q8X~GG}UV>&p#)S#EKtrJ>8Z)jsAhalh?eaLZE`pnLiTZfd*WEH~?W*@^KCECMvB*DvVj%`SUg>{%}X&^&<+3c6+13 zAmV~#A)E7mobxp^;TThFe8q6&zAxCE1;meXW1>r@GcoCHn6o(-h#YBEzs^n6jkKM) zR4%WS#)pR&i`j*2K3h8iEt_No(Y?!a z9ufZIg3W0{+^I|aTpD9=nj&CR8B^i&=WI?D;x3BPe!iHi-h?Pht=+IWF^Ib;Nb6dX)t4b0`oV ztA1X>(LiI#6&qMX|)ELPD3eiX401e#cKndo}-6+^O z)R250YSP7~=&2q^sS|)fYYuVBY_f<~i=gq69C#KI&a~@)E>vhfHbt$rY9H(NgT=)_3 zDkk^p@h%@5cc4yX(3ahiI2nL~u}6Xa8-o~)P@GS_U2%Bzi<`&^ zNyTDEYP9p3O>o2E3>UJA6{nricsx3q5Q0KlXH+w8?YvlWINgODywgb_M2md91J>$r z@{0?y+BLHk%wom4p_wOne3Tr{gdyjoHB%NZl~h@kaD-_@DLb4NL-u}jqYBnE?Wn?- zHOPn1*wK|@;i5TLK3XoU9ECezAxuS5Wx$$N>s&-8UkF0Rkt) z>Q*hX)C8;n$t8z#YPjp#^#f{vq&5b%tXYS{i8khBEYZwQSQQE^(T_^P$rXo_ZxCm{ zS`yJ3*1(BgnwK3;$l=Z=Yc(#`qAC#uD0a*obnq01mJ8RdTrSHEN;7}-+;lYr{){;b zz(Ue==tmb^N`JsY(R65s5(`olz|!Gxo(^A2NX!_!fW?#66!| zTKBBq;fx=bCFIeKDgd1Vc{~kH0?;cC=K+!3akW4RvE)hvK%RukJAG(P%ZH?X7pW zd{^^NHScSBEBr!Zzu3Lbqo4?hd?QUa>f__+7;n|v*bMIVsIex69}Q2emX``e>pHLA z&&)t_61sLX@D#3K!-D?IF^m#qrZG&wy)$;+fj;U%`-ZfH>0V}#G#MS^CU6kA5oC1- zDTE)nrs0;^8yvCQ9q7^?VAqJ5k&Za;d=@t6w!8wj?Ccrm5zx2OBuk|z-Aw^G6HiVz zSl`g|Va?Ht#G}G@JG>?QWnyZy!Xb?-^5fN^aE*;`$C7@cESpefZVre}Gsg<~abxV5mxP-$_8sWMmE#V{` z0fbql#OOp&ZL0-`vk_YTsk1RjUS9*6PDE3@Evb>3eq!R;%(OYTTC~d7f-3uV2gn4d zvoT3WJz-*To|&lAJkiR|o^ya}V9;rkmw>?)Dn5t8Q_weDf5meS+pO=|)%<+U9J+3; zc)8mfZsxJ_5nxWT(8P)u#yI@Y^$) zzjm%x9YWI9OglzvVo-*3ojx{k;hW1V%0&~x{78ng<^bctAd}-h+K z_DwafWQ!$po_}N{XXY2nOXCqE8d)nAHduEeS%ib2(Rp%(G zOiq&}=;~XLJ|?!O`%sx%K=zme{0W1$oNY=dtp5a1xbQsJ&4n_N%PpGArj<8~5wQ}f zDph{dC3=Ff-sjka~g+%SPkJ`#4U#o?R`FNmPbizgFw$5O(Grc*)&$FCY` zVbTBJ6Z)-Ccz3_k_mh2ty$|*Li|!xk`hQ)6op0~>(e}S;Z);m>{bwy-X=!PmYq|md z@Z02z9d@vJ&ff1*@7Q>pQZGKWUSXLGA1j*k=uBmEnd$QC!UBd5&K4mE49}?W z!AlM_-pS6@*tkK~IN)=Tv%$vbu|naJWoBlU%!P91G@Qzdb@{aioEdx-Zs!W-LLJ@s zI65lwaRq%99%lg#F7ytP?|?kyK|o-Ib@hK+7L~Y=A;B6M{f&(3jgATDq$TJZHMA`(gtaO`zebYMtjbFM5aR%JQ$4#hC z2=1Qkb+9AS?vU_^Y!Tq|Gjt3Mf}HNNN%+z5F)veKc?X&)2OW3A2S`C4tN1)>7>!1u zr2@DFpn15eC3BAd?UvgH`639-7V?1uK5urgfzx(Zr^3wWh(qFg=h25lc;goe29@HbRgF#$kk{(I{GLCh~_ghg+gv_DQo33j{~o$na52MQ2h}H zJ7p!(G@K8uerQz{Z>$V@F8G)UyXas)t#iKwoiVa54`hepHlqRMjKbO)ezaIvgF?~F ziK}o-m(aF5znTN5B0Fp4z+y53Bh!7(7n$+M;0E0}CDy6KjjfN#YAeAu(jWB6WL zWJwjL2C%3YjZO?aYG&b8pnqSN&AP|oEZVQgbED4(AX^y~8w!HHg)O9KXB}YH9<=2s zFU>ar=|&(q4170Qz3kF8%1&SqpHgZ&f~lC*C8TDS z1Ki$&j$Fq{_wR#1fc?p!6(rtu>@PFz?fk9!|DO+q2l^NLR(pS~=MQ_1b!WT&bLW5S zOm|GT|7_duv`w_mwS2nykD8xsDuh1&E${xbeGYb1NcVG%(Kle>V_*Rl2zLYS+~XIs z`NcwJ>csR+=43IuY(hEnCFxWK%(Q6Cr0*62@t{n3q``K@aT8k}05do09s=4DdmPTN zC}U=ls*mMUOG5VXRsmB`&2J$+d(ikYOWxb%+| zvnxxP=|`VBEv*-Cn;6#of?~~6b(4I8hRCTMSo2=yz@B%2LvqlOUEc&*kvE398-o|~ zG|+oF4rVh1=?V*QWEHPiP(u)K)xXIBe#!ycU62B^pe^VGWVuBW>cG0+N`BBgFj10v zSJ=%7NObeUAuxX^)U|&_VJ|TL#qxl>z5-G{l>VgD!io(_pg> z&~y$u?(R*HgiMCx^Qf^YRoQ@u;&H_ITw!(=&RczEz~gJU(?Lp5dzZY{Xyi}CqUmjV z4c_Jfr|$)P5As5ZE6_il1`!(_TP$8%DHni}Vr9vigG^1Km|YBf0_<~;)ReF|<8*!l zCYTh7ECOzYW1i-~mK@|dbq-4_B(BJA#Nqn-bc3TO$BNM5n3=Z9a1P8BtrcEotI3mj z+zt0QNRcY_vO{){&T_z1jks`cTHrU~Z#EEb2U1lk-X2$EJK_-J;bFH1o)G+<9Odt1 z%DjvL_RL9OYBG!0w&A#VvxB6rM7*7B6NMKB1yf@8em!r-_ng?G1FWxujvRxFE6xtc z9>+OhW<%ePuhg=)Erb65Kow^##T(&IBT zXETqRz%@|}90A_taFVMvaRQ0z1mt*FbzZ4$h+t?%hZGbrVo50|9GyBw`15FrU+~;e zZJ(Y@)$K{_b~w|PD+R@;KVJtt>2oIrNpyQH4y(kuSNbu)C>lU<8 zR**~=>gBPDrze9tZM_cX{BnhQQFg*>;E#dPqyFHWN@@tlUBgxFN@3u@6wveMAW(zD zad_$$%9*p-QVH)N2&(Hn4yPw0@R3MiRHrr~_K*iWa<`J6%{ahaHfX!HTWk!%{RD3F z=g|?P(K@$sX6D>Vu8<9^@7o+;MjK$On_-*FFT+T6T8UAd7XB3e2&!+5Kbaq>&RU1b~texv5aV2A3FrMp<8MT;X&iT^hRo<!8# z&=1(?r08)dgRVc~w%g$pa)id|%rrIxjtSJr0T=id!UKi_(VT?gs98lC8ao2{N@$>E za=cJJV+z%K0fWnX9L{U!!f>1$f#Ljdbd=X+)rTotNe?)lPEB?m`F0Uv27f^b{i7gtr&ON1?{rLaIQ24q2 zeBUqj{&w%fJ#Xv&MAu(*z0mn;$1k^kq5VW#x%G=J-)MQh`Rz?#ZR!sd>vjLx9hE&8 z3UYO+fdNYOkMPF;9T*R;57VApg`j)h1j2F69($&;8{fApnTJA9X9<4}XGw$i*pue9 z%xR!b#=aG(2oJc$c2|HGWq`R(Z$1UtJgv?wxeW!#TL{n2SAg|I>Ji29spLF=9!(mU z$7|pYdm?)!t6nAEb{4z6!YNf;ea0{YiU5*72Gd(0qzi~?dfC}a6`)*^njLU>>J;J6 zL97OQUr$UwbtVH$NKn~&$;@XKz&+s82)NV_RFK8R-Ee|)AzTCg7&dYJ2;+4Y>~sZK zUIuMB3POheQ9!Vyfeo>-Ykp#4Iw-buReBLy7B{=TTyPRwU_J?vt3K#br$lDuN)N8^ zhh$+C7!FL3iAHPp6ts>&%ek33ZWV!3F!R(bQVV9LpsAz?ML^~@q*!8=Zd~=QR#too zMEMne428qEsOmdc>zNsQu7da7xI5NSI0gCxx1PaQP6N^ZRDNZ(ybU$P0~LNjj*A)y zCT9?Jt^9E`y$!o!OYzww6}&&kc2T9G1cbHCA46S`5UEnLi5LK*J~&(FYMjf%r}DTX zu#jHJ14D}gc@VHe6}*|pc2#$1r!X>F|KaLS$O{#?x<_hiLm{aGEdD$?!WG@n)lKL* z;6b~$f)@q3o7(t9NZiNUs(4{#y^!N!3T&wYw+cyx4=5Z}hQ*%)pHc(b>^HCSg<`f; zF0Rg%@ifRnk`OLqsoU1Pr_zis;-GZq8FXI702bzLNEx2W`y>C#3?>8&YD6z7&Q#|v{o9f>zrfP!Bt6T%r2MZNqnBz5Xf z#Rr=c%T$1Lf6#FiU!quw6DE~_Tu*~D^qc2$>&*l_;rCaN-rw$#4we*su)qvQw;^ie zaT4s=3XuLwF>!nYit{7|eR_?K9-W57eP*ImvKI681nl-y_;muVm^hysXkCj{=U;AIf@USg&szQ9D2 zvHFXp{mli|Jiji&)$ojZS48umVY1p)*hsX=7PG+X9~5GGEBrzUV$IW$I+#){xec0E zNDZDc+&4ja%4CBi>K^HD8=QlgoIDMYpaReZX2_^ZK)ku9f|p$oo^pZ?$|ZnrgqN|_ z7qaj>8#Y^kt1zVYCD46Bk*9&a!c?lkw&mpUiL=}KHtww8y&LZKB|b|MgJ6=0m=K8! zXybUF2YXuu?&cVDWCJ*%h~Yt>VZ)xq`PHNW2U$4yU$^7VM|5bx();_lqP0rYzuuEnzmYWDGY zbd)=I4Mg)>g|>C}feloUM4Y+f{Wu9?G4r{lIOl48(6XiY?D-0?hf9sXpk0z=kUs}* z=mzI}&pv(@slykn#nob6R4s3+AkDbD5tz?{R68&zCd5mv0%FM4@-wr7OyiC_Dhqli zF&v)*yfqlZpMorJVR;!z)oSNzo#b|hD#$%9Wn*wc^ece!E#2)gLd0CxjNof3?5PT8 zBX?akgX@Gua(oU_Hh332g*T4GgBp@O6;4BbNrX-46avwSid_CUI=T%kb_$^(8pHX= z-L5l;L)z;29OP-RhR^WJ?(5F5?5S}6ad$KZ=RhxU;JONUovX0=hqsa*>ZcNbG4A_k z@LiJ`bbJniHQ07Xsh^m}Krj~M2DH>~uf*{V!FFR z5QbM^IuHY<8;$8_3e(V&Rd1|!cLiA3n~UHGzE690&MScmn{fK z<~Zhy<|;r6Kj^q)av+of&112R zuBHb1N2gbFWecuZ0+tNC3T+9w%aGgh>^@-cmh6@ol^e&|M@Pi#i~^eMJ|@EE zD{y_gL;wqsiIk$dJe7tn#Rj8}pX{+8|No<*@Jjy=_WfmFckiX1cXofHdr#M^ogeA! z??|?PsO?j2hg-j+s*_{P43Yj;+=h3l7cUo@9KXEE^8Z5G0=Bbsyqy%?V zIDfeuxk=Kl2GS~SL!QRV3?L)P>C4?2Nzk#v??6(;MiX{|kT5=!3t88 zyQ$7#l4QyNA4kWwAxQ0c8sGyWSvlbY(zybV5PT9&qTa~2Fmbwmm*n;e(vuTDAVsc- zZv8mRB7)!AzqsGgq6TDia|e|rVV$0>c& zt_V!b;JioZsKBlcsBgXOpl>7^r}VAxL4Z)vXt=(S>6af*_1q&^;WXoNj3YriA#j~l zmBOe_eQRZhI3ALWyE~0>dnr&ISq1+=^bJ*2aOG%yRaKm9T#n-zo%_ zBR$j-a?)|x!X;@E;avcWVD%!524V_avx4kzP&cp)jkCAz#SVCm?yvAWy=4=b;z_K| zFdp#YI5EEDaS~9QjyHL`N#9TovbqYNN8`K+y@784Dj>=Qyt*(0_dn%U-Tg5E-^Aex zUhVC^9F0$vj5xteW4yMYKG^X&8L0iiyT09*9iW&Bp9=a4W1Qo+k#EGx*K^@YZ-rm- z?XLZSIZ*|*_!tu4^^>TaRY-lS{(oQpjlK`}HADXYYWKUlez)^$ou@kH+dtd(wYFoe z&$fJ~`7fHEZ~EbXLiPXmRXAn3i{jCU)9o4f7)qY%uQZ=ovVhSG5Ebqo#?H19XFt_B zi7Z(G`t$+&VOOv$4t$2hX2r*F`;Aa7rta25wzz=ZdNY%dMz&U#kZ%P#XNU`k*qbU8 zF*<0NOef&uZQ*+JAlcap5~EAF0wR$1J3a@|8k{qmzP7wj$OkT4?yVplI&#Lt|}W~b}k1qUmf1>ME- z4#pbUP{-$(+=jjhyuHKIG+3d+iO>z#HU&u(_C0jdB^!N-VpV)#LWd^c?PuM7u)_J! zA0(zs915rjpc${fyf)yZpbnkWq92yURoMH$FlaQIRjztEpn(I%T6P{yo$2y5=;54R zGBLg%&;`4v!U@sk+CZ|g4qi1N_d0y;$K?=)|@rPYn|p}XfiaEoAUDMbh^DCYLcT>_ z^4GxV!%e}r&PaPpZdLvN-wlOt2mZfz^!~5j+j~~J|9RI}y6)>-@Az>0U$(cmEwtJ# zUv24azSQ)N(C-G#;Qyoi9=o@KbmO+WnJ*4>e#Gj+$54-2zfE}aCRjKmglpb3CN5Zr zoQK?PA%a9?#2uf5qz%5L<>oZ-LfdOVTveLO6-vlj67a}7SV2N^DUXViq2C5v!N8uScvFEH*m+Fw~FOexT)@JwhZ+rM4-jnMrQ*R zPEYRcPQgi#+K0IrWAz6JwiF*22{}i(yBG`vBkgj04w5yvhTt49T!4LHUBSLvj9YJT zQw3c52P#NbE-@0qT_*H2gFbr$SWrGzX0h`7R`kpSc)GpzU=1Z?y>$X}`6 z;|!P9O_}qYIIon!#ud0@0Kz!)fsOC`E68MSyB!Uj7QGknf_Tx!c|b5lMR+pM#7I?jGV3fYN7-QDW_Lq246;ofTv{mpYzdHWc3lK8DhYZCdJ{ z<^k0oob%l6l7T2>6b7G1)2RkqK{HE5Gh42fvDRS(?Wy49;LKh9fpeg@0g!uaL-og2 z(nGa5zZP6hA<&(3AaA_!)gNN6@@)1Ruey0GTQ=9S*Py<#Y{K=KxG#p5%F8*AMrfph zSA^U0X-0O@@o8=cQLvwxu!|LbUHJVn)+>B5NDU{_4UX1}Gw_x-_Vz~5{~rv!D-_<{ zf3@!idjGC>Kk)y(3;6%{1OML-b==neP}>JvKhb)q<+0{3G=E3aFEm-9zlVRm{a=^Y zfd_tojY?WiQO##@xWEWP)q;C+(#l(1ViLG_c z&@R7e-I(~IH_X7}uo?2R+59EQK+IbC`PKYlWZ{iZ79)>mL6lNt(Jaj^fzMnr^FveX z;nu`}n|&YWK>`n-o&fukZ@dPdWk#M~EtSCnF3rQx*3c{KNOaC*vqbZ$F`<|QN~3Z) z(~GJiW^tjATP&K^+!D-du9#iRm511E>%e3#-FXFm1IgEikE5})U`VU&q9ibK`>=~Z z`{q@SM8$Q^MLs4kBpTxJ{-uBrUmKL2Uq{k$sYDfKOtv84b11=XaJ$gr)Y)To=k;!1 zM_O^IBp7EvuUuqP7dB?V!UDbQ?4@;194-e9QVNhx{R4pX^JcLC46?9A_02-wpqU+5 zNB(dkXpo}gWiZ7U@2%K?r+}LGIt%vnI?#v@+N;t|ODe);K$RD3bb$QX>CB{=y%ab! z(6^4n;;bM~iUw*3`j3c+7EtL*vDpjjoJ8E60E)v^Lv&CQt{|#Xoi{K5@a*(-=A@N_ zn&8X{C}KnJC}3o4aGg_#%iadrR|c~NxaOBy)wjq!On{wRM*?x_f?<$Bf$7kAHO9UE zMrwW=x@vZ9$0(i_@TW51zw_;5h``10E*8tO{Pd?c6J`dqpw?+RYk% z>EF#PrstfpHzO1M|9!206$-yMJlOwQ|FOQ$^*!6y()<42qdlMK8Seg6_hVh}?;7lU zz4Ls>fA4rl$35+zX}{Rs-S%4B3-C)jUppGu(ZG%dzWp?iT?d|>0p`v%C7|Y=IDPQ( zW^S>g9=@JkN0J-nX3WrzA@XZKW|p$cWeoJ?uM~2t5Plzz#D-$A$mvI)UOxiCM0eM) z{*#4jf|KS_&MZc(dc_3GHn7=N?sfx z_QE<`W-`Fs-Jkl;7o3hq&C=@J68YK2$jEc+cyS2paGlw@PahR8zq_eyXm}luq>`~A zC@`5eFRemYF`rq7Yd!{;JJzKCY(d?yRwC8UH>(Zukia@RjhtbnvUMo}KYnfxUAUfmDHYyaYlDtZP42EF5$N z)-|e91lFVL{Hh0c|4p%k|Fxg($46eKUj2@3KY}UGt zmnN_aekLmRu*0@Te5|gIt)|F_j649`O{=Bx$b4ZLyVcg9Rpdx%9d1P!VD39kad{Bu z7e0ndd<8ItI*d7NTmrao=gTy+UtH(+7RY5&no-3TD42OxJ40HPmCz+M_aO6x%1@ACzTT$(NbkrYJ3MsXeJ^!KxbygEmpnV33r z>da%}PpY^9F%8bsweb%iyII>=5uVrt>IX0pZtg}&l$CX0$RA)S@uRDIpN^tnYBS+= z?~9|=LH=m`BviA9Xt?6;hXOII@|+4zz96k3?L*{728}IhuqxO SI=`u%Sp1s7y=`tufd3D4_E*mU literal 0 HcmV?d00001 diff --git a/git_sync.sh b/git_sync.sh new file mode 100755 index 0000000..2d59fe7 --- /dev/null +++ b/git_sync.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# ===================================================== +# Git Projekt Management Script +# ===================================================== +# Funktionen: +# 1) Erstmaliges Hochladen (Repo initialisieren + push) +# 2) Änderungen hochladen (commit + push) +# 3) Projekt vom Server herunterladen (clone) +# 4) Benutzername für Git-Server speichern +# ===================================================== + +# === Konfiguration === +REMOTE_URL="http://192.168.19.10:3020/madgerm/web-crawler.git" +# Wenn dein Login eine E-Mail ist, benutze %40 statt @ im REMOTE_URL: +# Beispiel: http://madgerm%40msn.com@192.168.19.10:3020/madgerm/MiniMal.git + +echo "---------------------------------------------" +echo " GIT PROJECT MANAGER" +echo "---------------------------------------------" +echo "1) Projekt erstmalig hochladen" +echo "2) Änderungen hochladen (Standard)" +echo "3) Projekt vom Server herunterladen" +echo "4) Git-Login (Benutzername) speichern" +echo +read -p "Wähle eine Option [1-4, Standard=2]: " choice + +# Wenn der Benutzer einfach Enter drückt, wird 2 gesetzt +choice=${choice:-2} + +case $choice in + 1) + echo ">>> Projekt wird initialisiert und hochgeladen..." + git init + git branch -M main + git add . + git commit -m "Initial commit" + git remote add origin "$REMOTE_URL" + git push -u origin main + ;; + 2) + echo ">>> Änderungen werden zum Server hochgeladen..." + git add . + git commit -m "Auto sync: $(date '+%Y-%m-%d %H:%M:%S')" + git push origin main + ;; + 3) + echo ">>> Projekt wird vom Server heruntergeladen..." + read -p "Zielordner (Standard: aktuelles Verzeichnis): " TARGET_DIR + TARGET_DIR=${TARGET_DIR:-.} + git clone "$REMOTE_URL" "$TARGET_DIR" + ;; + 4) + echo ">>> Git-Benutzernamen speichern..." + read -p "Gib deinen Git-Login (z.B. madgerm@msn.com) ein: " GITUSER + if [ -n "$GITUSER" ]; then + git config --global user.name "$GITUSER" + git config --global credential.username "$GITUSER" + echo "✅ Benutzername gespeichert:" + echo " user.name = $(git config --global user.name)" + echo " credential.username = $(git config --global credential.username)" + echo + echo "💡 Hinweis: Wenn du dein Passwort speichern möchtest:" + echo " git config --global credential.helper store" + else + echo "⚠️ Kein Benutzername eingegeben. Vorgang abgebrochen." + fi + ;; + *) + echo "Ungültige Eingabe. Abbruch." + ;; +esac diff --git a/src/__pycache__/crawler_core.cpython-312.pyc b/src/__pycache__/crawler_core.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cd1024dafda758c07d8f4896a93a15af0949133 GIT binary patch literal 13445 zcmb_@ZBQH8m1wur`ao(vBmt6;5JCup_z+;Q8H2Hn4Ysi{_88j}+b2b1uIrEYuM2jC^VE^)(U1|G^*9LuG+y z6iE<^1Vb=nfEdZV$q^F1WdSlM87K{{# zup*!dYDcsHD;ZUQ4i=6Sim*CR6f7Po7U6CZ?dLwaqD?mNUt6{CmL z&KMxAW=bJ-+$3uV*6JjX0;rNBmW#wK$93X{{33Ckbd`OENesEjbm1A~3nL$Q^UY~? znsv$2s%aGPhkc>68h|M;im-sCMd8sfCw?JZ=`ib^=6vJRf#LA<)Mp@70_TRpDCiCN zW>^nYg$yd7uA zOOY-)i#rrKcY`}J$&OuR(O6cfA`J}7F)ceJiv&;dga@F^<0IfpYK^!_{G1#Dv1z6w zEb;*RAug@T2r2^ERD=zTqcW%&!9?x$7p|7caFA^YdKu_yuw@K)j72RGG$z5Gu`puY zQ#aFUkH;7CaURc%c{{sqsmO#$mq0R0q$-?q)UD!_(L6`}C;A!W5o5R)(xU?4UHCma z0l6m$5_-nv`oYUS$ih)MFZXYiMz(&G;whf+ON&XO3h!&=0*T5vOkaF>kh(NUqYAD- z%J+*)RQk%iKgfPYmAukl_;MNk;uR8KB*7I+<$m1bZ4V>k6+a?=lIsUvKs@O;Y(s-j zUcuoJ5g&iK1m)k1{6k7=^+Sc!lMdotm{H#V+5J$yDF1VXf{3bkl^+klwuez(ttSZd z29#R16~U*}<9A_si`1wrH8n_}@~il~+F$k6T#1UrCrO0%yn-8*$PF2)-6B8xS1B@T zEI+1TXDT4soJ>HWi2o$XyeoTIE+oM#cb+LcwI3RORhb#ehKWw%)_;NDJp6Xm5N(7= z9cJARMLBg63L-*Y2UUF$W-v+I+M7{nH3U~G^~vv$p!^#YVQsWviv*ncG6~8`K*JZj z+}n%L+i1Zz3vLB6@9U(bNdHC-%2YcvqiqotWW>$g;AYAOy$oy5XV|ZWLtXaI@Z|qF znO1ON&Km$g#rcA4c$)j{J{0(V0IP4N{3tsK%D_I#`hxb}Q8Yb{>)K}uy6jl@O-Jl6 zWL!fP$9^#I?VjEbGHik;n}$K6{>t$-fED(?RhU(OO6WtB@~-@iPk zN$D&}og=1m%qbshbxWmpt#_;sw6z;%`?6s<7&mv!o!HP>le(sut|_i_&mB(DMem)M zKd~^ApdC+W!+V4CgMy`XwPKYP%smPEz$Qf(o**Av9Cy3#bT7~RqBGfjCf0pMcR&^{HAgps^ZnVKQXnZ%++_R?o_QDd0=jNY_dKvRoyMVQ!F$bTCZ5A?+*wUt|Tw8u?wtl zeq5-ZNSG$S(Ba0iO;!-jU2}by<)!i%ZGT*Bc#ogwmyX7YYXo)8W)(*L@^MwimzxR^ z%v8&yzZd!a!{>UU)RHvpiWzo2B}nM$Mu~B;@OGhK-@7`q8d!I%zqx)yupLj948%$X z{;(;>IR75`9!US&y{$)U$X~WM_SX~tU?ckrmH$vB$FQTO-yr`;LG~9bKT`IZ;p10T zS}6Ec1KF>me$_}}xWxvQKQ1Hz{@C!nQeCs#qt^ zjJ~_4@0^D0;x<9uwkaoycE5>R>s|K|j%z!`WIBfXShMG?_K@*U!-Os1%KsHUZ}XhRPcUe}uYHq)y%hV`P|csfK(D(3>fFR!uvT`-(-gL;Guo1hfGHlqg2~MeI$K%9 zMuSjv7=Dp+kbpYWmM68gnAWz$#kJ1azTYbey5tA#3*7hjEk*wF;6|}^>D=<(cyYrV zl>(&^osTXJuTXQ*cyaR_^)K4uJSr4F))vp66vu1lp}Gc5ZXGHfjwCq&!@%scj9>_| z!;XglRlqRdq2KoSb5>`*K(dX7iJNj4HRI~XGMx0Wh_f?n$bQbpu>rdmc2kB8+K-<* zHP}25n)C)ZH^TCP9UAsHcE)g$^-0JR?D%vD>cXNF$AB95g&4d?BG>^@AJl?Xd;(tR zB&JXxQt)uziI)eYFgGB$1#}j0&%iIz3CS$+L}Of{md4|n+Sv{)@|*KFmyGe^>e+n{ zmBpzdW3tE=E3zevoUtNjyl9u8+Lh8&WrU75e(AwN64yr)i0`|qm&BeE>0hY0oaUNS zjgxRpQu9<_&N!7skI0)cUe3rfCge?7e*Mg=Bpm~$ z#ARRxFZX%bHR6ZlA_+FAhJtfoPNAbifDemP=59^Pr@UOcD0eUxji2nCm=zV_i-P@d zW@K+np#w8jSyk=<3+t+TKiGEOK;(cshj1C_MzFQ_rKrXqXnK~1=h_$C39UV*>3RMP zkNmquI`U{*8-aNV3V8|+q-h#>m;{lJiLf$D%e|qSq8XVn#VCve_8sR%T;#fs zn{=tuhEZ=sJj#3W{HF`VOc7fRY-bEhjd+U1y8xRm83n$tvJAA%aV!c&(ps^7yjDPu z6^NB^4tfiB2ahZ$Ec_hG>}m8?DWGvEzlaeRAptAyNtt8$jd)qp?9daHCa&obn)W9& zJqcA$s<`yAO1E(Nfyx2k=W0S-vQQLPRV7usVyayq)(h=N;|<4>4MVYpp$DoVz}Zw1 zYW>2M2dbLSl>``vSK_5j$>us@Rk)t^p%yc_2~jP8m#~9@U1W14&Cq%+ew3 zIi9c#;1;!Jfih>-U~WADqgcf6!tdD=II@1XY5QSYQczJj2Rl#Zi8O(N%k555F6=X; zkC0U5j^(*E9i_JHV7bGZ7p5vB$2)OU0g85~7APin$c0t4Gmdg1#|0dFcg9gMYKhaR z`W3w?cwKK8O^!yvD`;q6OM}kQ(7b|1;cICWV zmmhm-w>?Z1Z++Q`$5*}$Ix<=oEf4kcras~_)CVb*`Kz~8g-@n}H;(7@Va~ypJ+Hw= zT}2Tg$De}Aj#71em80nE`fKMEnehe*SMy3p9Y5LP@&p%VnZNd{bfx*VAdej2>ZEeM zTpA0$;3`OML5^CW{th}{s-Kfl)}<-Rx+-et?R@z)u*_~9;>-OmiGKdg+ohJk*(bk2 z`Wy0P62#CmFyF^yo}9DwHDh1H+g~30{O;&)mZ;|6JONx}_s{M9i0s|cmOEcZf*9vl z+G>-^{P@Vd?cp_2f48>;v+4ni60wt%c{_&>-QwfN$MnkCn8 zHDB%Dx2-aKN}~v4Q0?!2b$O0wMb>lSc!dKu+a9>udn9V#a^yWA(Hiccl+V{loJXC! z69NFmTjrrtD&w_R--dZeBJ4t~<@LzI9g-+O$e>q(`CJYEVJR>F7Hhis+COXJ_5S{R zz68N+t(ByQDz=RLP8z0u`^YblxBiaTp$1;(KPFMkzw?|S^J^%(wo?wRasv{jE%MsM zoy_Ih`CYt|FZlr5?(k$+6F#eNdvdD@pGL$-Tz5m#SN^?q9Q<$Q^CgIBkVZ#(XV+<7 zkLZvNq%kbj$iMT_08aJj5RAzI7?bmP3<=sgCJ>xxe2wfHc_-3t--m`lKh)#W!gU6En&f~0<$z)&s~x* zfctu;9dB|HPSnNW^OyMWX5PlTAjqNO)gNR}zkks*-xMKl)y_A8<|F}X;Bqm;xh6Pa zcfbjo-rnwc%`j^Fz;ejCa$v+tQSEDLb#2A=`y{F7-&^#a9%icEe>GnwL3td|&T{`dQhxrO z$Kd6|uuqZjiyE&(0BV7{Wr9?G>q{}cC8mv#%5FUu)Bh=^e=Vjr)6-bx2(Jzb--fH-xIXqg&U@XueE>m_2wUFnjsNmqS&ePGBUg{5z*Ta(kN? z)N1@@BWOQ6=w^VflN>&6i_d?Eht5AC52F$fQfLiiOWryQvI*L@7!Ie;952kU5Ba|Y^p1_w?JoI}{Kg{a!u-nRxl zef@*I7ZG-nr1hr;hCD}351j-0p27a1W9N?hK8IzM)=g!vaO7Ps21dGL;bFu}7B~o3 z>I;n_HV8pJZ@?pZY;gFl1PA(9Pf)yjfeX``)925fIe*U6H*hwsWJBK30Glq2fcq%m z#TP|9;JRTWli>jK8CGyE5=~?FRLrOc+}7-jE_0=U@!%^OY{bosA}n4}7SNE0qt9M@ z@Q97O{gH5JMj^h^q-RDa22e+)p)kaVj@p^xVeFY=?Qlya5Ma9?BxAjxqQh1@@-;}PNA7+ruQZOD7>Jd(HWCHEux8lcToiNoQ6 z2m%^ocW^v@2}luqHq1I68=3pTKE5l2>LD`|!NQR-ggpTWyZ_R12(^p;)h(`76=J6X-Z2&xVGK}4|3 z9$`xkVWSJ&#M2zY<^=@MgHzm1bX?5!GEBNqyf5In=8ItWtr$o{Ex1(Wjob{4`N9xB z3b5=H!lyKZk74N&zCDBRzM7^XMmQZBoAkm|WyG(X z<`BaZNQ1Go86NO;E+(zZe1NpQo@?R2bdbebVhsC!)xfa0?9cGQLnb&_ znYpTfcNzpn7lB%Ff(;lTgD($6(gpoD#@Hzi>nxgK1F(WK0!u5#0^tao)}3T;j)uL6 z8Nefqrl&w>VcxP>N@&R!nRGRau~UfBh|@Z)91DlWeG@p^2G_{M7%TGQwy|Q)>tcLO zVS>%L%hTG-!T=_`H`DqcC~hFafs*tL5Z6*-yZwJk=(tyy+bx>BK0RDWA^ zuT-Fcdm%6-Jd6*e_1me#H47T+)@(h5u45(&x6*=(81LZwd}&p7-lWOtblDbV0LV!6 z-A52s;%P&sHdf9MEeyhh&N++cScrV|6rc+$T__DZOd2 z;daB)<(2-pe)sH22sc1Zrj8@7YfkFgV!F1~y0!MW?jR7zp&yRxTaxvi|rRh}~0?rQF6gqpo;vbd=`X*w7) z9b8x4*T+qR0#%l(u6w`YUWL%qyS_VKeKc8pB36Ckey4EmLcIDdfwH9>_3t;_YY>`` zte=fLjwc<1F~{Kjw}cB{i#sj~R8`96yz9T?7hK(IXXCa*Nn2mc*0=uF{n5DXO@XrH zMX|M8ab4H!3Fvc$4Kt{!PO55Ss@mnrgsNqu(w(aAPF5d`RUZ@%T@b!D60g4Wxm;$` z2E9ZablnaIuZjfAQ-rEgB^+hY2*RhQ6nCa!;TPPJilEO*9@jdL_KQf8h# zwLzP|cY3yeqe%BXcJ^4Rv-f{n53g4L`0`5ICzZ{D^)Li#AWc=Ytdg;c-2!ENtgvif9MYpkqw z)wtHX7F_e(?-#nx#><9bl9-&!BXLvfT;XG*Gihvz8C&AUwmJHt*0>?v8ERS?Ss4_} z`x10ls?s$-G}nK-^Kn@NzExAdK&Hwqi@w{wW!-Y`vNB#?mn^A^mDK$nuG7@xxl;#d z16#c_1GjDpoS$n5omnt9#Y>u#CA(uKyJrWUP}=Vf z{@LKd!R4+M@4cSomR0jw*ILuMSI`|#Py?IP2&t&XBpYux{#EnOj;yGDI=I>uclM+l zb?>|GxqjN1v^LM5nmfKQ_PAKTFfsqlREa4C9XdIGa!H<`ttncUq^lm#Rm*KL+W84x zmAxUg^v3d$7(gmx?-ychM48%GHgfnLo9mDv7)wwSS znXYXUC&Jgho+`B_OPgb*&56>MRJH5<%6pYRtx8&27y7$`5VzfY*|Pg&|Rfx*>bq3dMAGPq%_xjT4gP-r|YoEg49Ex67ntQQ_yow!be;69Xe zpNhFp-DeW+;SIRYMc<)?x`XTTwP~TY{}abxke?dJr6B)LK8$y2U6Dhq_rbv?GSnX#5(w-)Ur5udr+w9`5zs>7+ZJ#!Y7=+ zAoRQ?yfq?R3JN13A@q)briFK|3DwsVr8gcMDwiuC7~C6{dSLEa+_HC}FJ*VWUwE%@ zWo(UFE{xlIo)QIR1LP95QPZ%}zDliL_**(&)3c<84z_d)-9y5ei^8Qbfemkhq9+f_ z;MFHPEqg`~EoWqcsqtSeZm=@%PdsQp|D?V#-pB}N#}bW9qMo_eCit%|^<`)2$~iz2 znvP-ZWj!9V9^W+JhCg=_hKj}J+s(_`xV|Y#Hwkpp=CF(~nDBJ}L2kMyoIRh9%ZP%K z_vm?g;kr<@Td=hY?E^yDiG=#(L#;_#!REFEy*ovllk~0_y(>Z2|BKd`qD@KK7Nc#S z!1X$+0I#<145hRsvxCo{hUG-%5pvT;=<1)t8ZLA`-z+66-8e1nC7(YoX??z_fhqa? zu>~MJIiHKOQ%s)X>HK2zZIY-QAfG=mR7iL5WQC606}$s$ku`RG_qum|Sg;KwN=~Fo z$`;J#qq$ zsqY>qJgww?x&D%Yx?gs<5k7uHke5u#-;fRr*N~Si%HKG<0RCT9?N@BXKkCU#HtHV@ zy*2RhPkYHr)zm+Ab^!d(jpQXK_0LTT4DW7yTTKX+rDiHt-rEHq@e&zSj_doJ@R97KaJt_OX{v#ITd7PnDlyz<0;Gop|e^*2y1fl|bi_j)t5^m919{^4}KXoZlK0kn>v;g<)H7v8P%7yJk5w?5aUW z!HPk1IKjt`jOmY06KNV;4FTV%J99l2JC(C{g){k?8^ZF@NJqvfFMwu|P%Oj%tQ=$m||Bh&QMA#k?mH$RK9}(t9IBk1G)IK8WAl;L=P?h VvRv7_hxp|lRqsLhFAtEA{y$c74C(*? literal 0 HcmV?d00001 diff --git a/src/__pycache__/db_logger.cpython-312.pyc b/src/__pycache__/db_logger.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1147b6116c8e3bd6bae220daac27636253cc4e59 GIT binary patch literal 8693 zcmd5>TWlQ1m93uco(I{(;X|VM5XrVFS(=z4?O3*BIV=uXBRk!QWRdw&FQ@8%$_g4`J-}qBQ?0*^w`7g}mgJ>u2Y(U~V zQHjdM$&~fvrZ|>&O}TuQgr5>r*OcfY&kH4d3Vxdn~R-1@iK=?LwPVrTi_vJ-c{0*UXyWNHK_aLT1}Q~U~< z;Z)b_UYI~01}dro#BL@PLF!o{uW(b|2@-Uhe&{u3Xm-b`A!uzdmXPDg+1XfPw%e9> z$trx+aB}tpyfo;opU!G90$GbCG}&t4#6ZN*64Q~yWm%72qFQX)=#J(;62BE!3IFd* zLG%OeIx*mBg;m!gc>{Fgrk&@=BDY8sr?yHUXRk5=-GH2sud8Coyk3zs3A#qlz_3I; zbw#75S5ab#n4u`9N1aw?VsXtB6V+;$ zpQ1$L5nWdl9X~*pm+w8&b16Bm^~^`qS&h#3L}}zIELo3Eqdm|PPMk)&ucWC5%4Zr& zrnfu)IjF|^qY&L)Cbzx*RnMlkY0KM^JJh}+T$6GKo8eKFJJiZ@9YV_7NTZO`rX-I2b@(f^tE<LjU++C_E^KLw&=8 z^3WN1bSx|nUL2YTPoVwk^6@=2umK}BuL1VVUy%o(c6exHP#zdO6FNT}miy0-j}MN9 z6%hyF zlDsL&JH-8P?JjZq>vn30zi#d9>e+3nZBuI7k~(so$5sYDlH|L$1C47-t4p5}7xW?> zSQ}X#$sQj0Ao9UPw&mQGG@fg2TN%KrpY4cHk@3Q(hSY0d}OUI|>FMg@3TbO2(;wFJig zz`){KbltQekU>j@ubI0qv>-Py9Ja9!m^{41e=eBtrCKM3xbgjpC6o%kN0-Ty}`^^ zijBODH;%pG;M{~I<68zlx1*&28NocFWo^=7R0ju3eht8bD&{6 z(D}zeCmIo$!L>1vtYyBN$w~D)g6Ny#a?Ksv&B4v);Ej{Ncy|5SUoL$3=;>@z@5)e# z*<~LZ&xT*xlD>gb$L2ngItsSNL1T`wg|p``WWz6JTP|)(lbh1yXFEK;w=C^us3Fu# z-s^4XJIQaQ_8fwokZ-{1ZWN8eY0AC+>eG`H?b zJq-PT0s#NK_0GQgRQKpQZ>_c%jVE=DA}O%Zk>Yl4;o~w}E_xgUnTFEUV%3nL-syx0 z$j+au18b+gd&-&R!<*9KEvcPxpjH1H0g6Vs?nB~+x&_3oN>S#MYATLtNl{)-MdJ1w zkD{o_sG=~YWvS*7sLeja^xMQO3j3;}taH})B0u_P>vM62C`N1`G9>Q4PW~iJ>R0694nKxQ z9fV(oOc@Us%XOuoN4mF^g{P6DyB7IHLFIme1j*63(sC^4^v?*JLoI==^P@q*6fPxo z!xU1WVW#&AuIjwzuFp`l?UJ~kWy^#%$s6PvYQ z2)Fm3o1qPT64LZ3jGo2_)zQ)h{1Aj2+16mj*bf#jgG1OuP*gt)Q7K{n&pBV5!yTzC?8Ez=`MMA9cT*feq#jnSgw>*I{e2ugrsAe9ht7w` zl%Y{*V+3fq{KELqNN9Xg{@UPVWybygliEL4%sMJi`!d`DX1`8?ep6(w6w_xj`y{0e zFw685MyL-I7rLBup!$}P4%z~;VkXbFicFcE+Z_HJmVFl@1eC>V6{DHOCR$to=9N7#UDW&woADsK3Kil%cmUON%rSJ3g zNn}H6>#yfGf*t*S{+3^abY)8aGlnV!w+n3oE3RNc`eBeedJl792S)o5Bg;a;R`;ow zr~MUdA8O!|m4!XD|3R_lL1_Pd(T34}$8%5#ODg#c0AKJZfE1U7IaIl_0Qm9+jsmnD zs*h$Ke{hC>RPU0H>cPy@#n!ez$KPXg!5aDu^l9;YW?zi#cc4th2+z?)^!}ePdm5us zj!z!~St*mJM?jRRE7@3)$E#TKU5J*+e+Sy%HMWmE`^RI?{&wK^XE)B~;OzU->Py#@ zQa(TVz#nt;-NGqCCDU)BsVI>wqXf;uk0P5M`_q#renwBoCOs zQv-3a|5tA`oN|?brF(x4?U#Y*nTJdiC3i$X;z z!{5YtfdI5bSQ61oUFk*wqz;5wTiz}j?IKUV40G05Rur=NR_Q~p33EsHe-B<5{{HXN zF8l+pi$gLiuB^Pb4+g)U&(R^zqGyjM#m)pIQg8U@dSC#Dm=^?!f7R|)>vH%Y$C zDaZHIk2eY@CX9%IzTeB>paI_xyK2$3D-jC!EY@Yw3EOMjo%$Ngi3RBW`HLspe_V8Z zBp0`M`PQ%DhALr~n$_^y!jP|ORMisQqmj2k&r7K6eP}GZ08zlBY*g|%{~b(LPo`*8 zQ&P0xWB&4!PgZa;t0|a_STdpL;7&@Yx_oAMEEL{Ljg2H+wJ8fR9bCnXSnpRvk1X83 z03Jz|g=9Phu8HZiq3Lqp&~w1l_co}ZXIV!UzxUl2q3I>7P;i=OzF*7?N|vB7eQ;}6 zHbY%<$z0^KY~1}=nc~YSjiycaLL{EjbjuZrTp7=g7%X>aJvvKSV~UYf*x`td zL8&zS>my*Wz@n;KZhLFj{Hy+KV|2@_=DaoFa9bYOaRtTZ+X0*SJhB;p-xD{QAzBe~ zbqBAvzSa8uwiP!xFnocvn$?;eQr#r&l4_rSr`_f6-r$?@#Li7jvVlPV(BzB#ac;OORoqgw}#fma1vy9PZnx9l6kJ0dxFeC6cbT@Qht zdRKe5tL4pV`BTD)|H9=5-*h4S;>GNR$!z1)w)g9s-mfE(gZH0(;(oyP@9D^j-tBvO zKq9}B+6Jrn-vyfn-TeEUI9Nm8cZ(o-znaH%%~NLt9vtM)K?Td{jY?<+*a~$QPOnTM zkz$+GL-SXX)L@rSOj%8jeb3Iki;QfsrDdvZQ8T=Py663>P9^0+z;hNrj?)unUW4i>ByVKk=Ab9$E?_0gQ1d?|axW~BW zyE_q{cpKpO33YRgt;=IMp>Fy4?~HtFBwKI%hmeAcBzSXAM-%^^JLDqoxtc@m{CjO2 G#D4>K%M$be literal 0 HcmV?d00001 diff --git a/src/__pycache__/duplicate_detector.cpython-312.pyc b/src/__pycache__/duplicate_detector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e78f8f612a10a80098aec46ad19ddc4f33996e43 GIT binary patch literal 2626 zcmb_eT})g>6rTIDKfv;1p+%s`v|{N-SQ^nr8*R0rU`M#G4xbBKhN z3fnvtGFROv>nRf01~Sw<91M9XHYU!p8a!Ae3`(894inpUo$yc7>_h@VH|k{7vb zWHY*xao-#?DDH8BRZrhpW6U&co}rUCpJRk#%SdH7PSf$6MW~g)LVYkdWg3$@?>omP zsb%G?G&UTL3DqaLaowPdCsNgD5Zomg+uiV8grJiWK}i{|ppY<;=cJ@clsS|Pi3=yg zGog6Id(3x|(m5TZoTj+V+|#ufM&OfUhzDP%EU%lKay&|>IVpujan|6Rnhu6bz6zI2 z_y<$vFhnpyR^V?t3DY9tFb88cO5_64O8G-*Mk312H4l|FmrDZd3QPVdNU$o`7tY4j z+!k#3{zdTTErT1nX*eQDkT5IA<<4s+YAf2Wu(WfkvJ&T^PKK#7gU@I@!>E(7O@fab z7LT98gOuSZX1mnlIo+iAaw?Ojrb(^5@HM`=!Z3m2t0>4(p5jpMg$3%U}v{*0Za{aKx)Da-b!n4UK&>vveH->R_NleXnQB5--&zk18 zKLVmed9Pl0cp9cz^hXnlHP4?ZcHsHJg$u>lfd{P}^XfwE{-L(p6ok5B_w)1W578sr zppJIIC=5UNFnR#OA}XK>`0gE!G$ZjH5LqsP>jnUAq5_ucsX_GiFhVyJ0HQnr&13H< zROn3JiZNxq>SCf=Yt$E-cP8MAHk6dq+c+KcnhvH;9%pI9N7 zUaW&rW{vA60DK5gPOGRM+7P8n2MK%q^ZLY!~71b=$(Ol6Yp^ zm7hwC#v@GB*XIeV0m+LJh(Whf)Te+KG3|8Pu+pAtSmQP@N|eu2MN+(|Me~|qlUGlt z;S{I^%i1cn9MP==A2Ac<_F{&kRY>DL@hClZ?svio$AxJYZMLAMmRp673dQEfe`{`E zdiM59^U(*LN4~iF$<-~S)*PO{SZwcDn!eX{V!i9cSDkme&V19o-Ziw|K740jy=`Rv zwPIu2(&)X;lk1%)?>3&i-_*Jo`#84Nb>U8O?X9b87p|>cduOdv`@ZSjpISRMYrx`W zEowT*8o>Q7#t9zzj1x@zzZkbK%j7ao0sm55&Mkw(3igLqSJFWPFTF89z69A*342;-;3NVV%*sZ#o zb+;OCHZDb>Z~9e)|2j4_I0J7xG-ylDQcd$}H7#qCoGIj(ro9IbS7|4rX~a%x8WWA0 zwTbC4OkX13n2%;A!0k_hI*z!^9Qs|BL(%PUL+FJq+!8vt-CGwr8XOVY@i1$Kj~4-o z2g& z$&!9iyZ)?wPCB}UVESvb21T2?429&R*0p%a*d5uE= literal 0 HcmV?d00001 diff --git a/src/__pycache__/hash_manager.cpython-312.pyc b/src/__pycache__/hash_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54fe2f21d9295d6e26d4a2d2583e206c7c307b37 GIT binary patch literal 5105 zcma)AO>7&-6`tkplA^V;{>WIe6J#94w%96E497^~rZy;riiOaWOwn$nLkWsI5?3ad z%I?yN2+9b89IPO%>PzJ`P18dTPJJkfUeY3MBKsns7r9CxWt##4dTLP=s%^l5ed&9< z+~rE5osOjW*_k(g@4fGRqrbGY>>-f;yxFN7lL+|-Hk>5XDtBIk$}*9O%oWIlU2+o~ zw0#9%(Kq4ah>tLC#_tH$OD0_B?AC2Sz8Z4&cojw3C^%gs$7+1tPad%Y&mBm zL1KtbSEGBZk{PGlQ#yC+oe4$;Jws~M>$Q?2^IQ)x9Lys~cM61bvxl zbaW~!(@I&Z>FODGN;Eqq>WU{TJ#N<&%NIa&k|{+BmV+z928)V8@B2 z4DA1Yq(5IOvi@RLhQY=D92y(b`gJYWp9izd6m6T1&d$RU_8OT=$a8g3=PRmMS0H-%7} zpFI~7$%@!H!1F6zhhr!Cm6IaWbFR(97(@)qL;eGmW$=Q=R&|_k59ygf=^aVtuGiM! z0HN^5!5grCUk^D-bZ!nXYaC|9FSbo?^NI#Y243;h0%xF_tTfZ?)aoVJJ_ok%tFJ!} zJ6PZ@ar3YXEVucqgEImf;$!TvxIg~GJ1)7e(mZqbr;WM$+ zIDIuf-uKYax>Cd$Z3$c^c`lK7_;3Vu-N+Vaw+l3!KCQ=mlB}W_vM1OpExmHmtWprXz4us&(K8guWfVRCD6}IJD+V z!k%D?+iVXncVFrLFA21SH`_wX!7IUywu86Y4sH>DPv|c3w}6p5s?Kx?=tOO1}v8oYx=R`167te;URRotx(#=68*}%ZVfle3hSZV;1bOmLM{4 zQ(WESU=;3SM{;Kos)E*0g&5uqBMelTm8z}}UFZdcaPxu{x7=to<3=Y(f)Mb~LkpsT z>b24g>UBqwMf6bFUlq5)(d0)4 zHS|#@paCjW%5)ZY!(e(`wjRRJp;b1Qaon_5QH@#@R9P8lPi-4Aup4{xx@X}OGs>E} zBcM)}N`(_Nt(5_o=9mimhaKQqC)tIpsv35rhVwgfmsL2La}5AoeN->0x?eTRpnkv(}y5kVe*|k_`@(37!?vaCamZ(S%)y#w`e!05^esL+L@ZK>1%ViHm5l3r2Yt z|E{^SN++0N081&(vV#2(LX-o5n`V;?D(GYmm^l>P10}nO`3?*N`96d~@LvlUv);|v zfM~mwojVxue*u~!n5?|z9@hDW<#2K=o=Ve^6ivlP6R|*}cY{Lyk0WV}%4QPraWFVbw` zorbPv`pZU6EzOxhMK4X2v|`q<@K3{lYjGh{l-Xo?+Jk@=ITCGhgp3ECHTE@_LH8PL z*h2z<$G3@-*<7B*$5bW{a97#q1wBv4DUi|s2pM1=P$3)hq>isck1vm08Cee<+6eXE z3iYptj$VcbN>lKBon#3u$9Iq{z9$3-w{QP1k6oR6|D|i)YvE&e1AP1OdpwN2yN9%N z;)~_#$!k5=!fVkp0CHbSBlnu2bBjRs;I8BW#`ASx9A52N4X;kFolUKcj;(d4H>7j7 zq;nQ2I}f;sius*15F&qy3^eg8hYv#aMw2+u9=OpGzpeyF!3$ravP}HeXgtIJ5M}Vvm?!!>6w0rHv z5=z5oi~qh!zG-e))UBN7PDW2!?54io`nv)ILiXziNJAxBYWZZ@8pSZg8MJ; f+X(Kz72JRIh4tXmZ~E^EXF2XPcbiy+HQ)aLOtyae literal 0 HcmV?d00001 diff --git a/src/__pycache__/html_cleaner.cpython-312.pyc b/src/__pycache__/html_cleaner.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f257829050c0e1a5913081799db090f232466e27 GIT binary patch literal 1820 zcmaJBOH3O_bk_Uv7fcNp5>T~5l;DU@s-z;I(kkF6O;bP(98yb>wRp$&y1Q#@b_}Ij zM;v+}Dpg7^B&Yb8iX4#A1BV z`?;wp04Pj6Yt=eM0RCc(3sQqPxIl?@paPZCL4skA&KaJB$KVq@2UH#kssKe*ghE7qWHFBXV*GH)2r#3Yo(hMr92P0Ton5@{_%Bm{90 znJG^rIzvAC@$@^0Q|UUexg5vDCqd;ju+cbms@`*?FL`XwsWyD(&_{{?4;$f_jbH!lAA?IwD~x=-48%Xvc(*|-`ByGHX4;sy9l-O+TNV^bt?ci8X^v?wh3VM zyZ~$geWQAF{(sxrg6p&7J3XrEd+c{@Xk4Af_1pfO(+Ma2zzoQ{0Gk(;1p$;aGrqx3d2a;|`(@ z6M$FHNsEspvZKt1y)}3V90>A(z)oB2l+&CkUouKoZa_ z&9ty$rXXUkPdo?`-YU-1A4Vi_{o3@*&8w4FX0MVEf(2bk!Q`Tbv#}szBZt@?Bh9IL zd2HX%HG_go!@8RY7TZcbT4&M;$5V>5(1hkS6ROEZ$;1nF=&Y@P2^W*~N-gGvJ#GVD z#fGza0}dOCnt{j|P9bGchiKSBsp0zj*OM43ERledP?Cw!IqHU0W`$`}E`wSWM7!4$ zYl(96rl-<=aj$)3r+sA0`+cO+9$%iWM&!Ln?@pw*e0y`E5*b;(QH}QQMf-Q6{oApz zt%*wX{pFcIgB{iI8_$EC_q)rbO7NZKcufNB@;>m0-7CIoYsdQd+IYFM(t3U$aQ^O< z$!aKkZ|d&US2Hho%GY>m_aXp%9o4QkA6|ZNxja+py0q=<{G+35y|h+(6#u$Z>3Dk| zi2lKqYt?Y{-krO5_QE|o;hycj(Vg(<)5MS7-H&hWj(<{Z>3n{s>tXakbX&f#+4Eg$ z>)aFlm!4;XKMn4lxwZ0PHPpHnl6OLK`RuQufm)DyeAx^_;T8Pl=(f;yK<@>_yyz@N zgrB!AIo;zl0_tWmLDf>&xx#M&qPrHLKKgs*d)LUTMIF9}1}Mw6to0WC)p(BMegppB aL0?Vi<=XGh)&ND1dL6WR@g?INH~#?e0J%H> literal 0 HcmV?d00001 diff --git a/src/__pycache__/logger_setup.cpython-312.pyc b/src/__pycache__/logger_setup.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a00f20dc7663a4e1e69410fe8e7fd0c417f7b093 GIT binary patch literal 3320 zcmbtW-ESMm5x?W-kvtwriKIo@v?wcMRE|C(y9EjXF>&IZsFRPR zymzug;fShefY3#NNI+YvK!EU3fm~GoM_+s;1F{zv4NyPSZ!YAA)K8r~-SQ~FdT|%H zo!Ob$+28KW?%ZF);UEQ#ZS~7l;QoaNq`7S7;1V!(N~A#?OzqjoU#oXH`YNr{D5le|wR7*; z-qp7>ysp(&J%n?pM$U;^^*K_)5ic}V@$~p%h~A-lHd}S+uGNquCmhkELC~{A>d@7Q z!xN6op(&UXBT|RAdeHRLq#vKth$do%|bUSRgdV?Jr5|fd5o>ECqiD` z``YOEMTSJV>O00JAO^cO{$p%HBHOiLbiWwhfb2T#s{!o<*`w~$13L4V+p{yF8q|ZU zDWY|#-pv~8#7{aJ!s$VBR&Yxq);WC8bGoO9okO+LgkQ}N&Y{FxHKd2$5K)Nu07Ov> z(Vb%=6YYr%tFy0-OpJJ|hV?MYY9j=z9)8@{)p0uahG;qB1JL$cX!FOQ?RU`rE~=d+ zIBL;lT8w|jh>7qwT6mJtOtGgB@U;4wdc zr=n_7F|#D7ppr^rMp$3Yh!X0~fH%KfQbj?N3tw5>ypW=999B#Zi7EN zE~t4;E=s(bIiEQ@u1F6gr6hF6MM+hK0%%RoGSm>00Y5p+ZG`ejcnI|*i#>;m!lyY^ z-jH&|rRTW1nSpzfw3br@ReKJ%o02Iw6DU0l7+$Yvl4^#Qg#5krwHxL2l4knzfVT|( zed>J$N|Wgqm#k%Y8sW#!^cE|iIZ~YPv+!g14o0E-1N{+2L&4OkPvDuyMdp#~0rf|^ zPSeyEK@U}-{_K88^Yq*|6z?`&*$XCp&kR~lbFlWZioInro3HrEm2y#w0A#S?8JsJoXL|1DycxJzMnyr> zB!uuunk?wMA~Pt6qU{ELJ~Si=MKW}&6&0eciRE<-;bSn}f}$XtGp0u_EtmP2>BA9{ z3uX{NThRjEhc^HS1IuJ#GeYt@r{lC z7^-ieKDa%&^Fd>(85?cKCXCp`?&baK&DiD5TmSJ$`utyL+pZzWkJld>MHZ z`O4oO7&ivSTMXY}xR*YPOSHqMjqvGaIJ0@P;}5s}DZ`&?`A>oxPiQk~gGslTp^hi` z`TTF@fB#{9W_zaP8HTyBfp&D*hz>tFGuw>LZBBom%+yQUrN-iZUo-iW&G{ETis80~ z41cB*KV6^Oo@-p%oodFjwLqu;RQ>Aq)yClNC(Zu1YfL9OR4;B98?*aU&E%z8s1r}u zXSZh?Kia+Dj9&!%_)z05BR;wtH{$1OfoCTsI&6HWvM;}My9R#X@gDglv?b0>@c!5Y50dbiJ|)4?YoT+cW*Zn7iz(0A+E_SJQ@GE z#Vxc#3r|mtgGIwwTJ@ HuqOTwH{#t( literal 0 HcmV?d00001 diff --git a/src/__pycache__/state_manager.cpython-312.pyc b/src/__pycache__/state_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a055c364dd2f93782b6ad9aa86c5aca8134f89fa GIT binary patch literal 3024 zcmb_e&2JM&6rcUF9mmcWgb<)%LkPj7#6k#Bp%y8Ss*RMUg+L`NRo2!!hIMVPnO$QN zIg|s3q@oH@P-CP@8B)U))sa&`jjD&<#sqf8t{gFmJbyj}!{pQV^ z-|s#D5)KCm6y^RF<8L1!e__Wfai7uo`%&mo(0DUnnW~7)T;Fy$_GfGU! z_+q|{KjzN_VgZ4OS z^;5YQ)+J@#2crSqPg`h^`fB4M&hiza9)K0CddoEu3rz@*QBb`>igiC6ds6JBD1+T^sGZE2|Et*d9j)hlH)wc(t7t9Qe zW;7jqm5Eke617+&>Ykr*-8hn)b^`IZ0hfu#XS$m%;SINdN~;ybRZ?yvI|t|FZ`&)- zC600t!Q%KY8J1yFJ%PUS3gUwM@>ggq6I<{mibAT<=hflW>U_gG(Hql+@5NckR%#kZ z%0rzu1hp=D&^|(DWgD&LCB=#5_1wBB6=hxeQC_WD48&3J^E3!#S{WdRh$R$6ft(`( zeACKh=^QBlX=Z+|J;D^ciTnPEJYzI#H$##8hl8)`BGsZcKDMUfQ~xJU_+A$xP29xo z2!%Ql5LH$E?s_}~$l#rM_VfhUxN%oRWF6oQ)(OH93NJ9qKp^lLSsm^%8Sy!C8fecK zIkK6fS&lGAPFZHwk@b8g7g1OURn+w;99@o(wD7giFqMVzfFPP2=d&|CO)#rZAnUBN zAg+?vU1Zyi(%|jkTf^(!`=0jfEPb>z`%;lQgLBI30BP&Gv3EJT7+pEM9v&(aq2<7x zLr+5;3*+a$QucU-{9O!6mL0%*Lp{HNcVOqqXT}&$EclW*+dPQaB>Z` zpTj7i7KYri8D>*8M0%?TDQ=2Q56PA=E!2^@`PwEN3pNsumlV2sLn#WL=2?ii_|*|? zw|k3MtBqOdYly}eUB}25y9ltPD5dcEHxzq|H&B#4{a!7Q>wPk@yK5^%okw(UepUf) zDM@jPY~;`N@hX3mncspO86YN1O>Ye0NoHmmLLZNZMp89wm0kugS`Caj&0lXra2-`` z%vfr|%-R}6H33mIF}6EC*L2Pv)&q`X+d%-k;O-0qkR#>tkZZZ+js)b5_*`D?#Kk*MAj2FP zf^3HK)k2)2g*AGivQ6n#pXi4@)=dyVuMX1DwNRKZEL@qtQqpgmx6Fs_BagTAJP&tn zU{*gcYwPyq@x}3ywlp#KA=0Yf0ge*{QtCMYCY|w0&v8JvN=&b$U|xXTEm}%y<;>@?P@+~L`x%pCn0F6 zpr7G$ustX+L18q?n3pIdb@YUVL8%%#LpoPmecj%s;Mq9!9_||kag{u84KEy>Ke}*y z{&;Ec?dYxO!`1^&Lv1`h>dWS$`6#rnERoPK!yqvk3FDuDtd^nixD$-WGp3$TV>=X& zU(9Ri>Wn`g*UbbBOOO^=8|>upWtce_#1q2&C@?Cr{V33w9|%8#X90#unG3|PH38xp z`CT3>%YJ2W<#2^SRqhHa2Ufym0@a$dcJgkZg3WSYx6)Voc;)P!57q|O5_k9CA3pJ5 z@Z^JCV-*53l@Uo9au32YB0yXai00?=^TUv*I;fY+TmKwO*@2LbO|^2ZQy;^ q6UW(M7{e#A`aqN=K@gsh)+eOp53>7Da&Vm-d?OzbhF%d6eE&ay#-M-z literal 0 HcmV?d00001 diff --git a/src/__pycache__/stats_manager.cpython-312.pyc b/src/__pycache__/stats_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eae8f4f6a89f516bd95dd94eb88fadeb8a635b38 GIT binary patch literal 1239 zcmbVL%}*0S6o32Cvb5BKHBbQw8V{}ljYK_JIe75a3!ZwJY&wgS?6x>F#Vm5*&;x!F zPnB>$PloWP=m8QA&C0>VlQ*OX6HdO_?Ji;(4^GnGzTf-mdvD%+&SX-6haTptUlf4v zl0--E3Y|$5T0ns+4wONKM?lpkP;E~QJ<5uwnEDS%N|jI*V%H9x8^~K=Ls_9vR;kj2 zvLi%1KW*6MpI;vz|Hg!VLui4BZA-DsnGoe%0}}gD zKo(Vn{UDS1f)lW)=g^ioXG=!R15+(Ana1q%by0O~KgET`1PNrruF+7+h`O*GvQp($ zVN*XHQf{pv843xtghf{BP6Lx9ZN}=1?<&5|ZD)bYvL{b(78dKSU2rY>#Aa@xB1dDh z0%w&1{;A-^wQ3fcW|lqQAf#HW3PM(MXCs`6l(IQ~7hxTaMkjX6Epz+UhWM!tRZn*2_*hetN_H|Y)rLhtLihnD}j4i9#flBqH|S!PJcAcTFh zEQ)?|0sG;Lt3!XZ6RHNJr}V*Q^5Z-GTm9QT=EQkK47WUqUayW{=r(nFj3lWwD(bX%*iUKiv5=_i^qZH@iPDdz8&>dM~_J&)+}ZAzS1iJGF021+yyQ664;~ zEY~K)PZ8qQX~U8703lBsmJ?dope*kaf^RId`m)W0$A*#aLXal7FDB;@fP7Yb5}^qv zYBG^O8B`M2kCTwR@Ke9a2PmTLwRS(!W8sx*^7ZHv%MNz&& O`jTQ5C^dTOyqIt}aJmj$teejSy4BLQ7kO3Lj1{koXzPXKo4c;F19c{|4 z5y031y!-C^-TnC8@w?ygukm;kLHYjQJI%jE5c((Y_(j|{UZ%i!fDB{^7Mk8_g=xX- zi_;>sq7^6vrUL@U0+v*er)AHUtY9HD9Rgea1)2^UL8v45gk&)F_j5&V4qo{|sk zaX{A)+H3(gkcU0lTMW?%JlTHBD{*NbT;2x{?t_Q+!NdFDk$v#!4qlaL%wNAN`a9!F z74TMNSI#Rwy3;z~H)8D(-`hca%TRo#DU^(tAA&!sLAio$OiWCuhD{sWnqy{im|%NA z%Yj!XnVji>$JTCUbwiuc9q?k?(DZUvGceIUm`n|NM3*3WXoYHJd-nMk7By$FHU@cS zI`Pemr#dI7W8 zyh(Fur<~223&|i0KEKYk|k!;zrJktpR#J5e;HjfYGiUmAW z(2ZG43PTy9&s&%bIV3aW&?3=i@nGp5i_c=3Zc~xNFh2~B^IxcHXd{X`y6)r=jjjje}KqU(Y3TV;9b(Hl+mL`xlPsDqE4XYV!D zw_EBv4fUOs&Xr74z1mVI8|q|z>cjevZ#C81E!AkKMjhws`9f2*Tk2dxovY7R>h8~8 zB9wSn{1?>Y;x}Hsj?h(cLWJgqI4N>`%ENDY_$?0H7Sp`ZMZ?3fSR3C6LTvXx{6ty4 z*HBN@#@B;L=~^27)#%#6!;eCZgC{?Zc&=7VYs9oiJ@v<7GC+v)q38}e8G5PjN~9lnaG{TSo_KjRyS_WN%> zj>w5>bXU}``^ZjfL|ya{88J5s(Mf)!0{RFUN;Ue^D35Z>niuBLLf|7bFC;rEcRXpD zDtYPy8pj0(FHoX$aibtPT%v646sXvM_P$B6HK0vnlXA%_@y@uN(=D3Rbj#7MPTjbE z=Eh7O-=$kBm6QnIU?x+OT9`%no$64%0J4)Q+CWU-Pl_xA(gLQ#!g#@SXu5d!^4C1? z0aCE-lnGAjPR2A@kgJ+aD{`4;&%DhqWo&1PhH1}M)GOF&FM`KXJ`N0*aSjwmo55y5 z^Veu=b4A+=Rm(pAf@l?SSi4^7^`sb<4gq*#KLaWb%N*8{D>H0(NnpMNFrdEs}yx zJ)znXO)iZP7LyBi4U@2-N8&JvYcG?Ha-rm~=sbbVOH)0}2A8PGXJSH%BpLIzkZ2GCImQEt7R$6$rfGN=Im#Up{I2LM-lp*+Ikr7& zOY5_qt}8sE^!Cm(0XuMEB2TNgD#A04NsbN|&;>A$z~dZ;{}|9=1tog9zJAitQcr=5 zHPzFgWMQOst|hwHx?W%FJoHUOR)V#$4Hb1A0}(!WqBa52*V%Kw(mZgic5O}RSw7QH zPS!4g+R4#hrhcAkN!<;pdzn5q>QeWr^cI)F#L~#($Ztm1B{?!7eARd4v*F(jKQcb= zTKUOp-}uL=H6_tf`Ws6B^6h6=>Pr8rGV+zuxs+N=)%)Ije5o~Xp)qjb^SjM~tMxap z)q7K`%EWpHu+~x=GK#26%A!*5Is2@ub?##0+{Kmi&2u;EXK&WKezY2%{6|c!rT)DU zM{4i4NZ_vr=846LdOyEBGZ;G{qe1L54?94 zJwF>6I}&*QgTAptfj=FR!A?e5D5pC)%bX#DFp!+%m6uqM{?zmHoM#GLwlHYWyKcRE zdn?oU_xJ-M{N#I|X)`=hC>mu8kCKbv<3F1Y&xZAYAP8TgjxSLR>KEwn3-sCxbo5)P OOE~><@T8FNDEFS=T z?#{5%gL4SvkU%0HvcZF!Ac$-ZIS48qM7#)I5+@L-i-IRl24^qkivFI z_1>$W3x#n6^yk+l?^6w--=#7nWgzrUfN%>DM6iz-u3@Is6sFcxrqwizTtt++xTX{J z3aS}9nx+3C6D>{T6Ye)`9#N6cwt6!X<6ujIzI6;_8aN@L))+WqR*%AjJG3t|O1y-L zNojX%*04P{07%ec%9)tR>|qeoAQriZMv)f=LgjQz=&s{?L>OC}a(D3H9da{Zsjzx8 zV7-OnkuOF21G$Z38Ie(nx1lP}(kN0%mF?WKUWE1srqk$#dP8$m4>`((ml65^D%w`t zTB>f3JwGeXIwJj-afGkGl&Y!riE@2bLv3AV$Aw{TsuO%q*&p<&ZN$Zq2X7x5%Kz@E zLDYNNFn;ukT$(!%H^j&R^tReUmY!UNtZ@^l8ggd)UgS|8o74-a`Tp9<>!ueF%FJbM zyA0}2O<-G;2eFw1#B_iBJa9din(N$lE_s1_a@FQGqtLeu(@r9jP;Q<}DBn4GCS;8; z_&-XEazcaoEPCC-LMLHFn*v#i(3>(#t7Mt}UKD&*cbkT?XdsoHl&hp;lg~FpI*-=8}lX?2I~SHUQyE0$&zWj!*4? zlA*OQ)cg0D3i^fm5F!97%EJM&ev+OZuUrfntuULoFuE0o+by5+O2nNC#MYD8^P~C3 zjwp;kBnE#4Fyw8cE5JJFU>r?N-*m2B=q!VH_{H7j&sJ`)bW2ue<%xk_JN0;O;o;mu zckYeH<&E3%ZgX$;o96eeZ(HB(JX)>)@^<~Ey?f|m@}RuYsXi=k9899}vEAVQ=|_hb zI%f|IG;Qt~Uli{Y@3Wsa_ljNfT(`W|ss1)`^nrHt@7@tKZR2PHF7nl}CByikqyw*7 zIryL4_r1;etaSfrg{*MD)<>iy-|{8u0Y`im*wg%*EF_6f-{QwXm47dq0ro`282^fv Wx@hSy1C>6)onlX$!7uPv!TAr|%s({% literal 0 HcmV?d00001 diff --git a/src/crawler_core.py b/src/crawler_core.py new file mode 100644 index 0000000..0737739 --- /dev/null +++ b/src/crawler_core.py @@ -0,0 +1,277 @@ +import trafilatura, asyncio +from asyncio import PriorityQueue +import aiohttp +import logging +from urllib.parse import urljoin, urlparse +from urllib import robotparser +from bs4 import BeautifulSoup + +from .url_utils import normalize_url +from .storage import save +from .html_cleaner import clean_html +from .duplicate_detector import DuplicateDetector +from .hash_manager import HashManager +from .state_manager import StateManager +from .stats_manager import CrawlStats + +try: + from simhash import Simhash +except ImportError: + Simhash = None + + +class RobotsTxtChecker: + def __init__(self, user_agent): + self.user_agent = user_agent + self.parsers = {} # Cache: netloc -> robotparser.RobotFileParser + + async def _get_parser(self, session, url): + parsed_url = urlparse(url) + netloc = parsed_url.netloc + + if netloc not in self.parsers: + robots_url = urljoin(url, "/robots.txt") + rp = robotparser.RobotFileParser() + rp.set_url(robots_url) + + logging.info(f"Lade robots.txt von: {robots_url}") + + try: + # Asynchroner Abruf der robots.txt mit aiohttp + async with session.get(robots_url, timeout=aiohttp.ClientTimeout(total=10)) as response: + if response.status == 200: + text = await response.text() + rp.parse(text.splitlines()) + elif response.status == 404: + # Keine robots.txt gefunden, alles erlaubt + pass + else: + logging.warning(f"Fehler beim Abruf von {robots_url}: Status {response.status}") + except aiohttp.ClientError as e: + logging.error(f"Fehler beim Abruf von {robots_url}: {e}") + # Bei Fehler oder Timeout wird der Parser ohne Regeln gecacht (implizit alles erlaubt) + + self.parsers[netloc] = rp + + return self.parsers[netloc] + + async def is_allowed(self, session, url): + rp = await self._get_parser(session, url) + return rp.can_fetch(self.user_agent, url) + +# Instanziierung des Checkers +robots_checker = None + + +def extract_text(html): + text = trafilatura.extract(html) + + # fallback bei Gesetzen / technischen Dokus + if not text or len(text) < 400: + soup = BeautifulSoup(html, "lxml") + text = soup.get_text(" ", strip=True) + + if not text or len(text) < 100: + return None + + return text + + +def extract_title(html): + """Extrahiert den Titel aus dem HTML-Inhalt.""" + soup = BeautifulSoup(html, "lxml") + title_tag = soup.find("title") + if title_tag: + return title_tag.string.strip() + return "Kein Titel gefunden" + + +def allowed_link(base_url_normalized, base_path, next_url, crawl_mode, blocked_patterns, path_strict): + # Normalisiere die URL für Deduplizierung und Fragment-/Tracking-Entfernung + normalized_url = normalize_url(next_url) + + if any(pat in normalized_url for pat in blocked_patterns): + return False + + if crawl_mode == "single_page": + return False + + # Domain-Limit + # base_url_normalized ist die normalisierte Start-URL der Quelle + if urlparse(normalized_url).netloc != urlparse(base_url_normalized).netloc: + return False + + # Pfadbegrenzung + if crawl_mode == "path_limited": + # Strikte Pfadbegrenzung: Der Pfad der gefundenen URL muss mit dem Basis-Pfad beginnen. + # Wir verwenden urlparse, um den Pfad der gefundenen URL zu erhalten. + next_path = urlparse(normalized_url).path + if not next_path.startswith(base_path): + return False + + # Legacy path_strict (falls noch verwendet, obwohl path_limited dies ersetzen sollte) + if path_strict and not normalized_url.startswith(base_url_normalized): + return False + + return normalized_url # Gibt die normalisierte URL zurück, wenn erlaubt + + +async def crawl_source(session, source, config): + stats = CrawlStats() + global robots_checker + if not robots_checker: + user_agent = config.get("USER_AGENT", {}).get("value") + robots_checker = RobotsTxtChecker(user_agent) + + start_urls = source["start_urls"] + crawl_mode = source.get("crawl_mode", "domain_wide") + blocked_patterns = source.get("blocked_patterns", []) + path_strict = source.get("path_strict", False) + + # Lade Konfigurationswerte + page_limit = config.get("PAGE_LIMIT", {}).get("value", 200) + crawl_delay = config.get("CRAWL_DELAY", {}).get("value", 1) + min_content_length = config.get("MIN_CONTENT_LENGTH", {}).get("value", 500) + priority_patterns = config.get("priority_patterns", {}).get("value", []) + duplicate_detection_config = config.get("duplicate_detection", {}).get("value", {}) + incremental_crawling_config = config.get("incremental_crawling", {}).get("value", {}) + state_management_config = config.get("state_management", {}).get("value", {}) + output_dir = config.get("OUTPUT_DIR", {}).get("value") + + # Initialisiere DuplicateDetector, falls aktiviert + duplicate_detector = None + if duplicate_detection_config.get("enable"): + duplicate_detector = DuplicateDetector( + similarity_threshold=duplicate_detection_config.get("similarity_threshold", 95) + ) + + # Initialisiere HashManager, falls aktiviert + hash_manager = None + if incremental_crawling_config.get("enable"): + db_file = incremental_crawling_config.get("db_file", "crawled_hashes.db") + hash_manager = HashManager(db_file) + + # --- Zustandsverwaltung --- + state_manager = None + queue = PriorityQueue() + visited = set() + normalized_start_urls = [normalize_url(u) for u in start_urls] + base_url_normalized = normalized_start_urls[0] + + if state_management_config.get("enable"): + state_file_template = state_management_config.get("state_file", "crawler_state.json") + source_netloc = urlparse(base_url_normalized).netloc + state_file = state_file_template.replace(".json", f"_{source_netloc}.json") + state_manager = StateManager(state_file) + + loaded_state = state_manager.load_state() + if loaded_state: + logging.info(f"Lade Zustand aus {state_file}") + queue, visited = loaded_state + else: + for u in normalized_start_urls: + await queue.put((0, u)) + else: + for u in normalized_start_urls: + await queue.put((0, u)) + # ------------------------- + + # Extrahiere den Basis-Pfad für path_limited Modus + base_path = urlparse(base_url_normalized).path + + logging.info(f"Starte Quelle: {start_urls[0]} (Modus: {crawl_mode})") + + try: + while not queue.empty() and len(visited) < page_limit: + priority, url = await queue.get() # Entpacke Priorität und URL + + if url in visited: + continue + visited.add(url) + stats.total_visited += 1 + + logging.info(f"Crawle: {url}") + + # Asynchrone robots.txt Prüfung + if not await robots_checker.is_allowed(session, url): + logging.warning(f"robots.txt verbietet: {url}") + await asyncio.sleep(crawl_delay) # Respektiere den Delay, auch wenn übersprungen wird + continue + + try: + # Asynchroner Abruf mit aiohttp + async with session.get(url, timeout=aiohttp.ClientTimeout(total=20)) as response: + response.raise_for_status() # Wirft HTTPStatusError für 4xx/5xx + html = await response.text() + + cleaned_html = clean_html(html, config) + text = extract_text(cleaned_html) + + if text and len(text) >= min_content_length: + # Simhash-Berechnung, falls für inkrementelles Crawling benötigt + new_simhash_value = None + if hash_manager and Simhash: + new_simhash_value = Simhash(text).value + + # Inkrementelles Crawling: Prüfen, ob der Inhalt unverändert ist + if hash_manager and new_simhash_value is not None: + if hash_manager.is_unchanged(url, new_simhash_value): + logging.info(f"Inhalt unverändert, übersprungen: {url}") + continue + + # Duplikat-Erkennung + if duplicate_detector and duplicate_detector.is_duplicate(text): + logging.info(f"Duplikat übersprungen: {url}") + else: + # Speichern und Hashes aktualisieren + title = extract_title(cleaned_html) + save(url, title, text, output_dir) + stats.total_saved += 1 + stats.total_data_volume += len(text.encode('utf-8')) + if duplicate_detector: + duplicate_detector.add_hash(text) + if hash_manager and new_simhash_value is not None: + hash_manager.update_hash(url, new_simhash_value) + else: + logging.info(f"Inhalt verworfen (Länge: {len(text) if text else 0} < {min_content_length}): {url}") + + if crawl_mode != "single_page": + soup = BeautifulSoup(html, "lxml") + for link in soup.find_all("a", href=True): + next_url = urljoin(url, link["href"]) + + # allowed_link gibt False oder die normalisierte URL zurück + normalized_next_url = allowed_link(base_url_normalized, base_path, next_url, crawl_mode, blocked_patterns, path_strict) + + if normalized_next_url and normalized_next_url not in visited: + # Priorität basierend auf Mustern bestimmen + prio = 0 if any(pat in normalized_next_url for pat in priority_patterns) else 1 + await queue.put((prio, normalized_next_url)) + + except aiohttp.ClientError as e: + logging.error(f"aiohttp Fehler beim Abruf von {url}: {e}") + stats.errors += 1 + except Exception as e: + logging.error(f"Allgemeiner Fehler beim Crawlen von {url}: {e}") + stats.errors += 1 + + await asyncio.sleep(crawl_delay) + + except (KeyboardInterrupt, Exception): + if state_manager: + logging.warning("Prozess unterbrochen. Speichere aktuellen Zustand...") + state_manager.save_state(queue, visited) + logging.info("Zustand gespeichert.") + raise + else: + # This block executes when the try block completes without an exception. + logging.info(f"Quelle {base_url_normalized} erfolgreich gecrawlt.") + if state_manager: + logging.info("Lösche Zustandsdatei.") + state_manager.delete_state() + # Schließe die Datenbankverbindung des HashManagers + if hash_manager: + hash_manager.close() + + stats.finish() + return stats \ No newline at end of file diff --git a/src/db_logger.py b/src/db_logger.py new file mode 100644 index 0000000..4c174cd --- /dev/null +++ b/src/db_logger.py @@ -0,0 +1,169 @@ +import logging +import sqlite3 +import warnings + +# Platzhalter für mysql.connector, um Importfehler zu vermeiden, wenn es nicht installiert ist +try: + import mysql.connector + MYSQL_AVAILABLE = True +except ImportError: + MYSQL_AVAILABLE = False + warnings.warn("Das 'mysql-connector-python' Paket ist nicht installiert. Der MySQLHandler ist nicht verfügbar.") + +class SQLiteHandler(logging.Handler): + """Ein logging.Handler, der Log-Einträge in eine SQLite-Datenbank schreibt.""" + + def __init__(self, db_file): + super().__init__() + self.db_file = db_file + self._conn = None + self._cursor = None + self._db_initialized = False + + def _init_db(self): + """Initialisiert die Datenbankverbindung und erstellt die Tabelle, falls sie nicht existiert.""" + try: + self._conn = sqlite3.connect(self.db_file) + self._cursor = self._conn.cursor() + self._cursor.execute(''' + CREATE TABLE IF NOT EXISTS logs ( + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, + level VARCHAR(10), + message TEXT + ) + ''') + self._conn.commit() + self._db_initialized = True + except sqlite3.Error as e: + warnings.warn(f"Fehler beim Initialisieren der SQLite-Datenbank: {e}") + self._conn = None + + def emit(self, record): + """Schreibt einen Log-Eintrag in die Datenbank.""" + if not self._db_initialized: + self._init_db() + + if not self._conn: + return + + try: + log_entry = (record.levelname, self.format(record)) + self._cursor.execute('INSERT INTO logs (level, message) VALUES (?, ?)', log_entry) + self._conn.commit() + except sqlite3.Error as e: + warnings.warn(f"Fehler beim Schreiben des Logs in die SQLite-Datenbank: {e}") + + def close(self): + """Schließt die Datenbankverbindung.""" + if self._conn: + self._conn.close() + super().close() + +class MySQLHandler(logging.Handler): + """Ein logging.Handler, der Log-Einträge in eine MySQL-Datenbank schreibt.""" + + def __init__(self, host, user, password, database): + super().__init__() + if not MYSQL_AVAILABLE: + self._conn = None + return + + self.db_config = { + 'host': host, + 'user': user, + 'password': password, + 'database': database + } + self._conn = None + self._cursor = None + self._db_initialized = False + + def _init_db(self): + """Initialisiert die Datenbankverbindung und erstellt die Tabelle, falls sie nicht existiert.""" + try: + self._conn = mysql.connector.connect(**self.db_config) + self._cursor = self._conn.cursor() + self._cursor.execute(''' + CREATE TABLE IF NOT EXISTS logs ( + id INT AUTO_INCREMENT PRIMARY KEY, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, + level VARCHAR(10), + message TEXT + ) + ''') + self._conn.commit() + self._db_initialized = True + except mysql.connector.Error as e: + warnings.warn(f"Fehler beim Initialisieren der MySQL-Datenbank: {e}") + self._conn = None + + def emit(self, record): + """Schreibt einen Log-Eintrag in die Datenbank.""" + if not MYSQL_AVAILABLE or (not self._db_initialized and not self._conn): + self._init_db() + + if not self._conn: + return + + try: + log_entry = (record.levelname, self.format(record)) + self._cursor.execute('INSERT INTO logs (level, message) VALUES (%s, %s)', log_entry) + self._conn.commit() + except mysql.connector.Error as e: + warnings.warn(f"Fehler beim Schreiben des Logs in die MySQL-Datenbank: {e}") + + def close(self): + """Schließt die Datenbankverbindung.""" + if self._conn: + self._conn.close() + super().close() + +def log_stats_to_mysql(stats, source_url, db_config): + """Schreibt Crawl-Statistiken in eine separate MySQL-Tabelle.""" + if not MYSQL_AVAILABLE: + warnings.warn("MySQL-Connector nicht verfügbar. Statistiken können nicht geloggt werden.") + return + + conn = None + try: + conn = mysql.connector.connect(**db_config) + cursor = conn.cursor() + + # Tabelle erstellen, falls sie nicht existiert + cursor.execute(''' + CREATE TABLE IF NOT EXISTS crawl_stats ( + id INT AUTO_INCREMENT PRIMARY KEY, + source_url VARCHAR(255), + duration_seconds FLOAT, + total_visited INT, + total_saved INT, + total_data_volume_bytes BIGINT, + errors INT, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + # Statistiken einfügen + query = ( + "INSERT INTO crawl_stats (source_url, duration_seconds, total_visited, " + "total_saved, total_data_volume_bytes, errors) " + "VALUES (%s, %s, %s, %s, %s, %s)" + ) + values = ( + source_url, + stats.duration, + stats.total_visited, + stats.total_saved, + stats.total_data_volume, + stats.errors + ) + cursor.execute(query, values) + conn.commit() + logging.info(f"Statistiken für {source_url} erfolgreich in MySQL geloggt.") + + except mysql.connector.Error as e: + warnings.warn(f"Fehler beim Schreiben der Statistiken in die MySQL-Datenbank: {e}") + finally: + if conn and conn.is_connected(): + cursor.close() + conn.close() \ No newline at end of file diff --git a/src/duplicate_detector.py b/src/duplicate_detector.py new file mode 100644 index 0000000..dbfcad1 --- /dev/null +++ b/src/duplicate_detector.py @@ -0,0 +1,62 @@ +import logging + +try: + from simhash import Simhash +except ImportError: + Simhash = None + logging.warning("Die 'simhash'-Bibliothek wurde nicht gefunden. Die Duplikat-Erkennung ist deaktiviert.") + +class DuplicateDetector: + """ + Erkennt und verwaltet Textduplikate mittels SimHash. + """ + def __init__(self, similarity_threshold=95): + """ + Initialisiert den DuplicateDetector. + + Args: + similarity_threshold (int): Der prozentuale Schwellenwert für die Ähnlichkeit, + ab dem zwei Texte als Duplikate gelten. + """ + if Simhash is None: + self.enabled = False + return + + self.enabled = True + self.hashes = set() + self.similarity_threshold = similarity_threshold + + def is_duplicate(self, text): + """ + Überprüft, ob ein Text ein Duplikat eines bereits gesehenen Textes ist. + + Args: + text (str): Der zu überprüfende Text. + + Returns: + bool: True, wenn der Text ein Duplikat ist, andernfalls False. + """ + if not self.enabled: + return False + + new_hash = Simhash(text) + + for existing_hash in self.hashes: + similarity = (1 - (new_hash.distance(existing_hash) / 64.0)) * 100 + if similarity >= self.similarity_threshold: + logging.info(f"Duplikat gefunden mit einer Ähnlichkeit von {similarity:.2f}%.") + return True + return False + + def add_hash(self, text): + """ + Fügt den SimHash des Textes zur Menge der bekannten Hashes hinzu. + + Args: + text (str): Der Text, dessen Hash hinzugefügt werden soll. + """ + if not self.enabled: + return + + new_hash = Simhash(text) + self.hashes.add(new_hash) diff --git a/src/hash_manager.py b/src/hash_manager.py new file mode 100644 index 0000000..251ecb2 --- /dev/null +++ b/src/hash_manager.py @@ -0,0 +1,88 @@ +import sqlite3 +import logging +from datetime import datetime + +logger = logging.getLogger(__name__) + +class HashManager: + """Verwaltet die Speicherung und das Laden von Hashes für das inkrementelle Crawling via SQLite.""" + + def __init__(self, db_path): + """ + Initialisiert den HashManager und die Datenbankverbindung. + + Args: + db_path (str): Der Pfad zur SQLite-Datenbankdatei. + """ + self.db_path = db_path + self.conn = None + try: + self.conn = sqlite3.connect(self.db_path) + self._init_db() + except sqlite3.Error as e: + logger.error(f"Datenbankfehler beim Verbinden mit {self.db_path}: {e}") + raise + + def _init_db(self): + """Erstellt die 'hashes'-Tabelle, falls sie nicht existiert.""" + try: + with self.conn: + self.conn.execute(""" + CREATE TABLE IF NOT EXISTS hashes ( + url TEXT PRIMARY KEY, + simhash TEXT NOT NULL, + timestamp TEXT NOT NULL + ) + """) + except sqlite3.Error as e: + logger.error(f"Fehler beim Initialisieren der Datenbanktabelle: {e}") + + def is_unchanged(self, url, new_simhash): + """ + Prüft, ob sich der Inhalt einer URL geändert hat, indem der SimHash in der DB verglichen wird. + + Args: + url (str): Die zu prüfende URL. + new_simhash (int): Der neue SimHash des Inhalts. + + Returns: + bool: True, wenn die URL bekannt und der SimHash unverändert ist, sonst False. + """ + if not self.conn: + return False + try: + cursor = self.conn.cursor() + cursor.execute("SELECT simhash FROM hashes WHERE url = ?", (url,)) + result = cursor.fetchone() + if result and result[0] == str(new_simhash): + return True + except sqlite3.Error as e: + logger.error(f"Fehler beim Prüfen des Hashes für URL {url}: {e}") + return False + + def update_hash(self, url, simhash): + """ + Aktualisiert den Hash und den Zeitstempel für eine gegebene URL in der Datenbank. + + Args: + url (str): Die URL, deren Hash aktualisiert wird. + simhash (int): Der neue SimHash des Inhalts. + """ + if not self.conn: + return + timestamp = datetime.utcnow().isoformat() + try: + with self.conn: + self.conn.execute(""" + INSERT OR REPLACE INTO hashes (url, simhash, timestamp) + VALUES (?, ?, ?) + """, (url, str(simhash), timestamp)) + logger.debug(f"Hash für URL {url} in der Datenbank aktualisiert.") + except sqlite3.Error as e: + logger.error(f"Fehler beim Aktualisieren des Hashes für URL {url}: {e}") + + def close(self): + """Schließt die Datenbankverbindung.""" + if self.conn: + self.conn.close() + logger.info("Datenbankverbindung für Hashes geschlossen.") \ No newline at end of file diff --git a/src/html_cleaner.py b/src/html_cleaner.py new file mode 100644 index 0000000..6a3888d --- /dev/null +++ b/src/html_cleaner.py @@ -0,0 +1,40 @@ +import re +from bs4 import BeautifulSoup, Comment + +def clean_html(html_content: str, config: dict) -> str: + """ + Bereinigt den HTML-Inhalt, indem unerwünschte Tags und Textmuster entfernt werden. + + Args: + html_content: Der rohe HTML-Inhalt als String. + config: Das Konfigurationsobjekt. + + Returns: + Der bereinigte HTML-Inhalt als String. + """ + cleaner_config = config.get('html_cleaner', {}).get('value', {}) + remove_tags = cleaner_config.get('remove_tags', []) + remove_patterns = cleaner_config.get('remove_patterns', []) + + if not html_content: + return "" + + soup = BeautifulSoup(html_content, 'html.parser') + + # 1. Entferne unerwünschte Tags + for tag_name in remove_tags: + for tag in soup.find_all(tag_name): + tag.decompose() + + # 2. Entferne unerwünschte Textmuster aus dem verbleibenden Inhalt + if remove_patterns: + combined_pattern = "|".join(remove_patterns) + + for element in soup.find_all(string=True): + if element.parent.name in ['style', 'script'] or isinstance(element, Comment): + continue + + new_string = re.sub(combined_pattern, '', str(element), flags=re.IGNORECASE) + element.replace_with(new_string) + + return str(soup) \ No newline at end of file diff --git a/src/logger_setup.py b/src/logger_setup.py new file mode 100644 index 0000000..2014799 --- /dev/null +++ b/src/logger_setup.py @@ -0,0 +1,73 @@ +import logging +import sys +from logging.handlers import RotatingFileHandler +from src.db_logger import SQLiteHandler, MySQLHandler +def setup_logging(config): + """ + Konfiguriert das Logging-System basierend auf der Konfigurationsdatei. + """ + log_config = config.get('log', {}).get('value', {}) + + # Deaktivieren, wenn keine Handler konfiguriert sind + if not log_config or not log_config.get('handlers'): + logging.disable(logging.CRITICAL) + return + + # Root-Logger konfigurieren + logger = logging.getLogger() + if logger.hasHandlers(): + logger.handlers.clear() + logger.setLevel(logging.INFO) + + # Formatter erstellen + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + + handlers_to_add = log_config.get('handlers', []) + + # File-Handler dynamisch erstellen + if 'file' in handlers_to_add: + file_conf = log_config.get('file', {}) + log_file = file_conf.get('log_file', 'crawler.log') + max_size_mb = file_conf.get('max_size_mb', 10) + keep_last = file_conf.get('keep_last', 5) + max_bytes = max_size_mb * 1024 * 1024 + + file_handler = RotatingFileHandler( + log_file, maxBytes=max_bytes, backupCount=keep_last + ) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + # Console-Handler dynamisch erstellen + if 'console' in handlers_to_add: + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + # SQLite-Handler dynamisch erstellen + if 'sqlite' in handlers_to_add: + sqlite_conf = log_config.get('sqlite', {}) + db_file = sqlite_conf.get('db_file') + if db_file: + sqlite_handler = SQLiteHandler(db_file=db_file) + sqlite_handler.setFormatter(formatter) + logger.addHandler(sqlite_handler) + + # MySQL-Handler dynamisch erstellen + if 'mysql' in handlers_to_add: + mysql_conf = log_config.get('mysql', {}) + # Nur initialisieren, wenn alle Konfigurationsschlüssel vorhanden sind + if all(k in mysql_conf for k in ['host', 'user', 'password', 'database']): + mysql_handler = MySQLHandler( + host=mysql_conf['host'], + user=mysql_conf['user'], + password=mysql_conf['password'], + database=mysql_conf['database'] + ) + mysql_handler.setFormatter(formatter) + logger.addHandler(mysql_handler) + + if logger.hasHandlers(): + logging.info("Logging-System erfolgreich initialisiert.") + else: + logging.disable(logging.CRITICAL) diff --git a/src/state_manager.py b/src/state_manager.py new file mode 100644 index 0000000..0eb189a --- /dev/null +++ b/src/state_manager.py @@ -0,0 +1,59 @@ +import json +import os +from queue import PriorityQueue +from typing import Set, Tuple, List, Optional + +class StateManager: + """Manages the state of the crawler for pausing and resuming.""" + + def __init__(self, state_file: str): + """ + Initializes the StateManager. + + Args: + state_file: The path to the file where the state will be saved. + """ + self.state_file = state_file + + def save_state(self, queue: PriorityQueue, visited: Set[str]): + """ + Saves the current state of the queue and visited set to a file. + + Args: + queue: The PriorityQueue to save. + visited: The set of visited URLs to save. + """ + state = { + "queue": list(queue.queue), + "visited": list(visited) + } + with open(self.state_file, 'w') as f: + json.dump(state, f, indent=4) + + def load_state(self) -> Optional[Tuple[PriorityQueue, Set[str]]]: + """ + Loads the state from the state file if it exists. + + Returns: + A tuple containing the PriorityQueue and the set of visited URLs, + or None if no state file is found. + """ + if not os.path.exists(self.state_file): + return None + + with open(self.state_file, 'r') as f: + state = json.load(f) + + queue = PriorityQueue() + for item in state["queue"]: + # The items are stored as lists in JSON, convert them back to tuples + queue.put(tuple(item)) + + visited = set(state["visited"]) + + return queue, visited + + def delete_state(self): + """Deletes the state file if it exists.""" + if os.path.exists(self.state_file): + os.remove(self.state_file) \ No newline at end of file diff --git a/src/stats_manager.py b/src/stats_manager.py new file mode 100644 index 0000000..6de8192 --- /dev/null +++ b/src/stats_manager.py @@ -0,0 +1,19 @@ +import time + +class CrawlStats: + def __init__(self): + self.start_time = time.time() + self.end_time = None + self.total_visited = 0 + self.total_saved = 0 + self.total_data_volume = 0 + self.errors = 0 + + def finish(self): + self.end_time = time.time() + + @property + def duration(self): + if self.end_time: + return self.end_time - self.start_time + return time.time() - self.start_time \ No newline at end of file diff --git a/src/storage.py b/src/storage.py new file mode 100644 index 0000000..4a793ff --- /dev/null +++ b/src/storage.py @@ -0,0 +1,70 @@ +import hashlib +import json +import logging +from datetime import datetime +from pathlib import Path +from urllib.parse import urlparse + +def get_safe_path(url): + """ + Generiert einen sicheren, hierarchischen Pfad basierend auf der URL. + Ersetzt ungültige Dateisystemzeichen und verwendet die Domain/den Pfad. + """ + parsed = urlparse(url) + netloc = parsed.netloc + path = parsed.path.strip('/') + + # Ersetze ungültige Zeichen im Pfad (z.B. Query-Trenner) durch Unterstriche + # Da die URL bereits normalisiert ist, sollte die Query leer sein, aber wir sichern ab. + safe_path = path.replace(':', '_').replace('*', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('?', '_').replace('&', '_') + + # Füge die Domain hinzu + full_path = Path(netloc) / safe_path + + # Wenn der Pfad leer ist (z.B. bei der Root-URL), verwenden wir 'index' + if not full_path.name: + full_path = full_path / "index" + + # Füge die Dateiendung hinzu (z.B. .json, da wir später JSON speichern) + return full_path.with_suffix(".json") + +def save(url, title, content, output_dir): + """ + Speichert die extrahierten Daten (URL, Titel, Zeitstempel, Inhalt) als JSON-Objekt. + """ + + # 1. Generiere den Zeitstempel im ISO 8601 Format + timestamp = datetime.now().isoformat() + + # 2. Erstelle das JSON-Datenobjekt + data = { + "url": url, + "title": title, + "timestamp": timestamp, + "content": content + } + + # 3. Generiere den hierarchischen Pfad (für Browsability) + relative_path = get_safe_path(url) + + # 4. Kombiniere Output-Dir und relativen Pfad + output_path = Path(output_dir) + fname = output_path / relative_path + + # Stelle sicher, dass das Verzeichnis existiert + fname.parent.mkdir(parents=True, exist_ok=True) + + # Speichere nur, wenn die Datei nicht existiert + if not fname.exists(): + # Speichere das JSON-Objekt + try: + json_content = json.dumps(data, ensure_ascii=False, indent=4) + fname.write_text(json_content, encoding="utf-8") + logging.info(f"Gespeichert als {fname.relative_to(output_path)}") + except Exception as e: + logging.error(f"Fehler beim Speichern von JSON für {url}: {e}") + else: + # Wenn die Datei existiert, überspringen wir das Speichern. + logging.info(f"Datei existiert bereits: {fname.relative_to(output_path)}") + + return fname \ No newline at end of file diff --git a/src/url_utils.py b/src/url_utils.py new file mode 100644 index 0000000..69a9305 --- /dev/null +++ b/src/url_utils.py @@ -0,0 +1,30 @@ +from urllib.parse import urlparse, parse_qs, urlunparse, urlencode + +# Liste gängiger Tracking-Parameter, die entfernt werden sollen +TRACKING_PARAMS = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'session', 'ref', 'gclid', 'fbclid'] + +def normalize_url(url): + """ + Normalisiert eine URL, indem Fragmente entfernt und gängige Tracking-Parameter + aus der Query-Komponente entfernt werden. + """ + parsed = urlparse(url) + + # 1. Fragment entfernen + # 2. Query-Parameter bereinigen + + query_params = parse_qs(parsed.query) + + # Tracking-Parameter entfernen + cleaned_params = {k: v for k, v in query_params.items() if k.lower() not in TRACKING_PARAMS} + + # Query neu zusammensetzen + # Da parse_qs Listen von Werten zurückgibt, müssen wir sie für urlunparse/urlencode + # in das Standardformat zurückführen (z.B. k=v1&k=v2) + + cleaned_query = urlencode(cleaned_params, doseq=True) + + # urlunparse erwartet eine Liste von 6 Elementen: scheme, netloc, path, params, query, fragment + normalized_url = urlunparse(parsed._replace(query=cleaned_query, fragment='')) + + return normalized_url \ No newline at end of file diff --git a/src/web_server.py b/src/web_server.py new file mode 100644 index 0000000..3250ebd --- /dev/null +++ b/src/web_server.py @@ -0,0 +1,32 @@ +import http.server +import socketserver +import os + +class ReadmeHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + readme_path = os.path.join(os.path.dirname(__file__), '..', 'README.md') + try: + with open(readme_path, 'r', encoding='utf-8') as f: + content = f.read() + self.send_response(200) + self.send_header('Content-type', 'text/plain; charset=utf-8') + self.end_headers() + self.wfile.write(content.encode('utf-8')) + except FileNotFoundError: + self.send_error(404, 'File Not Found: README.md') + +def start_web_server(): + PORT = 8000 + httpd = None + try: + handler = ReadmeHandler + httpd = socketserver.TCPServer(("", PORT), handler) + print(f"Serving on port {PORT}") + httpd.serve_forever() + finally: + if httpd: + httpd.server_close() + print("Server closed.") + +if __name__ == "__main__": + start_web_server() \ No newline at end of file diff --git a/start.py b/start.py new file mode 100644 index 0000000..8b58ffb --- /dev/null +++ b/start.py @@ -0,0 +1,62 @@ +import json +import os +import logging +import asyncio +import aiohttp +from src.logger_setup import setup_logging +from src.crawler_core import crawl_source +from src.stats_manager import CrawlStats + +def display_stats(all_stats): + """Displays aggregated statistics from all crawl runs.""" + total_visited = sum(s.total_visited for s in all_stats) + total_saved = sum(s.total_saved for s in all_stats) + total_errors = sum(s.errors for s in all_stats) + total_duration = sum(s.duration for s in all_stats) + total_data_mb = sum(s.total_data_volume for s in all_stats) / (1024 * 1024) + + logging.info("--- Gesamte Crawling-Statistik ---") + logging.info(f"Gesamtdauer: {total_duration:.2f} Sekunden") + logging.info(f"Besuchte Seiten insgesamt: {total_visited}") + logging.info(f"Gespeicherte Seiten insgesamt: {total_saved}") + logging.info(f"Fehler insgesamt: {total_errors}") + logging.info(f"Gesamtdatenvolumen: {total_data_mb:.2f} MB") + logging.info("------------------------------------") + +async def main(): + """ + Main function to initialize and run the web crawler for all sources. + """ + # Load configuration + with open('config.json', 'r') as f: + config = json.load(f) + + # Load URL list + with open('url_list.json', 'r') as f: + url_list = json.load(f) + + # Setup logging + setup_logging(config) + logger = logging.getLogger(__name__) + + all_stats = [] + + try: + async with aiohttp.ClientSession() as session: + for source in url_list: + logger.info(f"Processing source: {source['start_urls'][0]}") + stats = await crawl_source(session, source, config) + all_stats.append(stats) + logger.info(f"Finished source: {source['start_urls'][0]}") + + except Exception as e: + logger.critical(f"A critical error occurred in the main loop: {e}", exc_info=True) + finally: + logger.info("All crawling tasks finished.") + display_stats(all_stats) + + +if __name__ == "__main__": + # Change the current working directory to the script's directory + os.chdir(os.path.dirname(os.path.abspath(__file__))) + asyncio.run(main()) \ No newline at end of file diff --git a/url_list.json b/url_list.json new file mode 100644 index 0000000..f065616 --- /dev/null +++ b/url_list.json @@ -0,0 +1,308 @@ +[ + { + "start_urls": [ + "https://sqlite.org/docs.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/ao_1977/BJNR006130976.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/gmbhg/BJNR004770892.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/pangv_2022/BJNR492110021.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/ustg_1980/BJNR119530979.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/uwg_2004/BJNR141400004.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/hgb/BJNR002190897.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/bgb/BJNR001950896.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/fahrschausbo_2012/BJNR131800012.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/fahrlpr_fv/BJNR004200018.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/fahrlg_2018/BJNR216210017.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/fahrlausbv/BJNR001500018.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/pbefg/BJNR002410961.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/g_kg_1998/BJNR148510998.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/fev_2010/BJNR198000010.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvzo_2012/BJNR067910012.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvg/BJNR004370909.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvoausnv_8/BJNR113000998.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvoausnv_5/BJNR062300994.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvoausnv_9/BJNR317100998.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvoausnv_12/BJNR086600005.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvoausnv_4/BJNR011240992.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gesetze-im-internet.de/stvo_2013/BJNR036710013.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:32016R0679" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.gnu.org/software/bash/manual/bash.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.json.org/json-en.html" + ], + "crawl_mode": "single_page" + }, + { + "start_urls": [ + "https://www.bussgeldkatalog.org/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://docs.budibase.com/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://shadowhelix.de/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://markdown.de/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://mariadb.com/docs/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://html.spec.whatwg.org/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://restfulapi.net/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://react.dev/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://expressjs.com/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://docs.docker.com/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://docs.openwebui.com/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://www.php.net/manual/en/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://www.postgresql.org/docs/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://nginx.org/en/docs/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://docs.nginx.com/nginx/admin-guide/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://wordpress.org/documentation/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://github.com/fail2ban/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "httpshttps://grafana.com/docs/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://nodejs.org/docs/latest/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://www.virtualmin.com/docs/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://developer.mozilla.org/en-US/docs/Web/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://owasp.org/www-project-top-ten/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://cheatsheetseries.owasp.org/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://nodered.org/docs/" + ], + "crawl_mode": "domain_wide" + }, + { + "start_urls": [ + "https://www.home-assistant.io/docs/" + ], + "crawl_mode": "domain_wide" + } +] \ No newline at end of file