Initial commit

This commit is contained in:
2025-11-13 13:01:48 +01:00
commit c347078f27
29 changed files with 2185 additions and 0 deletions

62
start.py Normal file
View File

@@ -0,0 +1,62 @@
import json
import os
import logging
import asyncio
import aiohttp
from src.logger_setup import setup_logging
from src.crawler_core import crawl_source
from src.stats_manager import CrawlStats
def display_stats(all_stats):
"""Displays aggregated statistics from all crawl runs."""
total_visited = sum(s.total_visited for s in all_stats)
total_saved = sum(s.total_saved for s in all_stats)
total_errors = sum(s.errors for s in all_stats)
total_duration = sum(s.duration for s in all_stats)
total_data_mb = sum(s.total_data_volume for s in all_stats) / (1024 * 1024)
logging.info("--- Gesamte Crawling-Statistik ---")
logging.info(f"Gesamtdauer: {total_duration:.2f} Sekunden")
logging.info(f"Besuchte Seiten insgesamt: {total_visited}")
logging.info(f"Gespeicherte Seiten insgesamt: {total_saved}")
logging.info(f"Fehler insgesamt: {total_errors}")
logging.info(f"Gesamtdatenvolumen: {total_data_mb:.2f} MB")
logging.info("------------------------------------")
async def main():
"""
Main function to initialize and run the web crawler for all sources.
"""
# Load configuration
with open('config.json', 'r') as f:
config = json.load(f)
# Load URL list
with open('url_list.json', 'r') as f:
url_list = json.load(f)
# Setup logging
setup_logging(config)
logger = logging.getLogger(__name__)
all_stats = []
try:
async with aiohttp.ClientSession() as session:
for source in url_list:
logger.info(f"Processing source: {source['start_urls'][0]}")
stats = await crawl_source(session, source, config)
all_stats.append(stats)
logger.info(f"Finished source: {source['start_urls'][0]}")
except Exception as e:
logger.critical(f"A critical error occurred in the main loop: {e}", exc_info=True)
finally:
logger.info("All crawling tasks finished.")
display_stats(all_stats)
if __name__ == "__main__":
# Change the current working directory to the script's directory
os.chdir(os.path.dirname(os.path.abspath(__file__)))
asyncio.run(main())