import json import os import logging import asyncio import aiohttp from src.logger_setup import setup_logging from src.crawler_core import crawl_source from src.stats_manager import CrawlStats def display_stats(all_stats): """Displays aggregated statistics from all crawl runs.""" total_visited = sum(s.total_visited for s in all_stats) total_saved = sum(s.total_saved for s in all_stats) total_errors = sum(s.errors for s in all_stats) total_duration = sum(s.duration for s in all_stats) total_data_mb = sum(s.total_data_volume for s in all_stats) / (1024 * 1024) logging.info("--- Gesamte Crawling-Statistik ---") logging.info(f"Gesamtdauer: {total_duration:.2f} Sekunden") logging.info(f"Besuchte Seiten insgesamt: {total_visited}") logging.info(f"Gespeicherte Seiten insgesamt: {total_saved}") logging.info(f"Fehler insgesamt: {total_errors}") logging.info(f"Gesamtdatenvolumen: {total_data_mb:.2f} MB") logging.info("------------------------------------") async def main(): """ Main function to initialize and run the web crawler for all sources. """ # Load configuration with open('config.json', 'r') as f: config = json.load(f) # Load URL list with open('url_list.json', 'r') as f: url_list = json.load(f) # Setup logging setup_logging(config) logger = logging.getLogger(__name__) all_stats = [] try: async with aiohttp.ClientSession() as session: for source in url_list: logger.info(f"Processing source: {source['start_urls'][0]}") stats = await crawl_source(session, source, config) all_stats.append(stats) logger.info(f"Finished source: {source['start_urls'][0]}") except Exception as e: logger.critical(f"A critical error occurred in the main loop: {e}", exc_info=True) finally: logger.info("All crawling tasks finished.") display_stats(all_stats) if __name__ == "__main__": # Change the current working directory to the script's directory os.chdir(os.path.dirname(os.path.abspath(__file__))) asyncio.run(main())