62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
import json
|
|
import os
|
|
import logging
|
|
import asyncio
|
|
import aiohttp
|
|
from src.logger_setup import setup_logging
|
|
from src.crawler_core import crawl_source
|
|
from src.stats_manager import CrawlStats
|
|
|
|
def display_stats(all_stats):
|
|
"""Displays aggregated statistics from all crawl runs."""
|
|
total_visited = sum(s.total_visited for s in all_stats)
|
|
total_saved = sum(s.total_saved for s in all_stats)
|
|
total_errors = sum(s.errors for s in all_stats)
|
|
total_duration = sum(s.duration for s in all_stats)
|
|
total_data_mb = sum(s.total_data_volume for s in all_stats) / (1024 * 1024)
|
|
|
|
logging.info("--- Gesamte Crawling-Statistik ---")
|
|
logging.info(f"Gesamtdauer: {total_duration:.2f} Sekunden")
|
|
logging.info(f"Besuchte Seiten insgesamt: {total_visited}")
|
|
logging.info(f"Gespeicherte Seiten insgesamt: {total_saved}")
|
|
logging.info(f"Fehler insgesamt: {total_errors}")
|
|
logging.info(f"Gesamtdatenvolumen: {total_data_mb:.2f} MB")
|
|
logging.info("------------------------------------")
|
|
|
|
async def main():
|
|
"""
|
|
Main function to initialize and run the web crawler for all sources.
|
|
"""
|
|
# Load configuration
|
|
with open('config.json', 'r') as f:
|
|
config = json.load(f)
|
|
|
|
# Load URL list
|
|
with open('url_list.json', 'r') as f:
|
|
url_list = json.load(f)
|
|
|
|
# Setup logging
|
|
setup_logging(config)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
all_stats = []
|
|
|
|
try:
|
|
async with aiohttp.ClientSession() as session:
|
|
for source in url_list:
|
|
logger.info(f"Processing source: {source['start_urls'][0]}")
|
|
stats = await crawl_source(session, source, config)
|
|
all_stats.append(stats)
|
|
logger.info(f"Finished source: {source['start_urls'][0]}")
|
|
|
|
except Exception as e:
|
|
logger.critical(f"A critical error occurred in the main loop: {e}", exc_info=True)
|
|
finally:
|
|
logger.info("All crawling tasks finished.")
|
|
display_stats(all_stats)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Change the current working directory to the script's directory
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
asyncio.run(main()) |