Initial commit
This commit is contained in:
62
start.py
Normal file
62
start.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from src.logger_setup import setup_logging
|
||||
from src.crawler_core import crawl_source
|
||||
from src.stats_manager import CrawlStats
|
||||
|
||||
def display_stats(all_stats):
|
||||
"""Displays aggregated statistics from all crawl runs."""
|
||||
total_visited = sum(s.total_visited for s in all_stats)
|
||||
total_saved = sum(s.total_saved for s in all_stats)
|
||||
total_errors = sum(s.errors for s in all_stats)
|
||||
total_duration = sum(s.duration for s in all_stats)
|
||||
total_data_mb = sum(s.total_data_volume for s in all_stats) / (1024 * 1024)
|
||||
|
||||
logging.info("--- Gesamte Crawling-Statistik ---")
|
||||
logging.info(f"Gesamtdauer: {total_duration:.2f} Sekunden")
|
||||
logging.info(f"Besuchte Seiten insgesamt: {total_visited}")
|
||||
logging.info(f"Gespeicherte Seiten insgesamt: {total_saved}")
|
||||
logging.info(f"Fehler insgesamt: {total_errors}")
|
||||
logging.info(f"Gesamtdatenvolumen: {total_data_mb:.2f} MB")
|
||||
logging.info("------------------------------------")
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Main function to initialize and run the web crawler for all sources.
|
||||
"""
|
||||
# Load configuration
|
||||
with open('config.json', 'r') as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Load URL list
|
||||
with open('url_list.json', 'r') as f:
|
||||
url_list = json.load(f)
|
||||
|
||||
# Setup logging
|
||||
setup_logging(config)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
all_stats = []
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for source in url_list:
|
||||
logger.info(f"Processing source: {source['start_urls'][0]}")
|
||||
stats = await crawl_source(session, source, config)
|
||||
all_stats.append(stats)
|
||||
logger.info(f"Finished source: {source['start_urls'][0]}")
|
||||
|
||||
except Exception as e:
|
||||
logger.critical(f"A critical error occurred in the main loop: {e}", exc_info=True)
|
||||
finally:
|
||||
logger.info("All crawling tasks finished.")
|
||||
display_stats(all_stats)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Change the current working directory to the script's directory
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user