#!/usr/bin/env bash set -euo pipefail # Helfer-Funktionen aus deinem Basis-Framework (siehe beispiel.zip) werden erwartet: ensure_root detect_pkg_manager pkg_install python3 pkg_install python3-venv || true pkg_install curl if ask_to_install "RAG Crawler"; then echo "" read -rp "Basis-Pfad für den Crawler [default: /srv/ai/rag-crawler]: " BASE BASE=${BASE:-/srv/ai/rag-crawler} $SUDO mkdir -p "${BASE}" else log "${YELLOW}⏭ RAG Crawler übersprungen.${NC}" exit 0 fi echo "" read -rp "Zielverzeichnis für den Corpus [default: /srv/ai/corpus]: " CORPUS_DIR CORPUS_DIR=${CORPUS_DIR:-/srv/ai/corpus} $SUDO mkdir -p "${CORPUS_DIR}" echo "" read -rp "Memory-API URL (z.B. http://127.0.0.1:8085) [default: http://127.0.0.1:8085]: " MEMORY_URL MEMORY_URL=${MEMORY_URL:-http://127.0.0.1:8085} # Dateien in BASE kopieren SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" $SUDO mkdir -p "${BASE}/crawler" $SUDO cp -r "${SRC_DIR}/crawler"/* "${BASE}/crawler/" $SUDO cp "${SRC_DIR}/requirements.txt" "${BASE}/requirements.txt" # Virtualenv $SUDO python3 -m venv "${BASE}/venv" $SUDO source "${BASE}/venv/bin/activate" $SUDO pip install -U pip $SUDO pip install -r "${BASE}/requirements.txt" $SUDO deactivate # sources.yml initialisieren/ersetzen if [ ! -f "${BASE}/crawler/sources.yml" ]; then $SUDO tee "${BASE}/crawler/sources.yml" >/dev/null <<'EOF' # Quellen-Definitionen seeds: - url: "https://www.gesetze-im-internet.de/stvo_2013/" include: ["**"] exclude: ["**/impressum*", "**/kontakt*"] revisit: "30d" - url: "https://www.gesetze-im-internet.de/bgb/" include: ["**"] exclude: [] revisit: "30d" - url: "https://www.php.net/manual/en/" include: ["**"] exclude: ["**/search.php*", "**/my.php*"] revisit: "14d" policy: concurrency_total: 4 concurrency_per_domain: 1 delay_min_seconds: 10 delay_max_seconds: 120 user_agent: "Mozilla/5.0 (compatible; polite-crawler/1.0)" obey_robots_txt: true store_html: false store_text: true store_pdf: true storage: root: "/srv/ai/corpus" # wird ersetzt text_subdir: "text" pdf_subdir: "pdf" memory: url: "http://127.0.0.1:8085" # wird ersetzt collection: "chat-memory" EOF fi # Pfade/URLs deterministisch in sources.yml ersetzen $SUDO sed -i "s|/srv/ai/corpus|${CORPUS_DIR}|g" "${BASE}/crawler/sources.yml" $SUDO sed -i "s|http://127.0.0.1:8085|${MEMORY_URL}|g" "${BASE}/crawler/sources.yml" echo "✅ Installiert unter: ${BASE}" echo " Corpus: ${CORPUS_DIR}" echo " Memory-API: ${MEMORY_URL}" echo "➡️ Aktivieren: source ${BASE}/venv/bin/activate && python3 ${BASE}/crawler/main.py --help"