91 lines
2.6 KiB
Bash
91 lines
2.6 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Helfer-Funktionen aus deinem Basis-Framework (siehe beispiel.zip) werden erwartet:
|
|
ensure_root
|
|
detect_pkg_manager
|
|
pkg_install python3
|
|
pkg_install python3-venv || true
|
|
pkg_install curl
|
|
|
|
if ask_to_install "RAG Crawler"; then
|
|
echo ""
|
|
read -rp "Basis-Pfad für den Crawler [default: /srv/ai/rag-crawler]: " BASE
|
|
BASE=${BASE:-/srv/ai/rag-crawler}
|
|
$SUDO mkdir -p "${BASE}"
|
|
else
|
|
log "${YELLOW}⏭ RAG Crawler übersprungen.${NC}"
|
|
exit 0
|
|
fi
|
|
|
|
echo ""
|
|
read -rp "Zielverzeichnis für den Corpus [default: /srv/ai/corpus]: " CORPUS_DIR
|
|
CORPUS_DIR=${CORPUS_DIR:-/srv/ai/corpus}
|
|
$SUDO mkdir -p "${CORPUS_DIR}"
|
|
|
|
echo ""
|
|
read -rp "Memory-API URL (z.B. http://127.0.0.1:8085) [default: http://127.0.0.1:8085]: " MEMORY_URL
|
|
MEMORY_URL=${MEMORY_URL:-http://127.0.0.1:8085}
|
|
|
|
# Dateien in BASE kopieren
|
|
SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
$SUDO mkdir -p "${BASE}/crawler"
|
|
$SUDO cp -r "${SRC_DIR}/crawler"/* "${BASE}/crawler/"
|
|
$SUDO cp "${SRC_DIR}/requirements.txt" "${BASE}/requirements.txt"
|
|
|
|
# Virtualenv
|
|
$SUDO python3 -m venv "${BASE}/venv"
|
|
$SUDO source "${BASE}/venv/bin/activate"
|
|
$SUDO pip install -U pip
|
|
$SUDO pip install -r "${BASE}/requirements.txt"
|
|
$SUDO deactivate
|
|
|
|
# sources.yml initialisieren/ersetzen
|
|
if [ ! -f "${BASE}/crawler/sources.yml" ]; then
|
|
$SUDO tee "${BASE}/crawler/sources.yml" >/dev/null <<'EOF'
|
|
# Quellen-Definitionen
|
|
seeds:
|
|
- url: "https://www.gesetze-im-internet.de/stvo_2013/"
|
|
include: ["**"]
|
|
exclude: ["**/impressum*", "**/kontakt*"]
|
|
revisit: "30d"
|
|
- url: "https://www.gesetze-im-internet.de/bgb/"
|
|
include: ["**"]
|
|
exclude: []
|
|
revisit: "30d"
|
|
- url: "https://www.php.net/manual/en/"
|
|
include: ["**"]
|
|
exclude: ["**/search.php*", "**/my.php*"]
|
|
revisit: "14d"
|
|
|
|
policy:
|
|
concurrency_total: 4
|
|
concurrency_per_domain: 1
|
|
delay_min_seconds: 10
|
|
delay_max_seconds: 120
|
|
user_agent: "Mozilla/5.0 (compatible; polite-crawler/1.0)"
|
|
obey_robots_txt: true
|
|
store_html: false
|
|
store_text: true
|
|
store_pdf: true
|
|
|
|
storage:
|
|
root: "/srv/ai/corpus" # wird ersetzt
|
|
text_subdir: "text"
|
|
pdf_subdir: "pdf"
|
|
|
|
memory:
|
|
url: "http://127.0.0.1:8085" # wird ersetzt
|
|
collection: "chat-memory"
|
|
EOF
|
|
fi
|
|
|
|
# Pfade/URLs deterministisch in sources.yml ersetzen
|
|
$SUDO sed -i "s|/srv/ai/corpus|${CORPUS_DIR}|g" "${BASE}/crawler/sources.yml"
|
|
$SUDO sed -i "s|http://127.0.0.1:8085|${MEMORY_URL}|g" "${BASE}/crawler/sources.yml"
|
|
|
|
echo "✅ Installiert unter: ${BASE}"
|
|
echo " Corpus: ${CORPUS_DIR}"
|
|
echo " Memory-API: ${MEMORY_URL}"
|
|
echo "➡️ Aktivieren: source ${BASE}/venv/bin/activate && python3 ${BASE}/crawler/main.py --help"
|