Initial upload

This commit is contained in:
2025-11-11 11:47:15 +01:00
commit 7c24dab288
48 changed files with 2761 additions and 0 deletions

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env bash
set -euo pipefail
# Helfer-Funktionen aus deinem Basis-Framework (siehe beispiel.zip) werden erwartet:
ensure_root
detect_pkg_manager
pkg_install python3
pkg_install python3-venv || true
pkg_install curl
if ask_to_install "RAG Crawler"; then
echo ""
read -rp "Basis-Pfad für den Crawler [default: /srv/ai/rag-crawler]: " BASE
BASE=${BASE:-/srv/ai/rag-crawler}
$SUDO mkdir -p "${BASE}"
else
log "${YELLOW}⏭ RAG Crawler übersprungen.${NC}"
exit 0
fi
echo ""
read -rp "Zielverzeichnis für den Corpus [default: /srv/ai/corpus]: " CORPUS_DIR
CORPUS_DIR=${CORPUS_DIR:-/srv/ai/corpus}
$SUDO mkdir -p "${CORPUS_DIR}"
echo ""
read -rp "Memory-API URL (z.B. http://127.0.0.1:8085) [default: http://127.0.0.1:8085]: " MEMORY_URL
MEMORY_URL=${MEMORY_URL:-http://127.0.0.1:8085}
# Dateien in BASE kopieren
SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
$SUDO mkdir -p "${BASE}/crawler"
$SUDO cp -r "${SRC_DIR}/crawler"/* "${BASE}/crawler/"
$SUDO cp "${SRC_DIR}/requirements.txt" "${BASE}/requirements.txt"
# Virtualenv
$SUDO python3 -m venv "${BASE}/venv"
$SUDO source "${BASE}/venv/bin/activate"
$SUDO pip install -U pip
$SUDO pip install -r "${BASE}/requirements.txt"
$SUDO deactivate
# sources.yml initialisieren/ersetzen
if [ ! -f "${BASE}/crawler/sources.yml" ]; then
$SUDO tee "${BASE}/crawler/sources.yml" >/dev/null <<'EOF'
# Quellen-Definitionen
seeds:
- url: "https://www.gesetze-im-internet.de/stvo_2013/"
include: ["**"]
exclude: ["**/impressum*", "**/kontakt*"]
revisit: "30d"
- url: "https://www.gesetze-im-internet.de/bgb/"
include: ["**"]
exclude: []
revisit: "30d"
- url: "https://www.php.net/manual/en/"
include: ["**"]
exclude: ["**/search.php*", "**/my.php*"]
revisit: "14d"
policy:
concurrency_total: 4
concurrency_per_domain: 1
delay_min_seconds: 10
delay_max_seconds: 120
user_agent: "Mozilla/5.0 (compatible; polite-crawler/1.0)"
obey_robots_txt: true
store_html: false
store_text: true
store_pdf: true
storage:
root: "/srv/ai/corpus" # wird ersetzt
text_subdir: "text"
pdf_subdir: "pdf"
memory:
url: "http://127.0.0.1:8085" # wird ersetzt
collection: "chat-memory"
EOF
fi
# Pfade/URLs deterministisch in sources.yml ersetzen
$SUDO sed -i "s|/srv/ai/corpus|${CORPUS_DIR}|g" "${BASE}/crawler/sources.yml"
$SUDO sed -i "s|http://127.0.0.1:8085|${MEMORY_URL}|g" "${BASE}/crawler/sources.yml"
echo "✅ Installiert unter: ${BASE}"
echo " Corpus: ${CORPUS_DIR}"
echo " Memory-API: ${MEMORY_URL}"
echo "➡️ Aktivieren: source ${BASE}/venv/bin/activate && python3 ${BASE}/crawler/main.py --help"