Initial upload
This commit is contained in:
90
recipes/ai/rag-crawler/install.sh
Normal file
90
recipes/ai/rag-crawler/install.sh
Normal file
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Helfer-Funktionen aus deinem Basis-Framework (siehe beispiel.zip) werden erwartet:
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
pkg_install python3
|
||||
pkg_install python3-venv || true
|
||||
pkg_install curl
|
||||
|
||||
if ask_to_install "RAG Crawler"; then
|
||||
echo ""
|
||||
read -rp "Basis-Pfad für den Crawler [default: /srv/ai/rag-crawler]: " BASE
|
||||
BASE=${BASE:-/srv/ai/rag-crawler}
|
||||
$SUDO mkdir -p "${BASE}"
|
||||
else
|
||||
log "${YELLOW}⏭ RAG Crawler übersprungen.${NC}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
read -rp "Zielverzeichnis für den Corpus [default: /srv/ai/corpus]: " CORPUS_DIR
|
||||
CORPUS_DIR=${CORPUS_DIR:-/srv/ai/corpus}
|
||||
$SUDO mkdir -p "${CORPUS_DIR}"
|
||||
|
||||
echo ""
|
||||
read -rp "Memory-API URL (z.B. http://127.0.0.1:8085) [default: http://127.0.0.1:8085]: " MEMORY_URL
|
||||
MEMORY_URL=${MEMORY_URL:-http://127.0.0.1:8085}
|
||||
|
||||
# Dateien in BASE kopieren
|
||||
SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
$SUDO mkdir -p "${BASE}/crawler"
|
||||
$SUDO cp -r "${SRC_DIR}/crawler"/* "${BASE}/crawler/"
|
||||
$SUDO cp "${SRC_DIR}/requirements.txt" "${BASE}/requirements.txt"
|
||||
|
||||
# Virtualenv
|
||||
$SUDO python3 -m venv "${BASE}/venv"
|
||||
$SUDO source "${BASE}/venv/bin/activate"
|
||||
$SUDO pip install -U pip
|
||||
$SUDO pip install -r "${BASE}/requirements.txt"
|
||||
$SUDO deactivate
|
||||
|
||||
# sources.yml initialisieren/ersetzen
|
||||
if [ ! -f "${BASE}/crawler/sources.yml" ]; then
|
||||
$SUDO tee "${BASE}/crawler/sources.yml" >/dev/null <<'EOF'
|
||||
# Quellen-Definitionen
|
||||
seeds:
|
||||
- url: "https://www.gesetze-im-internet.de/stvo_2013/"
|
||||
include: ["**"]
|
||||
exclude: ["**/impressum*", "**/kontakt*"]
|
||||
revisit: "30d"
|
||||
- url: "https://www.gesetze-im-internet.de/bgb/"
|
||||
include: ["**"]
|
||||
exclude: []
|
||||
revisit: "30d"
|
||||
- url: "https://www.php.net/manual/en/"
|
||||
include: ["**"]
|
||||
exclude: ["**/search.php*", "**/my.php*"]
|
||||
revisit: "14d"
|
||||
|
||||
policy:
|
||||
concurrency_total: 4
|
||||
concurrency_per_domain: 1
|
||||
delay_min_seconds: 10
|
||||
delay_max_seconds: 120
|
||||
user_agent: "Mozilla/5.0 (compatible; polite-crawler/1.0)"
|
||||
obey_robots_txt: true
|
||||
store_html: false
|
||||
store_text: true
|
||||
store_pdf: true
|
||||
|
||||
storage:
|
||||
root: "/srv/ai/corpus" # wird ersetzt
|
||||
text_subdir: "text"
|
||||
pdf_subdir: "pdf"
|
||||
|
||||
memory:
|
||||
url: "http://127.0.0.1:8085" # wird ersetzt
|
||||
collection: "chat-memory"
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Pfade/URLs deterministisch in sources.yml ersetzen
|
||||
$SUDO sed -i "s|/srv/ai/corpus|${CORPUS_DIR}|g" "${BASE}/crawler/sources.yml"
|
||||
$SUDO sed -i "s|http://127.0.0.1:8085|${MEMORY_URL}|g" "${BASE}/crawler/sources.yml"
|
||||
|
||||
echo "✅ Installiert unter: ${BASE}"
|
||||
echo " Corpus: ${CORPUS_DIR}"
|
||||
echo " Memory-API: ${MEMORY_URL}"
|
||||
echo "➡️ Aktivieren: source ${BASE}/venv/bin/activate && python3 ${BASE}/crawler/main.py --help"
|
||||
Reference in New Issue
Block a user