Initial upload
This commit is contained in:
64
recipes/ai/agent-config/install.sh
Normal file
64
recipes/ai/agent-config/install.sh
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
pkg_install curl
|
||||
if ask_to_install "Agent-Konfiguration"; then
|
||||
echo ""
|
||||
read -rp "Ollama Router Base-URL (z.B. http://192.168.3.21:11437): " ROUTER_URL
|
||||
ROUTER_URL=${ROUTER_URL:-http://localhost:11437}
|
||||
BASE="/srv/ai/agents"
|
||||
$SUDO mkdir -p "${BASE}"
|
||||
$SUDO tee "${BASE}/agents.yml" >/dev/null <<'EOF'
|
||||
language: de
|
||||
autonomy: soft
|
||||
scope: global
|
||||
agents:
|
||||
- name: Strategie-Agent
|
||||
purpose: "Lange Planungsdialoge, Roadmaps, Tabellen, UI/UX-Brainstorming."
|
||||
default_models:
|
||||
primary: "llama3.1:8b-instruct"
|
||||
secondary: "mistral-nemo:12b"
|
||||
cpu_fallback: "phi3:mini"
|
||||
endpoint: "${ROUTER_URL}"
|
||||
prompt_preset: |
|
||||
Du bist ein strategischer Planer. Arbeite iterativ, strukturiert und deutschsprachig.
|
||||
Liefere Tabellen (Markdown), klare Meilensteine, Risiken, Abhängigkeiten.
|
||||
Frage NUR nach, wenn kritische Annahmen fehlen; sonst entscheide pragmatisch.
|
||||
Modus: soft – Vorschläge machen, aber Details selbstständig ausarbeiten.
|
||||
- name: Denker-Agent
|
||||
purpose: "Tiefes Reasoning (CoT), Architektur- und Lösungsentwürfe, Mathe/Logik."
|
||||
default_models:
|
||||
primary: "huihui_ai/deepseek-r1-abliterated:14b"
|
||||
secondary: "phi3:medium-128k"
|
||||
cpu_fallback: "phi3:mini"
|
||||
endpoint: "${ROUTER_URL}"
|
||||
prompt_preset: |
|
||||
Denke in überprüfbaren Schritten. Erkläre Annahmen, bevor du entscheidest.
|
||||
Bevorzuge Beweise, Gegenbeispiele und Tests. Schließe mit TL;DR.
|
||||
- name: Gedächtnis-Agent
|
||||
purpose: "RAG, Wissensquellen, Zitationen, Abruf & Zusammenführung von Fakten."
|
||||
default_models:
|
||||
retriever_llm: "phi3:mini"
|
||||
embed_model: "mxbai-embed-large"
|
||||
cpu_fallback: "gemma2:2b-instruct-q6_K"
|
||||
endpoint: "${ROUTER_URL}"
|
||||
prompt_preset: |
|
||||
Orchestriere Nachschlagen in Wissenssammlungen (RAG). Zitiere Fundstellen (Datei/Seite/Abschnitt).
|
||||
Antworte nüchtern, fasse Unsicherheit transparent zusammen.
|
||||
sources:
|
||||
- name: "Gesetze"
|
||||
type: "pdf"
|
||||
location: "/srv/ai/corpus/law"
|
||||
- name: "Shadowrun-Regeln"
|
||||
type: "pdf"
|
||||
location: "/srv/ai/corpus/shadowrun"
|
||||
- name: "Tech-Docs"
|
||||
type: "mixed"
|
||||
location: "/srv/ai/corpus/tech"
|
||||
EOF
|
||||
$SUDO sed -i "s|\${ROUTER_URL}|${ROUTER_URL}|g" "${BASE}/agents.yml"
|
||||
echo "✅ Agenten-Profile: ${BASE}/agents.yml"
|
||||
else
|
||||
log "${YELLOW}⏭ Agent-Konfiguration übersprungen.${NC}"
|
||||
fi
|
||||
16
recipes/ai/budibase-server/docker-compose.yml
Normal file
16
recipes/ai/budibase-server/docker-compose.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
services:
|
||||
budibase:
|
||||
image: budibase/budibase:latest
|
||||
container_name: budibase
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "10000:80"
|
||||
environment:
|
||||
- JWT_SECRET=changeme
|
||||
- MINIO_ACCESS_KEY=budibase
|
||||
- MINIO_SECRET_KEY=budibase_secret
|
||||
volumes:
|
||||
- budibase_data:/data
|
||||
|
||||
volumes:
|
||||
budibase_data:
|
||||
56
recipes/ai/budibase-server/install.sh
Normal file
56
recipes/ai/budibase-server/install.sh
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if ask_to_install "Budibase Server"; then
|
||||
echo "=== BUDIBASE INSTALLATION ==="
|
||||
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
install_docker
|
||||
|
||||
echo "[+] Erstelle Verzeichnis: /srv/docker/budibase"
|
||||
$SUDO mkdir -p /srv/docker/budibase
|
||||
cd /srv/docker/budibase
|
||||
|
||||
# Funktion für automatisches Finden des nächsten freien Ports
|
||||
find_free_port() {
|
||||
PORT=10000
|
||||
while ss -lnt | awk '{print $4}' | grep -q ":$PORT$"; do
|
||||
PORT=$((PORT + 1))
|
||||
done
|
||||
echo "$PORT"
|
||||
}
|
||||
|
||||
FREE_PORT=$(find_free_port)
|
||||
echo "✅ Freier Port gefunden: $FREE_PORT"
|
||||
|
||||
echo "[+] Schreibe docker-compose.yml"
|
||||
$SUDO tee docker-compose.yml >/dev/null <<EOF
|
||||
services:
|
||||
budibase:
|
||||
image: budibase/budibase:latest
|
||||
container_name: budibase-$FREE_PORT
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "$FREE_PORT:80"
|
||||
environment:
|
||||
- JWT_SECRET=changeme
|
||||
- MINIO_ACCESS_KEY=budibase
|
||||
- MINIO_SECRET_KEY=budibase_secret
|
||||
volumes:
|
||||
- budibase_data:/data
|
||||
|
||||
volumes:
|
||||
budibase_data:
|
||||
EOF
|
||||
|
||||
echo "[+] Starte Budibase..."
|
||||
$SUDO docker compose up -d
|
||||
|
||||
echo ""
|
||||
echo "✅ Budibase ist installiert!"
|
||||
echo "→ Öffne im Browser: http://<IP>:$FREE_PORT"
|
||||
else
|
||||
log "${YELLOW}⏭ Budibase Server übersprungen.${NC}"
|
||||
fi
|
||||
|
||||
80
recipes/ai/builder-agent/install.sh
Normal file
80
recipes/ai/builder-agent/install.sh
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
pkg_install curl
|
||||
pkg_install git || true
|
||||
|
||||
if ask_to_install "Builder-Agent"; then
|
||||
echo ""
|
||||
read -rp "Ollama Router Base-URL (z.B. http://192.168.3.21:11437): " ROUTER_URL
|
||||
ROUTER_URL=${ROUTER_URL:-http://localhost:11437}
|
||||
echo ""
|
||||
read -rp "Projektverzeichnis (leer = auto-detect): " PROJECT_DIR
|
||||
if [ -z "${PROJECT_DIR}" ]; then
|
||||
if git rev-parse --show-toplevel >/dev/null 2>&1; then
|
||||
PROJECT_DIR="$(git rev-parse --show-toplevel)"
|
||||
else
|
||||
PROJECT_DIR="$(pwd)"
|
||||
fi
|
||||
fi
|
||||
PROJECT_DIR="$(readlink -f "${PROJECT_DIR}")"
|
||||
BASE="/srv/ai/builder"
|
||||
$SUDO mkdir -p "${BASE}"
|
||||
$SUDO tee "${BASE}/builder.yml" >/dev/null <<'EOF'
|
||||
name: Builder-Agent
|
||||
language: de
|
||||
autonomy: soft
|
||||
endpoint: "${ROUTER_URL}"
|
||||
models:
|
||||
planner: "llama3.1:8b-instruct"
|
||||
reasoner: "huihui_ai/deepseek-r1-abliterated:14b"
|
||||
coder_primary: "qwen2.5-coder:14b"
|
||||
coder_secondary: "deepseek-coder-v2:16b"
|
||||
cpu_fallback: "qwen2.5-coder:7b"
|
||||
workspace:
|
||||
project_dir: "${PROJECT_DIR}"
|
||||
tests:
|
||||
enabled: true
|
||||
force_languages: []
|
||||
prompts:
|
||||
system: |
|
||||
Du bist ein Builder-Agent (soft). Ziel: Probleme lösen mit minimaler Rückfrage.
|
||||
Strategie:
|
||||
1) Plane kurz (ToDo-Liste), dann implementiere iterativ im Workspace.
|
||||
2) Führe nach jedem Schritt Tests/Lints aus (falls verfügbar). Repariere Fehler selbstständig.
|
||||
3) Schreibe klare Commits; dokumentiere Änderungen kompakt in CHANGELOG.md.
|
||||
4) Nur bei sicherheitsrelevanten/zerstörerischen Aktionen Rückfrage.
|
||||
Liefere am Ende: TL;DR + nächste Schritte.
|
||||
EOF
|
||||
$SUDO sed -i "s|\${ROUTER_URL}|${ROUTER_URL}|g; s|\${PROJECT_DIR}|${PROJECT_DIR}|g" "${BASE}/builder.yml"
|
||||
$SUDO tee "${BASE}/run_tests.sh" >/dev/null <<'EOF'
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
ROOT="${1:-.}"
|
||||
cd "${ROOT}"
|
||||
if [ -f "requirements.txt" ] || ls -1 **/requirements.txt >/dev/null 2>&1; then
|
||||
command -v pytest >/dev/null 2>&1 && pytest -q || true
|
||||
fi
|
||||
if [ -f "package.json" ]; then
|
||||
if npm run | grep -q "test"; then npm test --silent || true; fi
|
||||
if npm run | grep -q "lint"; then npm run lint --silent || true; fi
|
||||
if npm run | grep -q "typecheck"; then npm run typecheck --silent || true; fi
|
||||
fi
|
||||
if [ -f "composer.json" ]; then
|
||||
if [ -f "vendor/bin/pest" ]; then vendor/bin/pest || true
|
||||
elif [ -f "vendor/bin/phpunit" ]; then vendor/bin/phpunit || true
|
||||
fi
|
||||
fi
|
||||
if [ -f "Dockerfile" ]; then
|
||||
docker build -q -t tmp-builder-test . || true
|
||||
fi
|
||||
if command -v shellcheck >/dev/null 2>&1; then
|
||||
find . -type f -name "*.sh" -print0 | xargs -0 -r shellcheck || true
|
||||
fi
|
||||
EOF
|
||||
$SUDO chmod +x "${BASE}/run_tests.sh"
|
||||
echo "✅ Builder-Agent konfiguriert unter ${BASE} (Workspace: ${PROJECT_DIR})"
|
||||
else
|
||||
log "${YELLOW}⏭ Builder-Agent übersprungen.${NC}"
|
||||
fi
|
||||
2
recipes/ai/diagram-agent/install.sh
Normal file
2
recipes/ai/diagram-agent/install.sh
Normal file
@@ -0,0 +1,2 @@
|
||||
#!/usr/bin/env bash
|
||||
echo "Diagram-Agent placeholder install script"
|
||||
12
recipes/ai/memory/README.md
Normal file
12
recipes/ai/memory/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
# Memory Stack (External Ollama)
|
||||
|
||||
## Deploy
|
||||
```
|
||||
bash deploy.sh http://<OLLAMA-IP>:<PORT>
|
||||
```
|
||||
|
||||
## Test
|
||||
```
|
||||
curl http://localhost:8085/health
|
||||
```
|
||||
25
recipes/ai/memory/compose.yaml
Normal file
25
recipes/ai/memory/compose.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
|
||||
version: "3.8"
|
||||
services:
|
||||
qdrant:
|
||||
image: qdrant/qdrant:latest
|
||||
container_name: memory-qdrant
|
||||
volumes:
|
||||
- /srv/docker/services/memory/qdrant:/qdrant/storage
|
||||
ports:
|
||||
- "127.0.0.1:6333:6333"
|
||||
restart: unless-stopped
|
||||
|
||||
memory-api:
|
||||
build:
|
||||
context: ./memory-api
|
||||
container_name: memory-api
|
||||
environment:
|
||||
- QDRANT_URL=http://qdrant:6333
|
||||
- OLLAMA_API={{OLLAMA_API}}
|
||||
- COLLECTION_NAME=chat-memory
|
||||
ports:
|
||||
- "127.0.0.1:8085:8085"
|
||||
depends_on:
|
||||
- qdrant
|
||||
restart: unless-stopped
|
||||
32
recipes/ai/memory/deploy.sh
Normal file
32
recipes/ai/memory/deploy.sh
Normal file
@@ -0,0 +1,32 @@
|
||||
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
install_docker
|
||||
|
||||
if ask_to_install "RAG Memory Stack (Qdrant + Memory API)"; then
|
||||
log "=== RAG Memory Stack Installation ==="
|
||||
|
||||
read -rp "Ollama API URL (z.B. http://127.0.0.1:11434): " OLLAMA_API_URL
|
||||
OLLAMA_API_URL=${OLLAMA_API_URL:-http://127.0.0.1:11434}
|
||||
|
||||
BASE="/srv/docker/services/memory"
|
||||
$SUDO mkdir -p "$BASE/qdrant"
|
||||
$SUDO cp -r "$(dirname "${BASH_SOURCE[0]}")/memory-api" "$BASE/"
|
||||
$SUDO cp "$(dirname "${BASH_SOURCE[0]}")/compose.yaml" "$BASE/docker-compose.yml"
|
||||
cd "$BASE"
|
||||
|
||||
$SUDO sed -i "s|{{OLLAMA_API}}|$OLLAMA_API_URL|g" docker-compose.yml
|
||||
|
||||
log "🚀 Starte RAG Memory Stack..."
|
||||
$SUDO docker compose up -d --build
|
||||
|
||||
log "Attempting to pull embedding model from remote Ollama..."
|
||||
$SUDO curl -s -X POST "$OLLAMA_API_URL/api/pull" -H 'Content-Type: application/json' -d '{"name": "nomic-embed-text"}' || log "Notice: Model pull failed (possibly using a gateway). Continuing."
|
||||
|
||||
log "✅ RAG Memory Stack läuft unter: http://<server-ip>:8085"
|
||||
else
|
||||
log "${YELLOW}⏭ RAG Memory Stack übersprungen.${NC}"
|
||||
fi
|
||||
8
recipes/ai/memory/memory-api/Dockerfile
Normal file
8
recipes/ai/memory/memory-api/Dockerfile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
COPY app.py .
|
||||
EXPOSE 8085
|
||||
CMD ["python", "app.py"]
|
||||
40
recipes/ai/memory/memory-api/app.py
Normal file
40
recipes/ai/memory/memory-api/app.py
Normal file
@@ -0,0 +1,40 @@
|
||||
|
||||
from fastapi import FastAPI
|
||||
import requests, os
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
QDRANT_URL = os.getenv("QDRANT_URL")
|
||||
OLLAMA_API = os.getenv("OLLAMA_API")
|
||||
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "chat-memory")
|
||||
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "qdrant": QDRANT_URL, "ollama": OLLAMA_API}
|
||||
|
||||
def embed(text):
|
||||
r = requests.post(f"{OLLAMA_API}/api/embeddings", json={"model":"nomic-embed-text","prompt":text})
|
||||
return r.json()["embedding"]
|
||||
|
||||
@app.post("/store")
|
||||
def store(item: dict):
|
||||
text = item["text"]
|
||||
metadata = item.get("metadata", {})
|
||||
vec = embed(text)
|
||||
pid = hashlib.sha256(text.encode()).hexdigest()
|
||||
client.upsert(collection_name=COLLECTION_NAME, points=[PointStruct(id=pid, vector=vec, payload={"text": text, **metadata})])
|
||||
return {"stored": True}
|
||||
|
||||
@app.post("/search")
|
||||
def search(query: dict):
|
||||
q = query["text"]
|
||||
top_k = query.get("top_k", 5)
|
||||
vec = embed(q)
|
||||
result = client.search(collection_name=COLLECTION_NAME, query_vector=vec, limit=top_k)
|
||||
return [{"score": r.score, "text": r.payload["text"]} for r in result]
|
||||
4
recipes/ai/memory/memory-api/requirements.txt
Normal file
4
recipes/ai/memory/memory-api/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
requests
|
||||
qdrant-client
|
||||
14
recipes/ai/ollama-router/README.md
Normal file
14
recipes/ai/ollama-router/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
# Ollama Router (new schema)
|
||||
|
||||
Dieses Paket folgt dem Beispiel-Schema (beispiel.zip). Es enthält:
|
||||
- `recipes/services/ollama-router/install.sh` – interaktive IP/Port-Abfrage (ohne ENV)
|
||||
- `recipes/services/ollama-router/docker-compose.yml` – nutzt externes Netzwerk `ai`
|
||||
- `recipes/services/ollama-router/config.yml` – wird vom Install-Skript erzeugt
|
||||
|
||||
## Install
|
||||
```bash
|
||||
bash recipes/services/ollama-router/install.sh
|
||||
cd /srv/docker/services/ollama-router
|
||||
docker compose up -d
|
||||
```
|
||||
CPU-Fallback-Modelle werden automatisch auf dem CPU-Node gepullt, damit **Strategie-/Denker-/Gedächtnis-Agenten** immer laufen.
|
||||
102
recipes/ai/ollama-router/install.sh
Normal file
102
recipes/ai/ollama-router/install.sh
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
pkg_install curl
|
||||
install_docker
|
||||
|
||||
if ask_to_install "Ollama Router"; then
|
||||
echo ""
|
||||
read -rp "Listen-Port des Router (Default 11437): " ROUTER_PORT
|
||||
ROUTER_PORT=${ROUTER_PORT:-11437}
|
||||
echo ""
|
||||
read -rp "NVIDIA Node IP: " NVIDIA_IP
|
||||
read -rp "NVIDIA Node Port (Default 11436): " NVIDIA_PORT
|
||||
NVIDIA_PORT=${NVIDIA_PORT:-11436}
|
||||
echo ""
|
||||
read -rp "AMD (ROCm) Node IP: " AMD_IP
|
||||
read -rp "AMD Node Port (Default 11435): " AMD_PORT
|
||||
AMD_PORT=${AMD_PORT:-11435}
|
||||
echo ""
|
||||
read -rp "CPU-only Node IP: " CPU_IP
|
||||
read -rp "CPU Node Port (Default 11434): " CPU_PORT
|
||||
CPU_PORT=${CPU_PORT:-11434}
|
||||
BASE="/srv/docker/services/ollama-router"
|
||||
$SUDO mkdir -p "${BASE}"
|
||||
cd "${BASE}"
|
||||
$SUDO tee config.yml >/dev/null <<'EOF'
|
||||
routes:
|
||||
llama3.1:8b-instruct:
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
mistral-nemo:12b:
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
huihui_ai/deepseek-r1-abliterated:14b:
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
phi3:medium-128k:
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
mxbai-embed-large:
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
phi3:mini:
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
gemma2:2b-instruct-q6_K:
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
qwen2.5-coder:14b:
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
deepseek-coder-v2:16b:
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
qwen2.5-coder:7b:
|
||||
- url: http://${CPU_IP}:${CPU_PORT}
|
||||
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
|
||||
- url: http://${AMD_IP}:${AMD_PORT}
|
||||
EOF
|
||||
$SUDO sed -i "s|\${NVIDIA_IP}|${NVIDIA_IP}|g; s|\${NVIDIA_PORT}|${NVIDIA_PORT}|g; s|\${AMD_IP}|${AMD_IP}|g; s|\${AMD_PORT}|${AMD_PORT}|g; s|\${CPU_IP}|${CPU_IP}|g; s|\${CPU_PORT}|${CPU_PORT}|g" config.yml
|
||||
$SUDO tee docker-compose.yml >/dev/null <<EOF
|
||||
version: "3.9"
|
||||
services:
|
||||
ollama-router:
|
||||
image: ghcr.io/ollama/ollama-router:latest
|
||||
container_name: ollama-router
|
||||
restart: unless-stopped
|
||||
networks: [ai]
|
||||
volumes:
|
||||
- ./config.yml:/app/config.yml:ro
|
||||
ports:
|
||||
- "${ROUTER_PORT}:11437"
|
||||
networks:
|
||||
ai:
|
||||
external: true
|
||||
EOF
|
||||
$SUDO docker network inspect ai >/dev/null 2>&1 || $SUDO docker network create ai
|
||||
CPU_MODELS=(
|
||||
"phi3:mini"
|
||||
"gemma2:2b-instruct-q6_K"
|
||||
"mxbai-embed-large"
|
||||
"qwen2.5-coder:7b"
|
||||
)
|
||||
for m in "${CPU_MODELS[@]}"; do
|
||||
echo "→ Pull ${m} on CPU node ${CPU_IP}:${CPU_PORT}"
|
||||
$SUDO curl -fsSL -X POST "http://${CPU_IP}:${CPU_PORT}/api/pull" -d "{"name":"${m}"}" || true
|
||||
done
|
||||
log "✅ Router konfiguriert in ${BASE}"
|
||||
log "ℹ️ Start: cd ${BASE} && docker compose up -d"
|
||||
else
|
||||
log "${YELLOW}⏭ Ollama Router übersprungen.${NC}"
|
||||
fi
|
||||
11
recipes/ai/ollama-server/docker-compose.yml
Normal file
11
recipes/ai/ollama-server/docker-compose.yml
Normal file
@@ -0,0 +1,11 @@
|
||||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
container_name: ollama
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "11434:11434"
|
||||
volumes:
|
||||
- ollama_data:/root/.ollama
|
||||
volumes:
|
||||
ollama_data:
|
||||
53
recipes/ai/ollama-server/install.sh
Normal file
53
recipes/ai/ollama-server/install.sh
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
if ask_to_install "Ollama Server"; then
|
||||
echo "=== OLLAMA SERVER INSTALLATION ==="
|
||||
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
install_docker
|
||||
|
||||
$SUDO mkdir -p /srv/docker/ollama
|
||||
cd /srv/docker/ollama
|
||||
|
||||
# Funktion, die den nächsten freien Port sucht
|
||||
find_free_port() {
|
||||
PORT=11434
|
||||
while ss -lnt | awk '{print $4}' | grep -q ":$PORT$"; do
|
||||
PORT=$((PORT + 1))
|
||||
done
|
||||
echo "$PORT"
|
||||
}
|
||||
|
||||
FREE_PORT=$(find_free_port)
|
||||
echo "✅ Freier Port gefunden: $FREE_PORT"
|
||||
|
||||
$SUDO tee docker-compose.yml >/dev/null <<EOF
|
||||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
container_name: ollama-$FREE_PORT
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "$FREE_PORT:11434"
|
||||
volumes:
|
||||
- ollama_data:/root/.ollama
|
||||
volumes:
|
||||
ollama_data:
|
||||
EOF
|
||||
|
||||
$SUDO docker compose up -d
|
||||
echo "Ollama Server läuft auf Port $FREE_PORT"
|
||||
|
||||
read -p "Modell jetzt herunterladen? (z.B. llama3 / Enter = nein): " MODEL
|
||||
if [ ! -z "$MODEL" ]; then
|
||||
$SUDO curl -N -X POST http://127.0.0.1:$FREE_PORT/api/pull \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"$MODEL\"}" || true
|
||||
fi
|
||||
|
||||
echo "✅ Fertig! URL: http://<server-ip>:$FREE_PORT"
|
||||
else
|
||||
log "${YELLOW}⏭ Ollama Server übersprungen.${NC}"
|
||||
fi
|
||||
|
||||
32
recipes/ai/rag-crawler/EXTRAS.md
Normal file
32
recipes/ai/rag-crawler/EXTRAS.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# EXTRAS: systemd Timer (optional)
|
||||
|
||||
## /etc/systemd/system/rag-crawler.service
|
||||
```
|
||||
[Unit]
|
||||
Description=RAG Crawler Update (drip)
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=root
|
||||
ExecStart=/bin/bash -lc 'source /srv/ai/rag-crawler/venv/bin/activate && python3 /srv/ai/rag-crawler/crawler/main.py --mode=drip --budget 1'
|
||||
```
|
||||
|
||||
## /etc/systemd/system/rag-crawler.timer
|
||||
```
|
||||
[Unit]
|
||||
Description=Run RAG Crawler drip hourly
|
||||
|
||||
[Timer]
|
||||
OnCalendar=hourly
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
```
|
||||
|
||||
## Enable
|
||||
```
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now rag-crawler.timer
|
||||
```
|
||||
40
recipes/ai/rag-crawler/README.md
Normal file
40
recipes/ai/rag-crawler/README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# RAG Crawler – Vollversion (freundlich & getrennt vom RAG-Speicher)
|
||||
|
||||
Dieser Crawler läuft **separat** vom RAG/Memory-Stack. Er:
|
||||
- respektiert `robots.txt`
|
||||
- nutzt zufällige Delays (min/max), per-Domain-Quoten & Limitierung der Parallelität
|
||||
- unterstützt zwei Modi: `update` (normal) und `drip` (sehr langsam/menschlich)
|
||||
- speichert Texte/PDFs im Dateisystem (Corpus), optional „drippt“ er nur wenige Seiten je Lauf
|
||||
- hat einen separaten **Ingest** nach deiner Memory-API (`/store`), kompatibel zu deiner `memory-api`
|
||||
|
||||
## Schnellstart
|
||||
```bash
|
||||
# 1) installieren
|
||||
bash recipes/services/rag-crawler/install.sh
|
||||
|
||||
# 2) Quellen bearbeiten
|
||||
nano /srv/ai/rag-crawler/crawler/sources.yml
|
||||
|
||||
# 3) Crawl (vollständig/regelmäßig)
|
||||
source /srv/ai/rag-crawler/venv/bin/activate
|
||||
python3 /srv/ai/rag-crawler/crawler/main.py --mode=update
|
||||
|
||||
# 4) „Drip“-Modus (z.B. stündlich je Domain nur 1 URL)
|
||||
python3 /srv/ai/rag-crawler/crawler/main.py --mode=drip --budget 1
|
||||
|
||||
# 5) Ingest aller neuen/aktualisierten Texte in die Memory-API
|
||||
python3 /srv/ai/rag-crawler/crawler/ingest.py --root /srv/ai/corpus --memory http://127.0.0.1:8085
|
||||
```
|
||||
|
||||
## Scheduling (Beispiele)
|
||||
- Crontab:
|
||||
`@hourly source /srv/ai/rag-crawler/venv/bin/activate && python3 /srv/ai/rag-crawler/crawler/main.py --mode=drip --budget 1`
|
||||
`*/10 * * * * source /srv/ai/rag-crawler/venv/bin/activate && python3 /srv/ai/rag-crawler/crawler/ingest.py --root /srv/ai/corpus --memory http://127.0.0.1:8085`
|
||||
- systemd Timer (optional): siehe `EXTRAS.md`
|
||||
|
||||
## Ordner
|
||||
- `/srv/ai/rag-crawler` – Crawler + venv
|
||||
- `/srv/ai/corpus` – Rohdaten (Text/PDF) + `.crawler_state.json`
|
||||
|
||||
## Hinweis
|
||||
- **Keine ENV notwendig** – alle Werte werden interaktiv abgefragt oder in `sources.yml` gepflegt.
|
||||
43
recipes/ai/rag-crawler/crawler/ingest.py
Normal file
43
recipes/ai/rag-crawler/crawler/ingest.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
import os, sys, json, pathlib, argparse, requests
|
||||
|
||||
def iter_texts(root):
|
||||
for p in pathlib.Path(root).rglob("*.txt"):
|
||||
yield p
|
||||
|
||||
def store(memory_url, collection, text, meta):
|
||||
payload = {"text": text, "metadata": {"source": meta.get("source"), "path": meta.get("path")}}
|
||||
r = requests.post(f"{memory_url}/store", json=payload, timeout=30)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--root", required=True, help="Corpus-Root (z.B. /srv/ai/corpus)")
|
||||
ap.add_argument("--memory", required=False, default=None, help="Memory-API URL (z.B. http://127.0.0.1:8085)")
|
||||
ap.add_argument("--collection", default="chat-memory")
|
||||
args = ap.parse_args()
|
||||
|
||||
# Optional: memory-URL aus sources.yml lesen
|
||||
if not args.memory:
|
||||
conf = pathlib.Path(__file__).with_name("sources.yml")
|
||||
if conf.exists():
|
||||
import yaml
|
||||
cfg = yaml.safe_load(conf.read_text())
|
||||
args.memory = cfg.get("memory", {}).get("url")
|
||||
|
||||
if not args.memory:
|
||||
print("Bitte --memory <URL> angeben oder in sources.yml hinterlegen.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
for p in iter_texts(args.root):
|
||||
try:
|
||||
text = p.read_text(errors="ignore")
|
||||
meta = {"path": str(p), "source": "crawler"}
|
||||
store(args.memory, args.collection, text, meta)
|
||||
print("✔ stored", p)
|
||||
except Exception as e:
|
||||
print("✖", p, e, file=sys.stderr)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
254
recipes/ai/rag-crawler/crawler/main.py
Normal file
254
recipes/ai/rag-crawler/crawler/main.py
Normal file
@@ -0,0 +1,254 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio, aiohttp, aiohttp.client_exceptions as aiox
|
||||
import os, time, random, hashlib, json, re, pathlib
|
||||
from urllib.parse import urljoin, urldefrag, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from dateutil.parser import parse as dtparse
|
||||
import yaml, tldextract, ssl
|
||||
|
||||
try:
|
||||
import uvloop
|
||||
uvloop.install()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ---- Config laden ----
|
||||
BASE = os.environ.get("RAG_CRAWLER_BASE", os.getcwd())
|
||||
CONF_PATH = os.path.join(BASE, "crawler", "sources.yml")
|
||||
with open(CONF_PATH, "r") as f:
|
||||
CFG = yaml.safe_load(f)
|
||||
|
||||
POLICY = CFG.get("policy", {})
|
||||
STORAGE = CFG.get("storage", {})
|
||||
MEMORY = CFG.get("memory", {})
|
||||
SEEDS = CFG.get("seeds", [])
|
||||
|
||||
ROOT = pathlib.Path(STORAGE.get("root", "/srv/ai/corpus")).resolve()
|
||||
TEXT_DIR = ROOT / STORAGE.get("text_subdir", "text")
|
||||
PDF_DIR = ROOT / STORAGE.get("pdf_subdir", "pdf")
|
||||
TEXT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
||||
STATE_PATH = ROOT / ".crawler_state.json"
|
||||
|
||||
STATE = {"visited": {}} # url -> {etag, last_modified, ts}
|
||||
if STATE_PATH.exists():
|
||||
try:
|
||||
STATE = json.loads(STATE_PATH.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def save_state():
|
||||
try:
|
||||
STATE_PATH.write_text(json.dumps(STATE))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ---- Robots & Quoten ----
|
||||
ROBOTS_CACHE = {}
|
||||
DOMAIN_NEXT_ALLOWED = {}
|
||||
|
||||
def domain_key(url):
|
||||
ext = tldextract.extract(url)
|
||||
return f"{ext.domain}.{ext.suffix}"
|
||||
|
||||
async def fetch_robots(session, base_url):
|
||||
dom = domain_key(base_url)
|
||||
if dom in ROBOTS_CACHE:
|
||||
return ROBOTS_CACHE[dom]
|
||||
robots_url = urljoin(f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}", "/robots.txt")
|
||||
from robotexclusionrulesparser import RobotExclusionRulesParser as Robots
|
||||
rp = Robots()
|
||||
try:
|
||||
async with session.get(robots_url, timeout=10) as r:
|
||||
if r.status == 200:
|
||||
rp.parse(await r.text())
|
||||
else:
|
||||
rp.parse("")
|
||||
except Exception:
|
||||
rp.parse("")
|
||||
ROBOTS_CACHE[dom] = rp
|
||||
return rp
|
||||
|
||||
def polite_delay_for(url):
|
||||
dmin = int(POLICY.get("delay_min_seconds", 5))
|
||||
dmax = int(POLICY.get("delay_max_seconds", 60))
|
||||
d = domain_key(url)
|
||||
t = DOMAIN_NEXT_ALLOWED.get(d, 0)
|
||||
now = time.time()
|
||||
if now < t:
|
||||
return max(0, t - now)
|
||||
# Setze nächste erlaubte Zeit (random Delay) – eigentlicher Sleep erfolgt in fetch()
|
||||
DOMAIN_NEXT_ALLOWED[d] = now + random.uniform(dmin, dmax)
|
||||
return 0
|
||||
|
||||
def norm_url(base, link):
|
||||
href = urljoin(base, link)
|
||||
href, _ = urldefrag(href)
|
||||
return href
|
||||
|
||||
def fnmatch(text, pat):
|
||||
pat = pat.replace("**", ".*").replace("*", "[^/]*")
|
||||
return re.fullmatch(pat, text) is not None
|
||||
|
||||
def allowed_by_patterns(url, inc, exc):
|
||||
ok_inc = True if not inc else any(fnmatch(url, pat) for pat in inc)
|
||||
ok_exc = any(fnmatch(url, pat) for pat in exc) if exc else False
|
||||
return ok_inc and not ok_exc
|
||||
|
||||
def should_revisit(url, revisit_str):
|
||||
info = STATE["visited"].get(url, {})
|
||||
if not info:
|
||||
return True
|
||||
try:
|
||||
days = int(revisit_str.rstrip("d"))
|
||||
except Exception:
|
||||
days = 30
|
||||
last_ts = info.get("ts", 0)
|
||||
return (time.time() - last_ts) > days * 86400
|
||||
|
||||
async def fetch(session, url, etag=None, lastmod=None):
|
||||
headers = {"User-Agent": POLICY.get("user_agent", "polite-crawler/1.0")}
|
||||
if etag:
|
||||
headers["If-None-Match"] = etag
|
||||
if lastmod:
|
||||
headers["If-Modified-Since"] = lastmod
|
||||
ssl_ctx = ssl.create_default_context()
|
||||
try:
|
||||
delay = polite_delay_for(url)
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
async with session.get(url, headers=headers, ssl=ssl_ctx, timeout=30) as r:
|
||||
if r.status == 304:
|
||||
return None, {"status": 304, "headers": {}}
|
||||
body = await r.read()
|
||||
return body, {"status": r.status, "headers": dict(r.headers)}
|
||||
except Exception as e:
|
||||
return None, {"status": "error", "error": str(e)}
|
||||
|
||||
def save_binary(path: pathlib.Path, content: bytes):
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(content)
|
||||
|
||||
def save_text(path: pathlib.Path, text: str):
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(text)
|
||||
|
||||
def is_pdf(headers):
|
||||
ct = headers.get("Content-Type", "").lower()
|
||||
return "application/pdf" in ct or ct.endswith("/pdf")
|
||||
|
||||
def extract_text_html(body: bytes) -> str:
|
||||
soup = BeautifulSoup(body, "lxml")
|
||||
for tag in soup(["script","style","noscript","nav","footer","header","aside"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text("\n")
|
||||
return "\n".join(line.strip() for line in text.splitlines() if line.strip())
|
||||
|
||||
def path_for(url, typ="text"):
|
||||
h = hashlib.sha256(url.encode()).hexdigest()[:16]
|
||||
if typ == "text":
|
||||
return TEXT_DIR / f"{h}.txt"
|
||||
return PDF_DIR / f"{h}.pdf"
|
||||
|
||||
async def crawl_seed(session, seed, budget=0):
|
||||
base = seed["url"]
|
||||
include = seed.get("include", [])
|
||||
exclude = seed.get("exclude", [])
|
||||
revisit = seed.get("revisit", "30d")
|
||||
|
||||
# robots
|
||||
if POLICY.get("obey_robots_txt", True):
|
||||
rp = await fetch_robots(session, base)
|
||||
if not rp.is_allowed("*", base):
|
||||
return
|
||||
|
||||
queue = [base]
|
||||
seen = set()
|
||||
processed = 0
|
||||
|
||||
while queue:
|
||||
url = queue.pop(0)
|
||||
if url in seen:
|
||||
continue
|
||||
seen.add(url)
|
||||
|
||||
if POLICY.get("obey_robots_txt", True):
|
||||
rp = await fetch_robots(session, url)
|
||||
if not rp.is_allowed("*", url):
|
||||
continue
|
||||
|
||||
if not allowed_by_patterns(url, include, exclude):
|
||||
continue
|
||||
|
||||
info = STATE["visited"].get(url, {})
|
||||
etag = info.get("etag")
|
||||
lastmod = info.get("last_modified")
|
||||
if not should_revisit(url, revisit):
|
||||
continue
|
||||
|
||||
body, meta = await fetch(session, url, etag, lastmod)
|
||||
status = meta.get("status")
|
||||
headers = meta.get("headers", {})
|
||||
|
||||
if status == 304:
|
||||
STATE["visited"][url] = {"etag": etag, "last_modified": lastmod, "ts": time.time()}
|
||||
save_state()
|
||||
continue
|
||||
if status != 200 or body is None:
|
||||
continue
|
||||
|
||||
if is_pdf(headers):
|
||||
out_pdf = path_for(url, "pdf")
|
||||
save_binary(out_pdf, body)
|
||||
# Grobe Textextraktion (best-effort)
|
||||
try:
|
||||
from pdfminer.high_level import extract_text as pdf_extract
|
||||
txt = pdf_extract(str(out_pdf))
|
||||
save_text(path_for(url, "text"), txt)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
txt = extract_text_html(body)
|
||||
save_text(path_for(url, "text"), txt)
|
||||
# Links sammeln (nur gleiche Domain leicht erweitern)
|
||||
soup = BeautifulSoup(body, "lxml")
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = urljoin(url, a["href"])
|
||||
href, _ = urldefrag(href)
|
||||
if href.startswith("http"):
|
||||
# Begrenze Tiefe implizit über revisit/budget
|
||||
queue.append(href)
|
||||
|
||||
STATE["visited"][url] = {
|
||||
"etag": headers.get("ETag"),
|
||||
"last_modified": headers.get("Last-Modified"),
|
||||
"ts": time.time(),
|
||||
}
|
||||
save_state()
|
||||
|
||||
processed += 1
|
||||
if budget and processed >= budget:
|
||||
break
|
||||
|
||||
async def main(mode="update", budget=0):
|
||||
con_total = int(POLICY.get("concurrency_total", 4))
|
||||
timeout = aiohttp.ClientTimeout(total=120)
|
||||
connector = aiohttp.TCPConnector(limit=con_total, ssl=False)
|
||||
async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
|
||||
tasks = []
|
||||
if mode == "drip":
|
||||
budget = budget or 1
|
||||
else:
|
||||
budget = 0 # unbegrenzt im update-Modus
|
||||
for seed in SEEDS:
|
||||
tasks.append(crawl_seed(session, seed, budget=budget))
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--mode", choices=["update","drip"], default="update",
|
||||
help="update=vollständig, drip=sehr langsam mit Budget je Seed")
|
||||
parser.add_argument("--budget", type=int, default=1, help="URLs pro Seed (nur drip)")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(main(args.mode, args.budget))
|
||||
90
recipes/ai/rag-crawler/install.sh
Normal file
90
recipes/ai/rag-crawler/install.sh
Normal file
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Helfer-Funktionen aus deinem Basis-Framework (siehe beispiel.zip) werden erwartet:
|
||||
ensure_root
|
||||
detect_pkg_manager
|
||||
pkg_install python3
|
||||
pkg_install python3-venv || true
|
||||
pkg_install curl
|
||||
|
||||
if ask_to_install "RAG Crawler"; then
|
||||
echo ""
|
||||
read -rp "Basis-Pfad für den Crawler [default: /srv/ai/rag-crawler]: " BASE
|
||||
BASE=${BASE:-/srv/ai/rag-crawler}
|
||||
$SUDO mkdir -p "${BASE}"
|
||||
else
|
||||
log "${YELLOW}⏭ RAG Crawler übersprungen.${NC}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
read -rp "Zielverzeichnis für den Corpus [default: /srv/ai/corpus]: " CORPUS_DIR
|
||||
CORPUS_DIR=${CORPUS_DIR:-/srv/ai/corpus}
|
||||
$SUDO mkdir -p "${CORPUS_DIR}"
|
||||
|
||||
echo ""
|
||||
read -rp "Memory-API URL (z.B. http://127.0.0.1:8085) [default: http://127.0.0.1:8085]: " MEMORY_URL
|
||||
MEMORY_URL=${MEMORY_URL:-http://127.0.0.1:8085}
|
||||
|
||||
# Dateien in BASE kopieren
|
||||
SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
$SUDO mkdir -p "${BASE}/crawler"
|
||||
$SUDO cp -r "${SRC_DIR}/crawler"/* "${BASE}/crawler/"
|
||||
$SUDO cp "${SRC_DIR}/requirements.txt" "${BASE}/requirements.txt"
|
||||
|
||||
# Virtualenv
|
||||
$SUDO python3 -m venv "${BASE}/venv"
|
||||
$SUDO source "${BASE}/venv/bin/activate"
|
||||
$SUDO pip install -U pip
|
||||
$SUDO pip install -r "${BASE}/requirements.txt"
|
||||
$SUDO deactivate
|
||||
|
||||
# sources.yml initialisieren/ersetzen
|
||||
if [ ! -f "${BASE}/crawler/sources.yml" ]; then
|
||||
$SUDO tee "${BASE}/crawler/sources.yml" >/dev/null <<'EOF'
|
||||
# Quellen-Definitionen
|
||||
seeds:
|
||||
- url: "https://www.gesetze-im-internet.de/stvo_2013/"
|
||||
include: ["**"]
|
||||
exclude: ["**/impressum*", "**/kontakt*"]
|
||||
revisit: "30d"
|
||||
- url: "https://www.gesetze-im-internet.de/bgb/"
|
||||
include: ["**"]
|
||||
exclude: []
|
||||
revisit: "30d"
|
||||
- url: "https://www.php.net/manual/en/"
|
||||
include: ["**"]
|
||||
exclude: ["**/search.php*", "**/my.php*"]
|
||||
revisit: "14d"
|
||||
|
||||
policy:
|
||||
concurrency_total: 4
|
||||
concurrency_per_domain: 1
|
||||
delay_min_seconds: 10
|
||||
delay_max_seconds: 120
|
||||
user_agent: "Mozilla/5.0 (compatible; polite-crawler/1.0)"
|
||||
obey_robots_txt: true
|
||||
store_html: false
|
||||
store_text: true
|
||||
store_pdf: true
|
||||
|
||||
storage:
|
||||
root: "/srv/ai/corpus" # wird ersetzt
|
||||
text_subdir: "text"
|
||||
pdf_subdir: "pdf"
|
||||
|
||||
memory:
|
||||
url: "http://127.0.0.1:8085" # wird ersetzt
|
||||
collection: "chat-memory"
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Pfade/URLs deterministisch in sources.yml ersetzen
|
||||
$SUDO sed -i "s|/srv/ai/corpus|${CORPUS_DIR}|g" "${BASE}/crawler/sources.yml"
|
||||
$SUDO sed -i "s|http://127.0.0.1:8085|${MEMORY_URL}|g" "${BASE}/crawler/sources.yml"
|
||||
|
||||
echo "✅ Installiert unter: ${BASE}"
|
||||
echo " Corpus: ${CORPUS_DIR}"
|
||||
echo " Memory-API: ${MEMORY_URL}"
|
||||
echo "➡️ Aktivieren: source ${BASE}/venv/bin/activate && python3 ${BASE}/crawler/main.py --help"
|
||||
12
recipes/ai/rag-crawler/requirements.txt
Normal file
12
recipes/ai/rag-crawler/requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
aiohttp
|
||||
aiodns
|
||||
beautifulsoup4
|
||||
tldextract
|
||||
urllib3
|
||||
pdfminer.six
|
||||
python-dateutil
|
||||
pydantic
|
||||
pyyaml
|
||||
robotexclusionrulesparser
|
||||
uvloop; sys_platform != 'win32'
|
||||
readability-lxml
|
||||
Reference in New Issue
Block a user