Initial upload

This commit is contained in:
2025-11-11 11:47:15 +01:00
commit 7c24dab288
48 changed files with 2761 additions and 0 deletions

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env bash
set -euo pipefail
ensure_root
detect_pkg_manager
pkg_install curl
if ask_to_install "Agent-Konfiguration"; then
echo ""
read -rp "Ollama Router Base-URL (z.B. http://192.168.3.21:11437): " ROUTER_URL
ROUTER_URL=${ROUTER_URL:-http://localhost:11437}
BASE="/srv/ai/agents"
$SUDO mkdir -p "${BASE}"
$SUDO tee "${BASE}/agents.yml" >/dev/null <<'EOF'
language: de
autonomy: soft
scope: global
agents:
- name: Strategie-Agent
purpose: "Lange Planungsdialoge, Roadmaps, Tabellen, UI/UX-Brainstorming."
default_models:
primary: "llama3.1:8b-instruct"
secondary: "mistral-nemo:12b"
cpu_fallback: "phi3:mini"
endpoint: "${ROUTER_URL}"
prompt_preset: |
Du bist ein strategischer Planer. Arbeite iterativ, strukturiert und deutschsprachig.
Liefere Tabellen (Markdown), klare Meilensteine, Risiken, Abhängigkeiten.
Frage NUR nach, wenn kritische Annahmen fehlen; sonst entscheide pragmatisch.
Modus: soft Vorschläge machen, aber Details selbstständig ausarbeiten.
- name: Denker-Agent
purpose: "Tiefes Reasoning (CoT), Architektur- und Lösungsentwürfe, Mathe/Logik."
default_models:
primary: "huihui_ai/deepseek-r1-abliterated:14b"
secondary: "phi3:medium-128k"
cpu_fallback: "phi3:mini"
endpoint: "${ROUTER_URL}"
prompt_preset: |
Denke in überprüfbaren Schritten. Erkläre Annahmen, bevor du entscheidest.
Bevorzuge Beweise, Gegenbeispiele und Tests. Schließe mit TL;DR.
- name: Gedächtnis-Agent
purpose: "RAG, Wissensquellen, Zitationen, Abruf & Zusammenführung von Fakten."
default_models:
retriever_llm: "phi3:mini"
embed_model: "mxbai-embed-large"
cpu_fallback: "gemma2:2b-instruct-q6_K"
endpoint: "${ROUTER_URL}"
prompt_preset: |
Orchestriere Nachschlagen in Wissenssammlungen (RAG). Zitiere Fundstellen (Datei/Seite/Abschnitt).
Antworte nüchtern, fasse Unsicherheit transparent zusammen.
sources:
- name: "Gesetze"
type: "pdf"
location: "/srv/ai/corpus/law"
- name: "Shadowrun-Regeln"
type: "pdf"
location: "/srv/ai/corpus/shadowrun"
- name: "Tech-Docs"
type: "mixed"
location: "/srv/ai/corpus/tech"
EOF
$SUDO sed -i "s|\${ROUTER_URL}|${ROUTER_URL}|g" "${BASE}/agents.yml"
echo "✅ Agenten-Profile: ${BASE}/agents.yml"
else
log "${YELLOW}⏭ Agent-Konfiguration übersprungen.${NC}"
fi

View File

@@ -0,0 +1,16 @@
services:
budibase:
image: budibase/budibase:latest
container_name: budibase
restart: unless-stopped
ports:
- "10000:80"
environment:
- JWT_SECRET=changeme
- MINIO_ACCESS_KEY=budibase
- MINIO_SECRET_KEY=budibase_secret
volumes:
- budibase_data:/data
volumes:
budibase_data:

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env bash
set -euo pipefail
if ask_to_install "Budibase Server"; then
echo "=== BUDIBASE INSTALLATION ==="
ensure_root
detect_pkg_manager
install_docker
echo "[+] Erstelle Verzeichnis: /srv/docker/budibase"
$SUDO mkdir -p /srv/docker/budibase
cd /srv/docker/budibase
# Funktion für automatisches Finden des nächsten freien Ports
find_free_port() {
PORT=10000
while ss -lnt | awk '{print $4}' | grep -q ":$PORT$"; do
PORT=$((PORT + 1))
done
echo "$PORT"
}
FREE_PORT=$(find_free_port)
echo "✅ Freier Port gefunden: $FREE_PORT"
echo "[+] Schreibe docker-compose.yml"
$SUDO tee docker-compose.yml >/dev/null <<EOF
services:
budibase:
image: budibase/budibase:latest
container_name: budibase-$FREE_PORT
restart: unless-stopped
ports:
- "$FREE_PORT:80"
environment:
- JWT_SECRET=changeme
- MINIO_ACCESS_KEY=budibase
- MINIO_SECRET_KEY=budibase_secret
volumes:
- budibase_data:/data
volumes:
budibase_data:
EOF
echo "[+] Starte Budibase..."
$SUDO docker compose up -d
echo ""
echo "✅ Budibase ist installiert!"
echo "→ Öffne im Browser: http://<IP>:$FREE_PORT"
else
log "${YELLOW}⏭ Budibase Server übersprungen.${NC}"
fi

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env bash
set -euo pipefail
ensure_root
detect_pkg_manager
pkg_install curl
pkg_install git || true
if ask_to_install "Builder-Agent"; then
echo ""
read -rp "Ollama Router Base-URL (z.B. http://192.168.3.21:11437): " ROUTER_URL
ROUTER_URL=${ROUTER_URL:-http://localhost:11437}
echo ""
read -rp "Projektverzeichnis (leer = auto-detect): " PROJECT_DIR
if [ -z "${PROJECT_DIR}" ]; then
if git rev-parse --show-toplevel >/dev/null 2>&1; then
PROJECT_DIR="$(git rev-parse --show-toplevel)"
else
PROJECT_DIR="$(pwd)"
fi
fi
PROJECT_DIR="$(readlink -f "${PROJECT_DIR}")"
BASE="/srv/ai/builder"
$SUDO mkdir -p "${BASE}"
$SUDO tee "${BASE}/builder.yml" >/dev/null <<'EOF'
name: Builder-Agent
language: de
autonomy: soft
endpoint: "${ROUTER_URL}"
models:
planner: "llama3.1:8b-instruct"
reasoner: "huihui_ai/deepseek-r1-abliterated:14b"
coder_primary: "qwen2.5-coder:14b"
coder_secondary: "deepseek-coder-v2:16b"
cpu_fallback: "qwen2.5-coder:7b"
workspace:
project_dir: "${PROJECT_DIR}"
tests:
enabled: true
force_languages: []
prompts:
system: |
Du bist ein Builder-Agent (soft). Ziel: Probleme lösen mit minimaler Rückfrage.
Strategie:
1) Plane kurz (ToDo-Liste), dann implementiere iterativ im Workspace.
2) Führe nach jedem Schritt Tests/Lints aus (falls verfügbar). Repariere Fehler selbstständig.
3) Schreibe klare Commits; dokumentiere Änderungen kompakt in CHANGELOG.md.
4) Nur bei sicherheitsrelevanten/zerstörerischen Aktionen Rückfrage.
Liefere am Ende: TL;DR + nächste Schritte.
EOF
$SUDO sed -i "s|\${ROUTER_URL}|${ROUTER_URL}|g; s|\${PROJECT_DIR}|${PROJECT_DIR}|g" "${BASE}/builder.yml"
$SUDO tee "${BASE}/run_tests.sh" >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
ROOT="${1:-.}"
cd "${ROOT}"
if [ -f "requirements.txt" ] || ls -1 **/requirements.txt >/dev/null 2>&1; then
command -v pytest >/dev/null 2>&1 && pytest -q || true
fi
if [ -f "package.json" ]; then
if npm run | grep -q "test"; then npm test --silent || true; fi
if npm run | grep -q "lint"; then npm run lint --silent || true; fi
if npm run | grep -q "typecheck"; then npm run typecheck --silent || true; fi
fi
if [ -f "composer.json" ]; then
if [ -f "vendor/bin/pest" ]; then vendor/bin/pest || true
elif [ -f "vendor/bin/phpunit" ]; then vendor/bin/phpunit || true
fi
fi
if [ -f "Dockerfile" ]; then
docker build -q -t tmp-builder-test . || true
fi
if command -v shellcheck >/dev/null 2>&1; then
find . -type f -name "*.sh" -print0 | xargs -0 -r shellcheck || true
fi
EOF
$SUDO chmod +x "${BASE}/run_tests.sh"
echo "✅ Builder-Agent konfiguriert unter ${BASE} (Workspace: ${PROJECT_DIR})"
else
log "${YELLOW}⏭ Builder-Agent übersprungen.${NC}"
fi

View File

@@ -0,0 +1,2 @@
#!/usr/bin/env bash
echo "Diagram-Agent placeholder install script"

View File

@@ -0,0 +1,12 @@
# Memory Stack (External Ollama)
## Deploy
```
bash deploy.sh http://<OLLAMA-IP>:<PORT>
```
## Test
```
curl http://localhost:8085/health
```

View File

@@ -0,0 +1,25 @@
version: "3.8"
services:
qdrant:
image: qdrant/qdrant:latest
container_name: memory-qdrant
volumes:
- /srv/docker/services/memory/qdrant:/qdrant/storage
ports:
- "127.0.0.1:6333:6333"
restart: unless-stopped
memory-api:
build:
context: ./memory-api
container_name: memory-api
environment:
- QDRANT_URL=http://qdrant:6333
- OLLAMA_API={{OLLAMA_API}}
- COLLECTION_NAME=chat-memory
ports:
- "127.0.0.1:8085:8085"
depends_on:
- qdrant
restart: unless-stopped

View File

@@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail
ensure_root
detect_pkg_manager
install_docker
if ask_to_install "RAG Memory Stack (Qdrant + Memory API)"; then
log "=== RAG Memory Stack Installation ==="
read -rp "Ollama API URL (z.B. http://127.0.0.1:11434): " OLLAMA_API_URL
OLLAMA_API_URL=${OLLAMA_API_URL:-http://127.0.0.1:11434}
BASE="/srv/docker/services/memory"
$SUDO mkdir -p "$BASE/qdrant"
$SUDO cp -r "$(dirname "${BASH_SOURCE[0]}")/memory-api" "$BASE/"
$SUDO cp "$(dirname "${BASH_SOURCE[0]}")/compose.yaml" "$BASE/docker-compose.yml"
cd "$BASE"
$SUDO sed -i "s|{{OLLAMA_API}}|$OLLAMA_API_URL|g" docker-compose.yml
log "🚀 Starte RAG Memory Stack..."
$SUDO docker compose up -d --build
log "Attempting to pull embedding model from remote Ollama..."
$SUDO curl -s -X POST "$OLLAMA_API_URL/api/pull" -H 'Content-Type: application/json' -d '{"name": "nomic-embed-text"}' || log "Notice: Model pull failed (possibly using a gateway). Continuing."
log "✅ RAG Memory Stack läuft unter: http://<server-ip>:8085"
else
log "${YELLOW}⏭ RAG Memory Stack übersprungen.${NC}"
fi

View File

@@ -0,0 +1,8 @@
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app.py .
EXPOSE 8085
CMD ["python", "app.py"]

View File

@@ -0,0 +1,40 @@
from fastapi import FastAPI
import requests, os
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
import hashlib
import json
app = FastAPI()
QDRANT_URL = os.getenv("QDRANT_URL")
OLLAMA_API = os.getenv("OLLAMA_API")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "chat-memory")
client = QdrantClient(url=QDRANT_URL)
@app.get("/health")
def health():
return {"status": "ok", "qdrant": QDRANT_URL, "ollama": OLLAMA_API}
def embed(text):
r = requests.post(f"{OLLAMA_API}/api/embeddings", json={"model":"nomic-embed-text","prompt":text})
return r.json()["embedding"]
@app.post("/store")
def store(item: dict):
text = item["text"]
metadata = item.get("metadata", {})
vec = embed(text)
pid = hashlib.sha256(text.encode()).hexdigest()
client.upsert(collection_name=COLLECTION_NAME, points=[PointStruct(id=pid, vector=vec, payload={"text": text, **metadata})])
return {"stored": True}
@app.post("/search")
def search(query: dict):
q = query["text"]
top_k = query.get("top_k", 5)
vec = embed(q)
result = client.search(collection_name=COLLECTION_NAME, query_vector=vec, limit=top_k)
return [{"score": r.score, "text": r.payload["text"]} for r in result]

View File

@@ -0,0 +1,4 @@
fastapi
uvicorn
requests
qdrant-client

View File

@@ -0,0 +1,14 @@
# Ollama Router (new schema)
Dieses Paket folgt dem Beispiel-Schema (beispiel.zip). Es enthält:
- `recipes/services/ollama-router/install.sh` interaktive IP/Port-Abfrage (ohne ENV)
- `recipes/services/ollama-router/docker-compose.yml` nutzt externes Netzwerk `ai`
- `recipes/services/ollama-router/config.yml` wird vom Install-Skript erzeugt
## Install
```bash
bash recipes/services/ollama-router/install.sh
cd /srv/docker/services/ollama-router
docker compose up -d
```
CPU-Fallback-Modelle werden automatisch auf dem CPU-Node gepullt, damit **Strategie-/Denker-/Gedächtnis-Agenten** immer laufen.

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env bash
set -euo pipefail
ensure_root
detect_pkg_manager
pkg_install curl
install_docker
if ask_to_install "Ollama Router"; then
echo ""
read -rp "Listen-Port des Router (Default 11437): " ROUTER_PORT
ROUTER_PORT=${ROUTER_PORT:-11437}
echo ""
read -rp "NVIDIA Node IP: " NVIDIA_IP
read -rp "NVIDIA Node Port (Default 11436): " NVIDIA_PORT
NVIDIA_PORT=${NVIDIA_PORT:-11436}
echo ""
read -rp "AMD (ROCm) Node IP: " AMD_IP
read -rp "AMD Node Port (Default 11435): " AMD_PORT
AMD_PORT=${AMD_PORT:-11435}
echo ""
read -rp "CPU-only Node IP: " CPU_IP
read -rp "CPU Node Port (Default 11434): " CPU_PORT
CPU_PORT=${CPU_PORT:-11434}
BASE="/srv/docker/services/ollama-router"
$SUDO mkdir -p "${BASE}"
cd "${BASE}"
$SUDO tee config.yml >/dev/null <<'EOF'
routes:
llama3.1:8b-instruct:
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${AMD_IP}:${AMD_PORT}
- url: http://${CPU_IP}:${CPU_PORT}
mistral-nemo:12b:
- url: http://${AMD_IP}:${AMD_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${CPU_IP}:${CPU_PORT}
huihui_ai/deepseek-r1-abliterated:14b:
- url: http://${AMD_IP}:${AMD_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${CPU_IP}:${CPU_PORT}
phi3:medium-128k:
- url: http://${AMD_IP}:${AMD_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${CPU_IP}:${CPU_PORT}
mxbai-embed-large:
- url: http://${CPU_IP}:${CPU_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${AMD_IP}:${AMD_PORT}
phi3:mini:
- url: http://${CPU_IP}:${CPU_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${AMD_IP}:${AMD_PORT}
gemma2:2b-instruct-q6_K:
- url: http://${CPU_IP}:${CPU_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${AMD_IP}:${AMD_PORT}
qwen2.5-coder:14b:
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${AMD_IP}:${AMD_PORT}
- url: http://${CPU_IP}:${CPU_PORT}
deepseek-coder-v2:16b:
- url: http://${AMD_IP}:${AMD_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${CPU_IP}:${CPU_PORT}
qwen2.5-coder:7b:
- url: http://${CPU_IP}:${CPU_PORT}
- url: http://${NVIDIA_IP}:${NVIDIA_PORT}
- url: http://${AMD_IP}:${AMD_PORT}
EOF
$SUDO sed -i "s|\${NVIDIA_IP}|${NVIDIA_IP}|g; s|\${NVIDIA_PORT}|${NVIDIA_PORT}|g; s|\${AMD_IP}|${AMD_IP}|g; s|\${AMD_PORT}|${AMD_PORT}|g; s|\${CPU_IP}|${CPU_IP}|g; s|\${CPU_PORT}|${CPU_PORT}|g" config.yml
$SUDO tee docker-compose.yml >/dev/null <<EOF
version: "3.9"
services:
ollama-router:
image: ghcr.io/ollama/ollama-router:latest
container_name: ollama-router
restart: unless-stopped
networks: [ai]
volumes:
- ./config.yml:/app/config.yml:ro
ports:
- "${ROUTER_PORT}:11437"
networks:
ai:
external: true
EOF
$SUDO docker network inspect ai >/dev/null 2>&1 || $SUDO docker network create ai
CPU_MODELS=(
"phi3:mini"
"gemma2:2b-instruct-q6_K"
"mxbai-embed-large"
"qwen2.5-coder:7b"
)
for m in "${CPU_MODELS[@]}"; do
echo "→ Pull ${m} on CPU node ${CPU_IP}:${CPU_PORT}"
$SUDO curl -fsSL -X POST "http://${CPU_IP}:${CPU_PORT}/api/pull" -d "{"name":"${m}"}" || true
done
log "✅ Router konfiguriert in ${BASE}"
log " Start: cd ${BASE} && docker compose up -d"
else
log "${YELLOW}⏭ Ollama Router übersprungen.${NC}"
fi

View File

@@ -0,0 +1,11 @@
services:
ollama:
image: ollama/ollama:latest
container_name: ollama
restart: unless-stopped
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
volumes:
ollama_data:

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env bash
set -euo pipefail
if ask_to_install "Ollama Server"; then
echo "=== OLLAMA SERVER INSTALLATION ==="
ensure_root
detect_pkg_manager
install_docker
$SUDO mkdir -p /srv/docker/ollama
cd /srv/docker/ollama
# Funktion, die den nächsten freien Port sucht
find_free_port() {
PORT=11434
while ss -lnt | awk '{print $4}' | grep -q ":$PORT$"; do
PORT=$((PORT + 1))
done
echo "$PORT"
}
FREE_PORT=$(find_free_port)
echo "✅ Freier Port gefunden: $FREE_PORT"
$SUDO tee docker-compose.yml >/dev/null <<EOF
services:
ollama:
image: ollama/ollama:latest
container_name: ollama-$FREE_PORT
restart: unless-stopped
ports:
- "$FREE_PORT:11434"
volumes:
- ollama_data:/root/.ollama
volumes:
ollama_data:
EOF
$SUDO docker compose up -d
echo "Ollama Server läuft auf Port $FREE_PORT"
read -p "Modell jetzt herunterladen? (z.B. llama3 / Enter = nein): " MODEL
if [ ! -z "$MODEL" ]; then
$SUDO curl -N -X POST http://127.0.0.1:$FREE_PORT/api/pull \
-H "Content-Type: application/json" \
-d "{\"name\":\"$MODEL\"}" || true
fi
echo "✅ Fertig! URL: http://<server-ip>:$FREE_PORT"
else
log "${YELLOW}⏭ Ollama Server übersprungen.${NC}"
fi

View File

@@ -0,0 +1,32 @@
# EXTRAS: systemd Timer (optional)
## /etc/systemd/system/rag-crawler.service
```
[Unit]
Description=RAG Crawler Update (drip)
After=network.target
[Service]
Type=oneshot
User=root
ExecStart=/bin/bash -lc 'source /srv/ai/rag-crawler/venv/bin/activate && python3 /srv/ai/rag-crawler/crawler/main.py --mode=drip --budget 1'
```
## /etc/systemd/system/rag-crawler.timer
```
[Unit]
Description=Run RAG Crawler drip hourly
[Timer]
OnCalendar=hourly
Persistent=true
[Install]
WantedBy=timers.target
```
## Enable
```
systemctl daemon-reload
systemctl enable --now rag-crawler.timer
```

View File

@@ -0,0 +1,40 @@
# RAG Crawler Vollversion (freundlich & getrennt vom RAG-Speicher)
Dieser Crawler läuft **separat** vom RAG/Memory-Stack. Er:
- respektiert `robots.txt`
- nutzt zufällige Delays (min/max), per-Domain-Quoten & Limitierung der Parallelität
- unterstützt zwei Modi: `update` (normal) und `drip` (sehr langsam/menschlich)
- speichert Texte/PDFs im Dateisystem (Corpus), optional „drippt“ er nur wenige Seiten je Lauf
- hat einen separaten **Ingest** nach deiner Memory-API (`/store`), kompatibel zu deiner `memory-api`
## Schnellstart
```bash
# 1) installieren
bash recipes/services/rag-crawler/install.sh
# 2) Quellen bearbeiten
nano /srv/ai/rag-crawler/crawler/sources.yml
# 3) Crawl (vollständig/regelmäßig)
source /srv/ai/rag-crawler/venv/bin/activate
python3 /srv/ai/rag-crawler/crawler/main.py --mode=update
# 4) „Drip“-Modus (z.B. stündlich je Domain nur 1 URL)
python3 /srv/ai/rag-crawler/crawler/main.py --mode=drip --budget 1
# 5) Ingest aller neuen/aktualisierten Texte in die Memory-API
python3 /srv/ai/rag-crawler/crawler/ingest.py --root /srv/ai/corpus --memory http://127.0.0.1:8085
```
## Scheduling (Beispiele)
- Crontab:
`@hourly source /srv/ai/rag-crawler/venv/bin/activate && python3 /srv/ai/rag-crawler/crawler/main.py --mode=drip --budget 1`
`*/10 * * * * source /srv/ai/rag-crawler/venv/bin/activate && python3 /srv/ai/rag-crawler/crawler/ingest.py --root /srv/ai/corpus --memory http://127.0.0.1:8085`
- systemd Timer (optional): siehe `EXTRAS.md`
## Ordner
- `/srv/ai/rag-crawler` Crawler + venv
- `/srv/ai/corpus` Rohdaten (Text/PDF) + `.crawler_state.json`
## Hinweis
- **Keine ENV notwendig** alle Werte werden interaktiv abgefragt oder in `sources.yml` gepflegt.

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import os, sys, json, pathlib, argparse, requests
def iter_texts(root):
for p in pathlib.Path(root).rglob("*.txt"):
yield p
def store(memory_url, collection, text, meta):
payload = {"text": text, "metadata": {"source": meta.get("source"), "path": meta.get("path")}}
r = requests.post(f"{memory_url}/store", json=payload, timeout=30)
r.raise_for_status()
return r.json()
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--root", required=True, help="Corpus-Root (z.B. /srv/ai/corpus)")
ap.add_argument("--memory", required=False, default=None, help="Memory-API URL (z.B. http://127.0.0.1:8085)")
ap.add_argument("--collection", default="chat-memory")
args = ap.parse_args()
# Optional: memory-URL aus sources.yml lesen
if not args.memory:
conf = pathlib.Path(__file__).with_name("sources.yml")
if conf.exists():
import yaml
cfg = yaml.safe_load(conf.read_text())
args.memory = cfg.get("memory", {}).get("url")
if not args.memory:
print("Bitte --memory <URL> angeben oder in sources.yml hinterlegen.", file=sys.stderr)
sys.exit(1)
for p in iter_texts(args.root):
try:
text = p.read_text(errors="ignore")
meta = {"path": str(p), "source": "crawler"}
store(args.memory, args.collection, text, meta)
print("✔ stored", p)
except Exception as e:
print("", p, e, file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,254 @@
#!/usr/bin/env python3
import asyncio, aiohttp, aiohttp.client_exceptions as aiox
import os, time, random, hashlib, json, re, pathlib
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup
from dateutil.parser import parse as dtparse
import yaml, tldextract, ssl
try:
import uvloop
uvloop.install()
except Exception:
pass
# ---- Config laden ----
BASE = os.environ.get("RAG_CRAWLER_BASE", os.getcwd())
CONF_PATH = os.path.join(BASE, "crawler", "sources.yml")
with open(CONF_PATH, "r") as f:
CFG = yaml.safe_load(f)
POLICY = CFG.get("policy", {})
STORAGE = CFG.get("storage", {})
MEMORY = CFG.get("memory", {})
SEEDS = CFG.get("seeds", [])
ROOT = pathlib.Path(STORAGE.get("root", "/srv/ai/corpus")).resolve()
TEXT_DIR = ROOT / STORAGE.get("text_subdir", "text")
PDF_DIR = ROOT / STORAGE.get("pdf_subdir", "pdf")
TEXT_DIR.mkdir(parents=True, exist_ok=True)
PDF_DIR.mkdir(parents=True, exist_ok=True)
STATE_PATH = ROOT / ".crawler_state.json"
STATE = {"visited": {}} # url -> {etag, last_modified, ts}
if STATE_PATH.exists():
try:
STATE = json.loads(STATE_PATH.read_text())
except Exception:
pass
def save_state():
try:
STATE_PATH.write_text(json.dumps(STATE))
except Exception:
pass
# ---- Robots & Quoten ----
ROBOTS_CACHE = {}
DOMAIN_NEXT_ALLOWED = {}
def domain_key(url):
ext = tldextract.extract(url)
return f"{ext.domain}.{ext.suffix}"
async def fetch_robots(session, base_url):
dom = domain_key(base_url)
if dom in ROBOTS_CACHE:
return ROBOTS_CACHE[dom]
robots_url = urljoin(f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}", "/robots.txt")
from robotexclusionrulesparser import RobotExclusionRulesParser as Robots
rp = Robots()
try:
async with session.get(robots_url, timeout=10) as r:
if r.status == 200:
rp.parse(await r.text())
else:
rp.parse("")
except Exception:
rp.parse("")
ROBOTS_CACHE[dom] = rp
return rp
def polite_delay_for(url):
dmin = int(POLICY.get("delay_min_seconds", 5))
dmax = int(POLICY.get("delay_max_seconds", 60))
d = domain_key(url)
t = DOMAIN_NEXT_ALLOWED.get(d, 0)
now = time.time()
if now < t:
return max(0, t - now)
# Setze nächste erlaubte Zeit (random Delay) eigentlicher Sleep erfolgt in fetch()
DOMAIN_NEXT_ALLOWED[d] = now + random.uniform(dmin, dmax)
return 0
def norm_url(base, link):
href = urljoin(base, link)
href, _ = urldefrag(href)
return href
def fnmatch(text, pat):
pat = pat.replace("**", ".*").replace("*", "[^/]*")
return re.fullmatch(pat, text) is not None
def allowed_by_patterns(url, inc, exc):
ok_inc = True if not inc else any(fnmatch(url, pat) for pat in inc)
ok_exc = any(fnmatch(url, pat) for pat in exc) if exc else False
return ok_inc and not ok_exc
def should_revisit(url, revisit_str):
info = STATE["visited"].get(url, {})
if not info:
return True
try:
days = int(revisit_str.rstrip("d"))
except Exception:
days = 30
last_ts = info.get("ts", 0)
return (time.time() - last_ts) > days * 86400
async def fetch(session, url, etag=None, lastmod=None):
headers = {"User-Agent": POLICY.get("user_agent", "polite-crawler/1.0")}
if etag:
headers["If-None-Match"] = etag
if lastmod:
headers["If-Modified-Since"] = lastmod
ssl_ctx = ssl.create_default_context()
try:
delay = polite_delay_for(url)
if delay > 0:
await asyncio.sleep(delay)
async with session.get(url, headers=headers, ssl=ssl_ctx, timeout=30) as r:
if r.status == 304:
return None, {"status": 304, "headers": {}}
body = await r.read()
return body, {"status": r.status, "headers": dict(r.headers)}
except Exception as e:
return None, {"status": "error", "error": str(e)}
def save_binary(path: pathlib.Path, content: bytes):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(content)
def save_text(path: pathlib.Path, text: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(text)
def is_pdf(headers):
ct = headers.get("Content-Type", "").lower()
return "application/pdf" in ct or ct.endswith("/pdf")
def extract_text_html(body: bytes) -> str:
soup = BeautifulSoup(body, "lxml")
for tag in soup(["script","style","noscript","nav","footer","header","aside"]):
tag.decompose()
text = soup.get_text("\n")
return "\n".join(line.strip() for line in text.splitlines() if line.strip())
def path_for(url, typ="text"):
h = hashlib.sha256(url.encode()).hexdigest()[:16]
if typ == "text":
return TEXT_DIR / f"{h}.txt"
return PDF_DIR / f"{h}.pdf"
async def crawl_seed(session, seed, budget=0):
base = seed["url"]
include = seed.get("include", [])
exclude = seed.get("exclude", [])
revisit = seed.get("revisit", "30d")
# robots
if POLICY.get("obey_robots_txt", True):
rp = await fetch_robots(session, base)
if not rp.is_allowed("*", base):
return
queue = [base]
seen = set()
processed = 0
while queue:
url = queue.pop(0)
if url in seen:
continue
seen.add(url)
if POLICY.get("obey_robots_txt", True):
rp = await fetch_robots(session, url)
if not rp.is_allowed("*", url):
continue
if not allowed_by_patterns(url, include, exclude):
continue
info = STATE["visited"].get(url, {})
etag = info.get("etag")
lastmod = info.get("last_modified")
if not should_revisit(url, revisit):
continue
body, meta = await fetch(session, url, etag, lastmod)
status = meta.get("status")
headers = meta.get("headers", {})
if status == 304:
STATE["visited"][url] = {"etag": etag, "last_modified": lastmod, "ts": time.time()}
save_state()
continue
if status != 200 or body is None:
continue
if is_pdf(headers):
out_pdf = path_for(url, "pdf")
save_binary(out_pdf, body)
# Grobe Textextraktion (best-effort)
try:
from pdfminer.high_level import extract_text as pdf_extract
txt = pdf_extract(str(out_pdf))
save_text(path_for(url, "text"), txt)
except Exception:
pass
else:
txt = extract_text_html(body)
save_text(path_for(url, "text"), txt)
# Links sammeln (nur gleiche Domain leicht erweitern)
soup = BeautifulSoup(body, "lxml")
for a in soup.find_all("a", href=True):
href = urljoin(url, a["href"])
href, _ = urldefrag(href)
if href.startswith("http"):
# Begrenze Tiefe implizit über revisit/budget
queue.append(href)
STATE["visited"][url] = {
"etag": headers.get("ETag"),
"last_modified": headers.get("Last-Modified"),
"ts": time.time(),
}
save_state()
processed += 1
if budget and processed >= budget:
break
async def main(mode="update", budget=0):
con_total = int(POLICY.get("concurrency_total", 4))
timeout = aiohttp.ClientTimeout(total=120)
connector = aiohttp.TCPConnector(limit=con_total, ssl=False)
async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
tasks = []
if mode == "drip":
budget = budget or 1
else:
budget = 0 # unbegrenzt im update-Modus
for seed in SEEDS:
tasks.append(crawl_seed(session, seed, budget=budget))
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--mode", choices=["update","drip"], default="update",
help="update=vollständig, drip=sehr langsam mit Budget je Seed")
parser.add_argument("--budget", type=int, default=1, help="URLs pro Seed (nur drip)")
args = parser.parse_args()
asyncio.run(main(args.mode, args.budget))

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env bash
set -euo pipefail
# Helfer-Funktionen aus deinem Basis-Framework (siehe beispiel.zip) werden erwartet:
ensure_root
detect_pkg_manager
pkg_install python3
pkg_install python3-venv || true
pkg_install curl
if ask_to_install "RAG Crawler"; then
echo ""
read -rp "Basis-Pfad für den Crawler [default: /srv/ai/rag-crawler]: " BASE
BASE=${BASE:-/srv/ai/rag-crawler}
$SUDO mkdir -p "${BASE}"
else
log "${YELLOW}⏭ RAG Crawler übersprungen.${NC}"
exit 0
fi
echo ""
read -rp "Zielverzeichnis für den Corpus [default: /srv/ai/corpus]: " CORPUS_DIR
CORPUS_DIR=${CORPUS_DIR:-/srv/ai/corpus}
$SUDO mkdir -p "${CORPUS_DIR}"
echo ""
read -rp "Memory-API URL (z.B. http://127.0.0.1:8085) [default: http://127.0.0.1:8085]: " MEMORY_URL
MEMORY_URL=${MEMORY_URL:-http://127.0.0.1:8085}
# Dateien in BASE kopieren
SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
$SUDO mkdir -p "${BASE}/crawler"
$SUDO cp -r "${SRC_DIR}/crawler"/* "${BASE}/crawler/"
$SUDO cp "${SRC_DIR}/requirements.txt" "${BASE}/requirements.txt"
# Virtualenv
$SUDO python3 -m venv "${BASE}/venv"
$SUDO source "${BASE}/venv/bin/activate"
$SUDO pip install -U pip
$SUDO pip install -r "${BASE}/requirements.txt"
$SUDO deactivate
# sources.yml initialisieren/ersetzen
if [ ! -f "${BASE}/crawler/sources.yml" ]; then
$SUDO tee "${BASE}/crawler/sources.yml" >/dev/null <<'EOF'
# Quellen-Definitionen
seeds:
- url: "https://www.gesetze-im-internet.de/stvo_2013/"
include: ["**"]
exclude: ["**/impressum*", "**/kontakt*"]
revisit: "30d"
- url: "https://www.gesetze-im-internet.de/bgb/"
include: ["**"]
exclude: []
revisit: "30d"
- url: "https://www.php.net/manual/en/"
include: ["**"]
exclude: ["**/search.php*", "**/my.php*"]
revisit: "14d"
policy:
concurrency_total: 4
concurrency_per_domain: 1
delay_min_seconds: 10
delay_max_seconds: 120
user_agent: "Mozilla/5.0 (compatible; polite-crawler/1.0)"
obey_robots_txt: true
store_html: false
store_text: true
store_pdf: true
storage:
root: "/srv/ai/corpus" # wird ersetzt
text_subdir: "text"
pdf_subdir: "pdf"
memory:
url: "http://127.0.0.1:8085" # wird ersetzt
collection: "chat-memory"
EOF
fi
# Pfade/URLs deterministisch in sources.yml ersetzen
$SUDO sed -i "s|/srv/ai/corpus|${CORPUS_DIR}|g" "${BASE}/crawler/sources.yml"
$SUDO sed -i "s|http://127.0.0.1:8085|${MEMORY_URL}|g" "${BASE}/crawler/sources.yml"
echo "✅ Installiert unter: ${BASE}"
echo " Corpus: ${CORPUS_DIR}"
echo " Memory-API: ${MEMORY_URL}"
echo "➡️ Aktivieren: source ${BASE}/venv/bin/activate && python3 ${BASE}/crawler/main.py --help"

View File

@@ -0,0 +1,12 @@
aiohttp
aiodns
beautifulsoup4
tldextract
urllib3
pdfminer.six
python-dateutil
pydantic
pyyaml
robotexclusionrulesparser
uvloop; sys_platform != 'win32'
readability-lxml