Files
2025-11-11 11:47:15 +01:00

44 lines
1.5 KiB
Python

#!/usr/bin/env python3
import os, sys, json, pathlib, argparse, requests
def iter_texts(root):
for p in pathlib.Path(root).rglob("*.txt"):
yield p
def store(memory_url, collection, text, meta):
payload = {"text": text, "metadata": {"source": meta.get("source"), "path": meta.get("path")}}
r = requests.post(f"{memory_url}/store", json=payload, timeout=30)
r.raise_for_status()
return r.json()
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--root", required=True, help="Corpus-Root (z.B. /srv/ai/corpus)")
ap.add_argument("--memory", required=False, default=None, help="Memory-API URL (z.B. http://127.0.0.1:8085)")
ap.add_argument("--collection", default="chat-memory")
args = ap.parse_args()
# Optional: memory-URL aus sources.yml lesen
if not args.memory:
conf = pathlib.Path(__file__).with_name("sources.yml")
if conf.exists():
import yaml
cfg = yaml.safe_load(conf.read_text())
args.memory = cfg.get("memory", {}).get("url")
if not args.memory:
print("Bitte --memory <URL> angeben oder in sources.yml hinterlegen.", file=sys.stderr)
sys.exit(1)
for p in iter_texts(args.root):
try:
text = p.read_text(errors="ignore")
meta = {"path": str(p), "source": "crawler"}
store(args.memory, args.collection, text, meta)
print("✔ stored", p)
except Exception as e:
print("", p, e, file=sys.stderr)
if __name__ == "__main__":
main()