44 lines
1.5 KiB
Python
44 lines
1.5 KiB
Python
#!/usr/bin/env python3
|
|
import os, sys, json, pathlib, argparse, requests
|
|
|
|
def iter_texts(root):
|
|
for p in pathlib.Path(root).rglob("*.txt"):
|
|
yield p
|
|
|
|
def store(memory_url, collection, text, meta):
|
|
payload = {"text": text, "metadata": {"source": meta.get("source"), "path": meta.get("path")}}
|
|
r = requests.post(f"{memory_url}/store", json=payload, timeout=30)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--root", required=True, help="Corpus-Root (z.B. /srv/ai/corpus)")
|
|
ap.add_argument("--memory", required=False, default=None, help="Memory-API URL (z.B. http://127.0.0.1:8085)")
|
|
ap.add_argument("--collection", default="chat-memory")
|
|
args = ap.parse_args()
|
|
|
|
# Optional: memory-URL aus sources.yml lesen
|
|
if not args.memory:
|
|
conf = pathlib.Path(__file__).with_name("sources.yml")
|
|
if conf.exists():
|
|
import yaml
|
|
cfg = yaml.safe_load(conf.read_text())
|
|
args.memory = cfg.get("memory", {}).get("url")
|
|
|
|
if not args.memory:
|
|
print("Bitte --memory <URL> angeben oder in sources.yml hinterlegen.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
for p in iter_texts(args.root):
|
|
try:
|
|
text = p.read_text(errors="ignore")
|
|
meta = {"path": str(p), "source": "crawler"}
|
|
store(args.memory, args.collection, text, meta)
|
|
print("✔ stored", p)
|
|
except Exception as e:
|
|
print("✖", p, e, file=sys.stderr)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|