Initial upload
This commit is contained in:
43
recipes/ai/rag-crawler/crawler/ingest.py
Normal file
43
recipes/ai/rag-crawler/crawler/ingest.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
import os, sys, json, pathlib, argparse, requests
|
||||
|
||||
def iter_texts(root):
|
||||
for p in pathlib.Path(root).rglob("*.txt"):
|
||||
yield p
|
||||
|
||||
def store(memory_url, collection, text, meta):
|
||||
payload = {"text": text, "metadata": {"source": meta.get("source"), "path": meta.get("path")}}
|
||||
r = requests.post(f"{memory_url}/store", json=payload, timeout=30)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--root", required=True, help="Corpus-Root (z.B. /srv/ai/corpus)")
|
||||
ap.add_argument("--memory", required=False, default=None, help="Memory-API URL (z.B. http://127.0.0.1:8085)")
|
||||
ap.add_argument("--collection", default="chat-memory")
|
||||
args = ap.parse_args()
|
||||
|
||||
# Optional: memory-URL aus sources.yml lesen
|
||||
if not args.memory:
|
||||
conf = pathlib.Path(__file__).with_name("sources.yml")
|
||||
if conf.exists():
|
||||
import yaml
|
||||
cfg = yaml.safe_load(conf.read_text())
|
||||
args.memory = cfg.get("memory", {}).get("url")
|
||||
|
||||
if not args.memory:
|
||||
print("Bitte --memory <URL> angeben oder in sources.yml hinterlegen.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
for p in iter_texts(args.root):
|
||||
try:
|
||||
text = p.read_text(errors="ignore")
|
||||
meta = {"path": str(p), "source": "crawler"}
|
||||
store(args.memory, args.collection, text, meta)
|
||||
print("✔ stored", p)
|
||||
except Exception as e:
|
||||
print("✖", p, e, file=sys.stderr)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user