Initial upload

This commit is contained in:
2025-11-11 11:47:15 +01:00
commit 7c24dab288
48 changed files with 2761 additions and 0 deletions

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import os, sys, json, pathlib, argparse, requests
def iter_texts(root):
for p in pathlib.Path(root).rglob("*.txt"):
yield p
def store(memory_url, collection, text, meta):
payload = {"text": text, "metadata": {"source": meta.get("source"), "path": meta.get("path")}}
r = requests.post(f"{memory_url}/store", json=payload, timeout=30)
r.raise_for_status()
return r.json()
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--root", required=True, help="Corpus-Root (z.B. /srv/ai/corpus)")
ap.add_argument("--memory", required=False, default=None, help="Memory-API URL (z.B. http://127.0.0.1:8085)")
ap.add_argument("--collection", default="chat-memory")
args = ap.parse_args()
# Optional: memory-URL aus sources.yml lesen
if not args.memory:
conf = pathlib.Path(__file__).with_name("sources.yml")
if conf.exists():
import yaml
cfg = yaml.safe_load(conf.read_text())
args.memory = cfg.get("memory", {}).get("url")
if not args.memory:
print("Bitte --memory <URL> angeben oder in sources.yml hinterlegen.", file=sys.stderr)
sys.exit(1)
for p in iter_texts(args.root):
try:
text = p.read_text(errors="ignore")
meta = {"path": str(p), "source": "crawler"}
store(args.memory, args.collection, text, meta)
print("✔ stored", p)
except Exception as e:
print("", p, e, file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,254 @@
#!/usr/bin/env python3
import asyncio, aiohttp, aiohttp.client_exceptions as aiox
import os, time, random, hashlib, json, re, pathlib
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup
from dateutil.parser import parse as dtparse
import yaml, tldextract, ssl
try:
import uvloop
uvloop.install()
except Exception:
pass
# ---- Config laden ----
BASE = os.environ.get("RAG_CRAWLER_BASE", os.getcwd())
CONF_PATH = os.path.join(BASE, "crawler", "sources.yml")
with open(CONF_PATH, "r") as f:
CFG = yaml.safe_load(f)
POLICY = CFG.get("policy", {})
STORAGE = CFG.get("storage", {})
MEMORY = CFG.get("memory", {})
SEEDS = CFG.get("seeds", [])
ROOT = pathlib.Path(STORAGE.get("root", "/srv/ai/corpus")).resolve()
TEXT_DIR = ROOT / STORAGE.get("text_subdir", "text")
PDF_DIR = ROOT / STORAGE.get("pdf_subdir", "pdf")
TEXT_DIR.mkdir(parents=True, exist_ok=True)
PDF_DIR.mkdir(parents=True, exist_ok=True)
STATE_PATH = ROOT / ".crawler_state.json"
STATE = {"visited": {}} # url -> {etag, last_modified, ts}
if STATE_PATH.exists():
try:
STATE = json.loads(STATE_PATH.read_text())
except Exception:
pass
def save_state():
try:
STATE_PATH.write_text(json.dumps(STATE))
except Exception:
pass
# ---- Robots & Quoten ----
ROBOTS_CACHE = {}
DOMAIN_NEXT_ALLOWED = {}
def domain_key(url):
ext = tldextract.extract(url)
return f"{ext.domain}.{ext.suffix}"
async def fetch_robots(session, base_url):
dom = domain_key(base_url)
if dom in ROBOTS_CACHE:
return ROBOTS_CACHE[dom]
robots_url = urljoin(f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}", "/robots.txt")
from robotexclusionrulesparser import RobotExclusionRulesParser as Robots
rp = Robots()
try:
async with session.get(robots_url, timeout=10) as r:
if r.status == 200:
rp.parse(await r.text())
else:
rp.parse("")
except Exception:
rp.parse("")
ROBOTS_CACHE[dom] = rp
return rp
def polite_delay_for(url):
dmin = int(POLICY.get("delay_min_seconds", 5))
dmax = int(POLICY.get("delay_max_seconds", 60))
d = domain_key(url)
t = DOMAIN_NEXT_ALLOWED.get(d, 0)
now = time.time()
if now < t:
return max(0, t - now)
# Setze nächste erlaubte Zeit (random Delay) eigentlicher Sleep erfolgt in fetch()
DOMAIN_NEXT_ALLOWED[d] = now + random.uniform(dmin, dmax)
return 0
def norm_url(base, link):
href = urljoin(base, link)
href, _ = urldefrag(href)
return href
def fnmatch(text, pat):
pat = pat.replace("**", ".*").replace("*", "[^/]*")
return re.fullmatch(pat, text) is not None
def allowed_by_patterns(url, inc, exc):
ok_inc = True if not inc else any(fnmatch(url, pat) for pat in inc)
ok_exc = any(fnmatch(url, pat) for pat in exc) if exc else False
return ok_inc and not ok_exc
def should_revisit(url, revisit_str):
info = STATE["visited"].get(url, {})
if not info:
return True
try:
days = int(revisit_str.rstrip("d"))
except Exception:
days = 30
last_ts = info.get("ts", 0)
return (time.time() - last_ts) > days * 86400
async def fetch(session, url, etag=None, lastmod=None):
headers = {"User-Agent": POLICY.get("user_agent", "polite-crawler/1.0")}
if etag:
headers["If-None-Match"] = etag
if lastmod:
headers["If-Modified-Since"] = lastmod
ssl_ctx = ssl.create_default_context()
try:
delay = polite_delay_for(url)
if delay > 0:
await asyncio.sleep(delay)
async with session.get(url, headers=headers, ssl=ssl_ctx, timeout=30) as r:
if r.status == 304:
return None, {"status": 304, "headers": {}}
body = await r.read()
return body, {"status": r.status, "headers": dict(r.headers)}
except Exception as e:
return None, {"status": "error", "error": str(e)}
def save_binary(path: pathlib.Path, content: bytes):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(content)
def save_text(path: pathlib.Path, text: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(text)
def is_pdf(headers):
ct = headers.get("Content-Type", "").lower()
return "application/pdf" in ct or ct.endswith("/pdf")
def extract_text_html(body: bytes) -> str:
soup = BeautifulSoup(body, "lxml")
for tag in soup(["script","style","noscript","nav","footer","header","aside"]):
tag.decompose()
text = soup.get_text("\n")
return "\n".join(line.strip() for line in text.splitlines() if line.strip())
def path_for(url, typ="text"):
h = hashlib.sha256(url.encode()).hexdigest()[:16]
if typ == "text":
return TEXT_DIR / f"{h}.txt"
return PDF_DIR / f"{h}.pdf"
async def crawl_seed(session, seed, budget=0):
base = seed["url"]
include = seed.get("include", [])
exclude = seed.get("exclude", [])
revisit = seed.get("revisit", "30d")
# robots
if POLICY.get("obey_robots_txt", True):
rp = await fetch_robots(session, base)
if not rp.is_allowed("*", base):
return
queue = [base]
seen = set()
processed = 0
while queue:
url = queue.pop(0)
if url in seen:
continue
seen.add(url)
if POLICY.get("obey_robots_txt", True):
rp = await fetch_robots(session, url)
if not rp.is_allowed("*", url):
continue
if not allowed_by_patterns(url, include, exclude):
continue
info = STATE["visited"].get(url, {})
etag = info.get("etag")
lastmod = info.get("last_modified")
if not should_revisit(url, revisit):
continue
body, meta = await fetch(session, url, etag, lastmod)
status = meta.get("status")
headers = meta.get("headers", {})
if status == 304:
STATE["visited"][url] = {"etag": etag, "last_modified": lastmod, "ts": time.time()}
save_state()
continue
if status != 200 or body is None:
continue
if is_pdf(headers):
out_pdf = path_for(url, "pdf")
save_binary(out_pdf, body)
# Grobe Textextraktion (best-effort)
try:
from pdfminer.high_level import extract_text as pdf_extract
txt = pdf_extract(str(out_pdf))
save_text(path_for(url, "text"), txt)
except Exception:
pass
else:
txt = extract_text_html(body)
save_text(path_for(url, "text"), txt)
# Links sammeln (nur gleiche Domain leicht erweitern)
soup = BeautifulSoup(body, "lxml")
for a in soup.find_all("a", href=True):
href = urljoin(url, a["href"])
href, _ = urldefrag(href)
if href.startswith("http"):
# Begrenze Tiefe implizit über revisit/budget
queue.append(href)
STATE["visited"][url] = {
"etag": headers.get("ETag"),
"last_modified": headers.get("Last-Modified"),
"ts": time.time(),
}
save_state()
processed += 1
if budget and processed >= budget:
break
async def main(mode="update", budget=0):
con_total = int(POLICY.get("concurrency_total", 4))
timeout = aiohttp.ClientTimeout(total=120)
connector = aiohttp.TCPConnector(limit=con_total, ssl=False)
async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
tasks = []
if mode == "drip":
budget = budget or 1
else:
budget = 0 # unbegrenzt im update-Modus
for seed in SEEDS:
tasks.append(crawl_seed(session, seed, budget=budget))
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--mode", choices=["update","drip"], default="update",
help="update=vollständig, drip=sehr langsam mit Budget je Seed")
parser.add_argument("--budget", type=int, default=1, help="URLs pro Seed (nur drip)")
args = parser.parse_args()
asyncio.run(main(args.mode, args.budget))