feat(agent-core): 增加智能编排与模型工具基础

2026-05-30 00:08:27 +08:00
parent 35b80929b0
commit 7a6c110103
16 changed files with 806 additions and 0 deletions
--- a/agent_core/rag/init.py
+++ b/agent_core/rag/init.py
@@ -0,0 +1 @@
+
--- a/agent_core/rag/chroma_store.py
+++ b/agent_core/rag/chroma_store.py
@@ -0,0 +1,96 @@
+from pathlib import Path
+
+from django.conf import settings
+
+from agent_core.llm_provider import create_embedding_provider
+
+
+def _client(path: str | Path | None = None):
+    import chromadb
+
+    resolved_path = str(path or settings.CHROMA_PATH)
+    return chromadb.PersistentClient(path=resolved_path)
+
+
+def _embedding_provider():
+    return create_embedding_provider(
+        {
+            "EMBEDDING_API_KEY": settings.EMBEDDING_API_KEY,
+            "EMBEDDING_BASE_URL": settings.EMBEDDING_BASE_URL,
+            "EMBEDDING_MODEL": settings.EMBEDDING_MODEL,
+        }
+    )
+
+
+def upsert_chunks(
+    collection: str,
+    chunks: list[dict],
+    store_path: str | Path | None = None,
+) -> None:
+    client = _client(store_path)
+    chroma_collection = client.get_or_create_collection(collection)
+    document_ids = {chunk["document_id"] for chunk in chunks if chunk.get("document_id") is not None}
+    for document_id in document_ids:
+        chroma_collection.delete(where={"document_id": document_id})
+    texts = [chunk["content"] for chunk in chunks]
+    embeddings = _embedding_provider().embed_texts(texts)
+    chroma_collection.upsert(
+        ids=[chunk["chunk_id"] for chunk in chunks],
+        documents=texts,
+        embeddings=embeddings,
+        metadatas=[
+            {
+                "scenario_id": chunk["scenario_id"],
+                "document_id": chunk["document_id"],
+                "source": chunk["source"],
+                "chunk_id": chunk["chunk_id"],
+                "created_at": chunk["created_at"],
+            }
+            for chunk in chunks
+        ],
+    )
+
+
+def query_chunks(
+    scenario_id: str,
+    query: str,
+    collection: str,
+    top_k: int = 5,
+    document_ids: list[int] | None = None,
+    store_path: str | Path | None = None,
+) -> list[dict]:
+    client = _client(store_path)
+    chroma_collection = client.get_or_create_collection(collection)
+    where: dict = {"scenario_id": scenario_id}
+    if document_ids:
+        where = {
+            "$and": [
+                {"scenario_id": scenario_id},
+                {"document_id": {"$in": document_ids}},
+            ]
+        }
+    embedding = _embedding_provider().embed_texts([query])[0]
+    result = chroma_collection.query(
+        query_embeddings=[embedding],
+        n_results=top_k,
+        where=where,
+        include=["documents", "metadatas", "distances"],
+    )
+    chunks = []
+    documents = result.get("documents", [[]])[0]
+    metadatas = result.get("metadatas", [[]])[0]
+    distances = result.get("distances", [[]])[0]
+    for content, metadata, distance in zip(documents, metadatas, distances):
+        chunks.append(
+            {
+                "scenario_id": metadata.get("scenario_id"),
+                "document_id": metadata.get("document_id"),
+                "collection": collection,
+                "source": metadata.get("source"),
+                "chunk_id": metadata.get("chunk_id"),
+                "content": content,
+                "created_at": metadata.get("created_at"),
+                "score": round(1 / (1 + float(distance)), 4),
+            }
+        )
+    return chunks
--- a/agent_core/rag/ingest.py
+++ b/agent_core/rag/ingest.py
@@ -0,0 +1,116 @@
+import json
+import re
+import importlib.util
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+
+from django.conf import settings
+
+from .chroma_store import upsert_chunks
+
+
+@dataclass
+class IngestResult:
+    success: bool
+    chunks_count: int = 0
+    error: str = ""
+
+
+def _default_store_path() -> Path:
+    return Path(settings.CHROMA_PATH) / "rag_store.json"
+
+
+def _load_store(store_path: Path) -> list[dict]:
+    if not store_path.exists():
+        return []
+    with store_path.open("r", encoding="utf-8") as file:
+        return json.load(file)
+
+
+def _save_store(store_path: Path, chunks: list[dict]) -> None:
+    store_path.parent.mkdir(parents=True, exist_ok=True)
+    with store_path.open("w", encoding="utf-8") as file:
+        json.dump(chunks, file, ensure_ascii=False, indent=2)
+
+
+def _split_text(text: str, chunk_size: int = 800, overlap: int = 120) -> list[str]:
+    normalized = re.sub(r"\s+", " ", text).strip()
+    if not normalized:
+        return []
+    chunks = []
+    start = 0
+    while start < len(normalized):
+        end = start + chunk_size
+        chunks.append(normalized[start:end])
+        if end >= len(normalized):
+            break
+        start = max(end - overlap, start + 1)
+    return chunks
+
+
+def ingest_document(
+    scenario_id: str,
+    source_file: str,
+    text: str,
+    collection: str,
+    document_id: int | None = None,
+    store_path: str | Path | None = None,
+) -> IngestResult:
+    if not text.strip():
+        return IngestResult(success=False, error="文档内容为空")
+    if store_path is None and importlib.util.find_spec("chromadb") is not None:
+        return _ingest_chroma_document(document_id, scenario_id, source_file, text, collection)
+    resolved_store_path = Path(store_path) if store_path else _default_store_path()
+    existing_chunks = [
+        chunk
+        for chunk in _load_store(resolved_store_path)
+        if not (
+            chunk.get("document_id") == document_id
+            and chunk.get("scenario_id") == scenario_id
+            and chunk.get("collection") == collection
+        )
+    ]
+    created_at = datetime.now(timezone.utc).isoformat()
+    new_chunks = []
+    for index, chunk_text in enumerate(_split_text(text), start=1):
+        new_chunks.append(
+            {
+                "scenario_id": scenario_id,
+                "document_id": document_id,
+                "collection": collection,
+                "source": source_file,
+                "chunk_id": f"{scenario_id}:{source_file}:{index}",
+                "content": chunk_text,
+                "created_at": created_at,
+            }
+        )
+    _save_store(resolved_store_path, [*existing_chunks, *new_chunks])
+    return IngestResult(success=True, chunks_count=len(new_chunks))
+
+
+def _ingest_chroma_document(
+    document_id: int | None,
+    scenario_id: str,
+    source_file: str,
+    text: str,
+    collection: str,
+) -> IngestResult:
+    created_at = datetime.now(timezone.utc).isoformat()
+    chunks = [
+        {
+            "scenario_id": scenario_id,
+            "document_id": document_id,
+            "collection": collection,
+            "source": source_file,
+            "chunk_id": f"{scenario_id}:{document_id or source_file}:{index}",
+            "content": chunk_text,
+            "created_at": created_at,
+        }
+        for index, chunk_text in enumerate(_split_text(text), start=1)
+    ]
+    try:
+        upsert_chunks(collection=collection, chunks=chunks)
+    except Exception as exc:
+        return IngestResult(success=False, error=str(exc))
+    return IngestResult(success=True, chunks_count=len(chunks))
--- a/agent_core/rag/retriever.py
+++ b/agent_core/rag/retriever.py
@@ -0,0 +1,69 @@
+import json
+import re
+import importlib.util
+from pathlib import Path
+
+from django.conf import settings
+
+from .chroma_store import query_chunks
+
+
+def _default_store_path() -> Path:
+    return Path(settings.CHROMA_PATH) / "rag_store.json"
+
+
+def _load_store(store_path: Path) -> list[dict]:
+    if not store_path.exists():
+        return []
+    with store_path.open("r", encoding="utf-8") as file:
+        return json.load(file)
+
+
+def _tokens(text: str) -> set[str]:
+    lowered = text.lower()
+    ascii_tokens = set(re.findall(r"[a-z0-9_]+", lowered))
+    cjk_tokens = set(re.findall(r"[\u4e00-\u9fff]{2,}", lowered))
+    chars = {char for char in lowered if "\u4e00" <= char <= "\u9fff"}
+    return ascii_tokens | cjk_tokens | chars
+
+
+def _score(query_tokens: set[str], content: str) -> float:
+    content_tokens = _tokens(content)
+    if not query_tokens or not content_tokens:
+        return 0.0
+    overlap = query_tokens & content_tokens
+    return round(len(overlap) / len(query_tokens), 4)
+
+
+def retrieve(
+    scenario_id: str,
+    query: str,
+    collection: str,
+    top_k: int = 5,
+    document_ids: list[int] | None = None,
+    store_path: str | Path | None = None,
+) -> list[dict]:
+    if store_path is None and importlib.util.find_spec("chromadb") is not None:
+        return query_chunks(
+            scenario_id=scenario_id,
+            query=query,
+            collection=collection,
+            top_k=top_k,
+            document_ids=document_ids,
+        )
+    resolved_store_path = Path(store_path) if store_path else _default_store_path()
+    query_tokens = _tokens(query)
+    allowed_document_ids = set(document_ids or [])
+    scored_chunks = []
+    for chunk in _load_store(resolved_store_path):
+        if chunk.get("scenario_id") != scenario_id:
+            continue
+        if chunk.get("collection") != collection:
+            continue
+        if allowed_document_ids and chunk.get("document_id") not in allowed_document_ids:
+            continue
+        score = _score(query_tokens, chunk.get("content", ""))
+        if score <= 0:
+            continue
+        scored_chunks.append({**chunk, "score": score})
+    return sorted(scored_chunks, key=lambda item: item["score"], reverse=True)[:top_k]