feat(agent-core): 增加智能编排与模型工具基础
This commit is contained in:
1
agent_core/rag/__init__.py
Normal file
1
agent_core/rag/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
96
agent_core/rag/chroma_store.py
Normal file
96
agent_core/rag/chroma_store.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from agent_core.llm_provider import create_embedding_provider
|
||||
|
||||
|
||||
def _client(path: str | Path | None = None):
|
||||
import chromadb
|
||||
|
||||
resolved_path = str(path or settings.CHROMA_PATH)
|
||||
return chromadb.PersistentClient(path=resolved_path)
|
||||
|
||||
|
||||
def _embedding_provider():
|
||||
return create_embedding_provider(
|
||||
{
|
||||
"EMBEDDING_API_KEY": settings.EMBEDDING_API_KEY,
|
||||
"EMBEDDING_BASE_URL": settings.EMBEDDING_BASE_URL,
|
||||
"EMBEDDING_MODEL": settings.EMBEDDING_MODEL,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def upsert_chunks(
|
||||
collection: str,
|
||||
chunks: list[dict],
|
||||
store_path: str | Path | None = None,
|
||||
) -> None:
|
||||
client = _client(store_path)
|
||||
chroma_collection = client.get_or_create_collection(collection)
|
||||
document_ids = {chunk["document_id"] for chunk in chunks if chunk.get("document_id") is not None}
|
||||
for document_id in document_ids:
|
||||
chroma_collection.delete(where={"document_id": document_id})
|
||||
texts = [chunk["content"] for chunk in chunks]
|
||||
embeddings = _embedding_provider().embed_texts(texts)
|
||||
chroma_collection.upsert(
|
||||
ids=[chunk["chunk_id"] for chunk in chunks],
|
||||
documents=texts,
|
||||
embeddings=embeddings,
|
||||
metadatas=[
|
||||
{
|
||||
"scenario_id": chunk["scenario_id"],
|
||||
"document_id": chunk["document_id"],
|
||||
"source": chunk["source"],
|
||||
"chunk_id": chunk["chunk_id"],
|
||||
"created_at": chunk["created_at"],
|
||||
}
|
||||
for chunk in chunks
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def query_chunks(
|
||||
scenario_id: str,
|
||||
query: str,
|
||||
collection: str,
|
||||
top_k: int = 5,
|
||||
document_ids: list[int] | None = None,
|
||||
store_path: str | Path | None = None,
|
||||
) -> list[dict]:
|
||||
client = _client(store_path)
|
||||
chroma_collection = client.get_or_create_collection(collection)
|
||||
where: dict = {"scenario_id": scenario_id}
|
||||
if document_ids:
|
||||
where = {
|
||||
"$and": [
|
||||
{"scenario_id": scenario_id},
|
||||
{"document_id": {"$in": document_ids}},
|
||||
]
|
||||
}
|
||||
embedding = _embedding_provider().embed_texts([query])[0]
|
||||
result = chroma_collection.query(
|
||||
query_embeddings=[embedding],
|
||||
n_results=top_k,
|
||||
where=where,
|
||||
include=["documents", "metadatas", "distances"],
|
||||
)
|
||||
chunks = []
|
||||
documents = result.get("documents", [[]])[0]
|
||||
metadatas = result.get("metadatas", [[]])[0]
|
||||
distances = result.get("distances", [[]])[0]
|
||||
for content, metadata, distance in zip(documents, metadatas, distances):
|
||||
chunks.append(
|
||||
{
|
||||
"scenario_id": metadata.get("scenario_id"),
|
||||
"document_id": metadata.get("document_id"),
|
||||
"collection": collection,
|
||||
"source": metadata.get("source"),
|
||||
"chunk_id": metadata.get("chunk_id"),
|
||||
"content": content,
|
||||
"created_at": metadata.get("created_at"),
|
||||
"score": round(1 / (1 + float(distance)), 4),
|
||||
}
|
||||
)
|
||||
return chunks
|
||||
116
agent_core/rag/ingest.py
Normal file
116
agent_core/rag/ingest.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import json
|
||||
import re
|
||||
import importlib.util
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .chroma_store import upsert_chunks
|
||||
|
||||
|
||||
@dataclass
|
||||
class IngestResult:
|
||||
success: bool
|
||||
chunks_count: int = 0
|
||||
error: str = ""
|
||||
|
||||
|
||||
def _default_store_path() -> Path:
|
||||
return Path(settings.CHROMA_PATH) / "rag_store.json"
|
||||
|
||||
|
||||
def _load_store(store_path: Path) -> list[dict]:
|
||||
if not store_path.exists():
|
||||
return []
|
||||
with store_path.open("r", encoding="utf-8") as file:
|
||||
return json.load(file)
|
||||
|
||||
|
||||
def _save_store(store_path: Path, chunks: list[dict]) -> None:
|
||||
store_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with store_path.open("w", encoding="utf-8") as file:
|
||||
json.dump(chunks, file, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def _split_text(text: str, chunk_size: int = 800, overlap: int = 120) -> list[str]:
|
||||
normalized = re.sub(r"\s+", " ", text).strip()
|
||||
if not normalized:
|
||||
return []
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(normalized):
|
||||
end = start + chunk_size
|
||||
chunks.append(normalized[start:end])
|
||||
if end >= len(normalized):
|
||||
break
|
||||
start = max(end - overlap, start + 1)
|
||||
return chunks
|
||||
|
||||
|
||||
def ingest_document(
|
||||
scenario_id: str,
|
||||
source_file: str,
|
||||
text: str,
|
||||
collection: str,
|
||||
document_id: int | None = None,
|
||||
store_path: str | Path | None = None,
|
||||
) -> IngestResult:
|
||||
if not text.strip():
|
||||
return IngestResult(success=False, error="文档内容为空")
|
||||
if store_path is None and importlib.util.find_spec("chromadb") is not None:
|
||||
return _ingest_chroma_document(document_id, scenario_id, source_file, text, collection)
|
||||
resolved_store_path = Path(store_path) if store_path else _default_store_path()
|
||||
existing_chunks = [
|
||||
chunk
|
||||
for chunk in _load_store(resolved_store_path)
|
||||
if not (
|
||||
chunk.get("document_id") == document_id
|
||||
and chunk.get("scenario_id") == scenario_id
|
||||
and chunk.get("collection") == collection
|
||||
)
|
||||
]
|
||||
created_at = datetime.now(timezone.utc).isoformat()
|
||||
new_chunks = []
|
||||
for index, chunk_text in enumerate(_split_text(text), start=1):
|
||||
new_chunks.append(
|
||||
{
|
||||
"scenario_id": scenario_id,
|
||||
"document_id": document_id,
|
||||
"collection": collection,
|
||||
"source": source_file,
|
||||
"chunk_id": f"{scenario_id}:{source_file}:{index}",
|
||||
"content": chunk_text,
|
||||
"created_at": created_at,
|
||||
}
|
||||
)
|
||||
_save_store(resolved_store_path, [*existing_chunks, *new_chunks])
|
||||
return IngestResult(success=True, chunks_count=len(new_chunks))
|
||||
|
||||
|
||||
def _ingest_chroma_document(
|
||||
document_id: int | None,
|
||||
scenario_id: str,
|
||||
source_file: str,
|
||||
text: str,
|
||||
collection: str,
|
||||
) -> IngestResult:
|
||||
created_at = datetime.now(timezone.utc).isoformat()
|
||||
chunks = [
|
||||
{
|
||||
"scenario_id": scenario_id,
|
||||
"document_id": document_id,
|
||||
"collection": collection,
|
||||
"source": source_file,
|
||||
"chunk_id": f"{scenario_id}:{document_id or source_file}:{index}",
|
||||
"content": chunk_text,
|
||||
"created_at": created_at,
|
||||
}
|
||||
for index, chunk_text in enumerate(_split_text(text), start=1)
|
||||
]
|
||||
try:
|
||||
upsert_chunks(collection=collection, chunks=chunks)
|
||||
except Exception as exc:
|
||||
return IngestResult(success=False, error=str(exc))
|
||||
return IngestResult(success=True, chunks_count=len(chunks))
|
||||
69
agent_core/rag/retriever.py
Normal file
69
agent_core/rag/retriever.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import json
|
||||
import re
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .chroma_store import query_chunks
|
||||
|
||||
|
||||
def _default_store_path() -> Path:
|
||||
return Path(settings.CHROMA_PATH) / "rag_store.json"
|
||||
|
||||
|
||||
def _load_store(store_path: Path) -> list[dict]:
|
||||
if not store_path.exists():
|
||||
return []
|
||||
with store_path.open("r", encoding="utf-8") as file:
|
||||
return json.load(file)
|
||||
|
||||
|
||||
def _tokens(text: str) -> set[str]:
|
||||
lowered = text.lower()
|
||||
ascii_tokens = set(re.findall(r"[a-z0-9_]+", lowered))
|
||||
cjk_tokens = set(re.findall(r"[\u4e00-\u9fff]{2,}", lowered))
|
||||
chars = {char for char in lowered if "\u4e00" <= char <= "\u9fff"}
|
||||
return ascii_tokens | cjk_tokens | chars
|
||||
|
||||
|
||||
def _score(query_tokens: set[str], content: str) -> float:
|
||||
content_tokens = _tokens(content)
|
||||
if not query_tokens or not content_tokens:
|
||||
return 0.0
|
||||
overlap = query_tokens & content_tokens
|
||||
return round(len(overlap) / len(query_tokens), 4)
|
||||
|
||||
|
||||
def retrieve(
|
||||
scenario_id: str,
|
||||
query: str,
|
||||
collection: str,
|
||||
top_k: int = 5,
|
||||
document_ids: list[int] | None = None,
|
||||
store_path: str | Path | None = None,
|
||||
) -> list[dict]:
|
||||
if store_path is None and importlib.util.find_spec("chromadb") is not None:
|
||||
return query_chunks(
|
||||
scenario_id=scenario_id,
|
||||
query=query,
|
||||
collection=collection,
|
||||
top_k=top_k,
|
||||
document_ids=document_ids,
|
||||
)
|
||||
resolved_store_path = Path(store_path) if store_path else _default_store_path()
|
||||
query_tokens = _tokens(query)
|
||||
allowed_document_ids = set(document_ids or [])
|
||||
scored_chunks = []
|
||||
for chunk in _load_store(resolved_store_path):
|
||||
if chunk.get("scenario_id") != scenario_id:
|
||||
continue
|
||||
if chunk.get("collection") != collection:
|
||||
continue
|
||||
if allowed_document_ids and chunk.get("document_id") not in allowed_document_ids:
|
||||
continue
|
||||
score = _score(query_tokens, chunk.get("content", ""))
|
||||
if score <= 0:
|
||||
continue
|
||||
scored_chunks.append({**chunk, "score": score})
|
||||
return sorted(scored_chunks, key=lambda item: item["score"], reverse=True)[:top_k]
|
||||
Reference in New Issue
Block a user