feat(rag): add document parsing structures

This commit is contained in:
2026-05-21 23:20:09 +08:00
parent 2ab02fb574
commit 1de773405f
38 changed files with 1240 additions and 9 deletions

53
script/sql/rag_chunk.sql Normal file
View File

@@ -0,0 +1,53 @@
DROP TABLE IF EXISTS rag_chunk;
CREATE TABLE rag_chunk (
id BIGSERIAL PRIMARY KEY,
store_id BIGINT NOT NULL,
document_id BIGINT NOT NULL,
chunk_index INTEGER NOT NULL,
chunk_content TEXT NOT NULL,
chunk_summary VARCHAR(1000) DEFAULT '',
token_count INTEGER,
page_number INTEGER,
section_title VARCHAR(255) DEFAULT '',
heading_path VARCHAR(1000) DEFAULT '',
vector_id VARCHAR(128),
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
version INTEGER NOT NULL DEFAULT 1,
create_time TIMESTAMP,
update_time TIMESTAMP,
remark VARCHAR(500) DEFAULT '',
create_by VARCHAR(64),
update_by VARCHAR(64),
CONSTRAINT uk_rag_chunk_document_index UNIQUE (document_id, chunk_index),
CONSTRAINT fk_rag_chunk_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id),
CONSTRAINT fk_rag_chunk_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id)
);
CREATE INDEX idx_rag_chunk_store_id ON rag_chunk (store_id);
CREATE INDEX idx_rag_chunk_document_id ON rag_chunk (document_id);
CREATE INDEX idx_rag_chunk_enabled ON rag_chunk (enabled);
CREATE INDEX idx_rag_chunk_vector_id ON rag_chunk (vector_id);
CREATE INDEX idx_rag_chunk_metadata_json ON rag_chunk USING GIN (metadata_json);
COMMENT ON TABLE rag_chunk IS 'RAG知识切片表';
COMMENT ON COLUMN rag_chunk.id IS 'ID';
COMMENT ON COLUMN rag_chunk.store_id IS '知识库ID';
COMMENT ON COLUMN rag_chunk.document_id IS '文档ID';
COMMENT ON COLUMN rag_chunk.chunk_index IS '文档内切片序号';
COMMENT ON COLUMN rag_chunk.chunk_content IS '切片内容';
COMMENT ON COLUMN rag_chunk.chunk_summary IS '切片摘要';
COMMENT ON COLUMN rag_chunk.token_count IS 'Token数量';
COMMENT ON COLUMN rag_chunk.page_number IS '页码';
COMMENT ON COLUMN rag_chunk.section_title IS '章节标题';
COMMENT ON COLUMN rag_chunk.heading_path IS '标题路径';
COMMENT ON COLUMN rag_chunk.vector_id IS '向量ID';
COMMENT ON COLUMN rag_chunk.metadata_json IS '切片级扩展元数据';
COMMENT ON COLUMN rag_chunk.enabled IS '是否启用';
COMMENT ON COLUMN rag_chunk.version IS '版本';
COMMENT ON COLUMN rag_chunk.create_time IS '创建时间';
COMMENT ON COLUMN rag_chunk.update_time IS '更新时间';
COMMENT ON COLUMN rag_chunk.remark IS '备注';
COMMENT ON COLUMN rag_chunk.create_by IS '创建者';
COMMENT ON COLUMN rag_chunk.update_by IS '更新者';

View File

@@ -0,0 +1,50 @@
CREATE EXTENSION IF NOT EXISTS vector;
DROP TABLE IF EXISTS rag_chunk_embedding;
CREATE TABLE rag_chunk_embedding (
id BIGSERIAL PRIMARY KEY,
store_id BIGINT NOT NULL,
document_id BIGINT NOT NULL,
chunk_id BIGINT NOT NULL,
embedding_model VARCHAR(100) NOT NULL,
embedding_dimension INTEGER NOT NULL DEFAULT 1024,
embedding VECTOR(1024) NOT NULL,
content_hash VARCHAR(64),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
version INTEGER NOT NULL DEFAULT 1,
create_time TIMESTAMP,
update_time TIMESTAMP,
remark VARCHAR(500) DEFAULT '',
create_by VARCHAR(64),
update_by VARCHAR(64),
CONSTRAINT uk_rag_chunk_embedding_chunk_model UNIQUE (chunk_id, embedding_model),
CONSTRAINT fk_rag_chunk_embedding_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id),
CONSTRAINT fk_rag_chunk_embedding_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id),
CONSTRAINT fk_rag_chunk_embedding_chunk_id FOREIGN KEY (chunk_id) REFERENCES rag_chunk (id)
);
CREATE INDEX idx_rag_chunk_embedding_store_id ON rag_chunk_embedding (store_id);
CREATE INDEX idx_rag_chunk_embedding_document_id ON rag_chunk_embedding (document_id);
CREATE INDEX idx_rag_chunk_embedding_chunk_id ON rag_chunk_embedding (chunk_id);
CREATE INDEX idx_rag_chunk_embedding_model ON rag_chunk_embedding (embedding_model);
CREATE INDEX idx_rag_chunk_embedding_enabled ON rag_chunk_embedding (enabled);
CREATE INDEX idx_rag_chunk_embedding_vector_hnsw
ON rag_chunk_embedding USING hnsw (embedding vector_cosine_ops);
COMMENT ON TABLE rag_chunk_embedding IS 'RAG切片向量表';
COMMENT ON COLUMN rag_chunk_embedding.id IS 'ID';
COMMENT ON COLUMN rag_chunk_embedding.store_id IS '知识库ID';
COMMENT ON COLUMN rag_chunk_embedding.document_id IS '文档ID';
COMMENT ON COLUMN rag_chunk_embedding.chunk_id IS '切片ID';
COMMENT ON COLUMN rag_chunk_embedding.embedding_model IS '向量模型';
COMMENT ON COLUMN rag_chunk_embedding.embedding_dimension IS '向量维度';
COMMENT ON COLUMN rag_chunk_embedding.embedding IS '向量内容';
COMMENT ON COLUMN rag_chunk_embedding.content_hash IS '向量生成内容哈希';
COMMENT ON COLUMN rag_chunk_embedding.enabled IS '是否启用';
COMMENT ON COLUMN rag_chunk_embedding.version IS '版本';
COMMENT ON COLUMN rag_chunk_embedding.create_time IS '创建时间';
COMMENT ON COLUMN rag_chunk_embedding.update_time IS '更新时间';
COMMENT ON COLUMN rag_chunk_embedding.remark IS '备注';
COMMENT ON COLUMN rag_chunk_embedding.create_by IS '创建者';
COMMENT ON COLUMN rag_chunk_embedding.update_by IS '更新者';

View File

@@ -0,0 +1,15 @@
INSERT INTO sys_enum (catalog, type, name, value, strvalue, sort, version, remark)
VALUES
('rag', 'chunk_strategy', '固定长度切片', 1, 'FIXED_LENGTH', 1, 1, 'RAG文档切片方式'),
('rag', 'chunk_strategy', '按段落切片', 2, 'PARAGRAPH', 2, 1, 'RAG文档切片方式'),
('rag', 'chunk_strategy', '按标题层级切片', 3, 'HEADING', 3, 1, 'RAG文档切片方式'),
('rag', 'chunk_strategy', '按表格行切片', 4, 'TABLE_ROW', 4, 1, 'RAG文档切片方式'),
('rag', 'chunk_strategy', '按分隔符切片', 5, 'DELIMITER', 5, 1, 'RAG文档切片方式'),
('rag', 'chunk_strategy', '语义切片', 6, 'SEMANTIC', 6, 1, 'RAG文档切片方式')
ON CONFLICT (catalog, type, name)
DO UPDATE SET
value = EXCLUDED.value,
strvalue = EXCLUDED.strvalue,
sort = EXCLUDED.sort,
remark = EXCLUDED.remark,
update_time = CURRENT_TIMESTAMP;