feat(rag): add document parsing structures
This commit is contained in:
53
script/sql/rag_chunk.sql
Normal file
53
script/sql/rag_chunk.sql
Normal file
@@ -0,0 +1,53 @@
|
||||
DROP TABLE IF EXISTS rag_chunk;
|
||||
|
||||
CREATE TABLE rag_chunk (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
store_id BIGINT NOT NULL,
|
||||
document_id BIGINT NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
chunk_content TEXT NOT NULL,
|
||||
chunk_summary VARCHAR(1000) DEFAULT '',
|
||||
token_count INTEGER,
|
||||
page_number INTEGER,
|
||||
section_title VARCHAR(255) DEFAULT '',
|
||||
heading_path VARCHAR(1000) DEFAULT '',
|
||||
vector_id VARCHAR(128),
|
||||
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
version INTEGER NOT NULL DEFAULT 1,
|
||||
create_time TIMESTAMP,
|
||||
update_time TIMESTAMP,
|
||||
remark VARCHAR(500) DEFAULT '',
|
||||
create_by VARCHAR(64),
|
||||
update_by VARCHAR(64),
|
||||
CONSTRAINT uk_rag_chunk_document_index UNIQUE (document_id, chunk_index),
|
||||
CONSTRAINT fk_rag_chunk_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id),
|
||||
CONSTRAINT fk_rag_chunk_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_rag_chunk_store_id ON rag_chunk (store_id);
|
||||
CREATE INDEX idx_rag_chunk_document_id ON rag_chunk (document_id);
|
||||
CREATE INDEX idx_rag_chunk_enabled ON rag_chunk (enabled);
|
||||
CREATE INDEX idx_rag_chunk_vector_id ON rag_chunk (vector_id);
|
||||
CREATE INDEX idx_rag_chunk_metadata_json ON rag_chunk USING GIN (metadata_json);
|
||||
|
||||
COMMENT ON TABLE rag_chunk IS 'RAG知识切片表';
|
||||
COMMENT ON COLUMN rag_chunk.id IS 'ID';
|
||||
COMMENT ON COLUMN rag_chunk.store_id IS '知识库ID';
|
||||
COMMENT ON COLUMN rag_chunk.document_id IS '文档ID';
|
||||
COMMENT ON COLUMN rag_chunk.chunk_index IS '文档内切片序号';
|
||||
COMMENT ON COLUMN rag_chunk.chunk_content IS '切片内容';
|
||||
COMMENT ON COLUMN rag_chunk.chunk_summary IS '切片摘要';
|
||||
COMMENT ON COLUMN rag_chunk.token_count IS 'Token数量';
|
||||
COMMENT ON COLUMN rag_chunk.page_number IS '页码';
|
||||
COMMENT ON COLUMN rag_chunk.section_title IS '章节标题';
|
||||
COMMENT ON COLUMN rag_chunk.heading_path IS '标题路径';
|
||||
COMMENT ON COLUMN rag_chunk.vector_id IS '向量ID';
|
||||
COMMENT ON COLUMN rag_chunk.metadata_json IS '切片级扩展元数据';
|
||||
COMMENT ON COLUMN rag_chunk.enabled IS '是否启用';
|
||||
COMMENT ON COLUMN rag_chunk.version IS '版本';
|
||||
COMMENT ON COLUMN rag_chunk.create_time IS '创建时间';
|
||||
COMMENT ON COLUMN rag_chunk.update_time IS '更新时间';
|
||||
COMMENT ON COLUMN rag_chunk.remark IS '备注';
|
||||
COMMENT ON COLUMN rag_chunk.create_by IS '创建者';
|
||||
COMMENT ON COLUMN rag_chunk.update_by IS '更新者';
|
||||
50
script/sql/rag_chunk_embedding.sql
Normal file
50
script/sql/rag_chunk_embedding.sql
Normal file
@@ -0,0 +1,50 @@
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
DROP TABLE IF EXISTS rag_chunk_embedding;
|
||||
|
||||
CREATE TABLE rag_chunk_embedding (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
store_id BIGINT NOT NULL,
|
||||
document_id BIGINT NOT NULL,
|
||||
chunk_id BIGINT NOT NULL,
|
||||
embedding_model VARCHAR(100) NOT NULL,
|
||||
embedding_dimension INTEGER NOT NULL DEFAULT 1024,
|
||||
embedding VECTOR(1024) NOT NULL,
|
||||
content_hash VARCHAR(64),
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
version INTEGER NOT NULL DEFAULT 1,
|
||||
create_time TIMESTAMP,
|
||||
update_time TIMESTAMP,
|
||||
remark VARCHAR(500) DEFAULT '',
|
||||
create_by VARCHAR(64),
|
||||
update_by VARCHAR(64),
|
||||
CONSTRAINT uk_rag_chunk_embedding_chunk_model UNIQUE (chunk_id, embedding_model),
|
||||
CONSTRAINT fk_rag_chunk_embedding_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id),
|
||||
CONSTRAINT fk_rag_chunk_embedding_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id),
|
||||
CONSTRAINT fk_rag_chunk_embedding_chunk_id FOREIGN KEY (chunk_id) REFERENCES rag_chunk (id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_rag_chunk_embedding_store_id ON rag_chunk_embedding (store_id);
|
||||
CREATE INDEX idx_rag_chunk_embedding_document_id ON rag_chunk_embedding (document_id);
|
||||
CREATE INDEX idx_rag_chunk_embedding_chunk_id ON rag_chunk_embedding (chunk_id);
|
||||
CREATE INDEX idx_rag_chunk_embedding_model ON rag_chunk_embedding (embedding_model);
|
||||
CREATE INDEX idx_rag_chunk_embedding_enabled ON rag_chunk_embedding (enabled);
|
||||
CREATE INDEX idx_rag_chunk_embedding_vector_hnsw
|
||||
ON rag_chunk_embedding USING hnsw (embedding vector_cosine_ops);
|
||||
|
||||
COMMENT ON TABLE rag_chunk_embedding IS 'RAG切片向量表';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.id IS 'ID';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.store_id IS '知识库ID';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.document_id IS '文档ID';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.chunk_id IS '切片ID';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.embedding_model IS '向量模型';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.embedding_dimension IS '向量维度';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.embedding IS '向量内容';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.content_hash IS '向量生成内容哈希';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.enabled IS '是否启用';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.version IS '版本';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.create_time IS '创建时间';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.update_time IS '更新时间';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.remark IS '备注';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.create_by IS '创建者';
|
||||
COMMENT ON COLUMN rag_chunk_embedding.update_by IS '更新者';
|
||||
15
script/sql/rag_chunk_strategy_enum.sql
Normal file
15
script/sql/rag_chunk_strategy_enum.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
INSERT INTO sys_enum (catalog, type, name, value, strvalue, sort, version, remark)
|
||||
VALUES
|
||||
('rag', 'chunk_strategy', '固定长度切片', 1, 'FIXED_LENGTH', 1, 1, 'RAG文档切片方式'),
|
||||
('rag', 'chunk_strategy', '按段落切片', 2, 'PARAGRAPH', 2, 1, 'RAG文档切片方式'),
|
||||
('rag', 'chunk_strategy', '按标题层级切片', 3, 'HEADING', 3, 1, 'RAG文档切片方式'),
|
||||
('rag', 'chunk_strategy', '按表格行切片', 4, 'TABLE_ROW', 4, 1, 'RAG文档切片方式'),
|
||||
('rag', 'chunk_strategy', '按分隔符切片', 5, 'DELIMITER', 5, 1, 'RAG文档切片方式'),
|
||||
('rag', 'chunk_strategy', '语义切片', 6, 'SEMANTIC', 6, 1, 'RAG文档切片方式')
|
||||
ON CONFLICT (catalog, type, name)
|
||||
DO UPDATE SET
|
||||
value = EXCLUDED.value,
|
||||
strvalue = EXCLUDED.strvalue,
|
||||
sort = EXCLUDED.sort,
|
||||
remark = EXCLUDED.remark,
|
||||
update_time = CURRENT_TIMESTAMP;
|
||||
Reference in New Issue
Block a user