From 1de773405f8b5a013ae635c3986b04d20d89e856 Mon Sep 17 00:00:00 2001 From: bruce Date: Thu, 21 May 2026 23:20:09 +0800 Subject: [PATCH] feat(rag): add document parsing structures --- pom.xml | 13 ++ script/sql/rag_chunk.sql | 53 ++++++ script/sql/rag_chunk_embedding.sql | 50 ++++++ script/sql/rag_chunk_strategy_enum.sql | 15 ++ .../common/config/MybatisPlusConfig.java | 17 ++ .../document/parse/DocumentParseContext.java | 21 +++ .../parse/DocumentParseException.java | 12 ++ .../document/parse/DocumentParseResult.java | 20 +++ .../common/document/parse/DocumentParser.java | 8 + .../document/parse/DocumentParserFactory.java | 37 ++++ .../impl/AbstractTikaDocumentParser.java | 64 +++++++ .../parse/impl/ExcelDocumentParser.java | 26 +++ .../parse/impl/PdfDocumentParser.java | 24 +++ .../parse/impl/TxtDocumentParser.java | 24 +++ .../parse/impl/WordDocumentParser.java | 26 +++ .../rag/controller/RagDocumentController.java | 15 ++ .../dto/request/RagDocumentParseRequest.java | 26 +++ .../response/RagDocumentParseResponse.java | 33 ++++ .../java/com/bruce/rag/entity/RagChunk.java | 67 ++++++++ .../bruce/rag/entity/RagChunkEmbedding.java | 50 ++++++ .../bruce/rag/enums/RagChunkStrategyEnum.java | 20 +++ .../rag/mapper/RagChunkEmbeddingMapper.java | 9 + .../com/bruce/rag/mapper/RagChunkMapper.java | 9 + .../service/IRagChunkEmbeddingService.java | 7 + .../bruce/rag/service/IRagChunkService.java | 7 + .../rag/service/IRagDocumentParseService.java | 13 ++ .../impl/RagChunkEmbeddingServiceImpl.java | 12 ++ .../rag/service/impl/RagChunkServiceImpl.java | 11 ++ .../impl/RagDocumentParseServiceImpl.java | 151 +++++++++++++++++ .../service/impl/RagDocumentServiceImpl.java | 26 ++- .../common/config/MybatisPlusConfigTests.java | 19 +++ .../parse/DocumentParserFactoryTests.java | 50 ++++++ .../parse/TxtDocumentParserTests.java | 47 +++++ .../enumconfig/EnumDefinitionTests.java | 7 + .../enumconfig/SysEnumDataInitTests.java | 8 + .../bruce/rag/RagComponentStructureTests.java | 52 ++++++ .../rag/RagDocumentParseServiceImplTests.java | 160 ++++++++++++++++++ .../rag/RagDocumentServiceImplTests.java | 40 ++++- 38 files changed, 1240 insertions(+), 9 deletions(-) create mode 100644 script/sql/rag_chunk.sql create mode 100644 script/sql/rag_chunk_embedding.sql create mode 100644 script/sql/rag_chunk_strategy_enum.sql create mode 100644 src/main/java/com/bruce/common/config/MybatisPlusConfig.java create mode 100644 src/main/java/com/bruce/common/document/parse/DocumentParseContext.java create mode 100644 src/main/java/com/bruce/common/document/parse/DocumentParseException.java create mode 100644 src/main/java/com/bruce/common/document/parse/DocumentParseResult.java create mode 100644 src/main/java/com/bruce/common/document/parse/DocumentParser.java create mode 100644 src/main/java/com/bruce/common/document/parse/DocumentParserFactory.java create mode 100644 src/main/java/com/bruce/common/document/parse/impl/AbstractTikaDocumentParser.java create mode 100644 src/main/java/com/bruce/common/document/parse/impl/ExcelDocumentParser.java create mode 100644 src/main/java/com/bruce/common/document/parse/impl/PdfDocumentParser.java create mode 100644 src/main/java/com/bruce/common/document/parse/impl/TxtDocumentParser.java create mode 100644 src/main/java/com/bruce/common/document/parse/impl/WordDocumentParser.java create mode 100644 src/main/java/com/bruce/rag/dto/request/RagDocumentParseRequest.java create mode 100644 src/main/java/com/bruce/rag/dto/response/RagDocumentParseResponse.java create mode 100644 src/main/java/com/bruce/rag/entity/RagChunk.java create mode 100644 src/main/java/com/bruce/rag/entity/RagChunkEmbedding.java create mode 100644 src/main/java/com/bruce/rag/enums/RagChunkStrategyEnum.java create mode 100644 src/main/java/com/bruce/rag/mapper/RagChunkEmbeddingMapper.java create mode 100644 src/main/java/com/bruce/rag/mapper/RagChunkMapper.java create mode 100644 src/main/java/com/bruce/rag/service/IRagChunkEmbeddingService.java create mode 100644 src/main/java/com/bruce/rag/service/IRagChunkService.java create mode 100644 src/main/java/com/bruce/rag/service/IRagDocumentParseService.java create mode 100644 src/main/java/com/bruce/rag/service/impl/RagChunkEmbeddingServiceImpl.java create mode 100644 src/main/java/com/bruce/rag/service/impl/RagChunkServiceImpl.java create mode 100644 src/main/java/com/bruce/rag/service/impl/RagDocumentParseServiceImpl.java create mode 100644 src/test/java/com/bruce/common/config/MybatisPlusConfigTests.java create mode 100644 src/test/java/com/bruce/common/document/parse/DocumentParserFactoryTests.java create mode 100644 src/test/java/com/bruce/common/document/parse/TxtDocumentParserTests.java create mode 100644 src/test/java/com/bruce/rag/RagDocumentParseServiceImplTests.java diff --git a/pom.xml b/pom.xml index b9eeb8d..59fc07c 100644 --- a/pom.xml +++ b/pom.xml @@ -29,6 +29,7 @@ 21 3.5.16 + 3.2.3 @@ -64,6 +65,18 @@ jackson-annotations + + org.apache.tika + tika-core + ${tika.version} + + + + org.apache.tika + tika-parsers-standard-package + ${tika.version} + + org.springdoc springdoc-openapi-starter-webmvc-ui diff --git a/script/sql/rag_chunk.sql b/script/sql/rag_chunk.sql new file mode 100644 index 0000000..340d66d --- /dev/null +++ b/script/sql/rag_chunk.sql @@ -0,0 +1,53 @@ +DROP TABLE IF EXISTS rag_chunk; + +CREATE TABLE rag_chunk ( + id BIGSERIAL PRIMARY KEY, + store_id BIGINT NOT NULL, + document_id BIGINT NOT NULL, + chunk_index INTEGER NOT NULL, + chunk_content TEXT NOT NULL, + chunk_summary VARCHAR(1000) DEFAULT '', + token_count INTEGER, + page_number INTEGER, + section_title VARCHAR(255) DEFAULT '', + heading_path VARCHAR(1000) DEFAULT '', + vector_id VARCHAR(128), + metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb, + enabled BOOLEAN NOT NULL DEFAULT TRUE, + version INTEGER NOT NULL DEFAULT 1, + create_time TIMESTAMP, + update_time TIMESTAMP, + remark VARCHAR(500) DEFAULT '', + create_by VARCHAR(64), + update_by VARCHAR(64), + CONSTRAINT uk_rag_chunk_document_index UNIQUE (document_id, chunk_index), + CONSTRAINT fk_rag_chunk_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id), + CONSTRAINT fk_rag_chunk_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id) +); + +CREATE INDEX idx_rag_chunk_store_id ON rag_chunk (store_id); +CREATE INDEX idx_rag_chunk_document_id ON rag_chunk (document_id); +CREATE INDEX idx_rag_chunk_enabled ON rag_chunk (enabled); +CREATE INDEX idx_rag_chunk_vector_id ON rag_chunk (vector_id); +CREATE INDEX idx_rag_chunk_metadata_json ON rag_chunk USING GIN (metadata_json); + +COMMENT ON TABLE rag_chunk IS 'RAG知识切片表'; +COMMENT ON COLUMN rag_chunk.id IS 'ID'; +COMMENT ON COLUMN rag_chunk.store_id IS '知识库ID'; +COMMENT ON COLUMN rag_chunk.document_id IS '文档ID'; +COMMENT ON COLUMN rag_chunk.chunk_index IS '文档内切片序号'; +COMMENT ON COLUMN rag_chunk.chunk_content IS '切片内容'; +COMMENT ON COLUMN rag_chunk.chunk_summary IS '切片摘要'; +COMMENT ON COLUMN rag_chunk.token_count IS 'Token数量'; +COMMENT ON COLUMN rag_chunk.page_number IS '页码'; +COMMENT ON COLUMN rag_chunk.section_title IS '章节标题'; +COMMENT ON COLUMN rag_chunk.heading_path IS '标题路径'; +COMMENT ON COLUMN rag_chunk.vector_id IS '向量ID'; +COMMENT ON COLUMN rag_chunk.metadata_json IS '切片级扩展元数据'; +COMMENT ON COLUMN rag_chunk.enabled IS '是否启用'; +COMMENT ON COLUMN rag_chunk.version IS '版本'; +COMMENT ON COLUMN rag_chunk.create_time IS '创建时间'; +COMMENT ON COLUMN rag_chunk.update_time IS '更新时间'; +COMMENT ON COLUMN rag_chunk.remark IS '备注'; +COMMENT ON COLUMN rag_chunk.create_by IS '创建者'; +COMMENT ON COLUMN rag_chunk.update_by IS '更新者'; diff --git a/script/sql/rag_chunk_embedding.sql b/script/sql/rag_chunk_embedding.sql new file mode 100644 index 0000000..0cbf208 --- /dev/null +++ b/script/sql/rag_chunk_embedding.sql @@ -0,0 +1,50 @@ +CREATE EXTENSION IF NOT EXISTS vector; + +DROP TABLE IF EXISTS rag_chunk_embedding; + +CREATE TABLE rag_chunk_embedding ( + id BIGSERIAL PRIMARY KEY, + store_id BIGINT NOT NULL, + document_id BIGINT NOT NULL, + chunk_id BIGINT NOT NULL, + embedding_model VARCHAR(100) NOT NULL, + embedding_dimension INTEGER NOT NULL DEFAULT 1024, + embedding VECTOR(1024) NOT NULL, + content_hash VARCHAR(64), + enabled BOOLEAN NOT NULL DEFAULT TRUE, + version INTEGER NOT NULL DEFAULT 1, + create_time TIMESTAMP, + update_time TIMESTAMP, + remark VARCHAR(500) DEFAULT '', + create_by VARCHAR(64), + update_by VARCHAR(64), + CONSTRAINT uk_rag_chunk_embedding_chunk_model UNIQUE (chunk_id, embedding_model), + CONSTRAINT fk_rag_chunk_embedding_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id), + CONSTRAINT fk_rag_chunk_embedding_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id), + CONSTRAINT fk_rag_chunk_embedding_chunk_id FOREIGN KEY (chunk_id) REFERENCES rag_chunk (id) +); + +CREATE INDEX idx_rag_chunk_embedding_store_id ON rag_chunk_embedding (store_id); +CREATE INDEX idx_rag_chunk_embedding_document_id ON rag_chunk_embedding (document_id); +CREATE INDEX idx_rag_chunk_embedding_chunk_id ON rag_chunk_embedding (chunk_id); +CREATE INDEX idx_rag_chunk_embedding_model ON rag_chunk_embedding (embedding_model); +CREATE INDEX idx_rag_chunk_embedding_enabled ON rag_chunk_embedding (enabled); +CREATE INDEX idx_rag_chunk_embedding_vector_hnsw + ON rag_chunk_embedding USING hnsw (embedding vector_cosine_ops); + +COMMENT ON TABLE rag_chunk_embedding IS 'RAG切片向量表'; +COMMENT ON COLUMN rag_chunk_embedding.id IS 'ID'; +COMMENT ON COLUMN rag_chunk_embedding.store_id IS '知识库ID'; +COMMENT ON COLUMN rag_chunk_embedding.document_id IS '文档ID'; +COMMENT ON COLUMN rag_chunk_embedding.chunk_id IS '切片ID'; +COMMENT ON COLUMN rag_chunk_embedding.embedding_model IS '向量模型'; +COMMENT ON COLUMN rag_chunk_embedding.embedding_dimension IS '向量维度'; +COMMENT ON COLUMN rag_chunk_embedding.embedding IS '向量内容'; +COMMENT ON COLUMN rag_chunk_embedding.content_hash IS '向量生成内容哈希'; +COMMENT ON COLUMN rag_chunk_embedding.enabled IS '是否启用'; +COMMENT ON COLUMN rag_chunk_embedding.version IS '版本'; +COMMENT ON COLUMN rag_chunk_embedding.create_time IS '创建时间'; +COMMENT ON COLUMN rag_chunk_embedding.update_time IS '更新时间'; +COMMENT ON COLUMN rag_chunk_embedding.remark IS '备注'; +COMMENT ON COLUMN rag_chunk_embedding.create_by IS '创建者'; +COMMENT ON COLUMN rag_chunk_embedding.update_by IS '更新者'; diff --git a/script/sql/rag_chunk_strategy_enum.sql b/script/sql/rag_chunk_strategy_enum.sql new file mode 100644 index 0000000..ea1ed52 --- /dev/null +++ b/script/sql/rag_chunk_strategy_enum.sql @@ -0,0 +1,15 @@ +INSERT INTO sys_enum (catalog, type, name, value, strvalue, sort, version, remark) +VALUES + ('rag', 'chunk_strategy', '固定长度切片', 1, 'FIXED_LENGTH', 1, 1, 'RAG文档切片方式'), + ('rag', 'chunk_strategy', '按段落切片', 2, 'PARAGRAPH', 2, 1, 'RAG文档切片方式'), + ('rag', 'chunk_strategy', '按标题层级切片', 3, 'HEADING', 3, 1, 'RAG文档切片方式'), + ('rag', 'chunk_strategy', '按表格行切片', 4, 'TABLE_ROW', 4, 1, 'RAG文档切片方式'), + ('rag', 'chunk_strategy', '按分隔符切片', 5, 'DELIMITER', 5, 1, 'RAG文档切片方式'), + ('rag', 'chunk_strategy', '语义切片', 6, 'SEMANTIC', 6, 1, 'RAG文档切片方式') +ON CONFLICT (catalog, type, name) +DO UPDATE SET + value = EXCLUDED.value, + strvalue = EXCLUDED.strvalue, + sort = EXCLUDED.sort, + remark = EXCLUDED.remark, + update_time = CURRENT_TIMESTAMP; diff --git a/src/main/java/com/bruce/common/config/MybatisPlusConfig.java b/src/main/java/com/bruce/common/config/MybatisPlusConfig.java new file mode 100644 index 0000000..dde5150 --- /dev/null +++ b/src/main/java/com/bruce/common/config/MybatisPlusConfig.java @@ -0,0 +1,17 @@ +package com.bruce.common.config; + +import com.baomidou.mybatisplus.extension.plugins.MybatisPlusInterceptor; +import com.baomidou.mybatisplus.extension.plugins.inner.OptimisticLockerInnerInterceptor; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class MybatisPlusConfig { + + @Bean + public MybatisPlusInterceptor mybatisPlusInterceptor() { + MybatisPlusInterceptor interceptor = new MybatisPlusInterceptor(); + interceptor.addInnerInterceptor(new OptimisticLockerInnerInterceptor()); + return interceptor; + } +} diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParseContext.java b/src/main/java/com/bruce/common/document/parse/DocumentParseContext.java new file mode 100644 index 0000000..b148600 --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/DocumentParseContext.java @@ -0,0 +1,21 @@ +package com.bruce.common.document.parse; + +import lombok.Data; + +import java.nio.file.Path; + +@Data +public class DocumentParseContext { + + private Long documentId; + + private Long attachmentId; + + private String originalName; + + private String suffix; + + private String contentType; + + private Path filePath; +} diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParseException.java b/src/main/java/com/bruce/common/document/parse/DocumentParseException.java new file mode 100644 index 0000000..345ed96 --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/DocumentParseException.java @@ -0,0 +1,12 @@ +package com.bruce.common.document.parse; + +public class DocumentParseException extends RuntimeException { + + public DocumentParseException(String message) { + super(message); + } + + public DocumentParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParseResult.java b/src/main/java/com/bruce/common/document/parse/DocumentParseResult.java new file mode 100644 index 0000000..bc35e66 --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/DocumentParseResult.java @@ -0,0 +1,20 @@ +package com.bruce.common.document.parse; + +import lombok.Data; + +import java.util.LinkedHashMap; +import java.util.Map; + +@Data +public class DocumentParseResult { + + private String text; + + private Integer textLength; + + private Integer pageCount; + + private Integer sheetCount; + + private Map metadata = new LinkedHashMap<>(); +} diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParser.java b/src/main/java/com/bruce/common/document/parse/DocumentParser.java new file mode 100644 index 0000000..8de2ef7 --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/DocumentParser.java @@ -0,0 +1,8 @@ +package com.bruce.common.document.parse; + +public interface DocumentParser { + + boolean supports(DocumentParseContext context); + + DocumentParseResult parse(DocumentParseContext context); +} diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParserFactory.java b/src/main/java/com/bruce/common/document/parse/DocumentParserFactory.java new file mode 100644 index 0000000..f1fb6e9 --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/DocumentParserFactory.java @@ -0,0 +1,37 @@ +package com.bruce.common.document.parse; + +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +import java.util.List; +import java.util.Locale; + +@Component +public class DocumentParserFactory { + + private final List parsers; + + public DocumentParserFactory(List parsers) { + this.parsers = parsers; + } + + public DocumentParser resolve(DocumentParseContext context) { + return parsers.stream() + .filter(parser -> parser.supports(context)) + .findFirst() + .orElseThrow(() -> new DocumentParseException("不支持的文档类型: " + resolveType(context))); + } + + private String resolveType(DocumentParseContext context) { + if (context == null) { + return "unknown"; + } + if (StringUtils.hasText(context.getSuffix())) { + return context.getSuffix().trim().toLowerCase(Locale.ROOT); + } + if (StringUtils.hasText(context.getContentType())) { + return context.getContentType().trim(); + } + return "unknown"; + } +} diff --git a/src/main/java/com/bruce/common/document/parse/impl/AbstractTikaDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/AbstractTikaDocumentParser.java new file mode 100644 index 0000000..65e3e3b --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/impl/AbstractTikaDocumentParser.java @@ -0,0 +1,64 @@ +package com.bruce.common.document.parse.impl; + +import com.bruce.common.document.parse.DocumentParseContext; +import com.bruce.common.document.parse.DocumentParseException; +import com.bruce.common.document.parse.DocumentParseResult; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.springframework.util.StringUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.util.Locale; +import java.util.Set; + +abstract class AbstractTikaDocumentParser { + + private static final int MAX_TEXT_LENGTH = -1; + + private final Tika tika = new Tika(); + + boolean supportsSuffix(DocumentParseContext context, Set suffixes) { + return context != null + && StringUtils.hasText(context.getSuffix()) + && suffixes.contains(context.getSuffix().trim().toLowerCase(Locale.ROOT)); + } + + boolean supportsContentType(DocumentParseContext context, String prefix) { + return context != null + && StringUtils.hasText(context.getContentType()) + && context.getContentType().trim().toLowerCase(Locale.ROOT).startsWith(prefix); + } + + DocumentParseResult parseWithTika(DocumentParseContext context) { + if (context == null || context.getFilePath() == null) { + throw new DocumentParseException("解析文件不能为空"); + } + try { + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, context.getOriginalName()); + if (StringUtils.hasText(context.getContentType())) { + metadata.set(Metadata.CONTENT_TYPE, context.getContentType()); + } + String text; + try (InputStream inputStream = Files.newInputStream(context.getFilePath())) { + text = tika.parseToString(inputStream, metadata, MAX_TEXT_LENGTH); + } + DocumentParseResult result = new DocumentParseResult(); + result.setText(text == null ? "" : text.trim()); + result.setTextLength(result.getText().length()); + result.getMetadata().put("contentType", firstNonBlank(metadata.get(Metadata.CONTENT_TYPE), context.getContentType())); + result.getMetadata().put("resourceName", firstNonBlank(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY), context.getOriginalName())); + return result; + } catch (IOException | TikaException e) { + throw new DocumentParseException("文档解析失败: " + e.getMessage(), e); + } + } + + private String firstNonBlank(String first, String fallback) { + return StringUtils.hasText(first) ? first : fallback; + } +} diff --git a/src/main/java/com/bruce/common/document/parse/impl/ExcelDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/ExcelDocumentParser.java new file mode 100644 index 0000000..abe7a9b --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/impl/ExcelDocumentParser.java @@ -0,0 +1,26 @@ +package com.bruce.common.document.parse.impl; + +import com.bruce.common.document.parse.DocumentParseContext; +import com.bruce.common.document.parse.DocumentParser; +import com.bruce.common.document.parse.DocumentParseResult; +import org.springframework.stereotype.Component; + +import java.util.Set; + +@Component +public class ExcelDocumentParser extends AbstractTikaDocumentParser implements DocumentParser { + + private static final Set SUFFIXES = Set.of("xls", "xlsx"); + + @Override + public boolean supports(DocumentParseContext context) { + return supportsSuffix(context, SUFFIXES) + || supportsContentType(context, "application/vnd.ms-excel") + || supportsContentType(context, "application/vnd.openxmlformats-officedocument.spreadsheetml"); + } + + @Override + public DocumentParseResult parse(DocumentParseContext context) { + return parseWithTika(context); + } +} diff --git a/src/main/java/com/bruce/common/document/parse/impl/PdfDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/PdfDocumentParser.java new file mode 100644 index 0000000..37cf8f0 --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/impl/PdfDocumentParser.java @@ -0,0 +1,24 @@ +package com.bruce.common.document.parse.impl; + +import com.bruce.common.document.parse.DocumentParseContext; +import com.bruce.common.document.parse.DocumentParser; +import com.bruce.common.document.parse.DocumentParseResult; +import org.springframework.stereotype.Component; + +import java.util.Set; + +@Component +public class PdfDocumentParser extends AbstractTikaDocumentParser implements DocumentParser { + + private static final Set SUFFIXES = Set.of("pdf"); + + @Override + public boolean supports(DocumentParseContext context) { + return supportsSuffix(context, SUFFIXES) || supportsContentType(context, "application/pdf"); + } + + @Override + public DocumentParseResult parse(DocumentParseContext context) { + return parseWithTika(context); + } +} diff --git a/src/main/java/com/bruce/common/document/parse/impl/TxtDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/TxtDocumentParser.java new file mode 100644 index 0000000..98646eb --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/impl/TxtDocumentParser.java @@ -0,0 +1,24 @@ +package com.bruce.common.document.parse.impl; + +import com.bruce.common.document.parse.DocumentParseContext; +import com.bruce.common.document.parse.DocumentParser; +import com.bruce.common.document.parse.DocumentParseResult; +import org.springframework.stereotype.Component; + +import java.util.Set; + +@Component +public class TxtDocumentParser extends AbstractTikaDocumentParser implements DocumentParser { + + private static final Set SUFFIXES = Set.of("txt", "md", "log"); + + @Override + public boolean supports(DocumentParseContext context) { + return supportsSuffix(context, SUFFIXES) || supportsContentType(context, "text/"); + } + + @Override + public DocumentParseResult parse(DocumentParseContext context) { + return parseWithTika(context); + } +} diff --git a/src/main/java/com/bruce/common/document/parse/impl/WordDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/WordDocumentParser.java new file mode 100644 index 0000000..c34e752 --- /dev/null +++ b/src/main/java/com/bruce/common/document/parse/impl/WordDocumentParser.java @@ -0,0 +1,26 @@ +package com.bruce.common.document.parse.impl; + +import com.bruce.common.document.parse.DocumentParseContext; +import com.bruce.common.document.parse.DocumentParser; +import com.bruce.common.document.parse.DocumentParseResult; +import org.springframework.stereotype.Component; + +import java.util.Set; + +@Component +public class WordDocumentParser extends AbstractTikaDocumentParser implements DocumentParser { + + private static final Set SUFFIXES = Set.of("doc", "docx"); + + @Override + public boolean supports(DocumentParseContext context) { + return supportsSuffix(context, SUFFIXES) + || supportsContentType(context, "application/msword") + || supportsContentType(context, "application/vnd.openxmlformats-officedocument.wordprocessingml"); + } + + @Override + public DocumentParseResult parse(DocumentParseContext context) { + return parseWithTika(context); + } +} diff --git a/src/main/java/com/bruce/rag/controller/RagDocumentController.java b/src/main/java/com/bruce/rag/controller/RagDocumentController.java index 777797c..e5cdbb8 100644 --- a/src/main/java/com/bruce/rag/controller/RagDocumentController.java +++ b/src/main/java/com/bruce/rag/controller/RagDocumentController.java @@ -2,9 +2,12 @@ package com.bruce.rag.controller; import com.bruce.common.domain.model.RequestResult; import com.bruce.rag.dto.request.RagDocumentBatchUploadRequest; +import com.bruce.rag.dto.request.RagDocumentParseRequest; import com.bruce.rag.dto.request.RagDocumentQueryRequest; import com.bruce.rag.dto.request.RagDocumentSaveRequest; +import com.bruce.rag.dto.response.RagDocumentParseResponse; import com.bruce.rag.dto.response.RagDocumentResponse; +import com.bruce.rag.service.IRagDocumentParseService; import com.bruce.rag.service.IRagDocumentService; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; @@ -29,6 +32,9 @@ public class RagDocumentController { @Autowired private IRagDocumentService ragDocumentService; + @Autowired + private IRagDocumentParseService ragDocumentParseService; + @Operation(summary = "查询全部知识库文档") @PostMapping("/list") public RequestResult> list() { @@ -85,4 +91,13 @@ public class RagDocumentController { request.getStoreId(), responses.size()); return RequestResult.success(responses); } + + @Operation(summary = "解析知识库文档") + @PostMapping("/parse") + public RequestResult> parse(@RequestBody RagDocumentParseRequest request) { + log.info("RagDocumentController.parse start, request={}", request); + List responses = ragDocumentParseService.parse(request); + log.info("RagDocumentController.parse success, count={}", responses.size()); + return RequestResult.success(responses); + } } diff --git a/src/main/java/com/bruce/rag/dto/request/RagDocumentParseRequest.java b/src/main/java/com/bruce/rag/dto/request/RagDocumentParseRequest.java new file mode 100644 index 0000000..8fc93c0 --- /dev/null +++ b/src/main/java/com/bruce/rag/dto/request/RagDocumentParseRequest.java @@ -0,0 +1,26 @@ +package com.bruce.rag.dto.request; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +import java.util.List; + +@Data +@Schema(description = "RAG知识库文档解析请求") +public class RagDocumentParseRequest { + + @Schema(description = "文档ID列表") + private List documentIds; + + @Schema(description = "切片方式") + private String chunkStrategy; + + @Schema(description = "切片长度") + private Integer chunkSize; + + @Schema(description = "重叠长度") + private Integer chunkOverlap; + + @Schema(description = "分隔符") + private String delimiter; +} diff --git a/src/main/java/com/bruce/rag/dto/response/RagDocumentParseResponse.java b/src/main/java/com/bruce/rag/dto/response/RagDocumentParseResponse.java new file mode 100644 index 0000000..17c7e3e --- /dev/null +++ b/src/main/java/com/bruce/rag/dto/response/RagDocumentParseResponse.java @@ -0,0 +1,33 @@ +package com.bruce.rag.dto.response; + +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.fasterxml.jackson.databind.ser.std.ToStringSerializer; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; + +import java.util.LinkedHashMap; +import java.util.Map; + +@Data +@Schema(description = "RAG知识库文档解析响应") +public class RagDocumentParseResponse { + + @Schema(description = "文档ID") + @JsonSerialize(using = ToStringSerializer.class) + private Long documentId; + + @Schema(description = "解析状态") + private String parseStatus; + + @Schema(description = "文本长度") + private Integer textLength; + + @Schema(description = "页数") + private Integer pageCount; + + @Schema(description = "工作表数量") + private Integer sheetCount; + + @Schema(description = "解析元数据") + private Map metadata = new LinkedHashMap<>(); +} diff --git a/src/main/java/com/bruce/rag/entity/RagChunk.java b/src/main/java/com/bruce/rag/entity/RagChunk.java new file mode 100644 index 0000000..fcd3a7a --- /dev/null +++ b/src/main/java/com/bruce/rag/entity/RagChunk.java @@ -0,0 +1,67 @@ +package com.bruce.rag.entity; + +import com.baomidou.mybatisplus.annotation.TableField; +import com.baomidou.mybatisplus.annotation.TableName; +import com.bruce.common.domain.model.BaseEntity; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +@EqualsAndHashCode(callSuper = true) +@TableName("rag_chunk") +@Schema(description = "RAG知识切片") +public class RagChunk extends BaseEntity { + + @Schema(description = "知识库ID") + @TableField("store_id") + private Long storeId; + + @Schema(description = "文档ID") + @TableField("document_id") + private Long documentId; + + @Schema(description = "文档内切片序号") + @TableField("chunk_index") + private Integer chunkIndex; + + @Schema(description = "切片内容") + @TableField("chunk_content") + private String chunkContent; + + @Schema(description = "切片摘要") + @TableField("chunk_summary") + private String chunkSummary; + + @Schema(description = "Token数量") + @TableField("token_count") + private Integer tokenCount; + + @Schema(description = "页码") + @TableField("page_number") + private Integer pageNumber; + + @Schema(description = "章节标题") + @TableField("section_title") + private String sectionTitle; + + @Schema(description = "标题路径") + @TableField("heading_path") + private String headingPath; + + @Schema(description = "向量ID") + @TableField("vector_id") + private String vectorId; + + @Schema(description = "切片级扩展元数据JSON") + @TableField("metadata_json") + private String metadataJson; + + @Schema(description = "是否启用") + private Boolean enabled; + + @Schema(description = "备注") + private String remark; +} diff --git a/src/main/java/com/bruce/rag/entity/RagChunkEmbedding.java b/src/main/java/com/bruce/rag/entity/RagChunkEmbedding.java new file mode 100644 index 0000000..6dd1c32 --- /dev/null +++ b/src/main/java/com/bruce/rag/entity/RagChunkEmbedding.java @@ -0,0 +1,50 @@ +package com.bruce.rag.entity; + +import com.baomidou.mybatisplus.annotation.TableField; +import com.baomidou.mybatisplus.annotation.TableName; +import com.bruce.common.domain.model.BaseEntity; +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +@EqualsAndHashCode(callSuper = true) +@TableName("rag_chunk_embedding") +@Schema(description = "RAG切片向量") +public class RagChunkEmbedding extends BaseEntity { + + @Schema(description = "知识库ID") + @TableField("store_id") + private Long storeId; + + @Schema(description = "文档ID") + @TableField("document_id") + private Long documentId; + + @Schema(description = "切片ID") + @TableField("chunk_id") + private Long chunkId; + + @Schema(description = "向量模型") + @TableField("embedding_model") + private String embeddingModel; + + @Schema(description = "向量维度") + @TableField("embedding_dimension") + private Integer embeddingDimension; + + @Schema(description = "向量内容") + private String embedding; + + @Schema(description = "向量生成内容哈希") + @TableField("content_hash") + private String contentHash; + + @Schema(description = "是否启用") + private Boolean enabled; + + @Schema(description = "备注") + private String remark; +} diff --git a/src/main/java/com/bruce/rag/enums/RagChunkStrategyEnum.java b/src/main/java/com/bruce/rag/enums/RagChunkStrategyEnum.java new file mode 100644 index 0000000..f514b06 --- /dev/null +++ b/src/main/java/com/bruce/rag/enums/RagChunkStrategyEnum.java @@ -0,0 +1,20 @@ +package com.bruce.rag.enums; + +import lombok.AllArgsConstructor; +import lombok.Getter; + +@Getter +@AllArgsConstructor +public enum RagChunkStrategyEnum { + + FIXED_LENGTH(1, "固定长度切片"), + PARAGRAPH(2, "按段落切片"), + HEADING(3, "按标题层级切片"), + TABLE_ROW(4, "按表格行切片"), + DELIMITER(5, "按分隔符切片"), + SEMANTIC(6, "语义切片"); + + private final Integer value; + + private final String label; +} diff --git a/src/main/java/com/bruce/rag/mapper/RagChunkEmbeddingMapper.java b/src/main/java/com/bruce/rag/mapper/RagChunkEmbeddingMapper.java new file mode 100644 index 0000000..a8b03ab --- /dev/null +++ b/src/main/java/com/bruce/rag/mapper/RagChunkEmbeddingMapper.java @@ -0,0 +1,9 @@ +package com.bruce.rag.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.bruce.rag.entity.RagChunkEmbedding; +import org.apache.ibatis.annotations.Mapper; + +@Mapper +public interface RagChunkEmbeddingMapper extends BaseMapper { +} diff --git a/src/main/java/com/bruce/rag/mapper/RagChunkMapper.java b/src/main/java/com/bruce/rag/mapper/RagChunkMapper.java new file mode 100644 index 0000000..68c4f90 --- /dev/null +++ b/src/main/java/com/bruce/rag/mapper/RagChunkMapper.java @@ -0,0 +1,9 @@ +package com.bruce.rag.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import com.bruce.rag.entity.RagChunk; +import org.apache.ibatis.annotations.Mapper; + +@Mapper +public interface RagChunkMapper extends BaseMapper { +} diff --git a/src/main/java/com/bruce/rag/service/IRagChunkEmbeddingService.java b/src/main/java/com/bruce/rag/service/IRagChunkEmbeddingService.java new file mode 100644 index 0000000..44c713f --- /dev/null +++ b/src/main/java/com/bruce/rag/service/IRagChunkEmbeddingService.java @@ -0,0 +1,7 @@ +package com.bruce.rag.service; + +import com.baomidou.mybatisplus.extension.service.IService; +import com.bruce.rag.entity.RagChunkEmbedding; + +public interface IRagChunkEmbeddingService extends IService { +} diff --git a/src/main/java/com/bruce/rag/service/IRagChunkService.java b/src/main/java/com/bruce/rag/service/IRagChunkService.java new file mode 100644 index 0000000..581ea6c --- /dev/null +++ b/src/main/java/com/bruce/rag/service/IRagChunkService.java @@ -0,0 +1,7 @@ +package com.bruce.rag.service; + +import com.baomidou.mybatisplus.extension.service.IService; +import com.bruce.rag.entity.RagChunk; + +public interface IRagChunkService extends IService { +} diff --git a/src/main/java/com/bruce/rag/service/IRagDocumentParseService.java b/src/main/java/com/bruce/rag/service/IRagDocumentParseService.java new file mode 100644 index 0000000..387c161 --- /dev/null +++ b/src/main/java/com/bruce/rag/service/IRagDocumentParseService.java @@ -0,0 +1,13 @@ +package com.bruce.rag.service; + +import com.bruce.rag.dto.response.RagDocumentParseResponse; +import com.bruce.rag.dto.request.RagDocumentParseRequest; + +import java.util.List; + +public interface IRagDocumentParseService { + + RagDocumentParseResponse parse(Long documentId); + + List parse(RagDocumentParseRequest request); +} diff --git a/src/main/java/com/bruce/rag/service/impl/RagChunkEmbeddingServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagChunkEmbeddingServiceImpl.java new file mode 100644 index 0000000..e71ff6d --- /dev/null +++ b/src/main/java/com/bruce/rag/service/impl/RagChunkEmbeddingServiceImpl.java @@ -0,0 +1,12 @@ +package com.bruce.rag.service.impl; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.bruce.rag.entity.RagChunkEmbedding; +import com.bruce.rag.mapper.RagChunkEmbeddingMapper; +import com.bruce.rag.service.IRagChunkEmbeddingService; +import org.springframework.stereotype.Service; + +@Service +public class RagChunkEmbeddingServiceImpl extends ServiceImpl + implements IRagChunkEmbeddingService { +} diff --git a/src/main/java/com/bruce/rag/service/impl/RagChunkServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagChunkServiceImpl.java new file mode 100644 index 0000000..8d9ba57 --- /dev/null +++ b/src/main/java/com/bruce/rag/service/impl/RagChunkServiceImpl.java @@ -0,0 +1,11 @@ +package com.bruce.rag.service.impl; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.mapper.RagChunkMapper; +import com.bruce.rag.service.IRagChunkService; +import org.springframework.stereotype.Service; + +@Service +public class RagChunkServiceImpl extends ServiceImpl implements IRagChunkService { +} diff --git a/src/main/java/com/bruce/rag/service/impl/RagDocumentParseServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagDocumentParseServiceImpl.java new file mode 100644 index 0000000..df60746 --- /dev/null +++ b/src/main/java/com/bruce/rag/service/impl/RagDocumentParseServiceImpl.java @@ -0,0 +1,151 @@ +package com.bruce.rag.service.impl; + +import com.bruce.common.config.AttachmentProperties; +import com.bruce.common.document.parse.DocumentParseContext; +import com.bruce.common.document.parse.DocumentParseException; +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.common.document.parse.DocumentParser; +import com.bruce.common.document.parse.DocumentParserFactory; +import com.bruce.common.domain.entity.SysAttachment; +import com.bruce.common.service.ISysAttachmentService; +import com.bruce.rag.dto.request.RagDocumentParseRequest; +import com.bruce.rag.dto.response.RagDocumentParseResponse; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import com.bruce.rag.enums.RagParseStatusEnum; +import com.bruce.rag.service.IRagDocumentParseService; +import com.bruce.rag.service.IRagDocumentService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.util.StringUtils; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +@Slf4j +@Service +@RequiredArgsConstructor +public class RagDocumentParseServiceImpl implements IRagDocumentParseService { + + private final IRagDocumentService ragDocumentService; + + private final ISysAttachmentService sysAttachmentService; + + private final AttachmentProperties attachmentProperties; + + private final DocumentParserFactory documentParserFactory; + + @Override + public List parse(RagDocumentParseRequest request) { + log.info("RagDocumentParseServiceImpl.parse batch start, request={}", request); + validateParseRequest(request); + List responses = request.getDocumentIds().stream() + .map(this::parse) + .toList(); + log.info("RagDocumentParseServiceImpl.parse batch success, count={}", responses.size()); + return responses; + } + + @Override + public RagDocumentParseResponse parse(Long documentId) { + log.info("RagDocumentParseServiceImpl.parse start, documentId={}", documentId); + if (documentId == null) { + throw new IllegalArgumentException("文档ID不能为空"); + } + + RagDocument document = ragDocumentService.getById(documentId); + if (document == null) { + throw new IllegalArgumentException("文档不存在,ID: " + documentId); + } + if (document.getAttachmentId() == null) { + throw new IllegalArgumentException("文档附件ID不能为空"); + } + + SysAttachment attachment = sysAttachmentService.getById(document.getAttachmentId()); + if (attachment == null) { + throw new IllegalArgumentException("附件不存在,ID: " + document.getAttachmentId()); + } + + updateParseStatus(documentId, RagParseStatusEnum.PARSING, null); + try { + DocumentParseContext context = buildParseContext(document, attachment); + DocumentParser parser = documentParserFactory.resolve(context); + DocumentParseResult result = parser.parse(context); + updateParseStatus(documentId, RagParseStatusEnum.PARSED, null); + RagDocumentParseResponse response = toResponse(documentId, result); + log.info("RagDocumentParseServiceImpl.parse success, documentId={}, textLength={}", + documentId, response.getTextLength()); + return response; + } catch (RuntimeException e) { + updateParseStatus(documentId, RagParseStatusEnum.FAILED, e.getMessage()); + log.warn("RagDocumentParseServiceImpl.parse failed, documentId={}, message={}", documentId, e.getMessage()); + throw e; + } + } + + private void validateParseRequest(RagDocumentParseRequest request) { + if (request == null) { + throw new IllegalArgumentException("解析请求不能为空"); + } + if (request.getDocumentIds() == null || request.getDocumentIds().isEmpty()) { + throw new IllegalArgumentException("文档ID列表不能为空"); + } + Set strategies = Arrays.stream(RagChunkStrategyEnum.values()) + .map(Enum::name) + .collect(Collectors.toSet()); + if (request.getChunkStrategy() == null || !strategies.contains(request.getChunkStrategy())) { + throw new IllegalArgumentException("不支持的切片方式: " + request.getChunkStrategy()); + } + } + + private DocumentParseContext buildParseContext(RagDocument document, SysAttachment attachment) { + Path filePath = resolveFilePath(attachment); + if (!Files.isRegularFile(filePath)) { + throw new DocumentParseException("解析文件不存在: " + filePath); + } + + DocumentParseContext context = new DocumentParseContext(); + context.setDocumentId(document.getId()); + context.setAttachmentId(attachment.getId()); + context.setOriginalName(attachment.getOriginalName()); + context.setSuffix(attachment.getFileSuffix()); + context.setContentType(attachment.getContentType()); + context.setFilePath(filePath); + return context; + } + + private Path resolveFilePath(SysAttachment attachment) { + if (!StringUtils.hasText(attachment.getFilePath())) { + throw new DocumentParseException("附件文件路径不能为空"); + } + Path filePath = Path.of(attachment.getFilePath()); + if (filePath.isAbsolute()) { + return filePath.normalize(); + } + return Path.of(attachmentProperties.getBasePath()).resolve(filePath).normalize(); + } + + private void updateParseStatus(Long documentId, RagParseStatusEnum status, String errorMessage) { + RagDocument update = new RagDocument(); + update.setId(documentId); + update.setParseStatus(status.name()); + update.setErrorMessage(StringUtils.hasText(errorMessage) ? errorMessage : null); + ragDocumentService.updateById(update); + } + + private RagDocumentParseResponse toResponse(Long documentId, DocumentParseResult result) { + RagDocumentParseResponse response = new RagDocumentParseResponse(); + response.setDocumentId(documentId); + response.setParseStatus(RagParseStatusEnum.PARSED.name()); + response.setTextLength(result.getTextLength()); + response.setPageCount(result.getPageCount()); + response.setSheetCount(result.getSheetCount()); + response.setMetadata(result.getMetadata()); + return response; + } +} diff --git a/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java index 378d1a5..9c36673 100644 --- a/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java +++ b/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java @@ -82,10 +82,18 @@ public class RagDocumentServiceImpl extends ServiceImpl factory.resolve(context("zip")) + ); + + assertEquals("不支持的文档类型: zip", exception.getMessage()); + } + + private DocumentParseContext context(String suffix) { + DocumentParseContext context = new DocumentParseContext(); + context.setSuffix(suffix); + context.setFilePath(Path.of("sample." + suffix)); + return context; + } +} diff --git a/src/test/java/com/bruce/common/document/parse/TxtDocumentParserTests.java b/src/test/java/com/bruce/common/document/parse/TxtDocumentParserTests.java new file mode 100644 index 0000000..b9c1d67 --- /dev/null +++ b/src/test/java/com/bruce/common/document/parse/TxtDocumentParserTests.java @@ -0,0 +1,47 @@ +package com.bruce.common.document.parse; + +import com.bruce.common.document.parse.impl.TxtDocumentParser; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TxtDocumentParserTests { + + @TempDir + private Path tempDir; + + @Test + void parseShouldReadPlainTextContent() throws Exception { + Path file = tempDir.resolve("people.txt"); + Files.writeString(file, "张三 是 产品经理\n李四 是 后端工程师", StandardCharsets.UTF_8); + DocumentParseContext context = new DocumentParseContext(); + context.setOriginalName("people.txt"); + context.setSuffix("txt"); + context.setContentType("text/plain"); + context.setFilePath(file); + + DocumentParseResult result = new TxtDocumentParser().parse(context); + + assertEquals("张三 是 产品经理\n李四 是 后端工程师", result.getText()); + assertEquals(result.getText().length(), result.getTextLength()); + assertTrue(result.getMetadata().get("contentType").toString().startsWith("text/plain")); + } + + @Test + void supportsShouldAcceptTextSuffixAndContentType() { + TxtDocumentParser parser = new TxtDocumentParser(); + DocumentParseContext suffixContext = new DocumentParseContext(); + suffixContext.setSuffix("TXT"); + DocumentParseContext contentTypeContext = new DocumentParseContext(); + contentTypeContext.setContentType("text/plain"); + + assertTrue(parser.supports(suffixContext)); + assertTrue(parser.supports(contentTypeContext)); + } +} diff --git a/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java b/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java index b146fe6..6c119c3 100644 --- a/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java +++ b/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java @@ -3,6 +3,7 @@ package com.bruce.common.enumconfig; import com.bruce.common.enums.CommonStatusEnum; import com.bruce.common.enums.EnableStatusEnum; import com.bruce.rag.enums.RagIndexStatusEnum; +import com.bruce.rag.enums.RagChunkStrategyEnum; import com.bruce.rag.enums.RagParseStatusEnum; import org.junit.jupiter.api.Test; @@ -24,6 +25,9 @@ class EnumDefinitionTests { assertEquals(4, RagParseStatusEnum.FAILED.getValue()); assertEquals(1, RagIndexStatusEnum.PENDING.getValue()); assertEquals(3, RagIndexStatusEnum.INDEXED.getValue()); + assertEquals(1, RagChunkStrategyEnum.FIXED_LENGTH.getValue()); + assertEquals(5, RagChunkStrategyEnum.DELIMITER.getValue()); + assertEquals(6, RagChunkStrategyEnum.SEMANTIC.getValue()); } @Test @@ -38,5 +42,8 @@ class EnumDefinitionTests { assertEquals("解析失败", RagParseStatusEnum.FAILED.getLabel()); assertEquals("待索引", RagIndexStatusEnum.PENDING.getLabel()); assertEquals("已索引", RagIndexStatusEnum.INDEXED.getLabel()); + assertEquals("固定长度切片", RagChunkStrategyEnum.FIXED_LENGTH.getLabel()); + assertEquals("按分隔符切片", RagChunkStrategyEnum.DELIMITER.getLabel()); + assertEquals("语义切片", RagChunkStrategyEnum.SEMANTIC.getLabel()); } } diff --git a/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java b/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java index 8eb7e5d..72a0cf9 100644 --- a/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java +++ b/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java @@ -5,6 +5,7 @@ import com.bruce.common.domain.entity.SysEnum; import com.bruce.common.enums.CommonStatusEnum; import com.bruce.common.enums.EnableStatusEnum; import com.bruce.common.service.ISysEnumService; +import com.bruce.rag.enums.RagChunkStrategyEnum; import com.bruce.rag.enums.RagIndexStatusEnum; import com.bruce.rag.enums.RagParseStatusEnum; import org.junit.jupiter.api.Test; @@ -40,6 +41,13 @@ class SysEnumDataInitTests { saveOrUpdate("rag", "index_status", RagIndexStatusEnum.INDEXING.getLabel(), RagIndexStatusEnum.INDEXING.getValue(), 2, "RAG文档索引状态"); saveOrUpdate("rag", "index_status", RagIndexStatusEnum.INDEXED.getLabel(), RagIndexStatusEnum.INDEXED.getValue(), 3, "RAG文档索引状态"); saveOrUpdate("rag", "index_status", RagIndexStatusEnum.FAILED.getLabel(), RagIndexStatusEnum.FAILED.getValue(), 4, "RAG文档索引状态"); + + saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.FIXED_LENGTH.getLabel(), RagChunkStrategyEnum.FIXED_LENGTH.getValue(), 1, "RAG文档切片方式"); + saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.PARAGRAPH.getLabel(), RagChunkStrategyEnum.PARAGRAPH.getValue(), 2, "RAG文档切片方式"); + saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.HEADING.getLabel(), RagChunkStrategyEnum.HEADING.getValue(), 3, "RAG文档切片方式"); + saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.TABLE_ROW.getLabel(), RagChunkStrategyEnum.TABLE_ROW.getValue(), 4, "RAG文档切片方式"); + saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.DELIMITER.getLabel(), RagChunkStrategyEnum.DELIMITER.getValue(), 5, "RAG文档切片方式"); + saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.SEMANTIC.getLabel(), RagChunkStrategyEnum.SEMANTIC.getValue(), 6, "RAG文档切片方式"); } private void saveOrUpdate(String catalog, String type, String name, Integer value, Integer sort, String remark) { diff --git a/src/test/java/com/bruce/rag/RagComponentStructureTests.java b/src/test/java/com/bruce/rag/RagComponentStructureTests.java index d7c9bdc..c9f586a 100644 --- a/src/test/java/com/bruce/rag/RagComponentStructureTests.java +++ b/src/test/java/com/bruce/rag/RagComponentStructureTests.java @@ -8,18 +8,29 @@ import com.bruce.rag.constant.RagSystemConstants; import com.bruce.rag.controller.RagDocumentController; import com.bruce.rag.controller.RagStoreController; import com.bruce.rag.dto.request.RagDocumentQueryRequest; +import com.bruce.rag.dto.request.RagDocumentParseRequest; import com.bruce.rag.dto.request.RagStoreQueryRequest; import com.bruce.rag.dto.request.RagStoreSaveRequest; +import com.bruce.rag.dto.response.RagDocumentParseResponse; import com.bruce.rag.dto.response.RagStoreDocumentOverviewResponse; import com.bruce.rag.dto.response.RagStoreOverviewResponse; import com.bruce.rag.dto.response.RagDocumentResponse; import com.bruce.rag.dto.response.RagStoreResponse; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagChunkEmbedding; import com.bruce.rag.entity.RagDocument; import com.bruce.rag.entity.RagStore; +import com.bruce.rag.mapper.RagChunkEmbeddingMapper; +import com.bruce.rag.mapper.RagChunkMapper; import com.bruce.rag.mapper.RagDocumentMapper; import com.bruce.rag.mapper.RagStoreMapper; +import com.bruce.rag.service.IRagChunkEmbeddingService; +import com.bruce.rag.service.IRagChunkService; +import com.bruce.rag.service.IRagDocumentParseService; import com.bruce.rag.service.IRagDocumentService; import com.bruce.rag.service.IRagStoreService; +import com.bruce.rag.service.impl.RagChunkEmbeddingServiceImpl; +import com.bruce.rag.service.impl.RagChunkServiceImpl; import com.bruce.rag.service.impl.RagDocumentServiceImpl; import com.bruce.rag.service.impl.RagStoreServiceImpl; import org.junit.jupiter.api.Test; @@ -39,10 +50,16 @@ class RagComponentStructureTests { void ragComponentsShouldReuseMybatisPlusBaseTypes() { assertTrue(BaseMapper.class.isAssignableFrom(RagStoreMapper.class)); assertTrue(BaseMapper.class.isAssignableFrom(RagDocumentMapper.class)); + assertTrue(BaseMapper.class.isAssignableFrom(RagChunkMapper.class)); + assertTrue(BaseMapper.class.isAssignableFrom(RagChunkEmbeddingMapper.class)); assertTrue(IService.class.isAssignableFrom(IRagStoreService.class)); assertTrue(IService.class.isAssignableFrom(IRagDocumentService.class)); + assertTrue(IService.class.isAssignableFrom(IRagChunkService.class)); + assertTrue(IService.class.isAssignableFrom(IRagChunkEmbeddingService.class)); assertTrue(ServiceImpl.class.isAssignableFrom(RagStoreServiceImpl.class)); assertTrue(ServiceImpl.class.isAssignableFrom(RagDocumentServiceImpl.class)); + assertTrue(ServiceImpl.class.isAssignableFrom(RagChunkServiceImpl.class)); + assertTrue(ServiceImpl.class.isAssignableFrom(RagChunkEmbeddingServiceImpl.class)); } @Test @@ -63,8 +80,10 @@ class RagComponentStructureTests { Method documentListMethod = RagDocumentController.class.getMethod("list"); Method documentQueryMethod = RagDocumentController.class.getMethod("query", RagDocumentQueryRequest.class); + Method documentParseMethod = RagDocumentController.class.getMethod("parse", RagDocumentParseRequest.class); Method documentResponseListMethod = IRagDocumentService.class.getMethod("listResponses"); Method documentServiceQueryMethod = IRagDocumentService.class.getMethod("query", RagDocumentQueryRequest.class); + Method documentParseServiceMethod = IRagDocumentParseService.class.getMethod("parse", RagDocumentParseRequest.class); assertEquals(RequestResult.class, storeListMethod.getReturnType()); assertEquals(RequestResult.class, storeQueryMethod.getReturnType()); @@ -89,11 +108,14 @@ class RagComponentStructureTests { assertEquals(RequestResult.class, documentListMethod.getReturnType()); assertEquals(RequestResult.class, documentQueryMethod.getReturnType()); + assertEquals(RequestResult.class, documentParseMethod.getReturnType()); assertEquals(List.class, documentServiceQueryMethod.getReturnType()); + assertEquals(List.class, documentParseServiceMethod.getReturnType()); assertTrue(documentResponseListMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse")); assertTrue(documentServiceQueryMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse")); assertTrue(documentListMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse")); assertTrue(documentQueryMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse")); + assertTrue(documentParseMethod.getGenericReturnType().getTypeName().contains("RagDocumentParseResponse")); assertEquals(RagDocumentResponse.class, RagDocumentResponse.class.getMethod("fromEntity", RagDocument.class).getReturnType()); } @@ -121,4 +143,34 @@ class RagComponentStructureTests { assertTrue(RagStoreController.class.getSimpleName().contains("RagStoreController")); assertTrue(RagDocumentController.class.getSimpleName().contains("RagDocumentController")); } + + @Test + void ragChunkStructureShouldSupportChunkMetadata() throws NoSuchFieldException { + assertEquals(Long.class, RagChunk.class.getDeclaredField("storeId").getType()); + assertEquals(Long.class, RagChunk.class.getDeclaredField("documentId").getType()); + assertEquals(Integer.class, RagChunk.class.getDeclaredField("chunkIndex").getType()); + assertEquals(String.class, RagChunk.class.getDeclaredField("chunkContent").getType()); + assertEquals(String.class, RagChunk.class.getDeclaredField("chunkSummary").getType()); + assertEquals(Integer.class, RagChunk.class.getDeclaredField("tokenCount").getType()); + assertEquals(Integer.class, RagChunk.class.getDeclaredField("pageNumber").getType()); + assertEquals(String.class, RagChunk.class.getDeclaredField("sectionTitle").getType()); + assertEquals(String.class, RagChunk.class.getDeclaredField("headingPath").getType()); + assertEquals(String.class, RagChunk.class.getDeclaredField("vectorId").getType()); + assertEquals(String.class, RagChunk.class.getDeclaredField("metadataJson").getType()); + assertEquals(Boolean.class, RagChunk.class.getDeclaredField("enabled").getType()); + assertEquals(String.class, RagChunk.class.getDeclaredField("remark").getType()); + } + + @Test + void ragChunkEmbeddingStructureShouldSupportPgvectorMetadata() throws NoSuchFieldException { + assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("storeId").getType()); + assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("documentId").getType()); + assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("chunkId").getType()); + assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("embeddingModel").getType()); + assertEquals(Integer.class, RagChunkEmbedding.class.getDeclaredField("embeddingDimension").getType()); + assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("embedding").getType()); + assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("contentHash").getType()); + assertEquals(Boolean.class, RagChunkEmbedding.class.getDeclaredField("enabled").getType()); + assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("remark").getType()); + } } diff --git a/src/test/java/com/bruce/rag/RagDocumentParseServiceImplTests.java b/src/test/java/com/bruce/rag/RagDocumentParseServiceImplTests.java new file mode 100644 index 0000000..f660c3f --- /dev/null +++ b/src/test/java/com/bruce/rag/RagDocumentParseServiceImplTests.java @@ -0,0 +1,160 @@ +package com.bruce.rag; + +import com.bruce.common.config.AttachmentProperties; +import com.bruce.common.document.parse.DocumentParseContext; +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.common.document.parse.DocumentParser; +import com.bruce.common.document.parse.DocumentParserFactory; +import com.bruce.common.domain.entity.SysAttachment; +import com.bruce.common.service.ISysAttachmentService; +import com.bruce.rag.dto.request.RagDocumentParseRequest; +import com.bruce.rag.dto.response.RagDocumentParseResponse; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagParseStatusEnum; +import com.bruce.rag.service.IRagDocumentService; +import com.bruce.rag.service.impl.RagDocumentParseServiceImpl; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class RagDocumentParseServiceImplTests { + + @TempDir + private Path tempDir; + + @Mock + private IRagDocumentService ragDocumentService; + + @Mock + private ISysAttachmentService sysAttachmentService; + + @Test + void parseShouldUpdateStatusAndReturnParseResponse() throws Exception { + Path file = tempDir.resolve("rag").resolve("people.txt"); + Files.createDirectories(file.getParent()); + Files.writeString(file, "people profiles"); + + RagDocument document = new RagDocument(); + document.setId(1001L); + document.setStoreId(2002L); + document.setAttachmentId(3003L); + document.setParseStatus(RagParseStatusEnum.UPLOADED.name()); + + SysAttachment attachment = new SysAttachment(); + attachment.setId(3003L); + attachment.setOriginalName("people.txt"); + attachment.setFileSuffix("txt"); + attachment.setContentType("text/plain"); + attachment.setFilePath("rag/people.txt"); + + AttachmentProperties attachmentProperties = new AttachmentProperties(); + attachmentProperties.setBasePath(tempDir.toString()); + DocumentParser parser = new FixedDocumentParser("people profiles"); + RagDocumentParseServiceImpl service = new RagDocumentParseServiceImpl( + ragDocumentService, + sysAttachmentService, + attachmentProperties, + new DocumentParserFactory(List.of(parser)) + ); + + when(ragDocumentService.getById(1001L)).thenReturn(document); + when(sysAttachmentService.getById(3003L)).thenReturn(attachment); + when(ragDocumentService.updateById(any(RagDocument.class))).thenReturn(true); + + RagDocumentParseResponse response = service.parse(1001L); + + assertEquals(1001L, response.getDocumentId()); + assertEquals(RagParseStatusEnum.PARSED.name(), response.getParseStatus()); + assertEquals(15, response.getTextLength()); + assertEquals("fixed", response.getMetadata().get("parser")); + + ArgumentCaptor captor = ArgumentCaptor.forClass(RagDocument.class); + verify(ragDocumentService, times(2)).updateById(captor.capture()); + List updates = captor.getAllValues(); + assertEquals(RagParseStatusEnum.PARSING.name(), updates.get(0).getParseStatus()); + assertEquals(RagParseStatusEnum.PARSED.name(), updates.get(1).getParseStatus()); + assertTrue(parser.supports(new DocumentParseContext())); + } + + @Test + void parseShouldSupportBatchRequestAndChunkStrategyStructure() throws Exception { + Path file = tempDir.resolve("rag").resolve("batch.txt"); + Files.createDirectories(file.getParent()); + Files.writeString(file, "batch profiles"); + + RagDocument document = new RagDocument(); + document.setId(1002L); + document.setStoreId(2002L); + document.setAttachmentId(3004L); + document.setParseStatus(RagParseStatusEnum.UPLOADED.name()); + + SysAttachment attachment = new SysAttachment(); + attachment.setId(3004L); + attachment.setOriginalName("batch.txt"); + attachment.setFileSuffix("txt"); + attachment.setContentType("text/plain"); + attachment.setFilePath("rag/batch.txt"); + + AttachmentProperties attachmentProperties = new AttachmentProperties(); + attachmentProperties.setBasePath(tempDir.toString()); + RagDocumentParseServiceImpl service = new RagDocumentParseServiceImpl( + ragDocumentService, + sysAttachmentService, + attachmentProperties, + new DocumentParserFactory(List.of(new FixedDocumentParser("batch profiles"))) + ); + RagDocumentParseRequest request = new RagDocumentParseRequest(); + request.setDocumentIds(List.of(1002L)); + request.setChunkStrategy("DELIMITER"); + request.setDelimiter("。"); + + when(ragDocumentService.getById(1002L)).thenReturn(document); + when(sysAttachmentService.getById(3004L)).thenReturn(attachment); + when(ragDocumentService.updateById(any(RagDocument.class))).thenReturn(true); + + List responses = service.parse(request); + + assertEquals(1, responses.size()); + assertEquals(1002L, responses.getFirst().getDocumentId()); + assertEquals(RagParseStatusEnum.PARSED.name(), responses.getFirst().getParseStatus()); + } + + private static class FixedDocumentParser implements DocumentParser { + + private final String text; + + private FixedDocumentParser(String text) { + this.text = text; + } + + @Override + public boolean supports(DocumentParseContext context) { + return true; + } + + @Override + public DocumentParseResult parse(DocumentParseContext context) { + DocumentParseResult result = new DocumentParseResult(); + result.setText(text); + result.setTextLength(text.length()); + result.setMetadata(Map.of("parser", "fixed")); + return result; + } + } +} diff --git a/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java b/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java index 419f89c..4d21850 100644 --- a/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java +++ b/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java @@ -102,13 +102,13 @@ class RagDocumentServiceImplTests { request.setRemark(" 备注信息 "); doReturn(existingDocument).when(ragDocumentService).getById(3003L); - doReturn(true).when(ragDocumentService).saveOrUpdate(any(RagDocument.class)); + doReturn(true).when(ragDocumentService).updateById(any(RagDocument.class)); boolean result = ragDocumentService.saveOrUpdate(request); assertTrue(result); ArgumentCaptor documentCaptor = ArgumentCaptor.forClass(RagDocument.class); - verify(ragDocumentService).saveOrUpdate(documentCaptor.capture()); + verify(ragDocumentService).updateById(documentCaptor.capture()); RagDocument savedDocument = documentCaptor.getValue(); assertEquals(3003L, savedDocument.getId()); assertEquals(1001L, savedDocument.getStoreId()); @@ -121,4 +121,40 @@ class RagDocumentServiceImplTests { assertEquals("已修复", savedDocument.getErrorMessage()); assertEquals("备注信息", savedDocument.getRemark()); } + + @Test + void saveOrUpdateShouldPreserveExistingFieldsForPartialUpdate() { + RagDocument existingDocument = new RagDocument(); + existingDocument.setId(3003L); + existingDocument.setStoreId(1001L); + existingDocument.setAttachmentId(2002L); + existingDocument.setDocumentTitle("people_profiles.txt"); + existingDocument.setDocumentSummary("测试人员信息,有多条人员信息"); + existingDocument.setParseStatus(RagParseStatusEnum.UPLOADED.name()); + existingDocument.setIndexStatus(RagIndexStatusEnum.PENDING.name()); + existingDocument.setEnabled(true); + existingDocument.setRemark("测试人员信息"); + + RagDocumentSaveRequest request = new RagDocumentSaveRequest(); + request.setId(3003L); + request.setStoreId(1001L); + request.setDocumentTitle("people_profiles.txt"); + request.setEnabled(false); + + doReturn(existingDocument).when(ragDocumentService).getById(3003L); + doReturn(true).when(ragDocumentService).updateById(any(RagDocument.class)); + + boolean result = ragDocumentService.saveOrUpdate(request); + + assertTrue(result); + ArgumentCaptor documentCaptor = ArgumentCaptor.forClass(RagDocument.class); + verify(ragDocumentService).updateById(documentCaptor.capture()); + RagDocument savedDocument = documentCaptor.getValue(); + assertEquals(2002L, savedDocument.getAttachmentId()); + assertEquals("测试人员信息,有多条人员信息", savedDocument.getDocumentSummary()); + assertEquals(RagParseStatusEnum.UPLOADED.name(), savedDocument.getParseStatus()); + assertEquals(RagIndexStatusEnum.PENDING.name(), savedDocument.getIndexStatus()); + assertEquals(false, savedDocument.getEnabled()); + assertEquals("测试人员信息", savedDocument.getRemark()); + } }