diff --git a/pom.xml b/pom.xml
index b9eeb8d..59fc07c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -29,6 +29,7 @@
21
3.5.16
+ 3.2.3
@@ -64,6 +65,18 @@
jackson-annotations
+
+ org.apache.tika
+ tika-core
+ ${tika.version}
+
+
+
+ org.apache.tika
+ tika-parsers-standard-package
+ ${tika.version}
+
+
org.springdoc
springdoc-openapi-starter-webmvc-ui
diff --git a/script/sql/rag_chunk.sql b/script/sql/rag_chunk.sql
new file mode 100644
index 0000000..340d66d
--- /dev/null
+++ b/script/sql/rag_chunk.sql
@@ -0,0 +1,53 @@
+DROP TABLE IF EXISTS rag_chunk;
+
+CREATE TABLE rag_chunk (
+ id BIGSERIAL PRIMARY KEY,
+ store_id BIGINT NOT NULL,
+ document_id BIGINT NOT NULL,
+ chunk_index INTEGER NOT NULL,
+ chunk_content TEXT NOT NULL,
+ chunk_summary VARCHAR(1000) DEFAULT '',
+ token_count INTEGER,
+ page_number INTEGER,
+ section_title VARCHAR(255) DEFAULT '',
+ heading_path VARCHAR(1000) DEFAULT '',
+ vector_id VARCHAR(128),
+ metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
+ enabled BOOLEAN NOT NULL DEFAULT TRUE,
+ version INTEGER NOT NULL DEFAULT 1,
+ create_time TIMESTAMP,
+ update_time TIMESTAMP,
+ remark VARCHAR(500) DEFAULT '',
+ create_by VARCHAR(64),
+ update_by VARCHAR(64),
+ CONSTRAINT uk_rag_chunk_document_index UNIQUE (document_id, chunk_index),
+ CONSTRAINT fk_rag_chunk_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id),
+ CONSTRAINT fk_rag_chunk_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id)
+);
+
+CREATE INDEX idx_rag_chunk_store_id ON rag_chunk (store_id);
+CREATE INDEX idx_rag_chunk_document_id ON rag_chunk (document_id);
+CREATE INDEX idx_rag_chunk_enabled ON rag_chunk (enabled);
+CREATE INDEX idx_rag_chunk_vector_id ON rag_chunk (vector_id);
+CREATE INDEX idx_rag_chunk_metadata_json ON rag_chunk USING GIN (metadata_json);
+
+COMMENT ON TABLE rag_chunk IS 'RAG知识切片表';
+COMMENT ON COLUMN rag_chunk.id IS 'ID';
+COMMENT ON COLUMN rag_chunk.store_id IS '知识库ID';
+COMMENT ON COLUMN rag_chunk.document_id IS '文档ID';
+COMMENT ON COLUMN rag_chunk.chunk_index IS '文档内切片序号';
+COMMENT ON COLUMN rag_chunk.chunk_content IS '切片内容';
+COMMENT ON COLUMN rag_chunk.chunk_summary IS '切片摘要';
+COMMENT ON COLUMN rag_chunk.token_count IS 'Token数量';
+COMMENT ON COLUMN rag_chunk.page_number IS '页码';
+COMMENT ON COLUMN rag_chunk.section_title IS '章节标题';
+COMMENT ON COLUMN rag_chunk.heading_path IS '标题路径';
+COMMENT ON COLUMN rag_chunk.vector_id IS '向量ID';
+COMMENT ON COLUMN rag_chunk.metadata_json IS '切片级扩展元数据';
+COMMENT ON COLUMN rag_chunk.enabled IS '是否启用';
+COMMENT ON COLUMN rag_chunk.version IS '版本';
+COMMENT ON COLUMN rag_chunk.create_time IS '创建时间';
+COMMENT ON COLUMN rag_chunk.update_time IS '更新时间';
+COMMENT ON COLUMN rag_chunk.remark IS '备注';
+COMMENT ON COLUMN rag_chunk.create_by IS '创建者';
+COMMENT ON COLUMN rag_chunk.update_by IS '更新者';
diff --git a/script/sql/rag_chunk_embedding.sql b/script/sql/rag_chunk_embedding.sql
new file mode 100644
index 0000000..0cbf208
--- /dev/null
+++ b/script/sql/rag_chunk_embedding.sql
@@ -0,0 +1,50 @@
+CREATE EXTENSION IF NOT EXISTS vector;
+
+DROP TABLE IF EXISTS rag_chunk_embedding;
+
+CREATE TABLE rag_chunk_embedding (
+ id BIGSERIAL PRIMARY KEY,
+ store_id BIGINT NOT NULL,
+ document_id BIGINT NOT NULL,
+ chunk_id BIGINT NOT NULL,
+ embedding_model VARCHAR(100) NOT NULL,
+ embedding_dimension INTEGER NOT NULL DEFAULT 1024,
+ embedding VECTOR(1024) NOT NULL,
+ content_hash VARCHAR(64),
+ enabled BOOLEAN NOT NULL DEFAULT TRUE,
+ version INTEGER NOT NULL DEFAULT 1,
+ create_time TIMESTAMP,
+ update_time TIMESTAMP,
+ remark VARCHAR(500) DEFAULT '',
+ create_by VARCHAR(64),
+ update_by VARCHAR(64),
+ CONSTRAINT uk_rag_chunk_embedding_chunk_model UNIQUE (chunk_id, embedding_model),
+ CONSTRAINT fk_rag_chunk_embedding_store_id FOREIGN KEY (store_id) REFERENCES rag_store (id),
+ CONSTRAINT fk_rag_chunk_embedding_document_id FOREIGN KEY (document_id) REFERENCES rag_document (id),
+ CONSTRAINT fk_rag_chunk_embedding_chunk_id FOREIGN KEY (chunk_id) REFERENCES rag_chunk (id)
+);
+
+CREATE INDEX idx_rag_chunk_embedding_store_id ON rag_chunk_embedding (store_id);
+CREATE INDEX idx_rag_chunk_embedding_document_id ON rag_chunk_embedding (document_id);
+CREATE INDEX idx_rag_chunk_embedding_chunk_id ON rag_chunk_embedding (chunk_id);
+CREATE INDEX idx_rag_chunk_embedding_model ON rag_chunk_embedding (embedding_model);
+CREATE INDEX idx_rag_chunk_embedding_enabled ON rag_chunk_embedding (enabled);
+CREATE INDEX idx_rag_chunk_embedding_vector_hnsw
+ ON rag_chunk_embedding USING hnsw (embedding vector_cosine_ops);
+
+COMMENT ON TABLE rag_chunk_embedding IS 'RAG切片向量表';
+COMMENT ON COLUMN rag_chunk_embedding.id IS 'ID';
+COMMENT ON COLUMN rag_chunk_embedding.store_id IS '知识库ID';
+COMMENT ON COLUMN rag_chunk_embedding.document_id IS '文档ID';
+COMMENT ON COLUMN rag_chunk_embedding.chunk_id IS '切片ID';
+COMMENT ON COLUMN rag_chunk_embedding.embedding_model IS '向量模型';
+COMMENT ON COLUMN rag_chunk_embedding.embedding_dimension IS '向量维度';
+COMMENT ON COLUMN rag_chunk_embedding.embedding IS '向量内容';
+COMMENT ON COLUMN rag_chunk_embedding.content_hash IS '向量生成内容哈希';
+COMMENT ON COLUMN rag_chunk_embedding.enabled IS '是否启用';
+COMMENT ON COLUMN rag_chunk_embedding.version IS '版本';
+COMMENT ON COLUMN rag_chunk_embedding.create_time IS '创建时间';
+COMMENT ON COLUMN rag_chunk_embedding.update_time IS '更新时间';
+COMMENT ON COLUMN rag_chunk_embedding.remark IS '备注';
+COMMENT ON COLUMN rag_chunk_embedding.create_by IS '创建者';
+COMMENT ON COLUMN rag_chunk_embedding.update_by IS '更新者';
diff --git a/script/sql/rag_chunk_strategy_enum.sql b/script/sql/rag_chunk_strategy_enum.sql
new file mode 100644
index 0000000..ea1ed52
--- /dev/null
+++ b/script/sql/rag_chunk_strategy_enum.sql
@@ -0,0 +1,15 @@
+INSERT INTO sys_enum (catalog, type, name, value, strvalue, sort, version, remark)
+VALUES
+ ('rag', 'chunk_strategy', '固定长度切片', 1, 'FIXED_LENGTH', 1, 1, 'RAG文档切片方式'),
+ ('rag', 'chunk_strategy', '按段落切片', 2, 'PARAGRAPH', 2, 1, 'RAG文档切片方式'),
+ ('rag', 'chunk_strategy', '按标题层级切片', 3, 'HEADING', 3, 1, 'RAG文档切片方式'),
+ ('rag', 'chunk_strategy', '按表格行切片', 4, 'TABLE_ROW', 4, 1, 'RAG文档切片方式'),
+ ('rag', 'chunk_strategy', '按分隔符切片', 5, 'DELIMITER', 5, 1, 'RAG文档切片方式'),
+ ('rag', 'chunk_strategy', '语义切片', 6, 'SEMANTIC', 6, 1, 'RAG文档切片方式')
+ON CONFLICT (catalog, type, name)
+DO UPDATE SET
+ value = EXCLUDED.value,
+ strvalue = EXCLUDED.strvalue,
+ sort = EXCLUDED.sort,
+ remark = EXCLUDED.remark,
+ update_time = CURRENT_TIMESTAMP;
diff --git a/src/main/java/com/bruce/common/config/MybatisPlusConfig.java b/src/main/java/com/bruce/common/config/MybatisPlusConfig.java
new file mode 100644
index 0000000..dde5150
--- /dev/null
+++ b/src/main/java/com/bruce/common/config/MybatisPlusConfig.java
@@ -0,0 +1,17 @@
+package com.bruce.common.config;
+
+import com.baomidou.mybatisplus.extension.plugins.MybatisPlusInterceptor;
+import com.baomidou.mybatisplus.extension.plugins.inner.OptimisticLockerInnerInterceptor;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+public class MybatisPlusConfig {
+
+ @Bean
+ public MybatisPlusInterceptor mybatisPlusInterceptor() {
+ MybatisPlusInterceptor interceptor = new MybatisPlusInterceptor();
+ interceptor.addInnerInterceptor(new OptimisticLockerInnerInterceptor());
+ return interceptor;
+ }
+}
diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParseContext.java b/src/main/java/com/bruce/common/document/parse/DocumentParseContext.java
new file mode 100644
index 0000000..b148600
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/DocumentParseContext.java
@@ -0,0 +1,21 @@
+package com.bruce.common.document.parse;
+
+import lombok.Data;
+
+import java.nio.file.Path;
+
+@Data
+public class DocumentParseContext {
+
+ private Long documentId;
+
+ private Long attachmentId;
+
+ private String originalName;
+
+ private String suffix;
+
+ private String contentType;
+
+ private Path filePath;
+}
diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParseException.java b/src/main/java/com/bruce/common/document/parse/DocumentParseException.java
new file mode 100644
index 0000000..345ed96
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/DocumentParseException.java
@@ -0,0 +1,12 @@
+package com.bruce.common.document.parse;
+
+public class DocumentParseException extends RuntimeException {
+
+ public DocumentParseException(String message) {
+ super(message);
+ }
+
+ public DocumentParseException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParseResult.java b/src/main/java/com/bruce/common/document/parse/DocumentParseResult.java
new file mode 100644
index 0000000..bc35e66
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/DocumentParseResult.java
@@ -0,0 +1,20 @@
+package com.bruce.common.document.parse;
+
+import lombok.Data;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+@Data
+public class DocumentParseResult {
+
+ private String text;
+
+ private Integer textLength;
+
+ private Integer pageCount;
+
+ private Integer sheetCount;
+
+ private Map metadata = new LinkedHashMap<>();
+}
diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParser.java b/src/main/java/com/bruce/common/document/parse/DocumentParser.java
new file mode 100644
index 0000000..8de2ef7
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/DocumentParser.java
@@ -0,0 +1,8 @@
+package com.bruce.common.document.parse;
+
+public interface DocumentParser {
+
+ boolean supports(DocumentParseContext context);
+
+ DocumentParseResult parse(DocumentParseContext context);
+}
diff --git a/src/main/java/com/bruce/common/document/parse/DocumentParserFactory.java b/src/main/java/com/bruce/common/document/parse/DocumentParserFactory.java
new file mode 100644
index 0000000..f1fb6e9
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/DocumentParserFactory.java
@@ -0,0 +1,37 @@
+package com.bruce.common.document.parse;
+
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+import java.util.List;
+import java.util.Locale;
+
+@Component
+public class DocumentParserFactory {
+
+ private final List parsers;
+
+ public DocumentParserFactory(List parsers) {
+ this.parsers = parsers;
+ }
+
+ public DocumentParser resolve(DocumentParseContext context) {
+ return parsers.stream()
+ .filter(parser -> parser.supports(context))
+ .findFirst()
+ .orElseThrow(() -> new DocumentParseException("不支持的文档类型: " + resolveType(context)));
+ }
+
+ private String resolveType(DocumentParseContext context) {
+ if (context == null) {
+ return "unknown";
+ }
+ if (StringUtils.hasText(context.getSuffix())) {
+ return context.getSuffix().trim().toLowerCase(Locale.ROOT);
+ }
+ if (StringUtils.hasText(context.getContentType())) {
+ return context.getContentType().trim();
+ }
+ return "unknown";
+ }
+}
diff --git a/src/main/java/com/bruce/common/document/parse/impl/AbstractTikaDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/AbstractTikaDocumentParser.java
new file mode 100644
index 0000000..65e3e3b
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/impl/AbstractTikaDocumentParser.java
@@ -0,0 +1,64 @@
+package com.bruce.common.document.parse.impl;
+
+import com.bruce.common.document.parse.DocumentParseContext;
+import com.bruce.common.document.parse.DocumentParseException;
+import com.bruce.common.document.parse.DocumentParseResult;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.springframework.util.StringUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.util.Locale;
+import java.util.Set;
+
+abstract class AbstractTikaDocumentParser {
+
+ private static final int MAX_TEXT_LENGTH = -1;
+
+ private final Tika tika = new Tika();
+
+ boolean supportsSuffix(DocumentParseContext context, Set suffixes) {
+ return context != null
+ && StringUtils.hasText(context.getSuffix())
+ && suffixes.contains(context.getSuffix().trim().toLowerCase(Locale.ROOT));
+ }
+
+ boolean supportsContentType(DocumentParseContext context, String prefix) {
+ return context != null
+ && StringUtils.hasText(context.getContentType())
+ && context.getContentType().trim().toLowerCase(Locale.ROOT).startsWith(prefix);
+ }
+
+ DocumentParseResult parseWithTika(DocumentParseContext context) {
+ if (context == null || context.getFilePath() == null) {
+ throw new DocumentParseException("解析文件不能为空");
+ }
+ try {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, context.getOriginalName());
+ if (StringUtils.hasText(context.getContentType())) {
+ metadata.set(Metadata.CONTENT_TYPE, context.getContentType());
+ }
+ String text;
+ try (InputStream inputStream = Files.newInputStream(context.getFilePath())) {
+ text = tika.parseToString(inputStream, metadata, MAX_TEXT_LENGTH);
+ }
+ DocumentParseResult result = new DocumentParseResult();
+ result.setText(text == null ? "" : text.trim());
+ result.setTextLength(result.getText().length());
+ result.getMetadata().put("contentType", firstNonBlank(metadata.get(Metadata.CONTENT_TYPE), context.getContentType()));
+ result.getMetadata().put("resourceName", firstNonBlank(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY), context.getOriginalName()));
+ return result;
+ } catch (IOException | TikaException e) {
+ throw new DocumentParseException("文档解析失败: " + e.getMessage(), e);
+ }
+ }
+
+ private String firstNonBlank(String first, String fallback) {
+ return StringUtils.hasText(first) ? first : fallback;
+ }
+}
diff --git a/src/main/java/com/bruce/common/document/parse/impl/ExcelDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/ExcelDocumentParser.java
new file mode 100644
index 0000000..abe7a9b
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/impl/ExcelDocumentParser.java
@@ -0,0 +1,26 @@
+package com.bruce.common.document.parse.impl;
+
+import com.bruce.common.document.parse.DocumentParseContext;
+import com.bruce.common.document.parse.DocumentParser;
+import com.bruce.common.document.parse.DocumentParseResult;
+import org.springframework.stereotype.Component;
+
+import java.util.Set;
+
+@Component
+public class ExcelDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
+
+ private static final Set SUFFIXES = Set.of("xls", "xlsx");
+
+ @Override
+ public boolean supports(DocumentParseContext context) {
+ return supportsSuffix(context, SUFFIXES)
+ || supportsContentType(context, "application/vnd.ms-excel")
+ || supportsContentType(context, "application/vnd.openxmlformats-officedocument.spreadsheetml");
+ }
+
+ @Override
+ public DocumentParseResult parse(DocumentParseContext context) {
+ return parseWithTika(context);
+ }
+}
diff --git a/src/main/java/com/bruce/common/document/parse/impl/PdfDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/PdfDocumentParser.java
new file mode 100644
index 0000000..37cf8f0
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/impl/PdfDocumentParser.java
@@ -0,0 +1,24 @@
+package com.bruce.common.document.parse.impl;
+
+import com.bruce.common.document.parse.DocumentParseContext;
+import com.bruce.common.document.parse.DocumentParser;
+import com.bruce.common.document.parse.DocumentParseResult;
+import org.springframework.stereotype.Component;
+
+import java.util.Set;
+
+@Component
+public class PdfDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
+
+ private static final Set SUFFIXES = Set.of("pdf");
+
+ @Override
+ public boolean supports(DocumentParseContext context) {
+ return supportsSuffix(context, SUFFIXES) || supportsContentType(context, "application/pdf");
+ }
+
+ @Override
+ public DocumentParseResult parse(DocumentParseContext context) {
+ return parseWithTika(context);
+ }
+}
diff --git a/src/main/java/com/bruce/common/document/parse/impl/TxtDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/TxtDocumentParser.java
new file mode 100644
index 0000000..98646eb
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/impl/TxtDocumentParser.java
@@ -0,0 +1,24 @@
+package com.bruce.common.document.parse.impl;
+
+import com.bruce.common.document.parse.DocumentParseContext;
+import com.bruce.common.document.parse.DocumentParser;
+import com.bruce.common.document.parse.DocumentParseResult;
+import org.springframework.stereotype.Component;
+
+import java.util.Set;
+
+@Component
+public class TxtDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
+
+ private static final Set SUFFIXES = Set.of("txt", "md", "log");
+
+ @Override
+ public boolean supports(DocumentParseContext context) {
+ return supportsSuffix(context, SUFFIXES) || supportsContentType(context, "text/");
+ }
+
+ @Override
+ public DocumentParseResult parse(DocumentParseContext context) {
+ return parseWithTika(context);
+ }
+}
diff --git a/src/main/java/com/bruce/common/document/parse/impl/WordDocumentParser.java b/src/main/java/com/bruce/common/document/parse/impl/WordDocumentParser.java
new file mode 100644
index 0000000..c34e752
--- /dev/null
+++ b/src/main/java/com/bruce/common/document/parse/impl/WordDocumentParser.java
@@ -0,0 +1,26 @@
+package com.bruce.common.document.parse.impl;
+
+import com.bruce.common.document.parse.DocumentParseContext;
+import com.bruce.common.document.parse.DocumentParser;
+import com.bruce.common.document.parse.DocumentParseResult;
+import org.springframework.stereotype.Component;
+
+import java.util.Set;
+
+@Component
+public class WordDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
+
+ private static final Set SUFFIXES = Set.of("doc", "docx");
+
+ @Override
+ public boolean supports(DocumentParseContext context) {
+ return supportsSuffix(context, SUFFIXES)
+ || supportsContentType(context, "application/msword")
+ || supportsContentType(context, "application/vnd.openxmlformats-officedocument.wordprocessingml");
+ }
+
+ @Override
+ public DocumentParseResult parse(DocumentParseContext context) {
+ return parseWithTika(context);
+ }
+}
diff --git a/src/main/java/com/bruce/rag/controller/RagDocumentController.java b/src/main/java/com/bruce/rag/controller/RagDocumentController.java
index 777797c..e5cdbb8 100644
--- a/src/main/java/com/bruce/rag/controller/RagDocumentController.java
+++ b/src/main/java/com/bruce/rag/controller/RagDocumentController.java
@@ -2,9 +2,12 @@ package com.bruce.rag.controller;
import com.bruce.common.domain.model.RequestResult;
import com.bruce.rag.dto.request.RagDocumentBatchUploadRequest;
+import com.bruce.rag.dto.request.RagDocumentParseRequest;
import com.bruce.rag.dto.request.RagDocumentQueryRequest;
import com.bruce.rag.dto.request.RagDocumentSaveRequest;
+import com.bruce.rag.dto.response.RagDocumentParseResponse;
import com.bruce.rag.dto.response.RagDocumentResponse;
+import com.bruce.rag.service.IRagDocumentParseService;
import com.bruce.rag.service.IRagDocumentService;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
@@ -29,6 +32,9 @@ public class RagDocumentController {
@Autowired
private IRagDocumentService ragDocumentService;
+ @Autowired
+ private IRagDocumentParseService ragDocumentParseService;
+
@Operation(summary = "查询全部知识库文档")
@PostMapping("/list")
public RequestResult> list() {
@@ -85,4 +91,13 @@ public class RagDocumentController {
request.getStoreId(), responses.size());
return RequestResult.success(responses);
}
+
+ @Operation(summary = "解析知识库文档")
+ @PostMapping("/parse")
+ public RequestResult> parse(@RequestBody RagDocumentParseRequest request) {
+ log.info("RagDocumentController.parse start, request={}", request);
+ List responses = ragDocumentParseService.parse(request);
+ log.info("RagDocumentController.parse success, count={}", responses.size());
+ return RequestResult.success(responses);
+ }
}
diff --git a/src/main/java/com/bruce/rag/dto/request/RagDocumentParseRequest.java b/src/main/java/com/bruce/rag/dto/request/RagDocumentParseRequest.java
new file mode 100644
index 0000000..8fc93c0
--- /dev/null
+++ b/src/main/java/com/bruce/rag/dto/request/RagDocumentParseRequest.java
@@ -0,0 +1,26 @@
+package com.bruce.rag.dto.request;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.List;
+
+@Data
+@Schema(description = "RAG知识库文档解析请求")
+public class RagDocumentParseRequest {
+
+ @Schema(description = "文档ID列表")
+ private List documentIds;
+
+ @Schema(description = "切片方式")
+ private String chunkStrategy;
+
+ @Schema(description = "切片长度")
+ private Integer chunkSize;
+
+ @Schema(description = "重叠长度")
+ private Integer chunkOverlap;
+
+ @Schema(description = "分隔符")
+ private String delimiter;
+}
diff --git a/src/main/java/com/bruce/rag/dto/response/RagDocumentParseResponse.java b/src/main/java/com/bruce/rag/dto/response/RagDocumentParseResponse.java
new file mode 100644
index 0000000..17c7e3e
--- /dev/null
+++ b/src/main/java/com/bruce/rag/dto/response/RagDocumentParseResponse.java
@@ -0,0 +1,33 @@
+package com.bruce.rag.dto.response;
+
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import com.fasterxml.jackson.databind.ser.std.ToStringSerializer;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+@Data
+@Schema(description = "RAG知识库文档解析响应")
+public class RagDocumentParseResponse {
+
+ @Schema(description = "文档ID")
+ @JsonSerialize(using = ToStringSerializer.class)
+ private Long documentId;
+
+ @Schema(description = "解析状态")
+ private String parseStatus;
+
+ @Schema(description = "文本长度")
+ private Integer textLength;
+
+ @Schema(description = "页数")
+ private Integer pageCount;
+
+ @Schema(description = "工作表数量")
+ private Integer sheetCount;
+
+ @Schema(description = "解析元数据")
+ private Map metadata = new LinkedHashMap<>();
+}
diff --git a/src/main/java/com/bruce/rag/entity/RagChunk.java b/src/main/java/com/bruce/rag/entity/RagChunk.java
new file mode 100644
index 0000000..fcd3a7a
--- /dev/null
+++ b/src/main/java/com/bruce/rag/entity/RagChunk.java
@@ -0,0 +1,67 @@
+package com.bruce.rag.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.bruce.common.domain.model.BaseEntity;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@EqualsAndHashCode(callSuper = true)
+@TableName("rag_chunk")
+@Schema(description = "RAG知识切片")
+public class RagChunk extends BaseEntity {
+
+ @Schema(description = "知识库ID")
+ @TableField("store_id")
+ private Long storeId;
+
+ @Schema(description = "文档ID")
+ @TableField("document_id")
+ private Long documentId;
+
+ @Schema(description = "文档内切片序号")
+ @TableField("chunk_index")
+ private Integer chunkIndex;
+
+ @Schema(description = "切片内容")
+ @TableField("chunk_content")
+ private String chunkContent;
+
+ @Schema(description = "切片摘要")
+ @TableField("chunk_summary")
+ private String chunkSummary;
+
+ @Schema(description = "Token数量")
+ @TableField("token_count")
+ private Integer tokenCount;
+
+ @Schema(description = "页码")
+ @TableField("page_number")
+ private Integer pageNumber;
+
+ @Schema(description = "章节标题")
+ @TableField("section_title")
+ private String sectionTitle;
+
+ @Schema(description = "标题路径")
+ @TableField("heading_path")
+ private String headingPath;
+
+ @Schema(description = "向量ID")
+ @TableField("vector_id")
+ private String vectorId;
+
+ @Schema(description = "切片级扩展元数据JSON")
+ @TableField("metadata_json")
+ private String metadataJson;
+
+ @Schema(description = "是否启用")
+ private Boolean enabled;
+
+ @Schema(description = "备注")
+ private String remark;
+}
diff --git a/src/main/java/com/bruce/rag/entity/RagChunkEmbedding.java b/src/main/java/com/bruce/rag/entity/RagChunkEmbedding.java
new file mode 100644
index 0000000..6dd1c32
--- /dev/null
+++ b/src/main/java/com/bruce/rag/entity/RagChunkEmbedding.java
@@ -0,0 +1,50 @@
+package com.bruce.rag.entity;
+
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.bruce.common.domain.model.BaseEntity;
+import io.swagger.v3.oas.annotations.media.Schema;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@EqualsAndHashCode(callSuper = true)
+@TableName("rag_chunk_embedding")
+@Schema(description = "RAG切片向量")
+public class RagChunkEmbedding extends BaseEntity {
+
+ @Schema(description = "知识库ID")
+ @TableField("store_id")
+ private Long storeId;
+
+ @Schema(description = "文档ID")
+ @TableField("document_id")
+ private Long documentId;
+
+ @Schema(description = "切片ID")
+ @TableField("chunk_id")
+ private Long chunkId;
+
+ @Schema(description = "向量模型")
+ @TableField("embedding_model")
+ private String embeddingModel;
+
+ @Schema(description = "向量维度")
+ @TableField("embedding_dimension")
+ private Integer embeddingDimension;
+
+ @Schema(description = "向量内容")
+ private String embedding;
+
+ @Schema(description = "向量生成内容哈希")
+ @TableField("content_hash")
+ private String contentHash;
+
+ @Schema(description = "是否启用")
+ private Boolean enabled;
+
+ @Schema(description = "备注")
+ private String remark;
+}
diff --git a/src/main/java/com/bruce/rag/enums/RagChunkStrategyEnum.java b/src/main/java/com/bruce/rag/enums/RagChunkStrategyEnum.java
new file mode 100644
index 0000000..f514b06
--- /dev/null
+++ b/src/main/java/com/bruce/rag/enums/RagChunkStrategyEnum.java
@@ -0,0 +1,20 @@
+package com.bruce.rag.enums;
+
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+
+@Getter
+@AllArgsConstructor
+public enum RagChunkStrategyEnum {
+
+ FIXED_LENGTH(1, "固定长度切片"),
+ PARAGRAPH(2, "按段落切片"),
+ HEADING(3, "按标题层级切片"),
+ TABLE_ROW(4, "按表格行切片"),
+ DELIMITER(5, "按分隔符切片"),
+ SEMANTIC(6, "语义切片");
+
+ private final Integer value;
+
+ private final String label;
+}
diff --git a/src/main/java/com/bruce/rag/mapper/RagChunkEmbeddingMapper.java b/src/main/java/com/bruce/rag/mapper/RagChunkEmbeddingMapper.java
new file mode 100644
index 0000000..a8b03ab
--- /dev/null
+++ b/src/main/java/com/bruce/rag/mapper/RagChunkEmbeddingMapper.java
@@ -0,0 +1,9 @@
+package com.bruce.rag.mapper;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.bruce.rag.entity.RagChunkEmbedding;
+import org.apache.ibatis.annotations.Mapper;
+
+@Mapper
+public interface RagChunkEmbeddingMapper extends BaseMapper {
+}
diff --git a/src/main/java/com/bruce/rag/mapper/RagChunkMapper.java b/src/main/java/com/bruce/rag/mapper/RagChunkMapper.java
new file mode 100644
index 0000000..68c4f90
--- /dev/null
+++ b/src/main/java/com/bruce/rag/mapper/RagChunkMapper.java
@@ -0,0 +1,9 @@
+package com.bruce.rag.mapper;
+
+import com.baomidou.mybatisplus.core.mapper.BaseMapper;
+import com.bruce.rag.entity.RagChunk;
+import org.apache.ibatis.annotations.Mapper;
+
+@Mapper
+public interface RagChunkMapper extends BaseMapper {
+}
diff --git a/src/main/java/com/bruce/rag/service/IRagChunkEmbeddingService.java b/src/main/java/com/bruce/rag/service/IRagChunkEmbeddingService.java
new file mode 100644
index 0000000..44c713f
--- /dev/null
+++ b/src/main/java/com/bruce/rag/service/IRagChunkEmbeddingService.java
@@ -0,0 +1,7 @@
+package com.bruce.rag.service;
+
+import com.baomidou.mybatisplus.extension.service.IService;
+import com.bruce.rag.entity.RagChunkEmbedding;
+
+public interface IRagChunkEmbeddingService extends IService {
+}
diff --git a/src/main/java/com/bruce/rag/service/IRagChunkService.java b/src/main/java/com/bruce/rag/service/IRagChunkService.java
new file mode 100644
index 0000000..581ea6c
--- /dev/null
+++ b/src/main/java/com/bruce/rag/service/IRagChunkService.java
@@ -0,0 +1,7 @@
+package com.bruce.rag.service;
+
+import com.baomidou.mybatisplus.extension.service.IService;
+import com.bruce.rag.entity.RagChunk;
+
+public interface IRagChunkService extends IService {
+}
diff --git a/src/main/java/com/bruce/rag/service/IRagDocumentParseService.java b/src/main/java/com/bruce/rag/service/IRagDocumentParseService.java
new file mode 100644
index 0000000..387c161
--- /dev/null
+++ b/src/main/java/com/bruce/rag/service/IRagDocumentParseService.java
@@ -0,0 +1,13 @@
+package com.bruce.rag.service;
+
+import com.bruce.rag.dto.response.RagDocumentParseResponse;
+import com.bruce.rag.dto.request.RagDocumentParseRequest;
+
+import java.util.List;
+
+public interface IRagDocumentParseService {
+
+ RagDocumentParseResponse parse(Long documentId);
+
+ List parse(RagDocumentParseRequest request);
+}
diff --git a/src/main/java/com/bruce/rag/service/impl/RagChunkEmbeddingServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagChunkEmbeddingServiceImpl.java
new file mode 100644
index 0000000..e71ff6d
--- /dev/null
+++ b/src/main/java/com/bruce/rag/service/impl/RagChunkEmbeddingServiceImpl.java
@@ -0,0 +1,12 @@
+package com.bruce.rag.service.impl;
+
+import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
+import com.bruce.rag.entity.RagChunkEmbedding;
+import com.bruce.rag.mapper.RagChunkEmbeddingMapper;
+import com.bruce.rag.service.IRagChunkEmbeddingService;
+import org.springframework.stereotype.Service;
+
+@Service
+public class RagChunkEmbeddingServiceImpl extends ServiceImpl
+ implements IRagChunkEmbeddingService {
+}
diff --git a/src/main/java/com/bruce/rag/service/impl/RagChunkServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagChunkServiceImpl.java
new file mode 100644
index 0000000..8d9ba57
--- /dev/null
+++ b/src/main/java/com/bruce/rag/service/impl/RagChunkServiceImpl.java
@@ -0,0 +1,11 @@
+package com.bruce.rag.service.impl;
+
+import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
+import com.bruce.rag.entity.RagChunk;
+import com.bruce.rag.mapper.RagChunkMapper;
+import com.bruce.rag.service.IRagChunkService;
+import org.springframework.stereotype.Service;
+
+@Service
+public class RagChunkServiceImpl extends ServiceImpl implements IRagChunkService {
+}
diff --git a/src/main/java/com/bruce/rag/service/impl/RagDocumentParseServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagDocumentParseServiceImpl.java
new file mode 100644
index 0000000..df60746
--- /dev/null
+++ b/src/main/java/com/bruce/rag/service/impl/RagDocumentParseServiceImpl.java
@@ -0,0 +1,151 @@
+package com.bruce.rag.service.impl;
+
+import com.bruce.common.config.AttachmentProperties;
+import com.bruce.common.document.parse.DocumentParseContext;
+import com.bruce.common.document.parse.DocumentParseException;
+import com.bruce.common.document.parse.DocumentParseResult;
+import com.bruce.common.document.parse.DocumentParser;
+import com.bruce.common.document.parse.DocumentParserFactory;
+import com.bruce.common.domain.entity.SysAttachment;
+import com.bruce.common.service.ISysAttachmentService;
+import com.bruce.rag.dto.request.RagDocumentParseRequest;
+import com.bruce.rag.dto.response.RagDocumentParseResponse;
+import com.bruce.rag.entity.RagDocument;
+import com.bruce.rag.enums.RagChunkStrategyEnum;
+import com.bruce.rag.enums.RagParseStatusEnum;
+import com.bruce.rag.service.IRagDocumentParseService;
+import com.bruce.rag.service.IRagDocumentService;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+import org.springframework.util.StringUtils;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+@Slf4j
+@Service
+@RequiredArgsConstructor
+public class RagDocumentParseServiceImpl implements IRagDocumentParseService {
+
+ private final IRagDocumentService ragDocumentService;
+
+ private final ISysAttachmentService sysAttachmentService;
+
+ private final AttachmentProperties attachmentProperties;
+
+ private final DocumentParserFactory documentParserFactory;
+
+ @Override
+ public List parse(RagDocumentParseRequest request) {
+ log.info("RagDocumentParseServiceImpl.parse batch start, request={}", request);
+ validateParseRequest(request);
+ List responses = request.getDocumentIds().stream()
+ .map(this::parse)
+ .toList();
+ log.info("RagDocumentParseServiceImpl.parse batch success, count={}", responses.size());
+ return responses;
+ }
+
+ @Override
+ public RagDocumentParseResponse parse(Long documentId) {
+ log.info("RagDocumentParseServiceImpl.parse start, documentId={}", documentId);
+ if (documentId == null) {
+ throw new IllegalArgumentException("文档ID不能为空");
+ }
+
+ RagDocument document = ragDocumentService.getById(documentId);
+ if (document == null) {
+ throw new IllegalArgumentException("文档不存在,ID: " + documentId);
+ }
+ if (document.getAttachmentId() == null) {
+ throw new IllegalArgumentException("文档附件ID不能为空");
+ }
+
+ SysAttachment attachment = sysAttachmentService.getById(document.getAttachmentId());
+ if (attachment == null) {
+ throw new IllegalArgumentException("附件不存在,ID: " + document.getAttachmentId());
+ }
+
+ updateParseStatus(documentId, RagParseStatusEnum.PARSING, null);
+ try {
+ DocumentParseContext context = buildParseContext(document, attachment);
+ DocumentParser parser = documentParserFactory.resolve(context);
+ DocumentParseResult result = parser.parse(context);
+ updateParseStatus(documentId, RagParseStatusEnum.PARSED, null);
+ RagDocumentParseResponse response = toResponse(documentId, result);
+ log.info("RagDocumentParseServiceImpl.parse success, documentId={}, textLength={}",
+ documentId, response.getTextLength());
+ return response;
+ } catch (RuntimeException e) {
+ updateParseStatus(documentId, RagParseStatusEnum.FAILED, e.getMessage());
+ log.warn("RagDocumentParseServiceImpl.parse failed, documentId={}, message={}", documentId, e.getMessage());
+ throw e;
+ }
+ }
+
+ private void validateParseRequest(RagDocumentParseRequest request) {
+ if (request == null) {
+ throw new IllegalArgumentException("解析请求不能为空");
+ }
+ if (request.getDocumentIds() == null || request.getDocumentIds().isEmpty()) {
+ throw new IllegalArgumentException("文档ID列表不能为空");
+ }
+ Set strategies = Arrays.stream(RagChunkStrategyEnum.values())
+ .map(Enum::name)
+ .collect(Collectors.toSet());
+ if (request.getChunkStrategy() == null || !strategies.contains(request.getChunkStrategy())) {
+ throw new IllegalArgumentException("不支持的切片方式: " + request.getChunkStrategy());
+ }
+ }
+
+ private DocumentParseContext buildParseContext(RagDocument document, SysAttachment attachment) {
+ Path filePath = resolveFilePath(attachment);
+ if (!Files.isRegularFile(filePath)) {
+ throw new DocumentParseException("解析文件不存在: " + filePath);
+ }
+
+ DocumentParseContext context = new DocumentParseContext();
+ context.setDocumentId(document.getId());
+ context.setAttachmentId(attachment.getId());
+ context.setOriginalName(attachment.getOriginalName());
+ context.setSuffix(attachment.getFileSuffix());
+ context.setContentType(attachment.getContentType());
+ context.setFilePath(filePath);
+ return context;
+ }
+
+ private Path resolveFilePath(SysAttachment attachment) {
+ if (!StringUtils.hasText(attachment.getFilePath())) {
+ throw new DocumentParseException("附件文件路径不能为空");
+ }
+ Path filePath = Path.of(attachment.getFilePath());
+ if (filePath.isAbsolute()) {
+ return filePath.normalize();
+ }
+ return Path.of(attachmentProperties.getBasePath()).resolve(filePath).normalize();
+ }
+
+ private void updateParseStatus(Long documentId, RagParseStatusEnum status, String errorMessage) {
+ RagDocument update = new RagDocument();
+ update.setId(documentId);
+ update.setParseStatus(status.name());
+ update.setErrorMessage(StringUtils.hasText(errorMessage) ? errorMessage : null);
+ ragDocumentService.updateById(update);
+ }
+
+ private RagDocumentParseResponse toResponse(Long documentId, DocumentParseResult result) {
+ RagDocumentParseResponse response = new RagDocumentParseResponse();
+ response.setDocumentId(documentId);
+ response.setParseStatus(RagParseStatusEnum.PARSED.name());
+ response.setTextLength(result.getTextLength());
+ response.setPageCount(result.getPageCount());
+ response.setSheetCount(result.getSheetCount());
+ response.setMetadata(result.getMetadata());
+ return response;
+ }
+}
diff --git a/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java b/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java
index 378d1a5..9c36673 100644
--- a/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java
+++ b/src/main/java/com/bruce/rag/service/impl/RagDocumentServiceImpl.java
@@ -82,10 +82,18 @@ public class RagDocumentServiceImpl extends ServiceImpl factory.resolve(context("zip"))
+ );
+
+ assertEquals("不支持的文档类型: zip", exception.getMessage());
+ }
+
+ private DocumentParseContext context(String suffix) {
+ DocumentParseContext context = new DocumentParseContext();
+ context.setSuffix(suffix);
+ context.setFilePath(Path.of("sample." + suffix));
+ return context;
+ }
+}
diff --git a/src/test/java/com/bruce/common/document/parse/TxtDocumentParserTests.java b/src/test/java/com/bruce/common/document/parse/TxtDocumentParserTests.java
new file mode 100644
index 0000000..b9c1d67
--- /dev/null
+++ b/src/test/java/com/bruce/common/document/parse/TxtDocumentParserTests.java
@@ -0,0 +1,47 @@
+package com.bruce.common.document.parse;
+
+import com.bruce.common.document.parse.impl.TxtDocumentParser;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class TxtDocumentParserTests {
+
+ @TempDir
+ private Path tempDir;
+
+ @Test
+ void parseShouldReadPlainTextContent() throws Exception {
+ Path file = tempDir.resolve("people.txt");
+ Files.writeString(file, "张三 是 产品经理\n李四 是 后端工程师", StandardCharsets.UTF_8);
+ DocumentParseContext context = new DocumentParseContext();
+ context.setOriginalName("people.txt");
+ context.setSuffix("txt");
+ context.setContentType("text/plain");
+ context.setFilePath(file);
+
+ DocumentParseResult result = new TxtDocumentParser().parse(context);
+
+ assertEquals("张三 是 产品经理\n李四 是 后端工程师", result.getText());
+ assertEquals(result.getText().length(), result.getTextLength());
+ assertTrue(result.getMetadata().get("contentType").toString().startsWith("text/plain"));
+ }
+
+ @Test
+ void supportsShouldAcceptTextSuffixAndContentType() {
+ TxtDocumentParser parser = new TxtDocumentParser();
+ DocumentParseContext suffixContext = new DocumentParseContext();
+ suffixContext.setSuffix("TXT");
+ DocumentParseContext contentTypeContext = new DocumentParseContext();
+ contentTypeContext.setContentType("text/plain");
+
+ assertTrue(parser.supports(suffixContext));
+ assertTrue(parser.supports(contentTypeContext));
+ }
+}
diff --git a/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java b/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java
index b146fe6..6c119c3 100644
--- a/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java
+++ b/src/test/java/com/bruce/common/enumconfig/EnumDefinitionTests.java
@@ -3,6 +3,7 @@ package com.bruce.common.enumconfig;
import com.bruce.common.enums.CommonStatusEnum;
import com.bruce.common.enums.EnableStatusEnum;
import com.bruce.rag.enums.RagIndexStatusEnum;
+import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.enums.RagParseStatusEnum;
import org.junit.jupiter.api.Test;
@@ -24,6 +25,9 @@ class EnumDefinitionTests {
assertEquals(4, RagParseStatusEnum.FAILED.getValue());
assertEquals(1, RagIndexStatusEnum.PENDING.getValue());
assertEquals(3, RagIndexStatusEnum.INDEXED.getValue());
+ assertEquals(1, RagChunkStrategyEnum.FIXED_LENGTH.getValue());
+ assertEquals(5, RagChunkStrategyEnum.DELIMITER.getValue());
+ assertEquals(6, RagChunkStrategyEnum.SEMANTIC.getValue());
}
@Test
@@ -38,5 +42,8 @@ class EnumDefinitionTests {
assertEquals("解析失败", RagParseStatusEnum.FAILED.getLabel());
assertEquals("待索引", RagIndexStatusEnum.PENDING.getLabel());
assertEquals("已索引", RagIndexStatusEnum.INDEXED.getLabel());
+ assertEquals("固定长度切片", RagChunkStrategyEnum.FIXED_LENGTH.getLabel());
+ assertEquals("按分隔符切片", RagChunkStrategyEnum.DELIMITER.getLabel());
+ assertEquals("语义切片", RagChunkStrategyEnum.SEMANTIC.getLabel());
}
}
diff --git a/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java b/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java
index 8eb7e5d..72a0cf9 100644
--- a/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java
+++ b/src/test/java/com/bruce/common/enumconfig/SysEnumDataInitTests.java
@@ -5,6 +5,7 @@ import com.bruce.common.domain.entity.SysEnum;
import com.bruce.common.enums.CommonStatusEnum;
import com.bruce.common.enums.EnableStatusEnum;
import com.bruce.common.service.ISysEnumService;
+import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.enums.RagIndexStatusEnum;
import com.bruce.rag.enums.RagParseStatusEnum;
import org.junit.jupiter.api.Test;
@@ -40,6 +41,13 @@ class SysEnumDataInitTests {
saveOrUpdate("rag", "index_status", RagIndexStatusEnum.INDEXING.getLabel(), RagIndexStatusEnum.INDEXING.getValue(), 2, "RAG文档索引状态");
saveOrUpdate("rag", "index_status", RagIndexStatusEnum.INDEXED.getLabel(), RagIndexStatusEnum.INDEXED.getValue(), 3, "RAG文档索引状态");
saveOrUpdate("rag", "index_status", RagIndexStatusEnum.FAILED.getLabel(), RagIndexStatusEnum.FAILED.getValue(), 4, "RAG文档索引状态");
+
+ saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.FIXED_LENGTH.getLabel(), RagChunkStrategyEnum.FIXED_LENGTH.getValue(), 1, "RAG文档切片方式");
+ saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.PARAGRAPH.getLabel(), RagChunkStrategyEnum.PARAGRAPH.getValue(), 2, "RAG文档切片方式");
+ saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.HEADING.getLabel(), RagChunkStrategyEnum.HEADING.getValue(), 3, "RAG文档切片方式");
+ saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.TABLE_ROW.getLabel(), RagChunkStrategyEnum.TABLE_ROW.getValue(), 4, "RAG文档切片方式");
+ saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.DELIMITER.getLabel(), RagChunkStrategyEnum.DELIMITER.getValue(), 5, "RAG文档切片方式");
+ saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.SEMANTIC.getLabel(), RagChunkStrategyEnum.SEMANTIC.getValue(), 6, "RAG文档切片方式");
}
private void saveOrUpdate(String catalog, String type, String name, Integer value, Integer sort, String remark) {
diff --git a/src/test/java/com/bruce/rag/RagComponentStructureTests.java b/src/test/java/com/bruce/rag/RagComponentStructureTests.java
index d7c9bdc..c9f586a 100644
--- a/src/test/java/com/bruce/rag/RagComponentStructureTests.java
+++ b/src/test/java/com/bruce/rag/RagComponentStructureTests.java
@@ -8,18 +8,29 @@ import com.bruce.rag.constant.RagSystemConstants;
import com.bruce.rag.controller.RagDocumentController;
import com.bruce.rag.controller.RagStoreController;
import com.bruce.rag.dto.request.RagDocumentQueryRequest;
+import com.bruce.rag.dto.request.RagDocumentParseRequest;
import com.bruce.rag.dto.request.RagStoreQueryRequest;
import com.bruce.rag.dto.request.RagStoreSaveRequest;
+import com.bruce.rag.dto.response.RagDocumentParseResponse;
import com.bruce.rag.dto.response.RagStoreDocumentOverviewResponse;
import com.bruce.rag.dto.response.RagStoreOverviewResponse;
import com.bruce.rag.dto.response.RagDocumentResponse;
import com.bruce.rag.dto.response.RagStoreResponse;
+import com.bruce.rag.entity.RagChunk;
+import com.bruce.rag.entity.RagChunkEmbedding;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.entity.RagStore;
+import com.bruce.rag.mapper.RagChunkEmbeddingMapper;
+import com.bruce.rag.mapper.RagChunkMapper;
import com.bruce.rag.mapper.RagDocumentMapper;
import com.bruce.rag.mapper.RagStoreMapper;
+import com.bruce.rag.service.IRagChunkEmbeddingService;
+import com.bruce.rag.service.IRagChunkService;
+import com.bruce.rag.service.IRagDocumentParseService;
import com.bruce.rag.service.IRagDocumentService;
import com.bruce.rag.service.IRagStoreService;
+import com.bruce.rag.service.impl.RagChunkEmbeddingServiceImpl;
+import com.bruce.rag.service.impl.RagChunkServiceImpl;
import com.bruce.rag.service.impl.RagDocumentServiceImpl;
import com.bruce.rag.service.impl.RagStoreServiceImpl;
import org.junit.jupiter.api.Test;
@@ -39,10 +50,16 @@ class RagComponentStructureTests {
void ragComponentsShouldReuseMybatisPlusBaseTypes() {
assertTrue(BaseMapper.class.isAssignableFrom(RagStoreMapper.class));
assertTrue(BaseMapper.class.isAssignableFrom(RagDocumentMapper.class));
+ assertTrue(BaseMapper.class.isAssignableFrom(RagChunkMapper.class));
+ assertTrue(BaseMapper.class.isAssignableFrom(RagChunkEmbeddingMapper.class));
assertTrue(IService.class.isAssignableFrom(IRagStoreService.class));
assertTrue(IService.class.isAssignableFrom(IRagDocumentService.class));
+ assertTrue(IService.class.isAssignableFrom(IRagChunkService.class));
+ assertTrue(IService.class.isAssignableFrom(IRagChunkEmbeddingService.class));
assertTrue(ServiceImpl.class.isAssignableFrom(RagStoreServiceImpl.class));
assertTrue(ServiceImpl.class.isAssignableFrom(RagDocumentServiceImpl.class));
+ assertTrue(ServiceImpl.class.isAssignableFrom(RagChunkServiceImpl.class));
+ assertTrue(ServiceImpl.class.isAssignableFrom(RagChunkEmbeddingServiceImpl.class));
}
@Test
@@ -63,8 +80,10 @@ class RagComponentStructureTests {
Method documentListMethod = RagDocumentController.class.getMethod("list");
Method documentQueryMethod = RagDocumentController.class.getMethod("query", RagDocumentQueryRequest.class);
+ Method documentParseMethod = RagDocumentController.class.getMethod("parse", RagDocumentParseRequest.class);
Method documentResponseListMethod = IRagDocumentService.class.getMethod("listResponses");
Method documentServiceQueryMethod = IRagDocumentService.class.getMethod("query", RagDocumentQueryRequest.class);
+ Method documentParseServiceMethod = IRagDocumentParseService.class.getMethod("parse", RagDocumentParseRequest.class);
assertEquals(RequestResult.class, storeListMethod.getReturnType());
assertEquals(RequestResult.class, storeQueryMethod.getReturnType());
@@ -89,11 +108,14 @@ class RagComponentStructureTests {
assertEquals(RequestResult.class, documentListMethod.getReturnType());
assertEquals(RequestResult.class, documentQueryMethod.getReturnType());
+ assertEquals(RequestResult.class, documentParseMethod.getReturnType());
assertEquals(List.class, documentServiceQueryMethod.getReturnType());
+ assertEquals(List.class, documentParseServiceMethod.getReturnType());
assertTrue(documentResponseListMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
assertTrue(documentServiceQueryMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
assertTrue(documentListMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
assertTrue(documentQueryMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
+ assertTrue(documentParseMethod.getGenericReturnType().getTypeName().contains("RagDocumentParseResponse"));
assertEquals(RagDocumentResponse.class, RagDocumentResponse.class.getMethod("fromEntity", RagDocument.class).getReturnType());
}
@@ -121,4 +143,34 @@ class RagComponentStructureTests {
assertTrue(RagStoreController.class.getSimpleName().contains("RagStoreController"));
assertTrue(RagDocumentController.class.getSimpleName().contains("RagDocumentController"));
}
+
+ @Test
+ void ragChunkStructureShouldSupportChunkMetadata() throws NoSuchFieldException {
+ assertEquals(Long.class, RagChunk.class.getDeclaredField("storeId").getType());
+ assertEquals(Long.class, RagChunk.class.getDeclaredField("documentId").getType());
+ assertEquals(Integer.class, RagChunk.class.getDeclaredField("chunkIndex").getType());
+ assertEquals(String.class, RagChunk.class.getDeclaredField("chunkContent").getType());
+ assertEquals(String.class, RagChunk.class.getDeclaredField("chunkSummary").getType());
+ assertEquals(Integer.class, RagChunk.class.getDeclaredField("tokenCount").getType());
+ assertEquals(Integer.class, RagChunk.class.getDeclaredField("pageNumber").getType());
+ assertEquals(String.class, RagChunk.class.getDeclaredField("sectionTitle").getType());
+ assertEquals(String.class, RagChunk.class.getDeclaredField("headingPath").getType());
+ assertEquals(String.class, RagChunk.class.getDeclaredField("vectorId").getType());
+ assertEquals(String.class, RagChunk.class.getDeclaredField("metadataJson").getType());
+ assertEquals(Boolean.class, RagChunk.class.getDeclaredField("enabled").getType());
+ assertEquals(String.class, RagChunk.class.getDeclaredField("remark").getType());
+ }
+
+ @Test
+ void ragChunkEmbeddingStructureShouldSupportPgvectorMetadata() throws NoSuchFieldException {
+ assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("storeId").getType());
+ assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("documentId").getType());
+ assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("chunkId").getType());
+ assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("embeddingModel").getType());
+ assertEquals(Integer.class, RagChunkEmbedding.class.getDeclaredField("embeddingDimension").getType());
+ assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("embedding").getType());
+ assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("contentHash").getType());
+ assertEquals(Boolean.class, RagChunkEmbedding.class.getDeclaredField("enabled").getType());
+ assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("remark").getType());
+ }
}
diff --git a/src/test/java/com/bruce/rag/RagDocumentParseServiceImplTests.java b/src/test/java/com/bruce/rag/RagDocumentParseServiceImplTests.java
new file mode 100644
index 0000000..f660c3f
--- /dev/null
+++ b/src/test/java/com/bruce/rag/RagDocumentParseServiceImplTests.java
@@ -0,0 +1,160 @@
+package com.bruce.rag;
+
+import com.bruce.common.config.AttachmentProperties;
+import com.bruce.common.document.parse.DocumentParseContext;
+import com.bruce.common.document.parse.DocumentParseResult;
+import com.bruce.common.document.parse.DocumentParser;
+import com.bruce.common.document.parse.DocumentParserFactory;
+import com.bruce.common.domain.entity.SysAttachment;
+import com.bruce.common.service.ISysAttachmentService;
+import com.bruce.rag.dto.request.RagDocumentParseRequest;
+import com.bruce.rag.dto.response.RagDocumentParseResponse;
+import com.bruce.rag.entity.RagDocument;
+import com.bruce.rag.enums.RagParseStatusEnum;
+import com.bruce.rag.service.IRagDocumentService;
+import com.bruce.rag.service.impl.RagDocumentParseServiceImpl;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.ArgumentCaptor;
+import org.mockito.Mock;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.when;
+
+@ExtendWith(MockitoExtension.class)
+class RagDocumentParseServiceImplTests {
+
+ @TempDir
+ private Path tempDir;
+
+ @Mock
+ private IRagDocumentService ragDocumentService;
+
+ @Mock
+ private ISysAttachmentService sysAttachmentService;
+
+ @Test
+ void parseShouldUpdateStatusAndReturnParseResponse() throws Exception {
+ Path file = tempDir.resolve("rag").resolve("people.txt");
+ Files.createDirectories(file.getParent());
+ Files.writeString(file, "people profiles");
+
+ RagDocument document = new RagDocument();
+ document.setId(1001L);
+ document.setStoreId(2002L);
+ document.setAttachmentId(3003L);
+ document.setParseStatus(RagParseStatusEnum.UPLOADED.name());
+
+ SysAttachment attachment = new SysAttachment();
+ attachment.setId(3003L);
+ attachment.setOriginalName("people.txt");
+ attachment.setFileSuffix("txt");
+ attachment.setContentType("text/plain");
+ attachment.setFilePath("rag/people.txt");
+
+ AttachmentProperties attachmentProperties = new AttachmentProperties();
+ attachmentProperties.setBasePath(tempDir.toString());
+ DocumentParser parser = new FixedDocumentParser("people profiles");
+ RagDocumentParseServiceImpl service = new RagDocumentParseServiceImpl(
+ ragDocumentService,
+ sysAttachmentService,
+ attachmentProperties,
+ new DocumentParserFactory(List.of(parser))
+ );
+
+ when(ragDocumentService.getById(1001L)).thenReturn(document);
+ when(sysAttachmentService.getById(3003L)).thenReturn(attachment);
+ when(ragDocumentService.updateById(any(RagDocument.class))).thenReturn(true);
+
+ RagDocumentParseResponse response = service.parse(1001L);
+
+ assertEquals(1001L, response.getDocumentId());
+ assertEquals(RagParseStatusEnum.PARSED.name(), response.getParseStatus());
+ assertEquals(15, response.getTextLength());
+ assertEquals("fixed", response.getMetadata().get("parser"));
+
+ ArgumentCaptor captor = ArgumentCaptor.forClass(RagDocument.class);
+ verify(ragDocumentService, times(2)).updateById(captor.capture());
+ List updates = captor.getAllValues();
+ assertEquals(RagParseStatusEnum.PARSING.name(), updates.get(0).getParseStatus());
+ assertEquals(RagParseStatusEnum.PARSED.name(), updates.get(1).getParseStatus());
+ assertTrue(parser.supports(new DocumentParseContext()));
+ }
+
+ @Test
+ void parseShouldSupportBatchRequestAndChunkStrategyStructure() throws Exception {
+ Path file = tempDir.resolve("rag").resolve("batch.txt");
+ Files.createDirectories(file.getParent());
+ Files.writeString(file, "batch profiles");
+
+ RagDocument document = new RagDocument();
+ document.setId(1002L);
+ document.setStoreId(2002L);
+ document.setAttachmentId(3004L);
+ document.setParseStatus(RagParseStatusEnum.UPLOADED.name());
+
+ SysAttachment attachment = new SysAttachment();
+ attachment.setId(3004L);
+ attachment.setOriginalName("batch.txt");
+ attachment.setFileSuffix("txt");
+ attachment.setContentType("text/plain");
+ attachment.setFilePath("rag/batch.txt");
+
+ AttachmentProperties attachmentProperties = new AttachmentProperties();
+ attachmentProperties.setBasePath(tempDir.toString());
+ RagDocumentParseServiceImpl service = new RagDocumentParseServiceImpl(
+ ragDocumentService,
+ sysAttachmentService,
+ attachmentProperties,
+ new DocumentParserFactory(List.of(new FixedDocumentParser("batch profiles")))
+ );
+ RagDocumentParseRequest request = new RagDocumentParseRequest();
+ request.setDocumentIds(List.of(1002L));
+ request.setChunkStrategy("DELIMITER");
+ request.setDelimiter("。");
+
+ when(ragDocumentService.getById(1002L)).thenReturn(document);
+ when(sysAttachmentService.getById(3004L)).thenReturn(attachment);
+ when(ragDocumentService.updateById(any(RagDocument.class))).thenReturn(true);
+
+ List responses = service.parse(request);
+
+ assertEquals(1, responses.size());
+ assertEquals(1002L, responses.getFirst().getDocumentId());
+ assertEquals(RagParseStatusEnum.PARSED.name(), responses.getFirst().getParseStatus());
+ }
+
+ private static class FixedDocumentParser implements DocumentParser {
+
+ private final String text;
+
+ private FixedDocumentParser(String text) {
+ this.text = text;
+ }
+
+ @Override
+ public boolean supports(DocumentParseContext context) {
+ return true;
+ }
+
+ @Override
+ public DocumentParseResult parse(DocumentParseContext context) {
+ DocumentParseResult result = new DocumentParseResult();
+ result.setText(text);
+ result.setTextLength(text.length());
+ result.setMetadata(Map.of("parser", "fixed"));
+ return result;
+ }
+ }
+}
diff --git a/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java b/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java
index 419f89c..4d21850 100644
--- a/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java
+++ b/src/test/java/com/bruce/rag/RagDocumentServiceImplTests.java
@@ -102,13 +102,13 @@ class RagDocumentServiceImplTests {
request.setRemark(" 备注信息 ");
doReturn(existingDocument).when(ragDocumentService).getById(3003L);
- doReturn(true).when(ragDocumentService).saveOrUpdate(any(RagDocument.class));
+ doReturn(true).when(ragDocumentService).updateById(any(RagDocument.class));
boolean result = ragDocumentService.saveOrUpdate(request);
assertTrue(result);
ArgumentCaptor documentCaptor = ArgumentCaptor.forClass(RagDocument.class);
- verify(ragDocumentService).saveOrUpdate(documentCaptor.capture());
+ verify(ragDocumentService).updateById(documentCaptor.capture());
RagDocument savedDocument = documentCaptor.getValue();
assertEquals(3003L, savedDocument.getId());
assertEquals(1001L, savedDocument.getStoreId());
@@ -121,4 +121,40 @@ class RagDocumentServiceImplTests {
assertEquals("已修复", savedDocument.getErrorMessage());
assertEquals("备注信息", savedDocument.getRemark());
}
+
+ @Test
+ void saveOrUpdateShouldPreserveExistingFieldsForPartialUpdate() {
+ RagDocument existingDocument = new RagDocument();
+ existingDocument.setId(3003L);
+ existingDocument.setStoreId(1001L);
+ existingDocument.setAttachmentId(2002L);
+ existingDocument.setDocumentTitle("people_profiles.txt");
+ existingDocument.setDocumentSummary("测试人员信息,有多条人员信息");
+ existingDocument.setParseStatus(RagParseStatusEnum.UPLOADED.name());
+ existingDocument.setIndexStatus(RagIndexStatusEnum.PENDING.name());
+ existingDocument.setEnabled(true);
+ existingDocument.setRemark("测试人员信息");
+
+ RagDocumentSaveRequest request = new RagDocumentSaveRequest();
+ request.setId(3003L);
+ request.setStoreId(1001L);
+ request.setDocumentTitle("people_profiles.txt");
+ request.setEnabled(false);
+
+ doReturn(existingDocument).when(ragDocumentService).getById(3003L);
+ doReturn(true).when(ragDocumentService).updateById(any(RagDocument.class));
+
+ boolean result = ragDocumentService.saveOrUpdate(request);
+
+ assertTrue(result);
+ ArgumentCaptor documentCaptor = ArgumentCaptor.forClass(RagDocument.class);
+ verify(ragDocumentService).updateById(documentCaptor.capture());
+ RagDocument savedDocument = documentCaptor.getValue();
+ assertEquals(2002L, savedDocument.getAttachmentId());
+ assertEquals("测试人员信息,有多条人员信息", savedDocument.getDocumentSummary());
+ assertEquals(RagParseStatusEnum.UPLOADED.name(), savedDocument.getParseStatus());
+ assertEquals(RagIndexStatusEnum.PENDING.name(), savedDocument.getIndexStatus());
+ assertEquals(false, savedDocument.getEnabled());
+ assertEquals("测试人员信息", savedDocument.getRemark());
+ }
}