feat(rag): add document parsing structures

This commit is contained in:
2026-05-21 23:20:09 +08:00
parent 2ab02fb574
commit 1de773405f
38 changed files with 1240 additions and 9 deletions

View File

@@ -0,0 +1,19 @@
package com.bruce.common.config;
import com.baomidou.mybatisplus.extension.plugins.inner.OptimisticLockerInnerInterceptor;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertTrue;
class MybatisPlusConfigTests {
@Test
void mybatisPlusInterceptorShouldRegisterOptimisticLocker() {
MybatisPlusConfig config = new MybatisPlusConfig();
var interceptor = config.mybatisPlusInterceptor();
assertTrue(interceptor.getInterceptors().stream()
.anyMatch(OptimisticLockerInnerInterceptor.class::isInstance));
}
}

View File

@@ -0,0 +1,50 @@
package com.bruce.common.document.parse;
import com.bruce.common.document.parse.impl.ExcelDocumentParser;
import com.bruce.common.document.parse.impl.PdfDocumentParser;
import com.bruce.common.document.parse.impl.TxtDocumentParser;
import com.bruce.common.document.parse.impl.WordDocumentParser;
import org.junit.jupiter.api.Test;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
class DocumentParserFactoryTests {
@Test
void resolveShouldChooseParserByFileSuffix() {
DocumentParserFactory factory = new DocumentParserFactory(List.of(
new TxtDocumentParser(),
new WordDocumentParser(),
new PdfDocumentParser(),
new ExcelDocumentParser()
));
assertEquals(TxtDocumentParser.class, factory.resolve(context("txt")).getClass());
assertEquals(WordDocumentParser.class, factory.resolve(context("docx")).getClass());
assertEquals(PdfDocumentParser.class, factory.resolve(context("pdf")).getClass());
assertEquals(ExcelDocumentParser.class, factory.resolve(context("xlsx")).getClass());
}
@Test
void resolveShouldRejectUnsupportedSuffix() {
DocumentParserFactory factory = new DocumentParserFactory(List.of(new TxtDocumentParser()));
DocumentParseException exception = assertThrows(
DocumentParseException.class,
() -> factory.resolve(context("zip"))
);
assertEquals("不支持的文档类型: zip", exception.getMessage());
}
private DocumentParseContext context(String suffix) {
DocumentParseContext context = new DocumentParseContext();
context.setSuffix(suffix);
context.setFilePath(Path.of("sample." + suffix));
return context;
}
}

View File

@@ -0,0 +1,47 @@
package com.bruce.common.document.parse;
import com.bruce.common.document.parse.impl.TxtDocumentParser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class TxtDocumentParserTests {
@TempDir
private Path tempDir;
@Test
void parseShouldReadPlainTextContent() throws Exception {
Path file = tempDir.resolve("people.txt");
Files.writeString(file, "张三 是 产品经理\n李四 是 后端工程师", StandardCharsets.UTF_8);
DocumentParseContext context = new DocumentParseContext();
context.setOriginalName("people.txt");
context.setSuffix("txt");
context.setContentType("text/plain");
context.setFilePath(file);
DocumentParseResult result = new TxtDocumentParser().parse(context);
assertEquals("张三 是 产品经理\n李四 是 后端工程师", result.getText());
assertEquals(result.getText().length(), result.getTextLength());
assertTrue(result.getMetadata().get("contentType").toString().startsWith("text/plain"));
}
@Test
void supportsShouldAcceptTextSuffixAndContentType() {
TxtDocumentParser parser = new TxtDocumentParser();
DocumentParseContext suffixContext = new DocumentParseContext();
suffixContext.setSuffix("TXT");
DocumentParseContext contentTypeContext = new DocumentParseContext();
contentTypeContext.setContentType("text/plain");
assertTrue(parser.supports(suffixContext));
assertTrue(parser.supports(contentTypeContext));
}
}

View File

@@ -3,6 +3,7 @@ package com.bruce.common.enumconfig;
import com.bruce.common.enums.CommonStatusEnum;
import com.bruce.common.enums.EnableStatusEnum;
import com.bruce.rag.enums.RagIndexStatusEnum;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.enums.RagParseStatusEnum;
import org.junit.jupiter.api.Test;
@@ -24,6 +25,9 @@ class EnumDefinitionTests {
assertEquals(4, RagParseStatusEnum.FAILED.getValue());
assertEquals(1, RagIndexStatusEnum.PENDING.getValue());
assertEquals(3, RagIndexStatusEnum.INDEXED.getValue());
assertEquals(1, RagChunkStrategyEnum.FIXED_LENGTH.getValue());
assertEquals(5, RagChunkStrategyEnum.DELIMITER.getValue());
assertEquals(6, RagChunkStrategyEnum.SEMANTIC.getValue());
}
@Test
@@ -38,5 +42,8 @@ class EnumDefinitionTests {
assertEquals("解析失败", RagParseStatusEnum.FAILED.getLabel());
assertEquals("待索引", RagIndexStatusEnum.PENDING.getLabel());
assertEquals("已索引", RagIndexStatusEnum.INDEXED.getLabel());
assertEquals("固定长度切片", RagChunkStrategyEnum.FIXED_LENGTH.getLabel());
assertEquals("按分隔符切片", RagChunkStrategyEnum.DELIMITER.getLabel());
assertEquals("语义切片", RagChunkStrategyEnum.SEMANTIC.getLabel());
}
}

View File

@@ -5,6 +5,7 @@ import com.bruce.common.domain.entity.SysEnum;
import com.bruce.common.enums.CommonStatusEnum;
import com.bruce.common.enums.EnableStatusEnum;
import com.bruce.common.service.ISysEnumService;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.enums.RagIndexStatusEnum;
import com.bruce.rag.enums.RagParseStatusEnum;
import org.junit.jupiter.api.Test;
@@ -40,6 +41,13 @@ class SysEnumDataInitTests {
saveOrUpdate("rag", "index_status", RagIndexStatusEnum.INDEXING.getLabel(), RagIndexStatusEnum.INDEXING.getValue(), 2, "RAG文档索引状态");
saveOrUpdate("rag", "index_status", RagIndexStatusEnum.INDEXED.getLabel(), RagIndexStatusEnum.INDEXED.getValue(), 3, "RAG文档索引状态");
saveOrUpdate("rag", "index_status", RagIndexStatusEnum.FAILED.getLabel(), RagIndexStatusEnum.FAILED.getValue(), 4, "RAG文档索引状态");
saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.FIXED_LENGTH.getLabel(), RagChunkStrategyEnum.FIXED_LENGTH.getValue(), 1, "RAG文档切片方式");
saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.PARAGRAPH.getLabel(), RagChunkStrategyEnum.PARAGRAPH.getValue(), 2, "RAG文档切片方式");
saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.HEADING.getLabel(), RagChunkStrategyEnum.HEADING.getValue(), 3, "RAG文档切片方式");
saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.TABLE_ROW.getLabel(), RagChunkStrategyEnum.TABLE_ROW.getValue(), 4, "RAG文档切片方式");
saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.DELIMITER.getLabel(), RagChunkStrategyEnum.DELIMITER.getValue(), 5, "RAG文档切片方式");
saveOrUpdate("rag", "chunk_strategy", RagChunkStrategyEnum.SEMANTIC.getLabel(), RagChunkStrategyEnum.SEMANTIC.getValue(), 6, "RAG文档切片方式");
}
private void saveOrUpdate(String catalog, String type, String name, Integer value, Integer sort, String remark) {