feat(rag): add document parsing structures
This commit is contained in:
@@ -0,0 +1,50 @@
|
||||
package com.bruce.common.document.parse;
|
||||
|
||||
import com.bruce.common.document.parse.impl.ExcelDocumentParser;
|
||||
import com.bruce.common.document.parse.impl.PdfDocumentParser;
|
||||
import com.bruce.common.document.parse.impl.TxtDocumentParser;
|
||||
import com.bruce.common.document.parse.impl.WordDocumentParser;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
class DocumentParserFactoryTests {
|
||||
|
||||
@Test
|
||||
void resolveShouldChooseParserByFileSuffix() {
|
||||
DocumentParserFactory factory = new DocumentParserFactory(List.of(
|
||||
new TxtDocumentParser(),
|
||||
new WordDocumentParser(),
|
||||
new PdfDocumentParser(),
|
||||
new ExcelDocumentParser()
|
||||
));
|
||||
|
||||
assertEquals(TxtDocumentParser.class, factory.resolve(context("txt")).getClass());
|
||||
assertEquals(WordDocumentParser.class, factory.resolve(context("docx")).getClass());
|
||||
assertEquals(PdfDocumentParser.class, factory.resolve(context("pdf")).getClass());
|
||||
assertEquals(ExcelDocumentParser.class, factory.resolve(context("xlsx")).getClass());
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveShouldRejectUnsupportedSuffix() {
|
||||
DocumentParserFactory factory = new DocumentParserFactory(List.of(new TxtDocumentParser()));
|
||||
|
||||
DocumentParseException exception = assertThrows(
|
||||
DocumentParseException.class,
|
||||
() -> factory.resolve(context("zip"))
|
||||
);
|
||||
|
||||
assertEquals("不支持的文档类型: zip", exception.getMessage());
|
||||
}
|
||||
|
||||
private DocumentParseContext context(String suffix) {
|
||||
DocumentParseContext context = new DocumentParseContext();
|
||||
context.setSuffix(suffix);
|
||||
context.setFilePath(Path.of("sample." + suffix));
|
||||
return context;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package com.bruce.common.document.parse;
|
||||
|
||||
import com.bruce.common.document.parse.impl.TxtDocumentParser;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class TxtDocumentParserTests {
|
||||
|
||||
@TempDir
|
||||
private Path tempDir;
|
||||
|
||||
@Test
|
||||
void parseShouldReadPlainTextContent() throws Exception {
|
||||
Path file = tempDir.resolve("people.txt");
|
||||
Files.writeString(file, "张三 是 产品经理\n李四 是 后端工程师", StandardCharsets.UTF_8);
|
||||
DocumentParseContext context = new DocumentParseContext();
|
||||
context.setOriginalName("people.txt");
|
||||
context.setSuffix("txt");
|
||||
context.setContentType("text/plain");
|
||||
context.setFilePath(file);
|
||||
|
||||
DocumentParseResult result = new TxtDocumentParser().parse(context);
|
||||
|
||||
assertEquals("张三 是 产品经理\n李四 是 后端工程师", result.getText());
|
||||
assertEquals(result.getText().length(), result.getTextLength());
|
||||
assertTrue(result.getMetadata().get("contentType").toString().startsWith("text/plain"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void supportsShouldAcceptTextSuffixAndContentType() {
|
||||
TxtDocumentParser parser = new TxtDocumentParser();
|
||||
DocumentParseContext suffixContext = new DocumentParseContext();
|
||||
suffixContext.setSuffix("TXT");
|
||||
DocumentParseContext contentTypeContext = new DocumentParseContext();
|
||||
contentTypeContext.setContentType("text/plain");
|
||||
|
||||
assertTrue(parser.supports(suffixContext));
|
||||
assertTrue(parser.supports(contentTypeContext));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user