feat(rag): 增加文档切片工厂基础层
This commit is contained in:
50
src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java
Normal file
50
src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java
Normal file
@@ -0,0 +1,50 @@
|
||||
package com.bruce.rag.parse;
|
||||
|
||||
import com.bruce.rag.entity.RagChunk;
|
||||
import com.bruce.rag.enums.RagChunkStrategyEnum;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
class ChunkerFactoryTests {
|
||||
|
||||
@Test
|
||||
void resolveShouldReturnMatchingChunker() {
|
||||
Chunker supported = new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH);
|
||||
Chunker unsupported = new StubChunker(RagChunkStrategyEnum.DELIMITER);
|
||||
ChunkerFactory factory = new ChunkerFactory(List.of(supported, unsupported));
|
||||
|
||||
Chunker resolved = factory.resolve(RagChunkStrategyEnum.FIXED_LENGTH);
|
||||
|
||||
assertSame(supported, resolved);
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveShouldRejectUnsupportedStrategy() {
|
||||
ChunkerFactory factory = new ChunkerFactory(List.of(new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH)));
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> factory.resolve(RagChunkStrategyEnum.SEMANTIC));
|
||||
}
|
||||
|
||||
private static class StubChunker implements Chunker {
|
||||
|
||||
private final RagChunkStrategyEnum strategy;
|
||||
|
||||
private StubChunker(RagChunkStrategyEnum strategy) {
|
||||
this.strategy = strategy;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean supports(RagChunkStrategyEnum strategy) {
|
||||
return this.strategy == strategy;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<RagChunk> chunk(RagChunkCommand command) {
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
}
|
||||
64
src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java
Normal file
64
src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java
Normal file
@@ -0,0 +1,64 @@
|
||||
package com.bruce.rag.parse;
|
||||
|
||||
import com.bruce.common.document.parse.DocumentParseResult;
|
||||
import com.bruce.rag.entity.RagChunk;
|
||||
import com.bruce.rag.entity.RagDocument;
|
||||
import com.bruce.rag.enums.RagChunkStrategyEnum;
|
||||
import com.bruce.rag.parse.impl.DelimiterChunker;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class DelimiterChunkerTests {
|
||||
|
||||
@Test
|
||||
void chunkShouldSplitByDelimiterAndIgnoreBlankSegments() {
|
||||
DelimiterChunker chunker = new DelimiterChunker();
|
||||
|
||||
RagChunkCommand command = new RagChunkCommand();
|
||||
command.setDocument(buildDocument());
|
||||
command.setParseResult(buildParseResult("第一段。第二段。。第三段"));
|
||||
command.setChunkStrategy(RagChunkStrategyEnum.DELIMITER.getValue());
|
||||
command.setDelimiter("。");
|
||||
|
||||
List<RagChunk> chunks = chunker.chunk(command);
|
||||
|
||||
assertEquals(3, chunks.size());
|
||||
assertEquals("第一段", chunks.get(0).getChunkContent());
|
||||
assertEquals("第二段", chunks.get(1).getChunkContent());
|
||||
assertEquals("第三段", chunks.get(2).getChunkContent());
|
||||
assertEquals(0, chunks.get(0).getChunkIndex());
|
||||
assertEquals(1, chunks.get(1).getChunkIndex());
|
||||
assertEquals(2, chunks.get(2).getChunkIndex());
|
||||
}
|
||||
|
||||
@Test
|
||||
void chunkShouldReturnEmptyListForBlankText() {
|
||||
DelimiterChunker chunker = new DelimiterChunker();
|
||||
|
||||
RagChunkCommand command = new RagChunkCommand();
|
||||
command.setDocument(buildDocument());
|
||||
command.setParseResult(buildParseResult(" "));
|
||||
command.setChunkStrategy(RagChunkStrategyEnum.DELIMITER.getValue());
|
||||
command.setDelimiter("。");
|
||||
|
||||
assertTrue(chunker.chunk(command).isEmpty());
|
||||
}
|
||||
|
||||
private static RagDocument buildDocument() {
|
||||
RagDocument document = new RagDocument();
|
||||
document.setId(66L);
|
||||
document.setStoreId(55L);
|
||||
return document;
|
||||
}
|
||||
|
||||
private static DocumentParseResult buildParseResult(String text) {
|
||||
DocumentParseResult result = new DocumentParseResult();
|
||||
result.setText(text);
|
||||
result.setTextLength(text.length());
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
package com.bruce.rag.parse;
|
||||
|
||||
import com.bruce.common.document.parse.DocumentParseResult;
|
||||
import com.bruce.rag.entity.RagChunk;
|
||||
import com.bruce.rag.entity.RagDocument;
|
||||
import com.bruce.rag.enums.RagChunkStrategyEnum;
|
||||
import com.bruce.rag.parse.impl.FixedLengthChunker;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class FixedLengthChunkerTests {
|
||||
|
||||
@Test
|
||||
void chunkShouldSplitTextByChunkSizeAndOverlap() {
|
||||
FixedLengthChunker chunker = new FixedLengthChunker();
|
||||
|
||||
RagChunkCommand command = new RagChunkCommand();
|
||||
command.setDocument(buildDocument());
|
||||
command.setParseResult(buildParseResult("abcdefghij"));
|
||||
command.setChunkStrategy(RagChunkStrategyEnum.FIXED_LENGTH.getValue());
|
||||
command.setChunkSize(4);
|
||||
command.setChunkOverlap(1);
|
||||
|
||||
List<RagChunk> chunks = chunker.chunk(command);
|
||||
|
||||
assertEquals(3, chunks.size());
|
||||
assertEquals("abcd", chunks.get(0).getChunkContent());
|
||||
assertEquals("defg", chunks.get(1).getChunkContent());
|
||||
assertEquals("ghij", chunks.get(2).getChunkContent());
|
||||
assertEquals(0, chunks.get(0).getChunkIndex());
|
||||
assertEquals(1, chunks.get(1).getChunkIndex());
|
||||
assertEquals(2, chunks.get(2).getChunkIndex());
|
||||
assertEquals(99L, chunks.get(0).getDocumentId());
|
||||
assertEquals(88L, chunks.get(0).getStoreId());
|
||||
assertTrue(Boolean.TRUE.equals(chunks.get(0).getEnabled()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void chunkShouldReturnEmptyListForBlankText() {
|
||||
FixedLengthChunker chunker = new FixedLengthChunker();
|
||||
|
||||
RagChunkCommand command = new RagChunkCommand();
|
||||
command.setDocument(buildDocument());
|
||||
command.setParseResult(buildParseResult(" "));
|
||||
command.setChunkStrategy(RagChunkStrategyEnum.FIXED_LENGTH.getValue());
|
||||
command.setChunkSize(4);
|
||||
command.setChunkOverlap(1);
|
||||
|
||||
assertTrue(chunker.chunk(command).isEmpty());
|
||||
}
|
||||
|
||||
private static RagDocument buildDocument() {
|
||||
RagDocument document = new RagDocument();
|
||||
document.setId(99L);
|
||||
document.setStoreId(88L);
|
||||
return document;
|
||||
}
|
||||
|
||||
private static DocumentParseResult buildParseResult(String text) {
|
||||
DocumentParseResult result = new DocumentParseResult();
|
||||
result.setText(text);
|
||||
result.setTextLength(text.length());
|
||||
return result;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user