feat(rag): 增加文档切片工厂基础层

This commit is contained in:
2026-05-24 21:11:27 +08:00
parent dfc9847e4d
commit bd8bfeb607
9 changed files with 908 additions and 0 deletions

View File

@@ -0,0 +1,26 @@
package com.bruce.rag.parse;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import java.util.List;
/**
* 切片策略统一接口。
* <p>
* 这里的职责只有两个:
* 1. 告诉工厂自己支持哪一种切片策略
* 2. 根据切片命令生成切片结果
*/
public interface Chunker {
/**
* 判断当前实现是否支持指定的切片策略。
*/
boolean supports(RagChunkStrategyEnum strategy);
/**
* 执行切片,返回内存中的切片对象列表。
*/
List<RagChunk> chunk(RagChunkCommand command);
}

View File

@@ -0,0 +1,32 @@
package com.bruce.rag.parse;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import org.springframework.stereotype.Component;
import java.util.List;
@Component
/**
* 切片策略工厂。
* <p>
* Spring 会把所有实现了 {@link Chunker} 的 Bean 注入进来,
* 工厂再根据切片策略挑出对应实现,避免业务层自己写 if-else 或 switch。
*/
public class ChunkerFactory {
private final List<Chunker> chunkers;
public ChunkerFactory(List<Chunker> chunkers) {
this.chunkers = chunkers;
}
/**
* 根据切片策略解析出具体的切片器实现。
*/
public Chunker resolve(RagChunkStrategyEnum strategy) {
return chunkers.stream()
.filter(chunker -> chunker.supports(strategy))
.findFirst()
.orElseThrow(() -> new IllegalArgumentException("不支持的切片方式: " + strategy));
}
}

View File

@@ -0,0 +1,45 @@
package com.bruce.rag.parse;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagDocument;
import lombok.Data;
@Data
/**
* 一次切片请求的上下文参数。
* <p>
* 这里把文档信息、解析结果和切片配置收拢到一个对象里,
* 这样切片器接口不会因为参数越来越多而变得难维护。
*/
public class RagChunkCommand {
/**
* 当前要切片的文档实体。
*/
private RagDocument document;
/**
* 文档解析后的文本结果。
*/
private DocumentParseResult parseResult;
/**
* 切片策略枚举值,通常来自前端请求。
*/
private Integer chunkStrategy;
/**
* 单个切片的目标长度,主要给定长切片使用。
*/
private Integer chunkSize;
/**
* 相邻切片之间的重叠长度,主要给定长切片使用。
*/
private Integer chunkOverlap;
/**
* 自定义分隔符,主要给分隔符切片使用。
*/
private String delimiter;
}

View File

@@ -0,0 +1,68 @@
package com.bruce.rag.parse.impl;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.Chunker;
import com.bruce.rag.parse.RagChunkCommand;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
@Component
/**
* 分隔符切片实现。
* <p>
* 先按外部传入的 delimiter 拆分文本,再过滤空片段,生成顺序切片。
*/
public class DelimiterChunker implements Chunker {
@Override
public boolean supports(RagChunkStrategyEnum strategy) {
return RagChunkStrategyEnum.DELIMITER == strategy;
}
@Override
public List<RagChunk> chunk(RagChunkCommand command) {
String text = extractText(command);
String delimiter = command == null ? null : command.getDelimiter();
if (!StringUtils.hasText(text) || !StringUtils.hasText(delimiter)) {
return List.of();
}
// 使用 Pattern.quote 处理正则特殊字符,确保分隔符按字面值切分。
String[] parts = text.split(Pattern.quote(delimiter));
List<RagChunk> chunks = new ArrayList<>();
for (String part : parts) {
if (!StringUtils.hasText(part)) {
continue;
}
chunks.add(buildChunk(command.getDocument(), chunks.size(), part.trim()));
}
return chunks;
}
private String extractText(RagChunkCommand command) {
DocumentParseResult parseResult = command == null ? null : command.getParseResult();
return parseResult == null ? null : parseResult.getText();
}
/**
* 分隔符切片同样只负责生成基础切片结构,不处理持久化和向量化。
*/
private RagChunk buildChunk(RagDocument document, int index, String content) {
RagChunk chunk = new RagChunk();
if (document != null) {
chunk.setStoreId(document.getStoreId());
chunk.setDocumentId(document.getId());
}
chunk.setChunkIndex(index);
chunk.setChunkContent(content);
chunk.setEnabled(Boolean.TRUE);
return chunk;
}
}

View File

@@ -0,0 +1,91 @@
package com.bruce.rag.parse.impl;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.Chunker;
import com.bruce.rag.parse.RagChunkCommand;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.util.ArrayList;
import java.util.List;
@Component
/**
* 定长切片实现。
* <p>
* 按 chunkSize 顺序截取文本,并结合 chunkOverlap 控制相邻切片的重叠部分。
*/
public class FixedLengthChunker implements Chunker {
@Override
public boolean supports(RagChunkStrategyEnum strategy) {
return RagChunkStrategyEnum.FIXED_LENGTH == strategy;
}
@Override
public List<RagChunk> chunk(RagChunkCommand command) {
String text = extractText(command);
if (!StringUtils.hasText(text)) {
return List.of();
}
int chunkSize = resolveChunkSize(command, text.length());
int overlap = resolveChunkOverlap(command, chunkSize);
// 实际步长等于切片长度减去重叠长度,最小保证为 1避免死循环。
int step = Math.max(1, chunkSize - overlap);
List<RagChunk> chunks = new ArrayList<>();
for (int start = 0; start < text.length(); start += step) {
int end = Math.min(text.length(), start + chunkSize);
chunks.add(buildChunk(command.getDocument(), chunks.size(), text.substring(start, end)));
if (end >= text.length()) {
break;
}
}
return chunks;
}
private String extractText(RagChunkCommand command) {
DocumentParseResult parseResult = command == null ? null : command.getParseResult();
return parseResult == null ? null : parseResult.getText();
}
/**
* 当未传 chunkSize 或传入非法值时,退化为整段文本一个切片。
*/
private int resolveChunkSize(RagChunkCommand command, int textLength) {
Integer chunkSize = command == null ? null : command.getChunkSize();
if (chunkSize == null || chunkSize <= 0) {
return textLength;
}
return chunkSize;
}
/**
* overlap 不能为负,也不能大于等于 chunkSize否则步长会变成 0 或负数。
*/
private int resolveChunkOverlap(RagChunkCommand command, int chunkSize) {
Integer overlap = command == null ? null : command.getChunkOverlap();
if (overlap == null || overlap < 0) {
return 0;
}
return Math.min(overlap, Math.max(0, chunkSize - 1));
}
/**
* 这里只构造最基础的切片对象,后续落库时再补充摘要、向量等扩展字段。
*/
private RagChunk buildChunk(RagDocument document, int index, String content) {
RagChunk chunk = new RagChunk();
if (document != null) {
chunk.setStoreId(document.getStoreId());
chunk.setDocumentId(document.getId());
}
chunk.setChunkIndex(index);
chunk.setChunkContent(content);
chunk.setEnabled(Boolean.TRUE);
return chunk;
}
}

View File

@@ -0,0 +1,50 @@
package com.bruce.rag.parse;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
class ChunkerFactoryTests {
@Test
void resolveShouldReturnMatchingChunker() {
Chunker supported = new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH);
Chunker unsupported = new StubChunker(RagChunkStrategyEnum.DELIMITER);
ChunkerFactory factory = new ChunkerFactory(List.of(supported, unsupported));
Chunker resolved = factory.resolve(RagChunkStrategyEnum.FIXED_LENGTH);
assertSame(supported, resolved);
}
@Test
void resolveShouldRejectUnsupportedStrategy() {
ChunkerFactory factory = new ChunkerFactory(List.of(new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH)));
assertThrows(IllegalArgumentException.class, () -> factory.resolve(RagChunkStrategyEnum.SEMANTIC));
}
private static class StubChunker implements Chunker {
private final RagChunkStrategyEnum strategy;
private StubChunker(RagChunkStrategyEnum strategy) {
this.strategy = strategy;
}
@Override
public boolean supports(RagChunkStrategyEnum strategy) {
return this.strategy == strategy;
}
@Override
public List<RagChunk> chunk(RagChunkCommand command) {
return List.of();
}
}
}

View File

@@ -0,0 +1,64 @@
package com.bruce.rag.parse;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.impl.DelimiterChunker;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class DelimiterChunkerTests {
@Test
void chunkShouldSplitByDelimiterAndIgnoreBlankSegments() {
DelimiterChunker chunker = new DelimiterChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult("第一段。第二段。。第三段"));
command.setChunkStrategy(RagChunkStrategyEnum.DELIMITER.getValue());
command.setDelimiter("");
List<RagChunk> chunks = chunker.chunk(command);
assertEquals(3, chunks.size());
assertEquals("第一段", chunks.get(0).getChunkContent());
assertEquals("第二段", chunks.get(1).getChunkContent());
assertEquals("第三段", chunks.get(2).getChunkContent());
assertEquals(0, chunks.get(0).getChunkIndex());
assertEquals(1, chunks.get(1).getChunkIndex());
assertEquals(2, chunks.get(2).getChunkIndex());
}
@Test
void chunkShouldReturnEmptyListForBlankText() {
DelimiterChunker chunker = new DelimiterChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult(" "));
command.setChunkStrategy(RagChunkStrategyEnum.DELIMITER.getValue());
command.setDelimiter("");
assertTrue(chunker.chunk(command).isEmpty());
}
private static RagDocument buildDocument() {
RagDocument document = new RagDocument();
document.setId(66L);
document.setStoreId(55L);
return document;
}
private static DocumentParseResult buildParseResult(String text) {
DocumentParseResult result = new DocumentParseResult();
result.setText(text);
result.setTextLength(text.length());
return result;
}
}

View File

@@ -0,0 +1,69 @@
package com.bruce.rag.parse;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.impl.FixedLengthChunker;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class FixedLengthChunkerTests {
@Test
void chunkShouldSplitTextByChunkSizeAndOverlap() {
FixedLengthChunker chunker = new FixedLengthChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult("abcdefghij"));
command.setChunkStrategy(RagChunkStrategyEnum.FIXED_LENGTH.getValue());
command.setChunkSize(4);
command.setChunkOverlap(1);
List<RagChunk> chunks = chunker.chunk(command);
assertEquals(3, chunks.size());
assertEquals("abcd", chunks.get(0).getChunkContent());
assertEquals("defg", chunks.get(1).getChunkContent());
assertEquals("ghij", chunks.get(2).getChunkContent());
assertEquals(0, chunks.get(0).getChunkIndex());
assertEquals(1, chunks.get(1).getChunkIndex());
assertEquals(2, chunks.get(2).getChunkIndex());
assertEquals(99L, chunks.get(0).getDocumentId());
assertEquals(88L, chunks.get(0).getStoreId());
assertTrue(Boolean.TRUE.equals(chunks.get(0).getEnabled()));
}
@Test
void chunkShouldReturnEmptyListForBlankText() {
FixedLengthChunker chunker = new FixedLengthChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult(" "));
command.setChunkStrategy(RagChunkStrategyEnum.FIXED_LENGTH.getValue());
command.setChunkSize(4);
command.setChunkOverlap(1);
assertTrue(chunker.chunk(command).isEmpty());
}
private static RagDocument buildDocument() {
RagDocument document = new RagDocument();
document.setId(99L);
document.setStoreId(88L);
return document;
}
private static DocumentParseResult buildParseResult(String text) {
DocumentParseResult result = new DocumentParseResult();
result.setText(text);
result.setTextLength(text.length());
return result;
}
}