feat(rag): 增加文档切片工厂基础层

This commit is contained in:
2026-05-24 21:11:27 +08:00
parent dfc9847e4d
commit bd8bfeb607
9 changed files with 908 additions and 0 deletions

View File

@@ -0,0 +1,463 @@
# RAG Chunker Foundation Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Build the standalone chunking foundation for the RAG module with a factory, fixed-length and delimiter implementations, and focused unit tests.
**Architecture:** Add a dedicated chunking abstraction under `com.bruce.rag.parse` so parsing and chunk generation stay decoupled. The factory will resolve a `RagChunkStrategyEnum` to a `Chunker` implementation, and each implementation will convert a command object into in-memory `RagChunk` entities without touching persistence.
**Tech Stack:** Java 21, Spring Boot, MyBatis-Plus entities, JUnit 5, Mockito
---
### Task 1: Define Chunking Contracts
**Files:**
- Create: `src/main/java/com/bruce/rag/parse/RagChunkCommand.java`
- Create: `src/main/java/com/bruce/rag/parse/Chunker.java`
- Create: `src/main/java/com/bruce/rag/parse/ChunkerFactory.java`
- Test: `src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java`
- [ ] **Step 1: Write the failing test**
```java
package com.bruce.rag.parse;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
class ChunkerFactoryTests {
@Test
void resolveShouldReturnMatchingChunker() {
Chunker supported = new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH);
Chunker unsupported = new StubChunker(RagChunkStrategyEnum.DELIMITER);
ChunkerFactory factory = new ChunkerFactory(List.of(supported, unsupported));
Chunker resolved = factory.resolve(RagChunkStrategyEnum.FIXED_LENGTH);
assertSame(supported, resolved);
}
@Test
void resolveShouldRejectUnsupportedStrategy() {
ChunkerFactory factory = new ChunkerFactory(List.of(new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH)));
assertThrows(IllegalArgumentException.class, () -> factory.resolve(RagChunkStrategyEnum.SEMANTIC));
}
private static class StubChunker implements Chunker {
private final RagChunkStrategyEnum strategy;
private StubChunker(RagChunkStrategyEnum strategy) {
this.strategy = strategy;
}
@Override
public boolean supports(RagChunkStrategyEnum strategy) {
return this.strategy == strategy;
}
@Override
public List<RagChunk> chunk(RagChunkCommand command) {
return List.of();
}
}
}
```
- [ ] **Step 2: Run test to verify it fails**
Run: `mvn -Dtest=ChunkerFactoryTests test`
Expected: FAIL with compilation errors because `Chunker`, `ChunkerFactory`, and `RagChunkCommand` do not exist yet
- [ ] **Step 3: Write minimal implementation**
```java
package com.bruce.rag.parse;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagDocument;
import lombok.Data;
@Data
public class RagChunkCommand {
private RagDocument document;
private DocumentParseResult parseResult;
private String chunkStrategy;
private Integer chunkSize;
private Integer chunkOverlap;
private String delimiter;
}
```
```java
package com.bruce.rag.parse;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import java.util.List;
public interface Chunker {
boolean supports(RagChunkStrategyEnum strategy);
List<RagChunk> chunk(RagChunkCommand command);
}
```
```java
package com.bruce.rag.parse;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import org.springframework.stereotype.Component;
import java.util.List;
@Component
public class ChunkerFactory {
private final List<Chunker> chunkers;
public ChunkerFactory(List<Chunker> chunkers) {
this.chunkers = chunkers;
}
public Chunker resolve(RagChunkStrategyEnum strategy) {
return chunkers.stream()
.filter(chunker -> chunker.supports(strategy))
.findFirst()
.orElseThrow(() -> new IllegalArgumentException("不支持的切片方式: " + strategy));
}
}
```
- [ ] **Step 4: Run test to verify it passes**
Run: `mvn -Dtest=ChunkerFactoryTests test`
Expected: PASS
- [ ] **Step 5: Commit**
```bash
git add src/main/java/com/bruce/rag/parse/RagChunkCommand.java src/main/java/com/bruce/rag/parse/Chunker.java src/main/java/com/bruce/rag/parse/ChunkerFactory.java src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java
git commit -m "feat: add rag chunker contracts"
```
### Task 2: Add Fixed-Length Chunker
**Files:**
- Create: `src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java`
- Test: `src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java`
- [ ] **Step 1: Write the failing test**
```java
package com.bruce.rag.parse;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.parse.impl.FixedLengthChunker;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class FixedLengthChunkerTests {
@Test
void chunkShouldSplitTextByChunkSizeAndOverlap() {
FixedLengthChunker chunker = new FixedLengthChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult("abcdefghij"));
command.setChunkStrategy("FIXED_LENGTH");
command.setChunkSize(4);
command.setChunkOverlap(1);
List<RagChunk> chunks = chunker.chunk(command);
assertEquals(3, chunks.size());
assertEquals("abcd", chunks.get(0).getChunkContent());
assertEquals("defg", chunks.get(1).getChunkContent());
assertEquals("ghij", chunks.get(2).getChunkContent());
assertEquals(0, chunks.get(0).getChunkIndex());
assertEquals(1, chunks.get(1).getChunkIndex());
assertEquals(2, chunks.get(2).getChunkIndex());
assertEquals(99L, chunks.get(0).getDocumentId());
assertEquals(88L, chunks.get(0).getStoreId());
assertTrue(Boolean.TRUE.equals(chunks.get(0).getEnabled()));
}
@Test
void chunkShouldReturnEmptyListForBlankText() {
FixedLengthChunker chunker = new FixedLengthChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult(" "));
command.setChunkStrategy("FIXED_LENGTH");
command.setChunkSize(4);
command.setChunkOverlap(1);
assertTrue(chunker.chunk(command).isEmpty());
}
private static RagDocument buildDocument() {
RagDocument document = new RagDocument();
document.setId(99L);
document.setStoreId(88L);
return document;
}
private static DocumentParseResult buildParseResult(String text) {
DocumentParseResult result = new DocumentParseResult();
result.setText(text);
result.setTextLength(text.length());
return result;
}
}
```
- [ ] **Step 2: Run test to verify it fails**
Run: `mvn -Dtest=FixedLengthChunkerTests test`
Expected: FAIL with compilation errors because `FixedLengthChunker` does not exist yet
- [ ] **Step 3: Write minimal implementation**
```java
package com.bruce.rag.parse.impl;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.Chunker;
import com.bruce.rag.parse.RagChunkCommand;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.util.ArrayList;
import java.util.List;
@Component
public class FixedLengthChunker implements Chunker {
@Override
public boolean supports(RagChunkStrategyEnum strategy) {
return RagChunkStrategyEnum.FIXED_LENGTH == strategy;
}
@Override
public List<RagChunk> chunk(RagChunkCommand command) {
DocumentParseResult parseResult = command.getParseResult();
String text = parseResult == null ? null : parseResult.getText();
if (!StringUtils.hasText(text)) {
return List.of();
}
int chunkSize = command.getChunkSize() == null ? text.length() : command.getChunkSize();
int overlap = command.getChunkOverlap() == null ? 0 : command.getChunkOverlap();
int step = Math.max(1, chunkSize - overlap);
List<RagChunk> chunks = new ArrayList<>();
for (int start = 0, index = 0; start < text.length(); start += step, index++) {
int end = Math.min(text.length(), start + chunkSize);
chunks.add(buildChunk(command.getDocument(), index, text.substring(start, end)));
if (end >= text.length()) {
break;
}
}
return chunks;
}
private RagChunk buildChunk(RagDocument document, int index, String content) {
RagChunk chunk = new RagChunk();
chunk.setStoreId(document.getStoreId());
chunk.setDocumentId(document.getId());
chunk.setChunkIndex(index);
chunk.setChunkContent(content);
chunk.setEnabled(Boolean.TRUE);
return chunk;
}
}
```
- [ ] **Step 4: Run test to verify it passes**
Run: `mvn -Dtest=FixedLengthChunkerTests test`
Expected: PASS
- [ ] **Step 5: Commit**
```bash
git add src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java
git commit -m "feat: add fixed length rag chunker"
```
### Task 3: Add Delimiter Chunker
**Files:**
- Create: `src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java`
- Test: `src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java`
- [ ] **Step 1: Write the failing test**
```java
package com.bruce.rag.parse;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.parse.impl.DelimiterChunker;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class DelimiterChunkerTests {
@Test
void chunkShouldSplitByDelimiterAndIgnoreBlankSegments() {
DelimiterChunker chunker = new DelimiterChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult("第一段。第二段。。第三段"));
command.setChunkStrategy("DELIMITER");
command.setDelimiter("");
List<RagChunk> chunks = chunker.chunk(command);
assertEquals(3, chunks.size());
assertEquals("第一段", chunks.get(0).getChunkContent());
assertEquals("第二段", chunks.get(1).getChunkContent());
assertEquals("第三段", chunks.get(2).getChunkContent());
assertEquals(0, chunks.get(0).getChunkIndex());
assertEquals(1, chunks.get(1).getChunkIndex());
assertEquals(2, chunks.get(2).getChunkIndex());
}
@Test
void chunkShouldReturnEmptyListForBlankText() {
DelimiterChunker chunker = new DelimiterChunker();
RagChunkCommand command = new RagChunkCommand();
command.setDocument(buildDocument());
command.setParseResult(buildParseResult(" "));
command.setChunkStrategy("DELIMITER");
command.setDelimiter("");
assertTrue(chunker.chunk(command).isEmpty());
}
private static RagDocument buildDocument() {
RagDocument document = new RagDocument();
document.setId(66L);
document.setStoreId(55L);
return document;
}
private static DocumentParseResult buildParseResult(String text) {
DocumentParseResult result = new DocumentParseResult();
result.setText(text);
result.setTextLength(text.length());
return result;
}
}
```
- [ ] **Step 2: Run test to verify it fails**
Run: `mvn -Dtest=DelimiterChunkerTests test`
Expected: FAIL with compilation errors because `DelimiterChunker` does not exist yet
- [ ] **Step 3: Write minimal implementation**
```java
package com.bruce.rag.parse.impl;
import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.Chunker;
import com.bruce.rag.parse.RagChunkCommand;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
@Component
public class DelimiterChunker implements Chunker {
@Override
public boolean supports(RagChunkStrategyEnum strategy) {
return RagChunkStrategyEnum.DELIMITER == strategy;
}
@Override
public List<RagChunk> chunk(RagChunkCommand command) {
DocumentParseResult parseResult = command.getParseResult();
String text = parseResult == null ? null : parseResult.getText();
if (!StringUtils.hasText(text) || !StringUtils.hasText(command.getDelimiter())) {
return List.of();
}
String[] parts = text.split(Pattern.quote(command.getDelimiter()));
List<RagChunk> chunks = new ArrayList<>();
for (String part : parts) {
if (!StringUtils.hasText(part)) {
continue;
}
chunks.add(buildChunk(command.getDocument(), chunks.size(), part.trim()));
}
return chunks;
}
private RagChunk buildChunk(RagDocument document, int index, String content) {
RagChunk chunk = new RagChunk();
chunk.setStoreId(document.getStoreId());
chunk.setDocumentId(document.getId());
chunk.setChunkIndex(index);
chunk.setChunkContent(content);
chunk.setEnabled(Boolean.TRUE);
return chunk;
}
}
```
- [ ] **Step 4: Run test to verify it passes**
Run: `mvn -Dtest=DelimiterChunkerTests test`
Expected: PASS
- [ ] **Step 5: Commit**
```bash
git add src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java
git commit -m "feat: add delimiter rag chunker"
```