From bd8bfeb607832ae9016dfdce7e62303896a3b933 Mon Sep 17 00:00:00 2001 From: bruce Date: Sun, 24 May 2026 21:11:27 +0800 Subject: [PATCH] =?UTF-8?q?feat(rag):=20=E5=A2=9E=E5=8A=A0=E6=96=87?= =?UTF-8?q?=E6=A1=A3=E5=88=87=E7=89=87=E5=B7=A5=E5=8E=82=E5=9F=BA=E7=A1=80?= =?UTF-8?q?=E5=B1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2026-05-24-rag-chunker-foundation.md | 463 ++++++++++++++++++ .../java/com/bruce/rag/parse/Chunker.java | 26 + .../com/bruce/rag/parse/ChunkerFactory.java | 32 ++ .../com/bruce/rag/parse/RagChunkCommand.java | 45 ++ .../rag/parse/impl/DelimiterChunker.java | 68 +++ .../rag/parse/impl/FixedLengthChunker.java | 91 ++++ .../bruce/rag/parse/ChunkerFactoryTests.java | 50 ++ .../rag/parse/DelimiterChunkerTests.java | 64 +++ .../rag/parse/FixedLengthChunkerTests.java | 69 +++ 9 files changed, 908 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-24-rag-chunker-foundation.md create mode 100644 src/main/java/com/bruce/rag/parse/Chunker.java create mode 100644 src/main/java/com/bruce/rag/parse/ChunkerFactory.java create mode 100644 src/main/java/com/bruce/rag/parse/RagChunkCommand.java create mode 100644 src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java create mode 100644 src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java create mode 100644 src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java create mode 100644 src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java create mode 100644 src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java diff --git a/docs/superpowers/plans/2026-05-24-rag-chunker-foundation.md b/docs/superpowers/plans/2026-05-24-rag-chunker-foundation.md new file mode 100644 index 0000000..0752369 --- /dev/null +++ b/docs/superpowers/plans/2026-05-24-rag-chunker-foundation.md @@ -0,0 +1,463 @@ +# RAG Chunker Foundation Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build the standalone chunking foundation for the RAG module with a factory, fixed-length and delimiter implementations, and focused unit tests. + +**Architecture:** Add a dedicated chunking abstraction under `com.bruce.rag.parse` so parsing and chunk generation stay decoupled. The factory will resolve a `RagChunkStrategyEnum` to a `Chunker` implementation, and each implementation will convert a command object into in-memory `RagChunk` entities without touching persistence. + +**Tech Stack:** Java 21, Spring Boot, MyBatis-Plus entities, JUnit 5, Mockito + +--- + +### Task 1: Define Chunking Contracts + +**Files:** +- Create: `src/main/java/com/bruce/rag/parse/RagChunkCommand.java` +- Create: `src/main/java/com/bruce/rag/parse/Chunker.java` +- Create: `src/main/java/com/bruce/rag/parse/ChunkerFactory.java` +- Test: `src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java` + +- [ ] **Step 1: Write the failing test** + +```java +package com.bruce.rag.parse; + +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; + +class ChunkerFactoryTests { + + @Test + void resolveShouldReturnMatchingChunker() { + Chunker supported = new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH); + Chunker unsupported = new StubChunker(RagChunkStrategyEnum.DELIMITER); + ChunkerFactory factory = new ChunkerFactory(List.of(supported, unsupported)); + + Chunker resolved = factory.resolve(RagChunkStrategyEnum.FIXED_LENGTH); + + assertSame(supported, resolved); + } + + @Test + void resolveShouldRejectUnsupportedStrategy() { + ChunkerFactory factory = new ChunkerFactory(List.of(new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH))); + + assertThrows(IllegalArgumentException.class, () -> factory.resolve(RagChunkStrategyEnum.SEMANTIC)); + } + + private static class StubChunker implements Chunker { + + private final RagChunkStrategyEnum strategy; + + private StubChunker(RagChunkStrategyEnum strategy) { + this.strategy = strategy; + } + + @Override + public boolean supports(RagChunkStrategyEnum strategy) { + return this.strategy == strategy; + } + + @Override + public List chunk(RagChunkCommand command) { + return List.of(); + } + } +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `mvn -Dtest=ChunkerFactoryTests test` +Expected: FAIL with compilation errors because `Chunker`, `ChunkerFactory`, and `RagChunkCommand` do not exist yet + +- [ ] **Step 3: Write minimal implementation** + +```java +package com.bruce.rag.parse; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagDocument; +import lombok.Data; + +@Data +public class RagChunkCommand { + + private RagDocument document; + + private DocumentParseResult parseResult; + + private String chunkStrategy; + + private Integer chunkSize; + + private Integer chunkOverlap; + + private String delimiter; +} +``` + +```java +package com.bruce.rag.parse; + +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.enums.RagChunkStrategyEnum; + +import java.util.List; + +public interface Chunker { + + boolean supports(RagChunkStrategyEnum strategy); + + List chunk(RagChunkCommand command); +} +``` + +```java +package com.bruce.rag.parse; + +import com.bruce.rag.enums.RagChunkStrategyEnum; +import org.springframework.stereotype.Component; + +import java.util.List; + +@Component +public class ChunkerFactory { + + private final List chunkers; + + public ChunkerFactory(List chunkers) { + this.chunkers = chunkers; + } + + public Chunker resolve(RagChunkStrategyEnum strategy) { + return chunkers.stream() + .filter(chunker -> chunker.supports(strategy)) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("不支持的切片方式: " + strategy)); + } +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `mvn -Dtest=ChunkerFactoryTests test` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/main/java/com/bruce/rag/parse/RagChunkCommand.java src/main/java/com/bruce/rag/parse/Chunker.java src/main/java/com/bruce/rag/parse/ChunkerFactory.java src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java +git commit -m "feat: add rag chunker contracts" +``` + +### Task 2: Add Fixed-Length Chunker + +**Files:** +- Create: `src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java` +- Test: `src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java` + +- [ ] **Step 1: Write the failing test** + +```java +package com.bruce.rag.parse; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.parse.impl.FixedLengthChunker; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class FixedLengthChunkerTests { + + @Test + void chunkShouldSplitTextByChunkSizeAndOverlap() { + FixedLengthChunker chunker = new FixedLengthChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult("abcdefghij")); + command.setChunkStrategy("FIXED_LENGTH"); + command.setChunkSize(4); + command.setChunkOverlap(1); + + List chunks = chunker.chunk(command); + + assertEquals(3, chunks.size()); + assertEquals("abcd", chunks.get(0).getChunkContent()); + assertEquals("defg", chunks.get(1).getChunkContent()); + assertEquals("ghij", chunks.get(2).getChunkContent()); + assertEquals(0, chunks.get(0).getChunkIndex()); + assertEquals(1, chunks.get(1).getChunkIndex()); + assertEquals(2, chunks.get(2).getChunkIndex()); + assertEquals(99L, chunks.get(0).getDocumentId()); + assertEquals(88L, chunks.get(0).getStoreId()); + assertTrue(Boolean.TRUE.equals(chunks.get(0).getEnabled())); + } + + @Test + void chunkShouldReturnEmptyListForBlankText() { + FixedLengthChunker chunker = new FixedLengthChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult(" ")); + command.setChunkStrategy("FIXED_LENGTH"); + command.setChunkSize(4); + command.setChunkOverlap(1); + + assertTrue(chunker.chunk(command).isEmpty()); + } + + private static RagDocument buildDocument() { + RagDocument document = new RagDocument(); + document.setId(99L); + document.setStoreId(88L); + return document; + } + + private static DocumentParseResult buildParseResult(String text) { + DocumentParseResult result = new DocumentParseResult(); + result.setText(text); + result.setTextLength(text.length()); + return result; + } +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `mvn -Dtest=FixedLengthChunkerTests test` +Expected: FAIL with compilation errors because `FixedLengthChunker` does not exist yet + +- [ ] **Step 3: Write minimal implementation** + +```java +package com.bruce.rag.parse.impl; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import com.bruce.rag.parse.Chunker; +import com.bruce.rag.parse.RagChunkCommand; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +import java.util.ArrayList; +import java.util.List; + +@Component +public class FixedLengthChunker implements Chunker { + + @Override + public boolean supports(RagChunkStrategyEnum strategy) { + return RagChunkStrategyEnum.FIXED_LENGTH == strategy; + } + + @Override + public List chunk(RagChunkCommand command) { + DocumentParseResult parseResult = command.getParseResult(); + String text = parseResult == null ? null : parseResult.getText(); + if (!StringUtils.hasText(text)) { + return List.of(); + } + + int chunkSize = command.getChunkSize() == null ? text.length() : command.getChunkSize(); + int overlap = command.getChunkOverlap() == null ? 0 : command.getChunkOverlap(); + int step = Math.max(1, chunkSize - overlap); + List chunks = new ArrayList<>(); + for (int start = 0, index = 0; start < text.length(); start += step, index++) { + int end = Math.min(text.length(), start + chunkSize); + chunks.add(buildChunk(command.getDocument(), index, text.substring(start, end))); + if (end >= text.length()) { + break; + } + } + return chunks; + } + + private RagChunk buildChunk(RagDocument document, int index, String content) { + RagChunk chunk = new RagChunk(); + chunk.setStoreId(document.getStoreId()); + chunk.setDocumentId(document.getId()); + chunk.setChunkIndex(index); + chunk.setChunkContent(content); + chunk.setEnabled(Boolean.TRUE); + return chunk; + } +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `mvn -Dtest=FixedLengthChunkerTests test` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java +git commit -m "feat: add fixed length rag chunker" +``` + +### Task 3: Add Delimiter Chunker + +**Files:** +- Create: `src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java` +- Test: `src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java` + +- [ ] **Step 1: Write the failing test** + +```java +package com.bruce.rag.parse; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.parse.impl.DelimiterChunker; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class DelimiterChunkerTests { + + @Test + void chunkShouldSplitByDelimiterAndIgnoreBlankSegments() { + DelimiterChunker chunker = new DelimiterChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult("第一段。第二段。。第三段")); + command.setChunkStrategy("DELIMITER"); + command.setDelimiter("。"); + + List chunks = chunker.chunk(command); + + assertEquals(3, chunks.size()); + assertEquals("第一段", chunks.get(0).getChunkContent()); + assertEquals("第二段", chunks.get(1).getChunkContent()); + assertEquals("第三段", chunks.get(2).getChunkContent()); + assertEquals(0, chunks.get(0).getChunkIndex()); + assertEquals(1, chunks.get(1).getChunkIndex()); + assertEquals(2, chunks.get(2).getChunkIndex()); + } + + @Test + void chunkShouldReturnEmptyListForBlankText() { + DelimiterChunker chunker = new DelimiterChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult(" ")); + command.setChunkStrategy("DELIMITER"); + command.setDelimiter("。"); + + assertTrue(chunker.chunk(command).isEmpty()); + } + + private static RagDocument buildDocument() { + RagDocument document = new RagDocument(); + document.setId(66L); + document.setStoreId(55L); + return document; + } + + private static DocumentParseResult buildParseResult(String text) { + DocumentParseResult result = new DocumentParseResult(); + result.setText(text); + result.setTextLength(text.length()); + return result; + } +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `mvn -Dtest=DelimiterChunkerTests test` +Expected: FAIL with compilation errors because `DelimiterChunker` does not exist yet + +- [ ] **Step 3: Write minimal implementation** + +```java +package com.bruce.rag.parse.impl; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import com.bruce.rag.parse.Chunker; +import com.bruce.rag.parse.RagChunkCommand; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +@Component +public class DelimiterChunker implements Chunker { + + @Override + public boolean supports(RagChunkStrategyEnum strategy) { + return RagChunkStrategyEnum.DELIMITER == strategy; + } + + @Override + public List chunk(RagChunkCommand command) { + DocumentParseResult parseResult = command.getParseResult(); + String text = parseResult == null ? null : parseResult.getText(); + if (!StringUtils.hasText(text) || !StringUtils.hasText(command.getDelimiter())) { + return List.of(); + } + + String[] parts = text.split(Pattern.quote(command.getDelimiter())); + List chunks = new ArrayList<>(); + for (String part : parts) { + if (!StringUtils.hasText(part)) { + continue; + } + chunks.add(buildChunk(command.getDocument(), chunks.size(), part.trim())); + } + return chunks; + } + + private RagChunk buildChunk(RagDocument document, int index, String content) { + RagChunk chunk = new RagChunk(); + chunk.setStoreId(document.getStoreId()); + chunk.setDocumentId(document.getId()); + chunk.setChunkIndex(index); + chunk.setChunkContent(content); + chunk.setEnabled(Boolean.TRUE); + return chunk; + } +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `mvn -Dtest=DelimiterChunkerTests test` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java +git commit -m "feat: add delimiter rag chunker" +``` diff --git a/src/main/java/com/bruce/rag/parse/Chunker.java b/src/main/java/com/bruce/rag/parse/Chunker.java new file mode 100644 index 0000000..7c95e32 --- /dev/null +++ b/src/main/java/com/bruce/rag/parse/Chunker.java @@ -0,0 +1,26 @@ +package com.bruce.rag.parse; + +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.enums.RagChunkStrategyEnum; + +import java.util.List; + +/** + * 切片策略统一接口。 + *

+ * 这里的职责只有两个: + * 1. 告诉工厂自己支持哪一种切片策略 + * 2. 根据切片命令生成切片结果 + */ +public interface Chunker { + + /** + * 判断当前实现是否支持指定的切片策略。 + */ + boolean supports(RagChunkStrategyEnum strategy); + + /** + * 执行切片,返回内存中的切片对象列表。 + */ + List chunk(RagChunkCommand command); +} diff --git a/src/main/java/com/bruce/rag/parse/ChunkerFactory.java b/src/main/java/com/bruce/rag/parse/ChunkerFactory.java new file mode 100644 index 0000000..844cd70 --- /dev/null +++ b/src/main/java/com/bruce/rag/parse/ChunkerFactory.java @@ -0,0 +1,32 @@ +package com.bruce.rag.parse; + +import com.bruce.rag.enums.RagChunkStrategyEnum; +import org.springframework.stereotype.Component; + +import java.util.List; + +@Component +/** + * 切片策略工厂。 + *

+ * Spring 会把所有实现了 {@link Chunker} 的 Bean 注入进来, + * 工厂再根据切片策略挑出对应实现,避免业务层自己写 if-else 或 switch。 + */ +public class ChunkerFactory { + + private final List chunkers; + + public ChunkerFactory(List chunkers) { + this.chunkers = chunkers; + } + + /** + * 根据切片策略解析出具体的切片器实现。 + */ + public Chunker resolve(RagChunkStrategyEnum strategy) { + return chunkers.stream() + .filter(chunker -> chunker.supports(strategy)) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("不支持的切片方式: " + strategy)); + } +} diff --git a/src/main/java/com/bruce/rag/parse/RagChunkCommand.java b/src/main/java/com/bruce/rag/parse/RagChunkCommand.java new file mode 100644 index 0000000..ba13fdd --- /dev/null +++ b/src/main/java/com/bruce/rag/parse/RagChunkCommand.java @@ -0,0 +1,45 @@ +package com.bruce.rag.parse; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagDocument; +import lombok.Data; + +@Data +/** + * 一次切片请求的上下文参数。 + *

+ * 这里把文档信息、解析结果和切片配置收拢到一个对象里, + * 这样切片器接口不会因为参数越来越多而变得难维护。 + */ +public class RagChunkCommand { + + /** + * 当前要切片的文档实体。 + */ + private RagDocument document; + + /** + * 文档解析后的文本结果。 + */ + private DocumentParseResult parseResult; + + /** + * 切片策略枚举值,通常来自前端请求。 + */ + private Integer chunkStrategy; + + /** + * 单个切片的目标长度,主要给定长切片使用。 + */ + private Integer chunkSize; + + /** + * 相邻切片之间的重叠长度,主要给定长切片使用。 + */ + private Integer chunkOverlap; + + /** + * 自定义分隔符,主要给分隔符切片使用。 + */ + private String delimiter; +} diff --git a/src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java b/src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java new file mode 100644 index 0000000..0560b42 --- /dev/null +++ b/src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java @@ -0,0 +1,68 @@ +package com.bruce.rag.parse.impl; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import com.bruce.rag.parse.Chunker; +import com.bruce.rag.parse.RagChunkCommand; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +@Component +/** + * 分隔符切片实现。 + *

+ * 先按外部传入的 delimiter 拆分文本,再过滤空片段,生成顺序切片。 + */ +public class DelimiterChunker implements Chunker { + + @Override + public boolean supports(RagChunkStrategyEnum strategy) { + return RagChunkStrategyEnum.DELIMITER == strategy; + } + + @Override + public List chunk(RagChunkCommand command) { + String text = extractText(command); + String delimiter = command == null ? null : command.getDelimiter(); + if (!StringUtils.hasText(text) || !StringUtils.hasText(delimiter)) { + return List.of(); + } + + // 使用 Pattern.quote 处理正则特殊字符,确保分隔符按字面值切分。 + String[] parts = text.split(Pattern.quote(delimiter)); + List chunks = new ArrayList<>(); + for (String part : parts) { + if (!StringUtils.hasText(part)) { + continue; + } + chunks.add(buildChunk(command.getDocument(), chunks.size(), part.trim())); + } + return chunks; + } + + private String extractText(RagChunkCommand command) { + DocumentParseResult parseResult = command == null ? null : command.getParseResult(); + return parseResult == null ? null : parseResult.getText(); + } + + /** + * 分隔符切片同样只负责生成基础切片结构,不处理持久化和向量化。 + */ + private RagChunk buildChunk(RagDocument document, int index, String content) { + RagChunk chunk = new RagChunk(); + if (document != null) { + chunk.setStoreId(document.getStoreId()); + chunk.setDocumentId(document.getId()); + } + chunk.setChunkIndex(index); + chunk.setChunkContent(content); + chunk.setEnabled(Boolean.TRUE); + return chunk; + } +} diff --git a/src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java b/src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java new file mode 100644 index 0000000..a7c16e9 --- /dev/null +++ b/src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java @@ -0,0 +1,91 @@ +package com.bruce.rag.parse.impl; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import com.bruce.rag.parse.Chunker; +import com.bruce.rag.parse.RagChunkCommand; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +import java.util.ArrayList; +import java.util.List; + +@Component +/** + * 定长切片实现。 + *

+ * 按 chunkSize 顺序截取文本,并结合 chunkOverlap 控制相邻切片的重叠部分。 + */ +public class FixedLengthChunker implements Chunker { + + @Override + public boolean supports(RagChunkStrategyEnum strategy) { + return RagChunkStrategyEnum.FIXED_LENGTH == strategy; + } + + @Override + public List chunk(RagChunkCommand command) { + String text = extractText(command); + if (!StringUtils.hasText(text)) { + return List.of(); + } + + int chunkSize = resolveChunkSize(command, text.length()); + int overlap = resolveChunkOverlap(command, chunkSize); + // 实际步长等于切片长度减去重叠长度,最小保证为 1,避免死循环。 + int step = Math.max(1, chunkSize - overlap); + List chunks = new ArrayList<>(); + for (int start = 0; start < text.length(); start += step) { + int end = Math.min(text.length(), start + chunkSize); + chunks.add(buildChunk(command.getDocument(), chunks.size(), text.substring(start, end))); + if (end >= text.length()) { + break; + } + } + return chunks; + } + + private String extractText(RagChunkCommand command) { + DocumentParseResult parseResult = command == null ? null : command.getParseResult(); + return parseResult == null ? null : parseResult.getText(); + } + + /** + * 当未传 chunkSize 或传入非法值时,退化为整段文本一个切片。 + */ + private int resolveChunkSize(RagChunkCommand command, int textLength) { + Integer chunkSize = command == null ? null : command.getChunkSize(); + if (chunkSize == null || chunkSize <= 0) { + return textLength; + } + return chunkSize; + } + + /** + * overlap 不能为负,也不能大于等于 chunkSize,否则步长会变成 0 或负数。 + */ + private int resolveChunkOverlap(RagChunkCommand command, int chunkSize) { + Integer overlap = command == null ? null : command.getChunkOverlap(); + if (overlap == null || overlap < 0) { + return 0; + } + return Math.min(overlap, Math.max(0, chunkSize - 1)); + } + + /** + * 这里只构造最基础的切片对象,后续落库时再补充摘要、向量等扩展字段。 + */ + private RagChunk buildChunk(RagDocument document, int index, String content) { + RagChunk chunk = new RagChunk(); + if (document != null) { + chunk.setStoreId(document.getStoreId()); + chunk.setDocumentId(document.getId()); + } + chunk.setChunkIndex(index); + chunk.setChunkContent(content); + chunk.setEnabled(Boolean.TRUE); + return chunk; + } +} diff --git a/src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java b/src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java new file mode 100644 index 0000000..4f1896e --- /dev/null +++ b/src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java @@ -0,0 +1,50 @@ +package com.bruce.rag.parse; + +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; + +class ChunkerFactoryTests { + + @Test + void resolveShouldReturnMatchingChunker() { + Chunker supported = new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH); + Chunker unsupported = new StubChunker(RagChunkStrategyEnum.DELIMITER); + ChunkerFactory factory = new ChunkerFactory(List.of(supported, unsupported)); + + Chunker resolved = factory.resolve(RagChunkStrategyEnum.FIXED_LENGTH); + + assertSame(supported, resolved); + } + + @Test + void resolveShouldRejectUnsupportedStrategy() { + ChunkerFactory factory = new ChunkerFactory(List.of(new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH))); + + assertThrows(IllegalArgumentException.class, () -> factory.resolve(RagChunkStrategyEnum.SEMANTIC)); + } + + private static class StubChunker implements Chunker { + + private final RagChunkStrategyEnum strategy; + + private StubChunker(RagChunkStrategyEnum strategy) { + this.strategy = strategy; + } + + @Override + public boolean supports(RagChunkStrategyEnum strategy) { + return this.strategy == strategy; + } + + @Override + public List chunk(RagChunkCommand command) { + return List.of(); + } + } +} diff --git a/src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java b/src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java new file mode 100644 index 0000000..d90145b --- /dev/null +++ b/src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java @@ -0,0 +1,64 @@ +package com.bruce.rag.parse; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import com.bruce.rag.parse.impl.DelimiterChunker; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class DelimiterChunkerTests { + + @Test + void chunkShouldSplitByDelimiterAndIgnoreBlankSegments() { + DelimiterChunker chunker = new DelimiterChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult("第一段。第二段。。第三段")); + command.setChunkStrategy(RagChunkStrategyEnum.DELIMITER.getValue()); + command.setDelimiter("。"); + + List chunks = chunker.chunk(command); + + assertEquals(3, chunks.size()); + assertEquals("第一段", chunks.get(0).getChunkContent()); + assertEquals("第二段", chunks.get(1).getChunkContent()); + assertEquals("第三段", chunks.get(2).getChunkContent()); + assertEquals(0, chunks.get(0).getChunkIndex()); + assertEquals(1, chunks.get(1).getChunkIndex()); + assertEquals(2, chunks.get(2).getChunkIndex()); + } + + @Test + void chunkShouldReturnEmptyListForBlankText() { + DelimiterChunker chunker = new DelimiterChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult(" ")); + command.setChunkStrategy(RagChunkStrategyEnum.DELIMITER.getValue()); + command.setDelimiter("。"); + + assertTrue(chunker.chunk(command).isEmpty()); + } + + private static RagDocument buildDocument() { + RagDocument document = new RagDocument(); + document.setId(66L); + document.setStoreId(55L); + return document; + } + + private static DocumentParseResult buildParseResult(String text) { + DocumentParseResult result = new DocumentParseResult(); + result.setText(text); + result.setTextLength(text.length()); + return result; + } +} diff --git a/src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java b/src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java new file mode 100644 index 0000000..d2ed454 --- /dev/null +++ b/src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java @@ -0,0 +1,69 @@ +package com.bruce.rag.parse; + +import com.bruce.common.document.parse.DocumentParseResult; +import com.bruce.rag.entity.RagChunk; +import com.bruce.rag.entity.RagDocument; +import com.bruce.rag.enums.RagChunkStrategyEnum; +import com.bruce.rag.parse.impl.FixedLengthChunker; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class FixedLengthChunkerTests { + + @Test + void chunkShouldSplitTextByChunkSizeAndOverlap() { + FixedLengthChunker chunker = new FixedLengthChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult("abcdefghij")); + command.setChunkStrategy(RagChunkStrategyEnum.FIXED_LENGTH.getValue()); + command.setChunkSize(4); + command.setChunkOverlap(1); + + List chunks = chunker.chunk(command); + + assertEquals(3, chunks.size()); + assertEquals("abcd", chunks.get(0).getChunkContent()); + assertEquals("defg", chunks.get(1).getChunkContent()); + assertEquals("ghij", chunks.get(2).getChunkContent()); + assertEquals(0, chunks.get(0).getChunkIndex()); + assertEquals(1, chunks.get(1).getChunkIndex()); + assertEquals(2, chunks.get(2).getChunkIndex()); + assertEquals(99L, chunks.get(0).getDocumentId()); + assertEquals(88L, chunks.get(0).getStoreId()); + assertTrue(Boolean.TRUE.equals(chunks.get(0).getEnabled())); + } + + @Test + void chunkShouldReturnEmptyListForBlankText() { + FixedLengthChunker chunker = new FixedLengthChunker(); + + RagChunkCommand command = new RagChunkCommand(); + command.setDocument(buildDocument()); + command.setParseResult(buildParseResult(" ")); + command.setChunkStrategy(RagChunkStrategyEnum.FIXED_LENGTH.getValue()); + command.setChunkSize(4); + command.setChunkOverlap(1); + + assertTrue(chunker.chunk(command).isEmpty()); + } + + private static RagDocument buildDocument() { + RagDocument document = new RagDocument(); + document.setId(99L); + document.setStoreId(88L); + return document; + } + + private static DocumentParseResult buildParseResult(String text) { + DocumentParseResult result = new DocumentParseResult(); + result.setText(text); + result.setTextLength(text.length()); + return result; + } +}