Files
common_agent/docs/superpowers/plans/2026-05-24-rag-chunker-foundation.md

14 KiB

RAG Chunker Foundation Implementation Plan

For agentic workers: REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (- [ ]) syntax for tracking.

Goal: Build the standalone chunking foundation for the RAG module with a factory, fixed-length and delimiter implementations, and focused unit tests.

Architecture: Add a dedicated chunking abstraction under com.bruce.rag.parse so parsing and chunk generation stay decoupled. The factory will resolve a RagChunkStrategyEnum to a Chunker implementation, and each implementation will convert a command object into in-memory RagChunk entities without touching persistence.

Tech Stack: Java 21, Spring Boot, MyBatis-Plus entities, JUnit 5, Mockito


Task 1: Define Chunking Contracts

Files:

  • Create: src/main/java/com/bruce/rag/parse/RagChunkCommand.java

  • Create: src/main/java/com/bruce/rag/parse/Chunker.java

  • Create: src/main/java/com/bruce/rag/parse/ChunkerFactory.java

  • Test: src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java

  • Step 1: Write the failing test

package com.bruce.rag.parse;

import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import org.junit.jupiter.api.Test;

import java.util.List;

import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;

class ChunkerFactoryTests {

    @Test
    void resolveShouldReturnMatchingChunker() {
        Chunker supported = new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH);
        Chunker unsupported = new StubChunker(RagChunkStrategyEnum.DELIMITER);
        ChunkerFactory factory = new ChunkerFactory(List.of(supported, unsupported));

        Chunker resolved = factory.resolve(RagChunkStrategyEnum.FIXED_LENGTH);

        assertSame(supported, resolved);
    }

    @Test
    void resolveShouldRejectUnsupportedStrategy() {
        ChunkerFactory factory = new ChunkerFactory(List.of(new StubChunker(RagChunkStrategyEnum.FIXED_LENGTH)));

        assertThrows(IllegalArgumentException.class, () -> factory.resolve(RagChunkStrategyEnum.SEMANTIC));
    }

    private static class StubChunker implements Chunker {

        private final RagChunkStrategyEnum strategy;

        private StubChunker(RagChunkStrategyEnum strategy) {
            this.strategy = strategy;
        }

        @Override
        public boolean supports(RagChunkStrategyEnum strategy) {
            return this.strategy == strategy;
        }

        @Override
        public List<RagChunk> chunk(RagChunkCommand command) {
            return List.of();
        }
    }
}
  • Step 2: Run test to verify it fails

Run: mvn -Dtest=ChunkerFactoryTests test Expected: FAIL with compilation errors because Chunker, ChunkerFactory, and RagChunkCommand do not exist yet

  • Step 3: Write minimal implementation
package com.bruce.rag.parse;

import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagDocument;
import lombok.Data;

@Data
public class RagChunkCommand {

    private RagDocument document;

    private DocumentParseResult parseResult;

    private String chunkStrategy;

    private Integer chunkSize;

    private Integer chunkOverlap;

    private String delimiter;
}
package com.bruce.rag.parse;

import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.enums.RagChunkStrategyEnum;

import java.util.List;

public interface Chunker {

    boolean supports(RagChunkStrategyEnum strategy);

    List<RagChunk> chunk(RagChunkCommand command);
}
package com.bruce.rag.parse;

import com.bruce.rag.enums.RagChunkStrategyEnum;
import org.springframework.stereotype.Component;

import java.util.List;

@Component
public class ChunkerFactory {

    private final List<Chunker> chunkers;

    public ChunkerFactory(List<Chunker> chunkers) {
        this.chunkers = chunkers;
    }

    public Chunker resolve(RagChunkStrategyEnum strategy) {
        return chunkers.stream()
                .filter(chunker -> chunker.supports(strategy))
                .findFirst()
                .orElseThrow(() -> new IllegalArgumentException("不支持的切片方式: " + strategy));
    }
}
  • Step 4: Run test to verify it passes

Run: mvn -Dtest=ChunkerFactoryTests test Expected: PASS

  • Step 5: Commit
git add src/main/java/com/bruce/rag/parse/RagChunkCommand.java src/main/java/com/bruce/rag/parse/Chunker.java src/main/java/com/bruce/rag/parse/ChunkerFactory.java src/test/java/com/bruce/rag/parse/ChunkerFactoryTests.java
git commit -m "feat: add rag chunker contracts"

Task 2: Add Fixed-Length Chunker

Files:

  • Create: src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java

  • Test: src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java

  • Step 1: Write the failing test

package com.bruce.rag.parse;

import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.parse.impl.FixedLengthChunker;
import org.junit.jupiter.api.Test;

import java.util.List;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

class FixedLengthChunkerTests {

    @Test
    void chunkShouldSplitTextByChunkSizeAndOverlap() {
        FixedLengthChunker chunker = new FixedLengthChunker();

        RagChunkCommand command = new RagChunkCommand();
        command.setDocument(buildDocument());
        command.setParseResult(buildParseResult("abcdefghij"));
        command.setChunkStrategy("FIXED_LENGTH");
        command.setChunkSize(4);
        command.setChunkOverlap(1);

        List<RagChunk> chunks = chunker.chunk(command);

        assertEquals(3, chunks.size());
        assertEquals("abcd", chunks.get(0).getChunkContent());
        assertEquals("defg", chunks.get(1).getChunkContent());
        assertEquals("ghij", chunks.get(2).getChunkContent());
        assertEquals(0, chunks.get(0).getChunkIndex());
        assertEquals(1, chunks.get(1).getChunkIndex());
        assertEquals(2, chunks.get(2).getChunkIndex());
        assertEquals(99L, chunks.get(0).getDocumentId());
        assertEquals(88L, chunks.get(0).getStoreId());
        assertTrue(Boolean.TRUE.equals(chunks.get(0).getEnabled()));
    }

    @Test
    void chunkShouldReturnEmptyListForBlankText() {
        FixedLengthChunker chunker = new FixedLengthChunker();

        RagChunkCommand command = new RagChunkCommand();
        command.setDocument(buildDocument());
        command.setParseResult(buildParseResult("   "));
        command.setChunkStrategy("FIXED_LENGTH");
        command.setChunkSize(4);
        command.setChunkOverlap(1);

        assertTrue(chunker.chunk(command).isEmpty());
    }

    private static RagDocument buildDocument() {
        RagDocument document = new RagDocument();
        document.setId(99L);
        document.setStoreId(88L);
        return document;
    }

    private static DocumentParseResult buildParseResult(String text) {
        DocumentParseResult result = new DocumentParseResult();
        result.setText(text);
        result.setTextLength(text.length());
        return result;
    }
}
  • Step 2: Run test to verify it fails

Run: mvn -Dtest=FixedLengthChunkerTests test Expected: FAIL with compilation errors because FixedLengthChunker does not exist yet

  • Step 3: Write minimal implementation
package com.bruce.rag.parse.impl;

import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.Chunker;
import com.bruce.rag.parse.RagChunkCommand;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;

import java.util.ArrayList;
import java.util.List;

@Component
public class FixedLengthChunker implements Chunker {

    @Override
    public boolean supports(RagChunkStrategyEnum strategy) {
        return RagChunkStrategyEnum.FIXED_LENGTH == strategy;
    }

    @Override
    public List<RagChunk> chunk(RagChunkCommand command) {
        DocumentParseResult parseResult = command.getParseResult();
        String text = parseResult == null ? null : parseResult.getText();
        if (!StringUtils.hasText(text)) {
            return List.of();
        }

        int chunkSize = command.getChunkSize() == null ? text.length() : command.getChunkSize();
        int overlap = command.getChunkOverlap() == null ? 0 : command.getChunkOverlap();
        int step = Math.max(1, chunkSize - overlap);
        List<RagChunk> chunks = new ArrayList<>();
        for (int start = 0, index = 0; start < text.length(); start += step, index++) {
            int end = Math.min(text.length(), start + chunkSize);
            chunks.add(buildChunk(command.getDocument(), index, text.substring(start, end)));
            if (end >= text.length()) {
                break;
            }
        }
        return chunks;
    }

    private RagChunk buildChunk(RagDocument document, int index, String content) {
        RagChunk chunk = new RagChunk();
        chunk.setStoreId(document.getStoreId());
        chunk.setDocumentId(document.getId());
        chunk.setChunkIndex(index);
        chunk.setChunkContent(content);
        chunk.setEnabled(Boolean.TRUE);
        return chunk;
    }
}
  • Step 4: Run test to verify it passes

Run: mvn -Dtest=FixedLengthChunkerTests test Expected: PASS

  • Step 5: Commit
git add src/main/java/com/bruce/rag/parse/impl/FixedLengthChunker.java src/test/java/com/bruce/rag/parse/FixedLengthChunkerTests.java
git commit -m "feat: add fixed length rag chunker"

Task 3: Add Delimiter Chunker

Files:

  • Create: src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java

  • Test: src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java

  • Step 1: Write the failing test

package com.bruce.rag.parse;

import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.parse.impl.DelimiterChunker;
import org.junit.jupiter.api.Test;

import java.util.List;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

class DelimiterChunkerTests {

    @Test
    void chunkShouldSplitByDelimiterAndIgnoreBlankSegments() {
        DelimiterChunker chunker = new DelimiterChunker();

        RagChunkCommand command = new RagChunkCommand();
        command.setDocument(buildDocument());
        command.setParseResult(buildParseResult("第一段。第二段。。第三段"));
        command.setChunkStrategy("DELIMITER");
        command.setDelimiter("。");

        List<RagChunk> chunks = chunker.chunk(command);

        assertEquals(3, chunks.size());
        assertEquals("第一段", chunks.get(0).getChunkContent());
        assertEquals("第二段", chunks.get(1).getChunkContent());
        assertEquals("第三段", chunks.get(2).getChunkContent());
        assertEquals(0, chunks.get(0).getChunkIndex());
        assertEquals(1, chunks.get(1).getChunkIndex());
        assertEquals(2, chunks.get(2).getChunkIndex());
    }

    @Test
    void chunkShouldReturnEmptyListForBlankText() {
        DelimiterChunker chunker = new DelimiterChunker();

        RagChunkCommand command = new RagChunkCommand();
        command.setDocument(buildDocument());
        command.setParseResult(buildParseResult(" "));
        command.setChunkStrategy("DELIMITER");
        command.setDelimiter("。");

        assertTrue(chunker.chunk(command).isEmpty());
    }

    private static RagDocument buildDocument() {
        RagDocument document = new RagDocument();
        document.setId(66L);
        document.setStoreId(55L);
        return document;
    }

    private static DocumentParseResult buildParseResult(String text) {
        DocumentParseResult result = new DocumentParseResult();
        result.setText(text);
        result.setTextLength(text.length());
        return result;
    }
}
  • Step 2: Run test to verify it fails

Run: mvn -Dtest=DelimiterChunkerTests test Expected: FAIL with compilation errors because DelimiterChunker does not exist yet

  • Step 3: Write minimal implementation
package com.bruce.rag.parse.impl;

import com.bruce.common.document.parse.DocumentParseResult;
import com.bruce.rag.entity.RagChunk;
import com.bruce.rag.entity.RagDocument;
import com.bruce.rag.enums.RagChunkStrategyEnum;
import com.bruce.rag.parse.Chunker;
import com.bruce.rag.parse.RagChunkCommand;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

@Component
public class DelimiterChunker implements Chunker {

    @Override
    public boolean supports(RagChunkStrategyEnum strategy) {
        return RagChunkStrategyEnum.DELIMITER == strategy;
    }

    @Override
    public List<RagChunk> chunk(RagChunkCommand command) {
        DocumentParseResult parseResult = command.getParseResult();
        String text = parseResult == null ? null : parseResult.getText();
        if (!StringUtils.hasText(text) || !StringUtils.hasText(command.getDelimiter())) {
            return List.of();
        }

        String[] parts = text.split(Pattern.quote(command.getDelimiter()));
        List<RagChunk> chunks = new ArrayList<>();
        for (String part : parts) {
            if (!StringUtils.hasText(part)) {
                continue;
            }
            chunks.add(buildChunk(command.getDocument(), chunks.size(), part.trim()));
        }
        return chunks;
    }

    private RagChunk buildChunk(RagDocument document, int index, String content) {
        RagChunk chunk = new RagChunk();
        chunk.setStoreId(document.getStoreId());
        chunk.setDocumentId(document.getId());
        chunk.setChunkIndex(index);
        chunk.setChunkContent(content);
        chunk.setEnabled(Boolean.TRUE);
        return chunk;
    }
}
  • Step 4: Run test to verify it passes

Run: mvn -Dtest=DelimiterChunkerTests test Expected: PASS

  • Step 5: Commit
git add src/main/java/com/bruce/rag/parse/impl/DelimiterChunker.java src/test/java/com/bruce/rag/parse/DelimiterChunkerTests.java
git commit -m "feat: add delimiter rag chunker"