feat(rag): add document parsing structures
This commit is contained in:
@@ -8,18 +8,29 @@ import com.bruce.rag.constant.RagSystemConstants;
|
||||
import com.bruce.rag.controller.RagDocumentController;
|
||||
import com.bruce.rag.controller.RagStoreController;
|
||||
import com.bruce.rag.dto.request.RagDocumentQueryRequest;
|
||||
import com.bruce.rag.dto.request.RagDocumentParseRequest;
|
||||
import com.bruce.rag.dto.request.RagStoreQueryRequest;
|
||||
import com.bruce.rag.dto.request.RagStoreSaveRequest;
|
||||
import com.bruce.rag.dto.response.RagDocumentParseResponse;
|
||||
import com.bruce.rag.dto.response.RagStoreDocumentOverviewResponse;
|
||||
import com.bruce.rag.dto.response.RagStoreOverviewResponse;
|
||||
import com.bruce.rag.dto.response.RagDocumentResponse;
|
||||
import com.bruce.rag.dto.response.RagStoreResponse;
|
||||
import com.bruce.rag.entity.RagChunk;
|
||||
import com.bruce.rag.entity.RagChunkEmbedding;
|
||||
import com.bruce.rag.entity.RagDocument;
|
||||
import com.bruce.rag.entity.RagStore;
|
||||
import com.bruce.rag.mapper.RagChunkEmbeddingMapper;
|
||||
import com.bruce.rag.mapper.RagChunkMapper;
|
||||
import com.bruce.rag.mapper.RagDocumentMapper;
|
||||
import com.bruce.rag.mapper.RagStoreMapper;
|
||||
import com.bruce.rag.service.IRagChunkEmbeddingService;
|
||||
import com.bruce.rag.service.IRagChunkService;
|
||||
import com.bruce.rag.service.IRagDocumentParseService;
|
||||
import com.bruce.rag.service.IRagDocumentService;
|
||||
import com.bruce.rag.service.IRagStoreService;
|
||||
import com.bruce.rag.service.impl.RagChunkEmbeddingServiceImpl;
|
||||
import com.bruce.rag.service.impl.RagChunkServiceImpl;
|
||||
import com.bruce.rag.service.impl.RagDocumentServiceImpl;
|
||||
import com.bruce.rag.service.impl.RagStoreServiceImpl;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -39,10 +50,16 @@ class RagComponentStructureTests {
|
||||
void ragComponentsShouldReuseMybatisPlusBaseTypes() {
|
||||
assertTrue(BaseMapper.class.isAssignableFrom(RagStoreMapper.class));
|
||||
assertTrue(BaseMapper.class.isAssignableFrom(RagDocumentMapper.class));
|
||||
assertTrue(BaseMapper.class.isAssignableFrom(RagChunkMapper.class));
|
||||
assertTrue(BaseMapper.class.isAssignableFrom(RagChunkEmbeddingMapper.class));
|
||||
assertTrue(IService.class.isAssignableFrom(IRagStoreService.class));
|
||||
assertTrue(IService.class.isAssignableFrom(IRagDocumentService.class));
|
||||
assertTrue(IService.class.isAssignableFrom(IRagChunkService.class));
|
||||
assertTrue(IService.class.isAssignableFrom(IRagChunkEmbeddingService.class));
|
||||
assertTrue(ServiceImpl.class.isAssignableFrom(RagStoreServiceImpl.class));
|
||||
assertTrue(ServiceImpl.class.isAssignableFrom(RagDocumentServiceImpl.class));
|
||||
assertTrue(ServiceImpl.class.isAssignableFrom(RagChunkServiceImpl.class));
|
||||
assertTrue(ServiceImpl.class.isAssignableFrom(RagChunkEmbeddingServiceImpl.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -63,8 +80,10 @@ class RagComponentStructureTests {
|
||||
|
||||
Method documentListMethod = RagDocumentController.class.getMethod("list");
|
||||
Method documentQueryMethod = RagDocumentController.class.getMethod("query", RagDocumentQueryRequest.class);
|
||||
Method documentParseMethod = RagDocumentController.class.getMethod("parse", RagDocumentParseRequest.class);
|
||||
Method documentResponseListMethod = IRagDocumentService.class.getMethod("listResponses");
|
||||
Method documentServiceQueryMethod = IRagDocumentService.class.getMethod("query", RagDocumentQueryRequest.class);
|
||||
Method documentParseServiceMethod = IRagDocumentParseService.class.getMethod("parse", RagDocumentParseRequest.class);
|
||||
|
||||
assertEquals(RequestResult.class, storeListMethod.getReturnType());
|
||||
assertEquals(RequestResult.class, storeQueryMethod.getReturnType());
|
||||
@@ -89,11 +108,14 @@ class RagComponentStructureTests {
|
||||
|
||||
assertEquals(RequestResult.class, documentListMethod.getReturnType());
|
||||
assertEquals(RequestResult.class, documentQueryMethod.getReturnType());
|
||||
assertEquals(RequestResult.class, documentParseMethod.getReturnType());
|
||||
assertEquals(List.class, documentServiceQueryMethod.getReturnType());
|
||||
assertEquals(List.class, documentParseServiceMethod.getReturnType());
|
||||
assertTrue(documentResponseListMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
|
||||
assertTrue(documentServiceQueryMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
|
||||
assertTrue(documentListMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
|
||||
assertTrue(documentQueryMethod.getGenericReturnType().getTypeName().contains("RagDocumentResponse"));
|
||||
assertTrue(documentParseMethod.getGenericReturnType().getTypeName().contains("RagDocumentParseResponse"));
|
||||
assertEquals(RagDocumentResponse.class, RagDocumentResponse.class.getMethod("fromEntity", RagDocument.class).getReturnType());
|
||||
}
|
||||
|
||||
@@ -121,4 +143,34 @@ class RagComponentStructureTests {
|
||||
assertTrue(RagStoreController.class.getSimpleName().contains("RagStoreController"));
|
||||
assertTrue(RagDocumentController.class.getSimpleName().contains("RagDocumentController"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void ragChunkStructureShouldSupportChunkMetadata() throws NoSuchFieldException {
|
||||
assertEquals(Long.class, RagChunk.class.getDeclaredField("storeId").getType());
|
||||
assertEquals(Long.class, RagChunk.class.getDeclaredField("documentId").getType());
|
||||
assertEquals(Integer.class, RagChunk.class.getDeclaredField("chunkIndex").getType());
|
||||
assertEquals(String.class, RagChunk.class.getDeclaredField("chunkContent").getType());
|
||||
assertEquals(String.class, RagChunk.class.getDeclaredField("chunkSummary").getType());
|
||||
assertEquals(Integer.class, RagChunk.class.getDeclaredField("tokenCount").getType());
|
||||
assertEquals(Integer.class, RagChunk.class.getDeclaredField("pageNumber").getType());
|
||||
assertEquals(String.class, RagChunk.class.getDeclaredField("sectionTitle").getType());
|
||||
assertEquals(String.class, RagChunk.class.getDeclaredField("headingPath").getType());
|
||||
assertEquals(String.class, RagChunk.class.getDeclaredField("vectorId").getType());
|
||||
assertEquals(String.class, RagChunk.class.getDeclaredField("metadataJson").getType());
|
||||
assertEquals(Boolean.class, RagChunk.class.getDeclaredField("enabled").getType());
|
||||
assertEquals(String.class, RagChunk.class.getDeclaredField("remark").getType());
|
||||
}
|
||||
|
||||
@Test
|
||||
void ragChunkEmbeddingStructureShouldSupportPgvectorMetadata() throws NoSuchFieldException {
|
||||
assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("storeId").getType());
|
||||
assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("documentId").getType());
|
||||
assertEquals(Long.class, RagChunkEmbedding.class.getDeclaredField("chunkId").getType());
|
||||
assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("embeddingModel").getType());
|
||||
assertEquals(Integer.class, RagChunkEmbedding.class.getDeclaredField("embeddingDimension").getType());
|
||||
assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("embedding").getType());
|
||||
assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("contentHash").getType());
|
||||
assertEquals(Boolean.class, RagChunkEmbedding.class.getDeclaredField("enabled").getType());
|
||||
assertEquals(String.class, RagChunkEmbedding.class.getDeclaredField("remark").getType());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,160 @@
|
||||
package com.bruce.rag;
|
||||
|
||||
import com.bruce.common.config.AttachmentProperties;
|
||||
import com.bruce.common.document.parse.DocumentParseContext;
|
||||
import com.bruce.common.document.parse.DocumentParseResult;
|
||||
import com.bruce.common.document.parse.DocumentParser;
|
||||
import com.bruce.common.document.parse.DocumentParserFactory;
|
||||
import com.bruce.common.domain.entity.SysAttachment;
|
||||
import com.bruce.common.service.ISysAttachmentService;
|
||||
import com.bruce.rag.dto.request.RagDocumentParseRequest;
|
||||
import com.bruce.rag.dto.response.RagDocumentParseResponse;
|
||||
import com.bruce.rag.entity.RagDocument;
|
||||
import com.bruce.rag.enums.RagParseStatusEnum;
|
||||
import com.bruce.rag.service.IRagDocumentService;
|
||||
import com.bruce.rag.service.impl.RagDocumentParseServiceImpl;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.ArgumentCaptor;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
class RagDocumentParseServiceImplTests {
|
||||
|
||||
@TempDir
|
||||
private Path tempDir;
|
||||
|
||||
@Mock
|
||||
private IRagDocumentService ragDocumentService;
|
||||
|
||||
@Mock
|
||||
private ISysAttachmentService sysAttachmentService;
|
||||
|
||||
@Test
|
||||
void parseShouldUpdateStatusAndReturnParseResponse() throws Exception {
|
||||
Path file = tempDir.resolve("rag").resolve("people.txt");
|
||||
Files.createDirectories(file.getParent());
|
||||
Files.writeString(file, "people profiles");
|
||||
|
||||
RagDocument document = new RagDocument();
|
||||
document.setId(1001L);
|
||||
document.setStoreId(2002L);
|
||||
document.setAttachmentId(3003L);
|
||||
document.setParseStatus(RagParseStatusEnum.UPLOADED.name());
|
||||
|
||||
SysAttachment attachment = new SysAttachment();
|
||||
attachment.setId(3003L);
|
||||
attachment.setOriginalName("people.txt");
|
||||
attachment.setFileSuffix("txt");
|
||||
attachment.setContentType("text/plain");
|
||||
attachment.setFilePath("rag/people.txt");
|
||||
|
||||
AttachmentProperties attachmentProperties = new AttachmentProperties();
|
||||
attachmentProperties.setBasePath(tempDir.toString());
|
||||
DocumentParser parser = new FixedDocumentParser("people profiles");
|
||||
RagDocumentParseServiceImpl service = new RagDocumentParseServiceImpl(
|
||||
ragDocumentService,
|
||||
sysAttachmentService,
|
||||
attachmentProperties,
|
||||
new DocumentParserFactory(List.of(parser))
|
||||
);
|
||||
|
||||
when(ragDocumentService.getById(1001L)).thenReturn(document);
|
||||
when(sysAttachmentService.getById(3003L)).thenReturn(attachment);
|
||||
when(ragDocumentService.updateById(any(RagDocument.class))).thenReturn(true);
|
||||
|
||||
RagDocumentParseResponse response = service.parse(1001L);
|
||||
|
||||
assertEquals(1001L, response.getDocumentId());
|
||||
assertEquals(RagParseStatusEnum.PARSED.name(), response.getParseStatus());
|
||||
assertEquals(15, response.getTextLength());
|
||||
assertEquals("fixed", response.getMetadata().get("parser"));
|
||||
|
||||
ArgumentCaptor<RagDocument> captor = ArgumentCaptor.forClass(RagDocument.class);
|
||||
verify(ragDocumentService, times(2)).updateById(captor.capture());
|
||||
List<RagDocument> updates = captor.getAllValues();
|
||||
assertEquals(RagParseStatusEnum.PARSING.name(), updates.get(0).getParseStatus());
|
||||
assertEquals(RagParseStatusEnum.PARSED.name(), updates.get(1).getParseStatus());
|
||||
assertTrue(parser.supports(new DocumentParseContext()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void parseShouldSupportBatchRequestAndChunkStrategyStructure() throws Exception {
|
||||
Path file = tempDir.resolve("rag").resolve("batch.txt");
|
||||
Files.createDirectories(file.getParent());
|
||||
Files.writeString(file, "batch profiles");
|
||||
|
||||
RagDocument document = new RagDocument();
|
||||
document.setId(1002L);
|
||||
document.setStoreId(2002L);
|
||||
document.setAttachmentId(3004L);
|
||||
document.setParseStatus(RagParseStatusEnum.UPLOADED.name());
|
||||
|
||||
SysAttachment attachment = new SysAttachment();
|
||||
attachment.setId(3004L);
|
||||
attachment.setOriginalName("batch.txt");
|
||||
attachment.setFileSuffix("txt");
|
||||
attachment.setContentType("text/plain");
|
||||
attachment.setFilePath("rag/batch.txt");
|
||||
|
||||
AttachmentProperties attachmentProperties = new AttachmentProperties();
|
||||
attachmentProperties.setBasePath(tempDir.toString());
|
||||
RagDocumentParseServiceImpl service = new RagDocumentParseServiceImpl(
|
||||
ragDocumentService,
|
||||
sysAttachmentService,
|
||||
attachmentProperties,
|
||||
new DocumentParserFactory(List.of(new FixedDocumentParser("batch profiles")))
|
||||
);
|
||||
RagDocumentParseRequest request = new RagDocumentParseRequest();
|
||||
request.setDocumentIds(List.of(1002L));
|
||||
request.setChunkStrategy("DELIMITER");
|
||||
request.setDelimiter("。");
|
||||
|
||||
when(ragDocumentService.getById(1002L)).thenReturn(document);
|
||||
when(sysAttachmentService.getById(3004L)).thenReturn(attachment);
|
||||
when(ragDocumentService.updateById(any(RagDocument.class))).thenReturn(true);
|
||||
|
||||
List<RagDocumentParseResponse> responses = service.parse(request);
|
||||
|
||||
assertEquals(1, responses.size());
|
||||
assertEquals(1002L, responses.getFirst().getDocumentId());
|
||||
assertEquals(RagParseStatusEnum.PARSED.name(), responses.getFirst().getParseStatus());
|
||||
}
|
||||
|
||||
private static class FixedDocumentParser implements DocumentParser {
|
||||
|
||||
private final String text;
|
||||
|
||||
private FixedDocumentParser(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean supports(DocumentParseContext context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentParseResult parse(DocumentParseContext context) {
|
||||
DocumentParseResult result = new DocumentParseResult();
|
||||
result.setText(text);
|
||||
result.setTextLength(text.length());
|
||||
result.setMetadata(Map.of("parser", "fixed"));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -102,13 +102,13 @@ class RagDocumentServiceImplTests {
|
||||
request.setRemark(" 备注信息 ");
|
||||
|
||||
doReturn(existingDocument).when(ragDocumentService).getById(3003L);
|
||||
doReturn(true).when(ragDocumentService).saveOrUpdate(any(RagDocument.class));
|
||||
doReturn(true).when(ragDocumentService).updateById(any(RagDocument.class));
|
||||
|
||||
boolean result = ragDocumentService.saveOrUpdate(request);
|
||||
|
||||
assertTrue(result);
|
||||
ArgumentCaptor<RagDocument> documentCaptor = ArgumentCaptor.forClass(RagDocument.class);
|
||||
verify(ragDocumentService).saveOrUpdate(documentCaptor.capture());
|
||||
verify(ragDocumentService).updateById(documentCaptor.capture());
|
||||
RagDocument savedDocument = documentCaptor.getValue();
|
||||
assertEquals(3003L, savedDocument.getId());
|
||||
assertEquals(1001L, savedDocument.getStoreId());
|
||||
@@ -121,4 +121,40 @@ class RagDocumentServiceImplTests {
|
||||
assertEquals("已修复", savedDocument.getErrorMessage());
|
||||
assertEquals("备注信息", savedDocument.getRemark());
|
||||
}
|
||||
|
||||
@Test
|
||||
void saveOrUpdateShouldPreserveExistingFieldsForPartialUpdate() {
|
||||
RagDocument existingDocument = new RagDocument();
|
||||
existingDocument.setId(3003L);
|
||||
existingDocument.setStoreId(1001L);
|
||||
existingDocument.setAttachmentId(2002L);
|
||||
existingDocument.setDocumentTitle("people_profiles.txt");
|
||||
existingDocument.setDocumentSummary("测试人员信息,有多条人员信息");
|
||||
existingDocument.setParseStatus(RagParseStatusEnum.UPLOADED.name());
|
||||
existingDocument.setIndexStatus(RagIndexStatusEnum.PENDING.name());
|
||||
existingDocument.setEnabled(true);
|
||||
existingDocument.setRemark("测试人员信息");
|
||||
|
||||
RagDocumentSaveRequest request = new RagDocumentSaveRequest();
|
||||
request.setId(3003L);
|
||||
request.setStoreId(1001L);
|
||||
request.setDocumentTitle("people_profiles.txt");
|
||||
request.setEnabled(false);
|
||||
|
||||
doReturn(existingDocument).when(ragDocumentService).getById(3003L);
|
||||
doReturn(true).when(ragDocumentService).updateById(any(RagDocument.class));
|
||||
|
||||
boolean result = ragDocumentService.saveOrUpdate(request);
|
||||
|
||||
assertTrue(result);
|
||||
ArgumentCaptor<RagDocument> documentCaptor = ArgumentCaptor.forClass(RagDocument.class);
|
||||
verify(ragDocumentService).updateById(documentCaptor.capture());
|
||||
RagDocument savedDocument = documentCaptor.getValue();
|
||||
assertEquals(2002L, savedDocument.getAttachmentId());
|
||||
assertEquals("测试人员信息,有多条人员信息", savedDocument.getDocumentSummary());
|
||||
assertEquals(RagParseStatusEnum.UPLOADED.name(), savedDocument.getParseStatus());
|
||||
assertEquals(RagIndexStatusEnum.PENDING.name(), savedDocument.getIndexStatus());
|
||||
assertEquals(false, savedDocument.getEnabled());
|
||||
assertEquals("测试人员信息", savedDocument.getRemark());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user