feat(rag): add document parsing structures

This commit is contained in:
2026-05-21 23:20:09 +08:00
parent 2ab02fb574
commit 1de773405f
38 changed files with 1240 additions and 9 deletions

View File

@@ -0,0 +1,17 @@
package com.bruce.common.config;
import com.baomidou.mybatisplus.extension.plugins.MybatisPlusInterceptor;
import com.baomidou.mybatisplus.extension.plugins.inner.OptimisticLockerInnerInterceptor;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class MybatisPlusConfig {
@Bean
public MybatisPlusInterceptor mybatisPlusInterceptor() {
MybatisPlusInterceptor interceptor = new MybatisPlusInterceptor();
interceptor.addInnerInterceptor(new OptimisticLockerInnerInterceptor());
return interceptor;
}
}

View File

@@ -0,0 +1,21 @@
package com.bruce.common.document.parse;
import lombok.Data;
import java.nio.file.Path;
@Data
public class DocumentParseContext {
private Long documentId;
private Long attachmentId;
private String originalName;
private String suffix;
private String contentType;
private Path filePath;
}

View File

@@ -0,0 +1,12 @@
package com.bruce.common.document.parse;
public class DocumentParseException extends RuntimeException {
public DocumentParseException(String message) {
super(message);
}
public DocumentParseException(String message, Throwable cause) {
super(message, cause);
}
}

View File

@@ -0,0 +1,20 @@
package com.bruce.common.document.parse;
import lombok.Data;
import java.util.LinkedHashMap;
import java.util.Map;
@Data
public class DocumentParseResult {
private String text;
private Integer textLength;
private Integer pageCount;
private Integer sheetCount;
private Map<String, Object> metadata = new LinkedHashMap<>();
}

View File

@@ -0,0 +1,8 @@
package com.bruce.common.document.parse;
public interface DocumentParser {
boolean supports(DocumentParseContext context);
DocumentParseResult parse(DocumentParseContext context);
}

View File

@@ -0,0 +1,37 @@
package com.bruce.common.document.parse;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.util.List;
import java.util.Locale;
@Component
public class DocumentParserFactory {
private final List<DocumentParser> parsers;
public DocumentParserFactory(List<DocumentParser> parsers) {
this.parsers = parsers;
}
public DocumentParser resolve(DocumentParseContext context) {
return parsers.stream()
.filter(parser -> parser.supports(context))
.findFirst()
.orElseThrow(() -> new DocumentParseException("不支持的文档类型: " + resolveType(context)));
}
private String resolveType(DocumentParseContext context) {
if (context == null) {
return "unknown";
}
if (StringUtils.hasText(context.getSuffix())) {
return context.getSuffix().trim().toLowerCase(Locale.ROOT);
}
if (StringUtils.hasText(context.getContentType())) {
return context.getContentType().trim();
}
return "unknown";
}
}

View File

@@ -0,0 +1,64 @@
package com.bruce.common.document.parse.impl;
import com.bruce.common.document.parse.DocumentParseContext;
import com.bruce.common.document.parse.DocumentParseException;
import com.bruce.common.document.parse.DocumentParseResult;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.springframework.util.StringUtils;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.util.Locale;
import java.util.Set;
abstract class AbstractTikaDocumentParser {
private static final int MAX_TEXT_LENGTH = -1;
private final Tika tika = new Tika();
boolean supportsSuffix(DocumentParseContext context, Set<String> suffixes) {
return context != null
&& StringUtils.hasText(context.getSuffix())
&& suffixes.contains(context.getSuffix().trim().toLowerCase(Locale.ROOT));
}
boolean supportsContentType(DocumentParseContext context, String prefix) {
return context != null
&& StringUtils.hasText(context.getContentType())
&& context.getContentType().trim().toLowerCase(Locale.ROOT).startsWith(prefix);
}
DocumentParseResult parseWithTika(DocumentParseContext context) {
if (context == null || context.getFilePath() == null) {
throw new DocumentParseException("解析文件不能为空");
}
try {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, context.getOriginalName());
if (StringUtils.hasText(context.getContentType())) {
metadata.set(Metadata.CONTENT_TYPE, context.getContentType());
}
String text;
try (InputStream inputStream = Files.newInputStream(context.getFilePath())) {
text = tika.parseToString(inputStream, metadata, MAX_TEXT_LENGTH);
}
DocumentParseResult result = new DocumentParseResult();
result.setText(text == null ? "" : text.trim());
result.setTextLength(result.getText().length());
result.getMetadata().put("contentType", firstNonBlank(metadata.get(Metadata.CONTENT_TYPE), context.getContentType()));
result.getMetadata().put("resourceName", firstNonBlank(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY), context.getOriginalName()));
return result;
} catch (IOException | TikaException e) {
throw new DocumentParseException("文档解析失败: " + e.getMessage(), e);
}
}
private String firstNonBlank(String first, String fallback) {
return StringUtils.hasText(first) ? first : fallback;
}
}

View File

@@ -0,0 +1,26 @@
package com.bruce.common.document.parse.impl;
import com.bruce.common.document.parse.DocumentParseContext;
import com.bruce.common.document.parse.DocumentParser;
import com.bruce.common.document.parse.DocumentParseResult;
import org.springframework.stereotype.Component;
import java.util.Set;
@Component
public class ExcelDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
private static final Set<String> SUFFIXES = Set.of("xls", "xlsx");
@Override
public boolean supports(DocumentParseContext context) {
return supportsSuffix(context, SUFFIXES)
|| supportsContentType(context, "application/vnd.ms-excel")
|| supportsContentType(context, "application/vnd.openxmlformats-officedocument.spreadsheetml");
}
@Override
public DocumentParseResult parse(DocumentParseContext context) {
return parseWithTika(context);
}
}

View File

@@ -0,0 +1,24 @@
package com.bruce.common.document.parse.impl;
import com.bruce.common.document.parse.DocumentParseContext;
import com.bruce.common.document.parse.DocumentParser;
import com.bruce.common.document.parse.DocumentParseResult;
import org.springframework.stereotype.Component;
import java.util.Set;
@Component
public class PdfDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
private static final Set<String> SUFFIXES = Set.of("pdf");
@Override
public boolean supports(DocumentParseContext context) {
return supportsSuffix(context, SUFFIXES) || supportsContentType(context, "application/pdf");
}
@Override
public DocumentParseResult parse(DocumentParseContext context) {
return parseWithTika(context);
}
}

View File

@@ -0,0 +1,24 @@
package com.bruce.common.document.parse.impl;
import com.bruce.common.document.parse.DocumentParseContext;
import com.bruce.common.document.parse.DocumentParser;
import com.bruce.common.document.parse.DocumentParseResult;
import org.springframework.stereotype.Component;
import java.util.Set;
@Component
public class TxtDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
private static final Set<String> SUFFIXES = Set.of("txt", "md", "log");
@Override
public boolean supports(DocumentParseContext context) {
return supportsSuffix(context, SUFFIXES) || supportsContentType(context, "text/");
}
@Override
public DocumentParseResult parse(DocumentParseContext context) {
return parseWithTika(context);
}
}

View File

@@ -0,0 +1,26 @@
package com.bruce.common.document.parse.impl;
import com.bruce.common.document.parse.DocumentParseContext;
import com.bruce.common.document.parse.DocumentParser;
import com.bruce.common.document.parse.DocumentParseResult;
import org.springframework.stereotype.Component;
import java.util.Set;
@Component
public class WordDocumentParser extends AbstractTikaDocumentParser implements DocumentParser {
private static final Set<String> SUFFIXES = Set.of("doc", "docx");
@Override
public boolean supports(DocumentParseContext context) {
return supportsSuffix(context, SUFFIXES)
|| supportsContentType(context, "application/msword")
|| supportsContentType(context, "application/vnd.openxmlformats-officedocument.wordprocessingml");
}
@Override
public DocumentParseResult parse(DocumentParseContext context) {
return parseWithTika(context);
}
}