|
|
|
package org.jeecg.modules.airag.app.service.impl;
|
|
|
|
|
|
|
|
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
|
|
|
import org.apache.poi.hwpf.HWPFDocument;
|
|
|
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
|
|
|
import org.apache.poi.hwpf.usermodel.Range;
|
|
|
|
import dev.langchain4j.data.document.Document;
|
|
|
|
import dev.langchain4j.data.document.DocumentSplitter;
|
|
|
|
import dev.langchain4j.data.document.loader.FileSystemDocumentLoader;
|
|
|
|
import dev.langchain4j.data.document.parser.TextDocumentParser;
|
|
|
|
import dev.langchain4j.data.document.splitter.DocumentByParagraphSplitter;
|
|
|
|
import dev.langchain4j.data.document.splitter.DocumentSplitters;
|
|
|
|
import dev.langchain4j.data.embedding.Embedding;
|
|
|
|
import dev.langchain4j.data.segment.TextSegment;
|
|
|
|
import dev.langchain4j.model.embedding.EmbeddingModel;
|
|
|
|
import dev.langchain4j.model.output.Response;
|
|
|
|
import org.apache.commons.io.FilenameUtils;
|
|
|
|
import org.apache.poi.xwpf.usermodel.IBodyElement;
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
|
|
|
import org.jeecg.common.api.vo.Result;
|
|
|
|
import org.jeecg.common.util.CommonUtils;
|
|
|
|
import org.jeecg.modules.airag.app.entity.QuestionEmbedding;
|
|
|
|
import org.jeecg.modules.airag.app.mapper.QuestionEmbeddingMapper;
|
|
|
|
import org.jeecg.modules.airag.app.service.IQuestionEmbeddingService;
|
|
|
|
import org.jeecg.modules.airag.app.utils.AiModelUtils;
|
|
|
|
import org.jeecg.modules.airag.common.handler.IAIChatHandler;
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
import org.springframework.web.multipart.MultipartFile;
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.file.Files;
|
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
import java.nio.file.StandardCopyOption;
|
|
|
|
import java.util.*;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
import java.util.zip.ZipEntry;
|
|
|
|
import java.util.zip.ZipInputStream;
|
|
|
|
|
|
|
|
@Service
|
|
|
|
public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService {
|
|
|
|
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(QuestionEmbeddingServiceImpl.class);
|
|
|
|
|
|
|
|
@Autowired
|
|
|
|
private QuestionEmbeddingMapper questionEmbeddingMapper;
|
|
|
|
|
|
|
|
@Autowired
|
|
|
|
private AiModelUtils aiModelUtils;
|
|
|
|
|
|
|
|
@Autowired
|
|
|
|
private IAIChatHandler aiChatHandler;
|
|
|
|
|
|
|
|
@Value("${jeecg.upload.path}")
|
|
|
|
private String uploadPath;
|
|
|
|
|
|
|
|
private static final Set<String> ALLOWED_EXTENSIONS = Set.of("txt", "doc", "docx");
|
|
|
|
private static final Pattern SPECIAL_CHARS_PATTERN = Pattern.compile("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s]");
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public List<QuestionEmbedding> findAll() {
|
|
|
|
return questionEmbeddingMapper.findAll();
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public QuestionEmbedding findById(String id) {
|
|
|
|
return questionEmbeddingMapper.findById(id);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public int insert(QuestionEmbedding record) {
|
|
|
|
return questionEmbeddingMapper.insert(record);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public int update(QuestionEmbedding record) {
|
|
|
|
return questionEmbeddingMapper.update(record);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public int deleteById(String id) {
|
|
|
|
return questionEmbeddingMapper.deleteById(id);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public List<QuestionEmbedding> similaritySearchByQuestion(String question, int limit, Double minSimilarity) {
|
|
|
|
return questionEmbeddingMapper.similaritySearchByQuestion(question, limit, minSimilarity);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public List<QuestionEmbedding> similaritySearch(float[] vector, int limit) {
|
|
|
|
return questionEmbeddingMapper.similaritySearch(vector, limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
public Result<?> processZipUpload(MultipartFile zipFile) {
|
|
|
|
try {
|
|
|
|
Path tempDir = Files.createTempDirectory("zip_upload_");
|
|
|
|
List<Path> validFiles = extractAndFilterZip(zipFile, tempDir);
|
|
|
|
|
|
|
|
if (validFiles.isEmpty()) {
|
|
|
|
return Result.error("ZIP文件中没有有效的TXT或Word文档");
|
|
|
|
}
|
|
|
|
|
|
|
|
for (Path filePath : validFiles) {
|
|
|
|
processSingleFile(filePath);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Result.OK("文件上传和处理成功");
|
|
|
|
} catch (Exception e) {
|
|
|
|
log.error("处理ZIP文件上传失败", e);
|
|
|
|
return Result.error("处理ZIP文件失败: " + e.getMessage());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private List<Path> extractAndFilterZip(MultipartFile zipFile, Path tempDir) throws IOException {
|
|
|
|
List<Path> validFiles = new ArrayList<>();
|
|
|
|
|
|
|
|
try (ZipInputStream zipIn = new ZipInputStream(zipFile.getInputStream())) {
|
|
|
|
ZipEntry entry;
|
|
|
|
while ((entry = zipIn.getNextEntry()) != null) {
|
|
|
|
if (!entry.isDirectory()) {
|
|
|
|
String fileName = entry.getName();
|
|
|
|
String ext = FilenameUtils.getExtension(fileName).toLowerCase();
|
|
|
|
|
|
|
|
if (ALLOWED_EXTENSIONS.contains(ext)) {
|
|
|
|
String safeFileName = new File(fileName).getName();
|
|
|
|
Path outputPath = tempDir.resolve(safeFileName);
|
|
|
|
Files.copy(zipIn, outputPath, StandardCopyOption.REPLACE_EXISTING);
|
|
|
|
validFiles.add(outputPath);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
zipIn.closeEntry();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return validFiles;
|
|
|
|
}
|
|
|
|
|
|
|
|
private void processSingleFile(Path filePath) throws Exception {
|
|
|
|
String originalFileName = filePath.getFileName().toString();
|
|
|
|
String fileExt = FilenameUtils.getExtension(originalFileName);
|
|
|
|
String newFileName = FilenameUtils.removeExtension(originalFileName) + "_" + UUID.randomUUID() + "." + fileExt;
|
|
|
|
Path targetPath = Paths.get(uploadPath, newFileName);
|
|
|
|
Files.move(filePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
|
|
|
|
|
|
|
List<String> segments;
|
|
|
|
if (fileExt.equalsIgnoreCase("txt")) {
|
|
|
|
String fileContent = readFileContent(targetPath);
|
|
|
|
String cleanedContent = cleanText(fileContent);
|
|
|
|
segments = splitTxtDocument(cleanedContent);
|
|
|
|
} else {
|
|
|
|
segments = splitWordDocument(targetPath.toString());
|
|
|
|
}
|
|
|
|
|
|
|
|
saveSegmentsToDatabase(segments, originalFileName, newFileName);
|
|
|
|
}
|
|
|
|
|
|
|
|
private String readFileContent(Path filePath) throws IOException {
|
|
|
|
return new String(Files.readAllBytes(filePath));
|
|
|
|
}
|
|
|
|
|
|
|
|
private String cleanText(String text) {
|
|
|
|
text = SPECIAL_CHARS_PATTERN.matcher(text).replaceAll("");
|
|
|
|
return text.replaceAll("\\s+", " ").trim();
|
|
|
|
}
|
|
|
|
|
|
|
|
private List<String> splitTxtDocument(String content) {
|
|
|
|
DocumentSplitter splitter = new DocumentByParagraphSplitter(1000, 200);
|
|
|
|
Document document = Document.from(content);
|
|
|
|
return splitter.split(document).stream()
|
|
|
|
.map(TextSegment::text)
|
|
|
|
.map(this::cleanText)
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
}
|
|
|
|
|
|
|
|
public List<String> splitWordDocument(String filePath) throws Exception {
|
|
|
|
List<String> result = new ArrayList<>();
|
|
|
|
String ext = FilenameUtils.getExtension(filePath).toLowerCase();
|
|
|
|
StringBuilder fullContent = new StringBuilder();
|
|
|
|
String fileName = new File(filePath).getName();
|
|
|
|
fileName = fileName.substring(0, fileName.lastIndexOf('.')); // 去掉后缀
|
|
|
|
|
|
|
|
if (ext.equals("docx")) {
|
|
|
|
try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
|
|
|
|
StringBuilder currentSection = new StringBuilder();
|
|
|
|
boolean isTableSection = false;
|
|
|
|
|
|
|
|
for (IBodyElement element : doc.getBodyElements()) {
|
|
|
|
if (element instanceof XWPFParagraph) {
|
|
|
|
XWPFParagraph para = (XWPFParagraph) element;
|
|
|
|
String text = cleanText(para.getText());
|
|
|
|
fullContent.append(text).append("\n");
|
|
|
|
|
|
|
|
if (isTableSection) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
currentSection = new StringBuilder();
|
|
|
|
isTableSection = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
String style = para.getStyle();
|
|
|
|
if (style != null && style.matches("Heading\\d")) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder(text).append("\n");
|
|
|
|
} else {
|
|
|
|
currentSection.append(text).append("\n");
|
|
|
|
}
|
|
|
|
} else if (element instanceof XWPFTable) {
|
|
|
|
String tableContent = extractTableContent((XWPFTable) element);
|
|
|
|
fullContent.append(tableContent).append("\n");
|
|
|
|
|
|
|
|
if (!isTableSection) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder();
|
|
|
|
isTableSection = true;
|
|
|
|
}
|
|
|
|
currentSection.append(tableContent).append("\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (ext.equals("doc")) {
|
|
|
|
try (HWPFDocument doc = new HWPFDocument(new FileInputStream(filePath))) {
|
|
|
|
Range range = doc.getRange();
|
|
|
|
StringBuilder currentSection = new StringBuilder();
|
|
|
|
boolean isTableSection = false;
|
|
|
|
|
|
|
|
for (int i = 0; i < range.numParagraphs(); i++) {
|
|
|
|
Paragraph para = range.getParagraph(i);
|
|
|
|
String text = cleanText(para.text());
|
|
|
|
fullContent.append(text).append("\n");
|
|
|
|
|
|
|
|
if (para.isInTable()) {
|
|
|
|
if (!isTableSection) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder();
|
|
|
|
isTableSection = true;
|
|
|
|
}
|
|
|
|
currentSection.append(text).append("\n");
|
|
|
|
} else {
|
|
|
|
if (isTableSection) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
currentSection = new StringBuilder();
|
|
|
|
isTableSection = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isHeading(para, range)) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder(text).append("\n");
|
|
|
|
} else {
|
|
|
|
currentSection.append(text).append("\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fullContent.length() < 1000) {
|
|
|
|
return Collections.singletonList(fileName + "\n" + fullContent.toString().trim());
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
private String extractTableContent(XWPFTable table) {
|
|
|
|
StringBuilder tableContent = new StringBuilder();
|
|
|
|
table.getRows().forEach(row -> {
|
|
|
|
row.getTableCells().forEach(cell -> {
|
|
|
|
tableContent.append("| ").append(cleanText(cell.getText())).append(" ");
|
|
|
|
});
|
|
|
|
tableContent.append("|\n");
|
|
|
|
});
|
|
|
|
return tableContent.toString();
|
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean isHeading(Paragraph para, Range range) {
|
|
|
|
int styleIndex = para.getStyleIndex();
|
|
|
|
if (styleIndex >= 1 && styleIndex <= 9) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
CharacterRun run = para.getCharacterRun(0);
|
|
|
|
if (run.isBold() || run.getFontSize() > 12) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
|
|
log.warn("获取字符格式失败", e);
|
|
|
|
}
|
|
|
|
|
|
|
|
String text = para.text().trim();
|
|
|
|
return text.toUpperCase().equals(text) &&
|
|
|
|
text.length() < 100 &&
|
|
|
|
!text.contains(".") &&
|
|
|
|
!text.contains("\t");
|
|
|
|
}
|
|
|
|
|
|
|
|
private void saveSegmentsToDatabase(List<String> segments, String originalFileName, String storedFileName) {
|
|
|
|
if (segments.isEmpty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
String fileNameWithoutExt = originalFileName.substring(0, originalFileName.lastIndexOf('.'));
|
|
|
|
String question = segments.size() == 1 ? fileNameWithoutExt : null;
|
|
|
|
|
|
|
|
for (String segment : segments) {
|
|
|
|
if (segment.trim().isEmpty()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
QuestionEmbedding record = new QuestionEmbedding();
|
|
|
|
record.setId(UUID.randomUUID().toString());
|
|
|
|
|
|
|
|
if (question != null) {
|
|
|
|
record.setQuestion(question);
|
|
|
|
} else {
|
|
|
|
String firstLine = segment.lines().findFirst().orElse("未命名问题");
|
|
|
|
record.setQuestion(cleanText(firstLine));
|
|
|
|
}
|
|
|
|
|
|
|
|
record.setAnswer(segment.trim());
|
|
|
|
record.setText("");
|
|
|
|
record.setMetadata("{\"fileName\":\"" + storedFileName + "\"}");
|
|
|
|
|
|
|
|
Response<Embedding> embeddingResponse = aiModelUtils.getEmbedding("1925730210204721154", record.getQuestion());
|
|
|
|
record.setEmbedding(embeddingResponse.content().vector());
|
|
|
|
|
|
|
|
questionEmbeddingMapper.insert(record);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} |
|
|
\ No newline at end of file |
...
|
...
|
|