|
...
|
...
|
@@ -6,13 +6,9 @@ import org.apache.poi.hwpf.usermodel.Paragraph; |
|
|
|
import org.apache.poi.hwpf.usermodel.Range;
|
|
|
|
import dev.langchain4j.data.document.Document;
|
|
|
|
import dev.langchain4j.data.document.DocumentSplitter;
|
|
|
|
import dev.langchain4j.data.document.loader.FileSystemDocumentLoader;
|
|
|
|
import dev.langchain4j.data.document.parser.TextDocumentParser;
|
|
|
|
import dev.langchain4j.data.document.splitter.DocumentByParagraphSplitter;
|
|
|
|
import dev.langchain4j.data.document.splitter.DocumentSplitters;
|
|
|
|
import dev.langchain4j.data.embedding.Embedding;
|
|
|
|
import dev.langchain4j.data.segment.TextSegment;
|
|
|
|
import dev.langchain4j.model.embedding.EmbeddingModel;
|
|
|
|
import dev.langchain4j.model.output.Response;
|
|
|
|
import org.apache.commons.io.FilenameUtils;
|
|
|
|
import org.apache.poi.xwpf.usermodel.IBodyElement;
|
|
...
|
...
|
@@ -20,7 +16,6 @@ import org.apache.poi.xwpf.usermodel.XWPFDocument; |
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
|
|
|
import org.jeecg.common.api.vo.Result;
|
|
|
|
import org.jeecg.common.util.CommonUtils;
|
|
|
|
import org.jeecg.modules.airag.app.entity.QuestionEmbedding;
|
|
|
|
import org.jeecg.modules.airag.app.mapper.QuestionEmbeddingMapper;
|
|
|
|
import org.jeecg.modules.airag.app.service.IQuestionEmbeddingService;
|
|
...
|
...
|
@@ -36,6 +31,7 @@ import org.springframework.web.multipart.MultipartFile; |
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
import java.nio.file.Files;
|
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.nio.file.Paths;
|
|
...
|
...
|
@@ -65,6 +61,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
|
|
|
|
private static final Set<String> ALLOWED_EXTENSIONS = Set.of("txt", "doc", "docx");
|
|
|
|
private static final Pattern SPECIAL_CHARS_PATTERN = Pattern.compile("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s]");
|
|
|
|
private static final Pattern UUID_PATTERN = Pattern.compile("_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}");
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public List<QuestionEmbedding> findAll() {
|
|
...
|
...
|
@@ -101,7 +98,8 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
return questionEmbeddingMapper.similaritySearch(vector, limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
public Result<?> processZipUpload(MultipartFile zipFile) {
|
|
|
|
@Override
|
|
|
|
public Result<?> processZipUpload(MultipartFile zipFile, String knowledgeId) {
|
|
|
|
try {
|
|
|
|
Path tempDir = Files.createTempDirectory("zip_upload_");
|
|
|
|
List<Path> validFiles = extractAndFilterZip(zipFile, tempDir);
|
|
...
|
...
|
@@ -111,7 +109,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
}
|
|
|
|
|
|
|
|
for (Path filePath : validFiles) {
|
|
|
|
processSingleFile(filePath);
|
|
|
|
processSingleFile(filePath, knowledgeId);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Result.OK("文件上传和处理成功");
|
|
...
|
...
|
@@ -124,7 +122,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
private List<Path> extractAndFilterZip(MultipartFile zipFile, Path tempDir) throws IOException {
|
|
|
|
List<Path> validFiles = new ArrayList<>();
|
|
|
|
|
|
|
|
try (ZipInputStream zipIn = new ZipInputStream(zipFile.getInputStream())) {
|
|
|
|
try (ZipInputStream zipIn = new ZipInputStream(zipFile.getInputStream(), Charset.forName("GBK"))) {
|
|
|
|
ZipEntry entry;
|
|
|
|
while ((entry = zipIn.getNextEntry()) != null) {
|
|
|
|
if (!entry.isDirectory()) {
|
|
...
|
...
|
@@ -144,11 +142,13 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
return validFiles;
|
|
|
|
}
|
|
|
|
|
|
|
|
private void processSingleFile(Path filePath) throws Exception {
|
|
|
|
private void processSingleFile(Path filePath, String knowledgeId) throws Exception {
|
|
|
|
String originalFileName = filePath.getFileName().toString();
|
|
|
|
String fileExt = FilenameUtils.getExtension(originalFileName);
|
|
|
|
String newFileName = FilenameUtils.removeExtension(originalFileName) + "_" + UUID.randomUUID() + "." + fileExt;
|
|
|
|
Path targetPath = Paths.get(uploadPath, newFileName);
|
|
|
|
|
|
|
|
// 生成带UUID的文件名用于存储
|
|
|
|
String storedFileName = generateStoredFileName(originalFileName);
|
|
|
|
Path targetPath = Paths.get(uploadPath, storedFileName);
|
|
|
|
Files.move(filePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
|
|
|
|
|
|
|
List<String> segments;
|
|
...
|
...
|
@@ -160,7 +160,13 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
segments = splitWordDocument(targetPath.toString());
|
|
|
|
}
|
|
|
|
|
|
|
|
saveSegmentsToDatabase(segments, originalFileName, newFileName);
|
|
|
|
saveSegmentsToDatabase(segments, originalFileName, storedFileName, knowledgeId);
|
|
|
|
}
|
|
|
|
|
|
|
|
private String generateStoredFileName(String originalFileName) {
|
|
|
|
String baseName = FilenameUtils.removeExtension(originalFileName);
|
|
|
|
String ext = FilenameUtils.getExtension(originalFileName);
|
|
|
|
return baseName + "_" + UUID.randomUUID() + "." + ext;
|
|
|
|
}
|
|
|
|
|
|
|
|
private String readFileContent(Path filePath) throws IOException {
|
|
...
|
...
|
@@ -168,10 +174,44 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
}
|
|
|
|
|
|
|
|
private String cleanText(String text) {
|
|
|
|
text = SPECIAL_CHARS_PATTERN.matcher(text).replaceAll("");
|
|
|
|
// 保留基本的标点符号,包括 . : - 等
|
|
|
|
Pattern preservedCharsPattern = Pattern.compile("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s.,:、,:;。;-]");
|
|
|
|
text = preservedCharsPattern.matcher(text).replaceAll("");
|
|
|
|
|
|
|
|
// 将多个换行符缩减为一个换行符
|
|
|
|
text = text.replaceAll("(\r?\n){2,}", "\n");
|
|
|
|
|
|
|
|
// 处理其他空白字符
|
|
|
|
return text.replaceAll("\\s+", " ").trim();
|
|
|
|
}
|
|
|
|
|
|
|
|
// 修改isHeading方法中的判断条件,不再排除包含.的文本
|
|
|
|
private static boolean isHeading(Paragraph para, Range range) {
|
|
|
|
int styleIndex = para.getStyleIndex();
|
|
|
|
if (styleIndex >= 1 && styleIndex <= 9) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
CharacterRun run = para.getCharacterRun(0);
|
|
|
|
if (run.isBold() || run.getFontSize() > 12) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
|
|
log.warn("获取字符格式失败", e);
|
|
|
|
}
|
|
|
|
|
|
|
|
String text = para.text().trim();
|
|
|
|
return text.toUpperCase().equals(text) &&
|
|
|
|
text.length() < 100 &&
|
|
|
|
!text.contains("\t"); // 移除了 !text.contains(".") 的判断
|
|
|
|
}
|
|
|
|
|
|
|
|
private String removeUuidSuffix(String fileName) {
|
|
|
|
// 移除UUID后缀部分
|
|
|
|
return UUID_PATTERN.matcher(fileName).replaceFirst("");
|
|
|
|
}
|
|
|
|
|
|
|
|
private List<String> splitTxtDocument(String content) {
|
|
|
|
DocumentSplitter splitter = new DocumentByParagraphSplitter(1000, 200);
|
|
|
|
Document document = Document.from(content);
|
|
...
|
...
|
@@ -185,8 +225,10 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
List<String> result = new ArrayList<>();
|
|
|
|
String ext = FilenameUtils.getExtension(filePath).toLowerCase();
|
|
|
|
StringBuilder fullContent = new StringBuilder();
|
|
|
|
String fileName = new File(filePath).getName();
|
|
|
|
fileName = fileName.substring(0, fileName.lastIndexOf('.')); // 去掉后缀
|
|
|
|
|
|
|
|
// 获取无UUID的文件名用于显示
|
|
|
|
String displayFileName = removeUuidSuffix(new File(filePath).getName());
|
|
|
|
displayFileName = FilenameUtils.removeExtension(displayFileName);
|
|
|
|
|
|
|
|
if (ext.equals("docx")) {
|
|
|
|
try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
|
|
...
|
...
|
@@ -207,7 +249,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
|
|
|
|
String style = para.getStyle();
|
|
|
|
if (style != null && style.matches("Heading\\d")) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
if (!currentSection.isEmpty()) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder(text).append("\n");
|
|
...
|
...
|
@@ -219,7 +261,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
fullContent.append(tableContent).append("\n");
|
|
|
|
|
|
|
|
if (!isTableSection) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
if (!currentSection.isEmpty()) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder();
|
|
...
|
...
|
@@ -229,7 +271,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
if (!currentSection.isEmpty()) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
}
|
|
...
|
...
|
@@ -246,7 +288,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
|
|
|
|
if (para.isInTable()) {
|
|
|
|
if (!isTableSection) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
if (!currentSection.isEmpty()) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder();
|
|
...
|
...
|
@@ -261,7 +303,7 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
}
|
|
|
|
|
|
|
|
if (isHeading(para, range)) {
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
if (!currentSection.isEmpty()) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
currentSection = new StringBuilder(text).append("\n");
|
|
...
|
...
|
@@ -271,59 +313,49 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (currentSection.length() > 0) {
|
|
|
|
if (!currentSection.isEmpty()) {
|
|
|
|
result.add(currentSection.toString().trim());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fullContent.length() < 1000) {
|
|
|
|
return Collections.singletonList(fileName + "\n" + fullContent.toString().trim());
|
|
|
|
return Collections.singletonList(displayFileName + "\n" + fullContent.toString().trim());
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
private String extractTableContent(XWPFTable table) {
|
|
|
|
StringBuilder tableContent = new StringBuilder();
|
|
|
|
StringBuilder tableContent = new StringBuilder("\n"); // 表格前加换行
|
|
|
|
table.getRows().forEach(row -> {
|
|
|
|
row.getTableCells().forEach(cell -> {
|
|
|
|
tableContent.append("| ").append(cleanText(cell.getText())).append(" ");
|
|
|
|
// 处理单元格内容中的多个换行
|
|
|
|
String cellText = cleanText(cell.getText()).replaceAll("(\r?\n){2,}", "\n");
|
|
|
|
tableContent.append("| ").append(cellText).append(" ");
|
|
|
|
});
|
|
|
|
tableContent.append("|\n");
|
|
|
|
});
|
|
|
|
return tableContent.toString();
|
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean isHeading(Paragraph para, Range range) {
|
|
|
|
int styleIndex = para.getStyleIndex();
|
|
|
|
if (styleIndex >= 1 && styleIndex <= 9) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
CharacterRun run = para.getCharacterRun(0);
|
|
|
|
if (run.isBold() || run.getFontSize() > 12) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
|
|
log.warn("获取字符格式失败", e);
|
|
|
|
}
|
|
|
|
|
|
|
|
String text = para.text().trim();
|
|
|
|
return text.toUpperCase().equals(text) &&
|
|
|
|
text.length() < 100 &&
|
|
|
|
!text.contains(".") &&
|
|
|
|
!text.contains("\t");
|
|
|
|
}
|
|
|
|
|
|
|
|
private void saveSegmentsToDatabase(List<String> segments, String originalFileName, String storedFileName) {
|
|
|
|
private void saveSegmentsToDatabase(List<String> segments, String originalFileName, String storedFileName, String knowledgeId) {
|
|
|
|
if (segments.isEmpty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
String fileNameWithoutExt = originalFileName.substring(0, originalFileName.lastIndexOf('.'));
|
|
|
|
String question = segments.size() == 1 ? fileNameWithoutExt : null;
|
|
|
|
// 从存储文件名中提取UUID部分
|
|
|
|
String uuid = storedFileName.substring(
|
|
|
|
storedFileName.lastIndexOf('_') + 1,
|
|
|
|
storedFileName.lastIndexOf('.')
|
|
|
|
);
|
|
|
|
|
|
|
|
// 获取无UUID和扩展名的文件名用于显示
|
|
|
|
String displayFileName = removeUuidSuffix(originalFileName);
|
|
|
|
displayFileName = FilenameUtils.removeExtension(displayFileName);
|
|
|
|
|
|
|
|
// 判断是否是单一段落
|
|
|
|
boolean isSingleSegment = segments.size() == 1;
|
|
|
|
|
|
|
|
for (String segment : segments) {
|
|
|
|
if (segment.trim().isEmpty()) {
|
|
...
|
...
|
@@ -333,20 +365,26 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
QuestionEmbedding record = new QuestionEmbedding();
|
|
|
|
record.setId(UUID.randomUUID().toString());
|
|
|
|
|
|
|
|
if (question != null) {
|
|
|
|
record.setQuestion(question);
|
|
|
|
if (isSingleSegment) {
|
|
|
|
record.setQuestion(displayFileName);
|
|
|
|
record.setAnswer(segment.trim());
|
|
|
|
} else {
|
|
|
|
String firstLine = segment.lines().findFirst().orElse("未命名问题");
|
|
|
|
record.setQuestion(cleanText(firstLine));
|
|
|
|
String firstLine = segment.lines().findFirst().orElse("");
|
|
|
|
record.setQuestion(displayFileName + ": " + cleanText(firstLine));
|
|
|
|
record.setAnswer(segment.trim());
|
|
|
|
}
|
|
|
|
|
|
|
|
record.setAnswer(segment.trim());
|
|
|
|
record.setText("");
|
|
|
|
record.setMetadata("{\"fileName\":\"" + storedFileName + "\"}");
|
|
|
|
record.setMetadata(String.format(
|
|
|
|
"{\"docId\":\"%s\",\"docName\":\"%s\",\"knowledgeId\":\"%s\"}",
|
|
|
|
uuid,
|
|
|
|
originalFileName,
|
|
|
|
knowledgeId
|
|
|
|
));
|
|
|
|
|
|
|
|
Response<Embedding> embeddingResponse = aiModelUtils.getEmbedding("1925730210204721154", record.getQuestion());
|
|
|
|
record.setEmbedding(embeddingResponse.content().vector());
|
|
|
|
|
|
|
|
record.setKnowledgeId(knowledgeId);
|
|
|
|
questionEmbeddingMapper.insert(record);
|
|
|
|
}
|
|
|
|
}
|
...
|
...
|
|