|
@@ -13,10 +13,7 @@ import dev.langchain4j.data.embedding.Embedding; |
|
@@ -13,10 +13,7 @@ import dev.langchain4j.data.embedding.Embedding; |
|
13
|
import dev.langchain4j.data.segment.TextSegment;
|
13
|
import dev.langchain4j.data.segment.TextSegment;
|
|
14
|
import dev.langchain4j.model.output.Response;
|
14
|
import dev.langchain4j.model.output.Response;
|
|
15
|
import org.apache.commons.io.FilenameUtils;
|
15
|
import org.apache.commons.io.FilenameUtils;
|
|
16
|
-import org.apache.poi.xwpf.usermodel.IBodyElement;
|
|
|
|
17
|
-import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
|
|
18
|
-import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
|
|
19
|
-import org.apache.poi.xwpf.usermodel.XWPFTable;
|
16
|
+import org.apache.poi.xwpf.usermodel.*;
|
|
20
|
import org.jeecg.common.api.vo.Result;
|
17
|
import org.jeecg.common.api.vo.Result;
|
|
21
|
import org.jeecg.modules.airag.app.entity.QuestionEmbedding;
|
18
|
import org.jeecg.modules.airag.app.entity.QuestionEmbedding;
|
|
22
|
import org.jeecg.modules.airag.app.mapper.QuestionEmbeddingMapper;
|
19
|
import org.jeecg.modules.airag.app.mapper.QuestionEmbeddingMapper;
|
|
@@ -67,12 +64,14 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
@@ -67,12 +64,14 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
67
|
|
64
|
|
|
68
|
@Value("${jeecg.upload.path}")
|
65
|
@Value("${jeecg.upload.path}")
|
|
69
|
private String uploadPath;
|
66
|
private String uploadPath;
|
|
|
|
67
|
+ @Value("${jeecg.ai-chat.embedId}")
|
|
|
|
68
|
+ private String embedId;
|
|
70
|
|
69
|
|
|
71
|
private static final Set<String> ALLOWED_EXTENSIONS = Set.of("txt", "doc", "docx");
|
70
|
private static final Set<String> ALLOWED_EXTENSIONS = Set.of("txt", "doc", "docx");
|
|
72
|
private static final Pattern SPECIAL_CHARS_PATTERN = Pattern.compile("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s]");
|
71
|
private static final Pattern SPECIAL_CHARS_PATTERN = Pattern.compile("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s]");
|
|
73
|
private static final Pattern UUID_PATTERN = Pattern.compile("_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}");
|
72
|
private static final Pattern UUID_PATTERN = Pattern.compile("_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}");
|
|
74
|
|
73
|
|
|
75
|
- // 新增:数据库连接配置
|
74
|
+ // 数据库连接配置
|
|
76
|
private static final String DB_URL = "jdbc:postgresql://192.168.100.104:5432/postgres";
|
75
|
private static final String DB_URL = "jdbc:postgresql://192.168.100.104:5432/postgres";
|
|
77
|
private static final String DB_USER = "postgres";
|
76
|
private static final String DB_USER = "postgres";
|
|
78
|
private static final String DB_PASSWORD = "postgres";
|
77
|
private static final String DB_PASSWORD = "postgres";
|
|
@@ -188,56 +187,59 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
@@ -188,56 +187,59 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
188
|
saveSegmentsToDatabase(segments, originalFileName, storedFileName, knowledgeId);
|
187
|
saveSegmentsToDatabase(segments, originalFileName, storedFileName, knowledgeId);
|
|
189
|
|
188
|
|
|
190
|
// 新增逻辑:同时保存到embeddings表
|
189
|
// 新增逻辑:同时保存到embeddings表
|
|
191
|
- saveToEmbeddingsTable(segments, originalFileName, knowledgeId);
|
|
|
|
192
|
- }
|
190
|
+ saveToEmbeddingsTable(segments, originalFileName, storedFileName, knowledgeId);
|
|
193
|
|
191
|
|
|
194
|
- // 新增方法:将内容保存到embeddings表
|
|
|
|
195
|
- private void saveToEmbeddingsTable(List<String> segments, String originalFileName, String knowledgeId) {
|
|
|
|
196
|
- if (segments.isEmpty()) {
|
|
|
|
197
|
- return;
|
|
|
|
198
|
- }
|
192
|
+ }
|
|
|
|
193
|
+ private void saveToEmbeddingsTable(List<String> segments, String originalFileName, String storedFileName, String knowledgeId) {
|
|
|
|
194
|
+ if (segments.isEmpty()) return;
|
|
199
|
|
195
|
|
|
200
|
- // 获取无UUID和扩展名的文件名用于显示
|
|
|
|
201
|
String displayFileName = removeUuidSuffix(originalFileName);
|
196
|
String displayFileName = removeUuidSuffix(originalFileName);
|
|
202
|
displayFileName = FilenameUtils.removeExtension(displayFileName);
|
197
|
displayFileName = FilenameUtils.removeExtension(displayFileName);
|
|
203
|
|
198
|
|
|
204
|
- // 为整个文档生成一个唯一的docId
|
|
|
|
205
|
- String docId = UUID.randomUUID().toString();
|
|
|
|
206
|
-
|
|
|
|
207
|
- // 合并所有段落作为完整内容
|
|
|
|
208
|
- String fullContent = String.join("\n\n", segments);
|
|
|
|
209
|
-
|
|
|
|
210
|
try (Connection conn = getConnection()) {
|
199
|
try (Connection conn = getConnection()) {
|
|
211
|
- // 准备元数据
|
|
|
|
212
|
- Map<String, Object> metadata = new HashMap<>();
|
|
|
|
213
|
- metadata.put("docId", docId);
|
|
|
|
214
|
- metadata.put("docName", originalFileName);
|
|
|
|
215
|
- metadata.put("knowledgeId", knowledgeId);
|
|
|
|
216
|
-
|
|
|
|
217
|
- // 获取文本的向量表示
|
|
|
|
218
|
- Response<Embedding> embeddingResponse = aiModelUtils.getEmbedding("1925730210204721154", displayFileName + ": " + fullContent);
|
|
|
|
219
|
- float[] embeddingVector = embeddingResponse.content().vector();
|
|
|
|
220
|
-
|
|
|
|
221
|
- // 插入到embeddings表
|
|
|
|
222
|
- String sql = "INSERT INTO embeddings (embedding_id, embedding, text, metadata) VALUES (?, ?, ?, ?::jsonb)";
|
|
|
|
223
|
- try (PreparedStatement stmt = conn.prepareStatement(sql)) {
|
|
|
|
224
|
- stmt.setString(1, UUID.randomUUID().toString());
|
|
|
|
225
|
- stmt.setObject(2, new PGvector(embeddingVector));
|
|
|
|
226
|
- stmt.setString(3, fullContent);
|
|
|
|
227
|
-
|
|
|
|
228
|
- PGobject jsonObject = new PGobject();
|
|
|
|
229
|
- jsonObject.setType("json");
|
|
|
|
230
|
- jsonObject.setValue(new ObjectMapper().writeValueAsString(metadata));
|
|
|
|
231
|
- stmt.setObject(4, jsonObject);
|
|
|
|
232
|
-
|
|
|
|
233
|
- stmt.executeUpdate();
|
200
|
+ for (String segment : segments) {
|
|
|
|
201
|
+ if (segment.trim().isEmpty()) continue;
|
|
|
|
202
|
+
|
|
|
|
203
|
+ // 回答内容是整个段落
|
|
|
|
204
|
+ String[] parts = segment.split("\\r?\\n", 2);
|
|
|
|
205
|
+ if (parts.length < 2) continue;
|
|
|
|
206
|
+
|
|
|
|
207
|
+ String titlePath = parts[0].trim();
|
|
|
|
208
|
+ String answer = segment.trim(); // 整个回答段(含标题 + 内容)
|
|
|
|
209
|
+
|
|
|
|
210
|
+ // 获取 embedding
|
|
|
|
211
|
+ Response<Embedding> embeddingResponse = aiModelUtils.getEmbedding(embedId, answer);
|
|
|
|
212
|
+ float[] embeddingVector = embeddingResponse.content().vector();
|
|
|
|
213
|
+
|
|
|
|
214
|
+ // 准备 metadata
|
|
|
|
215
|
+ Map<String, Object> metadata = new HashMap<>();
|
|
|
|
216
|
+ metadata.put("docName", originalFileName);
|
|
|
|
217
|
+ metadata.put("storedFileName", storedFileName);
|
|
|
|
218
|
+ metadata.put("knowledgeId", knowledgeId);
|
|
|
|
219
|
+ metadata.put("title", displayFileName + ": " + titlePath);
|
|
|
|
220
|
+
|
|
|
|
221
|
+ // 插入
|
|
|
|
222
|
+ String sql = "INSERT INTO embeddings (embedding_id, embedding, text, metadata) VALUES (?, ?, ?, ?::jsonb)";
|
|
|
|
223
|
+ try (PreparedStatement stmt = conn.prepareStatement(sql)) {
|
|
|
|
224
|
+ stmt.setString(1, UUID.randomUUID().toString());
|
|
|
|
225
|
+ stmt.setObject(2, new PGvector(embeddingVector));
|
|
|
|
226
|
+ stmt.setString(3, answer);
|
|
|
|
227
|
+
|
|
|
|
228
|
+ PGobject jsonObject = new PGobject();
|
|
|
|
229
|
+ jsonObject.setType("json");
|
|
|
|
230
|
+ jsonObject.setValue(new ObjectMapper().writeValueAsString(metadata));
|
|
|
|
231
|
+ stmt.setObject(4, jsonObject);
|
|
|
|
232
|
+
|
|
|
|
233
|
+ stmt.executeUpdate();
|
|
|
|
234
|
+ }
|
|
234
|
}
|
235
|
}
|
|
235
|
} catch (Exception e) {
|
236
|
} catch (Exception e) {
|
|
236
|
- log.error("保存到embeddings表失败", e);
|
237
|
+ log.error("保存分段到embeddings表失败", e);
|
|
237
|
}
|
238
|
}
|
|
238
|
}
|
239
|
}
|
|
239
|
|
240
|
|
|
240
|
- // 新增方法:获取数据库连接
|
241
|
+
|
|
|
|
242
|
+ // 获取数据库连接
|
|
241
|
private Connection getConnection() throws SQLException {
|
243
|
private Connection getConnection() throws SQLException {
|
|
242
|
return DriverManager.getConnection(DB_URL, DB_USER, DB_PASSWORD);
|
244
|
return DriverManager.getConnection(DB_URL, DB_USER, DB_PASSWORD);
|
|
243
|
}
|
245
|
}
|
|
@@ -253,8 +255,8 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
@@ -253,8 +255,8 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
253
|
}
|
255
|
}
|
|
254
|
|
256
|
|
|
255
|
private String cleanText(String text) {
|
257
|
private String cleanText(String text) {
|
|
256
|
- // 保留基本的标点符号,包括 . : - 等
|
|
|
|
257
|
- Pattern preservedCharsPattern = Pattern.compile("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s.,:、,:;。;-]");
|
258
|
+ // 保留基本的标点符号
|
|
|
|
259
|
+ Pattern preservedCharsPattern = Pattern.compile("[^a-zA-Z0-9\\u4e00-\\u9fa5\\s.,:、,:;。;#;-]");
|
|
258
|
text = preservedCharsPattern.matcher(text).replaceAll("");
|
260
|
text = preservedCharsPattern.matcher(text).replaceAll("");
|
|
259
|
|
261
|
|
|
260
|
// 将多个换行符缩减为一个换行符
|
262
|
// 将多个换行符缩减为一个换行符
|
|
@@ -299,182 +301,183 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
@@ -299,182 +301,183 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
299
|
.collect(Collectors.toList());
|
301
|
.collect(Collectors.toList());
|
|
300
|
}
|
302
|
}
|
|
301
|
|
303
|
|
|
302
|
- public List<String> splitWordDocument(String filePath) throws Exception {
|
|
|
|
303
|
- List<String> result = new ArrayList<>();
|
|
|
|
304
|
- String ext = FilenameUtils.getExtension(filePath).toLowerCase();
|
|
|
|
305
|
- StringBuilder fullContent = new StringBuilder();
|
|
|
|
306
|
|
304
|
|
|
307
|
- // 获取无UUID的文件名用于显示
|
|
|
|
308
|
- String displayFileName = removeUuidSuffix(new File(filePath).getName());
|
|
|
|
309
|
- displayFileName = FilenameUtils.removeExtension(displayFileName);
|
305
|
+ // 后备分割方案:按段落结构分割
|
|
|
|
306
|
+ private List<String> splitByContentStructure(XWPFDocument doc) {
|
|
|
|
307
|
+ List<String> segments = new ArrayList<>();
|
|
|
|
308
|
+ StringBuilder currentSegment = new StringBuilder();
|
|
|
|
309
|
+ final int MAX_SEGMENT_LENGTH = 1000; // 最大分段长度
|
|
310
|
|
310
|
|
|
311
|
- if (ext.equals("docx")) {
|
|
|
|
312
|
- try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
|
|
|
|
313
|
- StringBuilder currentSection = new StringBuilder();
|
|
|
|
314
|
- boolean isTableSection = false;
|
|
|
|
315
|
-
|
|
|
|
316
|
- for (IBodyElement element : doc.getBodyElements()) {
|
|
|
|
317
|
- if (element instanceof XWPFParagraph) {
|
|
|
|
318
|
- XWPFParagraph para = (XWPFParagraph) element;
|
|
|
|
319
|
- String text = cleanText(para.getText());
|
|
|
|
320
|
- fullContent.append(text).append("\n");
|
|
|
|
321
|
-
|
|
|
|
322
|
- if (isTableSection) {
|
|
|
|
323
|
- result.add(currentSection.toString().trim());
|
|
|
|
324
|
- currentSection = new StringBuilder();
|
|
|
|
325
|
- isTableSection = false;
|
|
|
|
326
|
- }
|
311
|
+ for (IBodyElement element : doc.getBodyElements()) {
|
|
|
|
312
|
+ String text = "";
|
|
|
|
313
|
+ if (element instanceof XWPFParagraph) {
|
|
|
|
314
|
+ text = ((XWPFParagraph) element).getText().trim();
|
|
|
|
315
|
+ } else if (element instanceof XWPFTable) {
|
|
|
|
316
|
+ text = extractTableContent((XWPFTable) element);
|
|
|
|
317
|
+ }
|
|
327
|
|
318
|
|
|
328
|
- String style = para.getStyle();
|
|
|
|
329
|
- if (style != null && style.matches("Heading\\d")) {
|
|
|
|
330
|
- if (!currentSection.isEmpty()) {
|
|
|
|
331
|
- result.add(currentSection.toString().trim());
|
|
|
|
332
|
- }
|
|
|
|
333
|
- currentSection = new StringBuilder(text).append("\n");
|
|
|
|
334
|
- } else {
|
|
|
|
335
|
- currentSection.append(text).append("\n");
|
|
|
|
336
|
- }
|
|
|
|
337
|
- } else if (element instanceof XWPFTable) {
|
|
|
|
338
|
- String tableContent = extractTableContent((XWPFTable) element);
|
|
|
|
339
|
- fullContent.append(tableContent).append("\n");
|
|
|
|
340
|
-
|
|
|
|
341
|
- if (!isTableSection) {
|
|
|
|
342
|
- if (!currentSection.isEmpty()) {
|
|
|
|
343
|
- result.add(currentSection.toString().trim());
|
|
|
|
344
|
- }
|
|
|
|
345
|
- currentSection = new StringBuilder();
|
|
|
|
346
|
- isTableSection = true;
|
|
|
|
347
|
- }
|
|
|
|
348
|
- currentSection.append(tableContent).append("\n");
|
|
|
|
349
|
- }
|
|
|
|
350
|
- }
|
319
|
+ if (text.isEmpty()) continue;
|
|
351
|
|
320
|
|
|
352
|
- if (!currentSection.isEmpty()) {
|
|
|
|
353
|
- result.add(currentSection.toString().trim());
|
|
|
|
354
|
- }
|
321
|
+ // 当遇到空行或达到最大长度时分段
|
|
|
|
322
|
+ if (currentSegment.length() + text.length() > MAX_SEGMENT_LENGTH
|
|
|
|
323
|
+ && currentSegment.length() > 0) {
|
|
|
|
324
|
+ segments.add(currentSegment.toString().trim());
|
|
|
|
325
|
+ currentSegment = new StringBuilder();
|
|
355
|
}
|
326
|
}
|
|
356
|
- } else if (ext.equals("doc")) {
|
|
|
|
357
|
- try (HWPFDocument doc = new HWPFDocument(new FileInputStream(filePath))) {
|
|
|
|
358
|
- Range range = doc.getRange();
|
|
|
|
359
|
- StringBuilder currentSection = new StringBuilder();
|
|
|
|
360
|
- boolean isTableSection = false;
|
|
|
|
361
|
-
|
|
|
|
362
|
- for (int i = 0; i < range.numParagraphs(); i++) {
|
|
|
|
363
|
- Paragraph para = range.getParagraph(i);
|
|
|
|
364
|
- String text = cleanText(para.text());
|
|
|
|
365
|
- fullContent.append(text).append("\n");
|
|
|
|
366
|
-
|
|
|
|
367
|
- if (para.isInTable()) {
|
|
|
|
368
|
- if (!isTableSection) {
|
|
|
|
369
|
- if (!currentSection.isEmpty()) {
|
|
|
|
370
|
- result.add(currentSection.toString().trim());
|
|
|
|
371
|
- }
|
|
|
|
372
|
- currentSection = new StringBuilder();
|
|
|
|
373
|
- isTableSection = true;
|
|
|
|
374
|
- }
|
|
|
|
375
|
- currentSection.append(text).append("\n");
|
|
|
|
376
|
- } else {
|
|
|
|
377
|
- if (isTableSection) {
|
|
|
|
378
|
- result.add(currentSection.toString().trim());
|
|
|
|
379
|
- currentSection = new StringBuilder();
|
|
|
|
380
|
- isTableSection = false;
|
|
|
|
381
|
- }
|
|
|
|
382
|
|
327
|
|
|
383
|
- if (isHeading(para, range)) {
|
|
|
|
384
|
- if (!currentSection.isEmpty()) {
|
|
|
|
385
|
- result.add(currentSection.toString().trim());
|
|
|
|
386
|
- }
|
|
|
|
387
|
- currentSection = new StringBuilder(text).append("\n");
|
|
|
|
388
|
- } else {
|
|
|
|
389
|
- currentSection.append(text).append("\n");
|
|
|
|
390
|
- }
|
|
|
|
391
|
- }
|
|
|
|
392
|
- }
|
328
|
+ currentSegment.append(text).append("\n\n");
|
|
|
|
329
|
+ }
|
|
|
|
330
|
+
|
|
|
|
331
|
+ if (currentSegment.length() > 0) {
|
|
|
|
332
|
+ segments.add(currentSegment.toString().trim());
|
|
|
|
333
|
+ }
|
|
|
|
334
|
+ return segments;
|
|
|
|
335
|
+ }
|
|
393
|
|
336
|
|
|
394
|
- if (!currentSection.isEmpty()) {
|
|
|
|
395
|
- result.add(currentSection.toString().trim());
|
337
|
+ // 按标题分割文本
|
|
|
|
338
|
+ private List<String> splitByHeadings(String content) {
|
|
|
|
339
|
+ List<String> segments = new ArrayList<>();
|
|
|
|
340
|
+ StringBuilder currentSegment = new StringBuilder();
|
|
|
|
341
|
+ String[] lines = content.split("\\r?\\n");
|
|
|
|
342
|
+
|
|
|
|
343
|
+ for (String line : lines) {
|
|
|
|
344
|
+ // 检测标题行(以1-6个#开头,后面跟着空格)
|
|
|
|
345
|
+ if (line.trim().matches("^#{1,6}\\s+.*")) {
|
|
|
|
346
|
+ // 保存当前分段
|
|
|
|
347
|
+ if (!currentSegment.isEmpty()) {
|
|
|
|
348
|
+ segments.add(currentSegment.toString().trim());
|
|
|
|
349
|
+ currentSegment = new StringBuilder();
|
|
396
|
}
|
350
|
}
|
|
397
|
}
|
351
|
}
|
|
|
|
352
|
+ currentSegment.append(line).append("\n");
|
|
398
|
}
|
353
|
}
|
|
399
|
|
354
|
|
|
400
|
- if (fullContent.length() < 1000) {
|
|
|
|
401
|
- return Collections.singletonList(displayFileName + "\n" + fullContent.toString().trim());
|
355
|
+ // 添加最后一个分段
|
|
|
|
356
|
+ if (!currentSegment.isEmpty()) {
|
|
|
|
357
|
+ segments.add(currentSegment.toString().trim());
|
|
402
|
}
|
358
|
}
|
|
403
|
|
359
|
|
|
404
|
- return result;
|
360
|
+ return segments;
|
|
405
|
}
|
361
|
}
|
|
406
|
|
362
|
|
|
407
|
private String extractTableContent(XWPFTable table) {
|
363
|
private String extractTableContent(XWPFTable table) {
|
|
408
|
- StringBuilder tableContent = new StringBuilder("\n"); // 表格前加换行
|
364
|
+ StringBuilder tableContent = new StringBuilder();
|
|
409
|
table.getRows().forEach(row -> {
|
365
|
table.getRows().forEach(row -> {
|
|
|
|
366
|
+ StringBuilder rowContent = new StringBuilder("| ");
|
|
410
|
row.getTableCells().forEach(cell -> {
|
367
|
row.getTableCells().forEach(cell -> {
|
|
411
|
- // 处理单元格内容中的多个换行
|
|
|
|
412
|
- String cellText = cleanText(cell.getText()).replaceAll("(\r?\n){2,}", "\n");
|
|
|
|
413
|
- tableContent.append("| ").append(cellText).append(" ");
|
368
|
+ String cellText = cell.getText().replaceAll("(\r?\n){2,}", " ");
|
|
|
|
369
|
+ rowContent.append(cellText).append(" | ");
|
|
414
|
});
|
370
|
});
|
|
415
|
- tableContent.append("|\n");
|
371
|
+ tableContent.append(rowContent.toString().trim()).append("\n");
|
|
416
|
});
|
372
|
});
|
|
417
|
return tableContent.toString();
|
373
|
return tableContent.toString();
|
|
418
|
}
|
374
|
}
|
|
|
|
375
|
+ public List<String> splitWordDocument(String filePath) throws Exception {
|
|
|
|
376
|
+ List<String> result = new ArrayList<>();
|
|
|
|
377
|
+ String ext = FilenameUtils.getExtension(filePath).toLowerCase();
|
|
419
|
|
378
|
|
|
|
|
379
|
+ if (!ext.equals("docx")) return result;
|
|
|
|
380
|
+
|
|
|
|
381
|
+ try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
|
|
|
|
382
|
+ StringBuilder currentContent = new StringBuilder();
|
|
|
|
383
|
+ List<String> titlePath = new ArrayList<>();
|
|
|
|
384
|
+ String lastOutput = null;
|
|
|
|
385
|
+
|
|
|
|
386
|
+ for (IBodyElement element : doc.getBodyElements()) {
|
|
|
|
387
|
+ if (element instanceof XWPFParagraph) {
|
|
|
|
388
|
+ XWPFParagraph para = (XWPFParagraph) element;
|
|
|
|
389
|
+ String text = para.getText().trim();
|
|
|
|
390
|
+ if (text.isEmpty()) continue;
|
|
|
|
391
|
+
|
|
|
|
392
|
+ int headingLevel = getHeadingLevel(para);
|
|
|
|
393
|
+ if (headingLevel > 0) {
|
|
|
|
394
|
+ // 存在之前内容,保存上一个段
|
|
|
|
395
|
+ if (currentContent.length() > 0 && !titlePath.isEmpty()) {
|
|
|
|
396
|
+ String fullBlock = String.join("", titlePath) + "\n" + currentContent.toString().trim();
|
|
|
|
397
|
+ result.add(fullBlock);
|
|
|
|
398
|
+ currentContent.setLength(0);
|
|
|
|
399
|
+ }
|
|
420
|
|
400
|
|
|
421
|
- private void saveSegmentsToDatabase(List<String> segments, String originalFileName, String storedFileName, String knowledgeId) {
|
|
|
|
422
|
- if (segments.isEmpty()) {
|
|
|
|
423
|
- return;
|
401
|
+ // 更新标题路径
|
|
|
|
402
|
+ while (titlePath.size() >= headingLevel) {
|
|
|
|
403
|
+ titlePath.remove(titlePath.size() - 1);
|
|
|
|
404
|
+ }
|
|
|
|
405
|
+ titlePath.add(text);
|
|
|
|
406
|
+ } else {
|
|
|
|
407
|
+ currentContent.append(text).append("\n");
|
|
|
|
408
|
+ }
|
|
|
|
409
|
+ } else if (element instanceof XWPFTable) {
|
|
|
|
410
|
+ String tableText = extractTableContent((XWPFTable) element);
|
|
|
|
411
|
+ currentContent.append(tableText).append("\n");
|
|
|
|
412
|
+ }
|
|
|
|
413
|
+ }
|
|
|
|
414
|
+
|
|
|
|
415
|
+ // 最后一段
|
|
|
|
416
|
+ if (currentContent.length() > 0 && !titlePath.isEmpty()) {
|
|
|
|
417
|
+ String fullBlock = String.join("", titlePath) + "\n" + currentContent.toString().trim();
|
|
|
|
418
|
+ result.add(fullBlock);
|
|
|
|
419
|
+ }
|
|
|
|
420
|
+ }
|
|
|
|
421
|
+
|
|
|
|
422
|
+ return result;
|
|
|
|
423
|
+ }
|
|
|
|
424
|
+
|
|
|
|
425
|
+ // 获取标题等级
|
|
|
|
426
|
+ private int getHeadingLevel(XWPFParagraph para) {
|
|
|
|
427
|
+ String style = para.getStyle();
|
|
|
|
428
|
+ if (style != null && style.matches("Heading\\d|标题\\d|\\d")) {
|
|
|
|
429
|
+ return Integer.parseInt(style.replaceAll("[^\\d]", ""));
|
|
424
|
}
|
430
|
}
|
|
425
|
|
431
|
|
|
426
|
- // 从存储文件名中提取UUID部分作为docId
|
|
|
|
427
|
- String docId = storedFileName.substring(
|
|
|
|
428
|
- storedFileName.lastIndexOf('_') + 1,
|
|
|
|
429
|
- storedFileName.lastIndexOf('.')
|
|
|
|
430
|
- );
|
432
|
+ if (para.getRuns().size() > 0) {
|
|
|
|
433
|
+ XWPFRun run = para.getRuns().get(0);
|
|
|
|
434
|
+ if (run.isBold() || (run.getFontSize() > 12 && run.getFontSize() != -1)) {
|
|
|
|
435
|
+ return 2; // 可能是二级标题
|
|
|
|
436
|
+ }
|
|
|
|
437
|
+ }
|
|
431
|
|
438
|
|
|
432
|
- // 获取无UUID和扩展名的文件名用于显示
|
439
|
+ return 0;
|
|
|
|
440
|
+ }
|
|
|
|
441
|
+ private void saveSegmentsToDatabase(List<String> segments, String originalFileName, String storedFileName, String knowledgeId) {
|
|
|
|
442
|
+ if (segments.isEmpty()) return;
|
|
|
|
443
|
+
|
|
|
|
444
|
+ String docId = storedFileName.substring(storedFileName.lastIndexOf('_') + 1, storedFileName.lastIndexOf('.'));
|
|
433
|
String displayFileName = removeUuidSuffix(originalFileName);
|
445
|
String displayFileName = removeUuidSuffix(originalFileName);
|
|
434
|
displayFileName = FilenameUtils.removeExtension(displayFileName);
|
446
|
displayFileName = FilenameUtils.removeExtension(displayFileName);
|
|
435
|
|
447
|
|
|
436
|
- // 判断是否是单一段落
|
|
|
|
437
|
- boolean isSingleSegment = segments.size() == 1;
|
|
|
|
438
|
-
|
|
|
|
439
|
for (String segment : segments) {
|
448
|
for (String segment : segments) {
|
|
440
|
- if (segment.trim().isEmpty()) {
|
|
|
|
441
|
- continue;
|
|
|
|
442
|
- }
|
449
|
+ if (segment.trim().isEmpty()) continue;
|
|
443
|
|
450
|
|
|
444
|
- QuestionEmbedding record = new QuestionEmbedding();
|
|
|
|
445
|
- record.setId(UUID.randomUUID().toString());
|
451
|
+ String[] parts = segment.split("\\r?\\n", 2);
|
|
|
|
452
|
+ if (parts.length < 2) continue;
|
|
446
|
|
453
|
|
|
447
|
- if (isSingleSegment) {
|
|
|
|
448
|
- record.setQuestion(displayFileName);
|
|
|
|
449
|
- record.setAnswer(segment.trim());
|
|
|
|
450
|
- } else {
|
|
|
|
451
|
- String firstLine = segment.lines().findFirst().orElse("");
|
|
|
|
452
|
- record.setQuestion(displayFileName + ": " + cleanText(firstLine));
|
|
|
|
453
|
- record.setAnswer(segment.trim());
|
|
|
|
454
|
- }
|
454
|
+ String titleLine = parts[0].trim(); // 如 掌静脉设备: 产品特点
|
|
|
|
455
|
+ String content = parts[1].trim();
|
|
|
|
456
|
+
|
|
|
|
457
|
+ // 构造问题:文件名: 标题层级路径
|
|
|
|
458
|
+ String question = displayFileName + titleLine;
|
|
455
|
|
459
|
|
|
|
|
460
|
+ QuestionEmbedding record = new QuestionEmbedding();
|
|
|
|
461
|
+ record.setId(UUID.randomUUID().toString());
|
|
|
|
462
|
+ record.setQuestion(question);
|
|
|
|
463
|
+ record.setAnswer(titleLine + "\n" + content);
|
|
456
|
record.setText("");
|
464
|
record.setText("");
|
|
457
|
|
465
|
|
|
458
|
- // 构建metadata JSON对象
|
|
|
|
459
|
- Map<String, String> metadata = new LinkedHashMap<>(); // 使用LinkedHashMap保持字段顺序
|
466
|
+ Map<String, String> metadata = new LinkedHashMap<>();
|
|
460
|
metadata.put("docId", docId);
|
467
|
metadata.put("docId", docId);
|
|
461
|
- metadata.put("docName", originalFileName); // 上传前的原始文件名
|
|
|
|
462
|
- metadata.put("storedFileName", storedFileName); // 上传后的带UUID的文件名
|
468
|
+ metadata.put("docName", originalFileName);
|
|
|
|
469
|
+ metadata.put("storedFileName", storedFileName);
|
|
463
|
metadata.put("knowledgeId", knowledgeId);
|
470
|
metadata.put("knowledgeId", knowledgeId);
|
|
464
|
|
471
|
|
|
465
|
- // 使用ObjectMapper转换为JSON字符串
|
|
|
|
466
|
try {
|
472
|
try {
|
|
467
|
record.setMetadata(new ObjectMapper().writeValueAsString(metadata));
|
473
|
record.setMetadata(new ObjectMapper().writeValueAsString(metadata));
|
|
468
|
} catch (JsonProcessingException e) {
|
474
|
} catch (JsonProcessingException e) {
|
|
469
|
log.error("生成metadata JSON失败", e);
|
475
|
log.error("生成metadata JSON失败", e);
|
|
470
|
- // 使用默认值
|
|
|
|
471
|
- record.setMetadata(String.format(
|
|
|
|
472
|
- "{\"docId\":\"%s\",\"docName\":\"%s\",\"storedFileName\":\"%s\",\"knowledgeId\":\"%s\"}",
|
|
|
|
473
|
- docId, originalFileName, storedFileName, knowledgeId
|
|
|
|
474
|
- ));
|
|
|
|
475
|
}
|
476
|
}
|
|
476
|
|
477
|
|
|
477
|
- Response<Embedding> embeddingResponse = aiModelUtils.getEmbedding("1925730210204721154", record.getQuestion());
|
478
|
+ log.info("保存分段: title={}, content_length={}", question, segment.length());
|
|
|
|
479
|
+
|
|
|
|
480
|
+ Response<Embedding> embeddingResponse = aiModelUtils.getEmbedding(embedId, record.getQuestion());
|
|
478
|
record.setEmbedding(embeddingResponse.content().vector());
|
481
|
record.setEmbedding(embeddingResponse.content().vector());
|
|
479
|
record.setKnowledgeId(knowledgeId);
|
482
|
record.setKnowledgeId(knowledgeId);
|
|
480
|
questionEmbeddingMapper.insert(record);
|
483
|
questionEmbeddingMapper.insert(record);
|