|
@@ -266,26 +266,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
@@ -266,26 +266,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
266
|
return text.replaceAll("\\s+", " ").trim();
|
266
|
return text.replaceAll("\\s+", " ").trim();
|
|
267
|
}
|
267
|
}
|
|
268
|
|
268
|
|
|
269
|
- private static boolean isHeading(Paragraph para, Range range) {
|
|
|
|
270
|
- int styleIndex = para.getStyleIndex();
|
|
|
|
271
|
- if (styleIndex >= 1 && styleIndex <= 9) {
|
|
|
|
272
|
- return true;
|
|
|
|
273
|
- }
|
|
|
|
274
|
-
|
|
|
|
275
|
- try {
|
|
|
|
276
|
- CharacterRun run = para.getCharacterRun(0);
|
|
|
|
277
|
- if (run.isBold() || run.getFontSize() > 12) {
|
|
|
|
278
|
- return true;
|
|
|
|
279
|
- }
|
|
|
|
280
|
- } catch (Exception e) {
|
|
|
|
281
|
- log.warn("获取字符格式失败", e);
|
|
|
|
282
|
- }
|
|
|
|
283
|
-
|
|
|
|
284
|
- String text = para.text().trim();
|
|
|
|
285
|
- return text.toUpperCase().equals(text) &&
|
|
|
|
286
|
- text.length() < 100 &&
|
|
|
|
287
|
- !text.contains("\t");
|
|
|
|
288
|
- }
|
|
|
|
289
|
|
269
|
|
|
290
|
private String removeUuidSuffix(String fileName) {
|
270
|
private String removeUuidSuffix(String fileName) {
|
|
291
|
// 移除UUID后缀部分
|
271
|
// 移除UUID后缀部分
|
|
@@ -302,64 +282,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
@@ -302,64 +282,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
302
|
}
|
282
|
}
|
|
303
|
|
283
|
|
|
304
|
|
284
|
|
|
305
|
- // 后备分割方案:按段落结构分割
|
|
|
|
306
|
- private List<String> splitByContentStructure(XWPFDocument doc) {
|
|
|
|
307
|
- List<String> segments = new ArrayList<>();
|
|
|
|
308
|
- StringBuilder currentSegment = new StringBuilder();
|
|
|
|
309
|
- final int MAX_SEGMENT_LENGTH = 1000; // 最大分段长度
|
|
|
|
310
|
-
|
|
|
|
311
|
- for (IBodyElement element : doc.getBodyElements()) {
|
|
|
|
312
|
- String text = "";
|
|
|
|
313
|
- if (element instanceof XWPFParagraph) {
|
|
|
|
314
|
- text = ((XWPFParagraph) element).getText().trim();
|
|
|
|
315
|
- } else if (element instanceof XWPFTable) {
|
|
|
|
316
|
- text = extractTableContent((XWPFTable) element);
|
|
|
|
317
|
- }
|
|
|
|
318
|
-
|
|
|
|
319
|
- if (text.isEmpty()) continue;
|
|
|
|
320
|
-
|
|
|
|
321
|
- // 当遇到空行或达到最大长度时分段
|
|
|
|
322
|
- if (currentSegment.length() + text.length() > MAX_SEGMENT_LENGTH
|
|
|
|
323
|
- && currentSegment.length() > 0) {
|
|
|
|
324
|
- segments.add(currentSegment.toString().trim());
|
|
|
|
325
|
- currentSegment = new StringBuilder();
|
|
|
|
326
|
- }
|
|
|
|
327
|
-
|
|
|
|
328
|
- currentSegment.append(text).append("\n\n");
|
|
|
|
329
|
- }
|
|
|
|
330
|
-
|
|
|
|
331
|
- if (currentSegment.length() > 0) {
|
|
|
|
332
|
- segments.add(currentSegment.toString().trim());
|
|
|
|
333
|
- }
|
|
|
|
334
|
- return segments;
|
|
|
|
335
|
- }
|
|
|
|
336
|
-
|
|
|
|
337
|
- // 按标题分割文本
|
|
|
|
338
|
- private List<String> splitByHeadings(String content) {
|
|
|
|
339
|
- List<String> segments = new ArrayList<>();
|
|
|
|
340
|
- StringBuilder currentSegment = new StringBuilder();
|
|
|
|
341
|
- String[] lines = content.split("\\r?\\n");
|
|
|
|
342
|
-
|
|
|
|
343
|
- for (String line : lines) {
|
|
|
|
344
|
- // 检测标题行(以1-6个#开头,后面跟着空格)
|
|
|
|
345
|
- if (line.trim().matches("^#{1,6}\\s+.*")) {
|
|
|
|
346
|
- // 保存当前分段
|
|
|
|
347
|
- if (!currentSegment.isEmpty()) {
|
|
|
|
348
|
- segments.add(currentSegment.toString().trim());
|
|
|
|
349
|
- currentSegment = new StringBuilder();
|
|
|
|
350
|
- }
|
|
|
|
351
|
- }
|
|
|
|
352
|
- currentSegment.append(line).append("\n");
|
|
|
|
353
|
- }
|
|
|
|
354
|
-
|
|
|
|
355
|
- // 添加最后一个分段
|
|
|
|
356
|
- if (!currentSegment.isEmpty()) {
|
|
|
|
357
|
- segments.add(currentSegment.toString().trim());
|
|
|
|
358
|
- }
|
|
|
|
359
|
-
|
|
|
|
360
|
- return segments;
|
|
|
|
361
|
- }
|
|
|
|
362
|
-
|
|
|
|
363
|
private String extractTableContent(XWPFTable table) {
|
285
|
private String extractTableContent(XWPFTable table) {
|
|
364
|
StringBuilder tableContent = new StringBuilder();
|
286
|
StringBuilder tableContent = new StringBuilder();
|
|
365
|
table.getRows().forEach(row -> {
|
287
|
table.getRows().forEach(row -> {
|
|
@@ -381,7 +303,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
@@ -381,7 +303,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
381
|
try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
|
303
|
try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
|
|
382
|
StringBuilder currentContent = new StringBuilder();
|
304
|
StringBuilder currentContent = new StringBuilder();
|
|
383
|
List<String> titlePath = new ArrayList<>();
|
305
|
List<String> titlePath = new ArrayList<>();
|
|
384
|
- String lastOutput = null;
|
|
|
|
385
|
|
306
|
|
|
386
|
for (IBodyElement element : doc.getBodyElements()) {
|
307
|
for (IBodyElement element : doc.getBodyElements()) {
|
|
387
|
if (element instanceof XWPFParagraph) {
|
308
|
if (element instanceof XWPFParagraph) {
|