|
...
|
...
|
@@ -266,26 +266,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
return text.replaceAll("\\s+", " ").trim();
|
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean isHeading(Paragraph para, Range range) {
|
|
|
|
int styleIndex = para.getStyleIndex();
|
|
|
|
if (styleIndex >= 1 && styleIndex <= 9) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
CharacterRun run = para.getCharacterRun(0);
|
|
|
|
if (run.isBold() || run.getFontSize() > 12) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
|
|
log.warn("获取字符格式失败", e);
|
|
|
|
}
|
|
|
|
|
|
|
|
String text = para.text().trim();
|
|
|
|
return text.toUpperCase().equals(text) &&
|
|
|
|
text.length() < 100 &&
|
|
|
|
!text.contains("\t");
|
|
|
|
}
|
|
|
|
|
|
|
|
private String removeUuidSuffix(String fileName) {
|
|
|
|
// 移除UUID后缀部分
|
|
...
|
...
|
@@ -302,64 +282,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 后备分割方案:按段落结构分割
|
|
|
|
private List<String> splitByContentStructure(XWPFDocument doc) {
|
|
|
|
List<String> segments = new ArrayList<>();
|
|
|
|
StringBuilder currentSegment = new StringBuilder();
|
|
|
|
final int MAX_SEGMENT_LENGTH = 1000; // 最大分段长度
|
|
|
|
|
|
|
|
for (IBodyElement element : doc.getBodyElements()) {
|
|
|
|
String text = "";
|
|
|
|
if (element instanceof XWPFParagraph) {
|
|
|
|
text = ((XWPFParagraph) element).getText().trim();
|
|
|
|
} else if (element instanceof XWPFTable) {
|
|
|
|
text = extractTableContent((XWPFTable) element);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (text.isEmpty()) continue;
|
|
|
|
|
|
|
|
// 当遇到空行或达到最大长度时分段
|
|
|
|
if (currentSegment.length() + text.length() > MAX_SEGMENT_LENGTH
|
|
|
|
&& currentSegment.length() > 0) {
|
|
|
|
segments.add(currentSegment.toString().trim());
|
|
|
|
currentSegment = new StringBuilder();
|
|
|
|
}
|
|
|
|
|
|
|
|
currentSegment.append(text).append("\n\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (currentSegment.length() > 0) {
|
|
|
|
segments.add(currentSegment.toString().trim());
|
|
|
|
}
|
|
|
|
return segments;
|
|
|
|
}
|
|
|
|
|
|
|
|
// 按标题分割文本
|
|
|
|
private List<String> splitByHeadings(String content) {
|
|
|
|
List<String> segments = new ArrayList<>();
|
|
|
|
StringBuilder currentSegment = new StringBuilder();
|
|
|
|
String[] lines = content.split("\\r?\\n");
|
|
|
|
|
|
|
|
for (String line : lines) {
|
|
|
|
// 检测标题行(以1-6个#开头,后面跟着空格)
|
|
|
|
if (line.trim().matches("^#{1,6}\\s+.*")) {
|
|
|
|
// 保存当前分段
|
|
|
|
if (!currentSegment.isEmpty()) {
|
|
|
|
segments.add(currentSegment.toString().trim());
|
|
|
|
currentSegment = new StringBuilder();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
currentSegment.append(line).append("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
// 添加最后一个分段
|
|
|
|
if (!currentSegment.isEmpty()) {
|
|
|
|
segments.add(currentSegment.toString().trim());
|
|
|
|
}
|
|
|
|
|
|
|
|
return segments;
|
|
|
|
}
|
|
|
|
|
|
|
|
private String extractTableContent(XWPFTable table) {
|
|
|
|
StringBuilder tableContent = new StringBuilder();
|
|
|
|
table.getRows().forEach(row -> {
|
|
...
|
...
|
@@ -381,7 +303,6 @@ public class QuestionEmbeddingServiceImpl implements IQuestionEmbeddingService { |
|
|
|
try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
|
|
|
|
StringBuilder currentContent = new StringBuilder();
|
|
|
|
List<String> titlePath = new ArrayList<>();
|
|
|
|
String lastOutput = null;
|
|
|
|
|
|
|
|
for (IBodyElement element : doc.getBodyElements()) {
|
|
|
|
if (element instanceof XWPFParagraph) {
|
...
|
...
|
|