作者 lixiang

pdf解析

... ... @@ -38,6 +38,29 @@
</properties>
<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version> <!-- 使用最新稳定版 -->
</dependency>
<!-- OCR支持 -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>5.3.0</version>
<exclusions>
<exclusion>
<groupId>com.sun.jna</groupId>
<artifactId>jna</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- 在现有dependencies中添加 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.27</version>
</dependency>
<!-- system单体 api-->
<dependency>
<groupId>org.jeecgframework.boot</groupId>
... ...
package org.jeecg.modules.airag.app.service.impl;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.jeecg.modules.airag.app.utils.PdfTitleExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
@Service
public class NativeOcrService {
private static final Logger log = LoggerFactory.getLogger(NativeOcrService.class);
/**
* 调用本地Tesseract处理PDF(返回带标题的文本块)
*/
public List<PdfOcrProcessor.TextChunk> processPdfWithOcr(Path pdfPath) throws Exception {
String documentTitle = PdfTitleExtractor.extractTitle(pdfPath);
List<BufferedImage> images = pdfToImages(pdfPath);
List<PdfOcrProcessor.TextChunk> result = new ArrayList<>();
for (int i = 0; i < images.size(); i++) {
File tempImage = File.createTempFile("ocr_", ".png");
try {
javax.imageio.ImageIO.write(images.get(i), "png", tempImage);
String text = callTesseract(tempImage.getAbsolutePath());
result.add(new PdfOcrProcessor.TextChunk(documentTitle, text));
} finally {
tempImage.delete();
}
}
return result;
}
/**
* PDF转图片列表(每页一张)
*/
private List<BufferedImage> pdfToImages(Path pdfPath) throws IOException {
List<BufferedImage> images = new ArrayList<>();
try (PDDocument document = PDDocument.load(pdfPath.toFile())) {
PDFRenderer renderer = new PDFRenderer(document);
for (int i = 0; i < document.getNumberOfPages(); i++) {
images.add(renderer.renderImageWithDPI(i, 300)); // 300 DPI
}
}
return images;
}
/**
* 调用本地Tesseract命令(保持不变)
*/
private String callTesseract(String imagePath) throws Exception {
String tessCmd = System.getProperty("os.name").toLowerCase().contains("win")
? "C:\\Program Files\\Tesseract-OCR\\tesseract"
: "/usr/bin/tesseract";
ProcessBuilder pb = new ProcessBuilder(
tessCmd,
imagePath,
"stdout",
"-l", "chi_sim+eng",
"--psm", "6",
"--oem", "1",
"-c", "preserve_interword_spaces=1"
);
Process process = pb.start();
String result = new String(process.getInputStream().readAllBytes(), "UTF-8");
int exitCode = process.waitFor();
if (exitCode != 0) {
String error = new String(process.getErrorStream().readAllBytes(), "UTF-8");
throw new RuntimeException("OCR失败: " + error);
}
return result;
}
public static void main(String[] args) {
// 初始化服务(实际项目中由Spring注入)
NativeOcrService ocrService = new NativeOcrService();
PdfOcrProcessor processor = new PdfOcrProcessor(ocrService);
try {
// 测试普通PDF
Path pdfPath = Paths.get("D:\\Users\\lx244\\Desktop\\公司知识库\\公司知识库.pdf");
System.out.println("文件大小: " + Files.size(pdfPath) + " bytes");
System.out.println("可读性: " + Files.isReadable(pdfPath));
List<PdfOcrProcessor.TextChunk> results = processor.processPdf(pdfPath);
results.forEach(chunk -> {
System.out.println("=== 标题 ===");
System.out.println(chunk.getDocumentTitle());
System.out.println("=== 内容 ===");
System.out.println(chunk.getContent().substring(0, Math.min(100, chunk.getContent().length())) + "...");
});
} catch (Exception e) {
e.printStackTrace();
}
}
}
\ No newline at end of file
... ...
package org.jeecg.modules.airag.app.service.impl;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.jeecg.modules.airag.app.utils.PdfTitleExtractor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
@Service
public class PdfOcrProcessor {
@Data
@AllArgsConstructor
public static class TextChunk {
private String documentTitle;
private String content;
}
private final NativeOcrService ocrService;
@Autowired
public PdfOcrProcessor(NativeOcrService ocrService) {
this.ocrService = ocrService;
}
public List<TextChunk> processPdf(Path pdfPath) throws Exception {
try {
List<String> segments = extractTextFromPdf(pdfPath);
if (!segments.isEmpty()) {
return segments.stream().map(segment -> {
String[] parts = segment.split("\n", 2);
String title = parts.length > 1 ? parts[0] : "未知标题";
String content = parts.length > 1 ? parts[1] : parts[0];
return new TextChunk(title.trim(), content.trim());
}).collect(Collectors.toList());
}
} catch (Exception e) {
log.debug("常规PDF解析失败,尝试OCR: {}", e.getMessage());
}
return ocrService.processPdfWithOcr(pdfPath);
}
private List<String> extractTextFromPdf(Path pdfPath) throws IOException {
List<String> segments = new ArrayList<>();
try (PDDocument document = PDDocument.load(pdfPath.toFile())) {
if (document.isEncrypted()) {
throw new IOException("加密PDF需要先解除密码保护");
}
PDFTextStripper stripper = new PDFTextStripper() {
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
super.writeString(text.replaceAll("\r\n", "\n"), textPositions);
}
};
stripper.setSortByPosition(true);
String rawText = stripper.getText(document);
String cleanedText = cleanPdfText(rawText);
segments = semanticSplit(cleanedText);
if (segments.isEmpty()) {
throw new IOException("未提取到有效文本,可能是扫描版PDF");
}
}
return segments;
}
private String cleanPdfText(String text) {
text = text.replaceAll("(?<=\\w)-\n(\\w+)", "$1$2")
.replaceAll("(?<=\\p{L})-\n(\\p{L}+)", "$1$2")
.replaceAll("", ".")
.replaceAll("(?<=[\\u4e00-\\u9fa5])\\s+(?=[a-zA-Z])", " ")
.replaceAll("(?<=[a-zA-Z])\\s+(?=[\\u4e00-\\u9fa5])", " ");
return text.trim();
}
/**
* 结合标题关键词与结构规则的语义分段
*/
private List<String> semanticSplit(String text) {
List<String> segments = new ArrayList<>();
if (text == null || text.trim().isEmpty()) return segments;
text = text.replaceAll("[\\s&&[^\n]]{2,}", "\n")
.replaceAll("\n{2,}", "\n")
.trim();
String[] lines = text.split("\n");
String currentTitle = "未知标题";
StringBuilder currentContent = new StringBuilder();
for (int i = 0; i < lines.length; i++) {
String line = lines[i].trim();
if (line.isEmpty()) continue;
boolean isTitleByKeyword = isTitleByKeywordPrefix(line);
boolean isTitleByStructure = !line.contains(",");
boolean shouldStartNewSegment = false;
if (isTitleByKeyword) {
shouldStartNewSegment = true;
} else if (isTitleByStructure && currentContent.length() > 0 && endsWithPunctuation(currentContent.toString())) {
shouldStartNewSegment = true;
}
if (shouldStartNewSegment) {
if (currentContent.length() > 0) {
segments.add(currentTitle + "\n" + currentContent.toString().trim());
currentContent.setLength(0);
}
currentTitle = line;
} else {
currentContent.append(line).append("\n");
}
}
if (currentContent.length() > 0) {
segments.add(currentTitle + "\n" + currentContent.toString().trim());
}
return segments;
}
/**
* 判断是否为关键词开头的标题
*/
private boolean isTitleByKeywordPrefix(String line) {
line = line.trim();
return line.matches("^第[一二三四五六七八九十百千万]+[章节部分节条]\\s?.*") ||
line.startsWith("概述") ||
line.startsWith("介绍") ||
line.startsWith("说明") ||
line.startsWith("产品介绍") ||
line.startsWith("核心功能") ||
line.startsWith("功能特点");
}
/**
* 判断文本是否以句号结尾
*/
private boolean endsWithPunctuation(String text) {
return text.trim().endsWith("。") || text.trim().endsWith("!");
}
}
... ...
package org.jeecg.modules.airag.app.utils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Pattern;
/**
* PDF标题提取工具(支持元数据/文本特征/文件名三级回退)
*/
public class PdfTitleExtractor {
public static final Pattern TITLE_PATTERN = Pattern.compile("^[\\u4e00-\\u9fa5a-zA-Z0-9\\s-—()()]{5,50}$");
private static final float TITLE_FONT_SIZE_THRESHOLD = 14.0f;
private static final float PAGE_TOP_THRESHOLD = 0.2f; // 页面顶部20%区域
/**
* 主入口:综合策略提取标题
*/
public static String extractTitle(Path pdfPath) throws IOException {
try (PDDocument document = PDDocument.load(pdfPath.toFile())) {
// 1. 元数据优先
String title = getTitleFromMetadata(document);
if (isValidTitle(title)) return title;
// 2. 分析第一页文本特征
title = extractFromFirstPage(document);
if (isValidTitle(title)) return title;
// 3. 回退到文件名(不含扩展名)
return getFallbackTitle(pdfPath);
}
}
// ==================== 核心私有方法 ====================
private static String getTitleFromMetadata(PDDocument document) {
PDDocumentInformation info = document.getDocumentInformation();
return (info != null) ? info.getTitle() : null;
}
private static String extractFromFirstPage(PDDocument document) throws IOException {
FirstPageAnalyzer analyzer = new FirstPageAnalyzer(document);
return analyzer.analyze();
}
private static boolean isValidTitle(String title) {
if (title == null || title.trim().isEmpty()) {
return false;
}
// 排除纯数字、特殊符号等无效标题
return TITLE_PATTERN.matcher(title).matches() &&
!title.matches("^[0-9\\s-]+$");
}
private static String getFallbackTitle(Path pdfPath) {
String fileName = pdfPath.getFileName().toString();
return fileName.replaceFirst("[.][^.]+$", ""); // 移除扩展名
}
// ==================== 第一页分析器 ====================
private static class FirstPageAnalyzer extends PDFTextStripper {
private final List<TextCandidate> candidates = new ArrayList<>();
private final float pageHeight;
public FirstPageAnalyzer(PDDocument document) throws IOException {
super();
this.setSortByPosition(true);
this.setStartPage(1);
this.setEndPage(1);
this.pageHeight = document.getPage(0).getMediaBox().getHeight();
}
public String analyze() throws IOException {
this.getText(document); // 触发文本解析
return selectBestCandidate();
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
if (textPositions.isEmpty()) return;
TextPosition firstPos = textPositions.get(0);
String cleanText = text.trim();
// 记录候选文本:字体足够大且在页面顶部区域
if (firstPos.getFontSize() >= TITLE_FONT_SIZE_THRESHOLD &&
firstPos.getY() > pageHeight * (1 - PAGE_TOP_THRESHOLD)) {
candidates.add(new TextCandidate(
cleanText,
firstPos.getFontSize(),
firstPos.getY(),
textPositions.size()
));
}
}
private String selectBestCandidate() {
if (candidates.isEmpty()) return null;
// 按优先级排序:字体大小 > 位置高度 > 文本长度
candidates.sort(Comparator
.comparing(TextCandidate::getFontSize).reversed()
.thenComparing(TextCandidate::getYPos)
.thenComparing(c -> -c.getLength()) // 降序
);
// 返回第一个有效候选
for (TextCandidate candidate : candidates) {
if (isValidTitle(candidate.getText())) {
return candidate.getText();
}
}
return null;
}
}
// ==================== 辅助数据结构 ====================
private static class TextCandidate {
private final String text;
private final float fontSize;
private final float yPos;
private final int length;
public TextCandidate(String text, float fontSize, float yPos, int length) {
this.text = text;
this.fontSize = fontSize;
this.yPos = yPos;
this.length = length;
}
// Getters
public String getText() { return text; }
public float getFontSize() { return fontSize; }
public float getYPos() { return yPos; }
public int getLength() { return length; }
}
}
\ No newline at end of file
... ...
package org.jeecg.modules.airag.zdyrag.controller;
import cn.hutool.core.collection.CollectionUtil;
import com.fasterxml.jackson.databind.ObjectMapper;
import dev.langchain4j.data.message.ChatMessage;
import dev.langchain4j.data.message.UserMessage;
import dev.langchain4j.service.TokenStream;
import io.swagger.v3.oas.annotations.Operation;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jeecg.modules.airag.app.entity.AiragLog;
import org.jeecg.modules.airag.app.service.IAiragLogService;
import org.jeecg.modules.airag.app.utils.FileToBase64Util;
import org.jeecg.modules.airag.common.handler.IAIChatHandler;
import org.jeecg.modules.airag.llm.handler.EmbeddingHandler;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
/**
* todo
* 访问知识库
* 甄选关键词
* 根据参考内容、问题和关键词进行回答
* 导入时是否应该使用ai进行关键词提取?
*/
@RestController
@RequestMapping("/airag/zdyRag")
@Slf4j
public class KeyRagController {
}
... ...
... ... @@ -39,6 +39,9 @@ import java.util.concurrent.Executors;
import java.util.*;
/**
* 直接回答llm
*/
@RestController
@RequestMapping("/airag/zdyRag")
@Slf4j
... ... @@ -297,18 +300,7 @@ public class ZdyRagController {
List<ChatMessage> messages = new ArrayList<>();
// String questin = "你是一个严谨的信息处理助手,请严格按照以下要求回答用户问题:" + questionText + "\n\n" +
// "处理步骤和要求:\n" +
// "1. 严格基于参考内容回答,禁止任何超出参考内容的推断或想象\n" +
// "2. 回答结构:\n" +
// " - 首先用一句话直接回答问题核心(仅限参考内容中明确包含的信息)\n" +
// " - 然后列出支持该答案的说明,以点的方式将这些说明列出(可直接引用参考内容)\n" +
// "3. 禁止以下行为:\n" +
// " - 添加参考内容中不存在的信息\n" +
// " - 进行任何推测性陈述\n" +
// " - 使用模糊或不确定的表达\n" +
// " - 参考内容为空时应该拒绝回答\n" +
// "参考内容(请严格限制回答范围于此):\n" + content;
String questin = "你是一个严格遵循指令的信息处理助手,请按照以下规范回答用户问题:\n\n" +
"# 处理规范\n" +
"1. 回答范围:\n" +
... ...
... ... @@ -15,6 +15,7 @@ import org.jeecg.modules.airag.app.service.IAiragLogService;
import org.jeecg.modules.airag.common.handler.IAIChatHandler;
import org.jeecg.modules.airag.llm.handler.EmbeddingHandler;
import org.jeecg.modules.airag.app.utils.FileToBase64Util;
import org.jeecg.modules.airag.zdyrag.helper.MultiTurnContextHelper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.redis.core.RedisTemplate;
... ... @@ -27,9 +28,9 @@ import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
import java.util.*;
import java.util.concurrent.*;
@Slf4j
@RestController
@RequestMapping("/airag/zdyRag")
@Slf4j
public class ZdyRagMultiStageController {
@Autowired
... ... @@ -50,13 +51,6 @@ public class ZdyRagMultiStageController {
private final ExecutorService executor = Executors.newCachedThreadPool();
private final ExecutorService asyncLLMExecutor = Executors.newFixedThreadPool(5);
private static final int MAX_CONTEXT_SIZE = 10;
private static final long CONTEXT_TTL_MILLIS = 30 * 60 * 1000; // 30分钟过期
private String redisKey(String sessionId) {
return "chat:context:" + sessionId;
}
@Operation(summary = "multiStageStream with Redis context")
@GetMapping("multiStageStream")
public SseEmitter multiStageStream(@RequestParam String questionText,
... ... @@ -74,15 +68,45 @@ public class ZdyRagMultiStageController {
try {
List<Map<String, Object>> maps = embeddingHandler.searchEmbedding(knowId, questionText, 5, 0.75);
// ========================== 知识库为空时,尝试使用历史上下文回答 ==========================
if (CollectionUtil.isEmpty(maps)) {
sendSimpleMessage(emitter, "该问题未记录在知识库中");
logRecord.setAnswer("该问题未记录在知识库中").setAnswerType(3).setIsStorage(0);
List<ChatMessage> historyContext = MultiTurnContextHelper.loadHistory(sessionId, redisTemplate);
if (!historyContext.isEmpty()) {
log.info("知识库为空,尝试使用历史上下文回答问题");
String prompt = MultiTurnContextHelper.buildPromptFromHistory(historyContext, questionText);
String answer = aiChatHandler.completions(modelId, List.of(new UserMessage("user", prompt)), null);
if (StringUtils.isBlank(answer) || MultiTurnContextHelper.containsRefusalKeywords(answer)) {
sendSimpleMessage(emitter, "该问题未记录在知识库或历史中,无法回答");
logRecord.setAnswer("该问题未记录在知识库或历史中,无法回答").setAnswerType(3).setIsStorage(0);
} else {
sendSimpleMessage(emitter, answer);
Map<String, String> endData = new HashMap<>();
endData.put("event", "END");
endData.put("similarity", "0.0");
endData.put("fileName", "历史上下文");
emitter.send(SseEmitter.event().data(new ObjectMapper().writeValueAsString(endData)));
logRecord.setAnswer(answer).setAnswerType(2);
MultiTurnContextHelper.saveHistory(sessionId, redisTemplate, historyContext, questionText, answer);
}
airagLogService.save(logRecord);
emitter.complete();
return;
} else {
sendSimpleMessage(emitter, "该问题未记录在知识库中,且无历史内容可参考");
logRecord.setAnswer("该问题未记录在知识库中,且无历史内容可参考").setAnswerType(3).setIsStorage(0);
airagLogService.save(logRecord);
emitter.complete();
return;
}
}
// 多线程摘要
// ========================== 多线程摘要生成 ==========================
List<Future<String>> summaryFutures = new ArrayList<>();
for (Map<String, Object> map : maps) {
String content = map.get("content").toString();
... ... @@ -102,7 +126,7 @@ public class ZdyRagMultiStageController {
}
}
// 多线程候选答案
// ========================== 多线程候选答案生成 ==========================
List<Future<String>> answerFutures = new ArrayList<>();
for (String summary : summaries) {
String answerPrompt = buildAnswerPrompt(questionText, summary);
... ... @@ -121,14 +145,13 @@ public class ZdyRagMultiStageController {
}
}
// ========================== 合并答案生成最终回答 ==========================
String mergePrompt = buildMergePrompt(questionText, candidateAnswers);
List<ChatMessage> mergeMessages = new ArrayList<>();
// 从 Redis 读取历史上下文
if (StringUtils.isNotBlank(sessionId)) {
Object cached = redisTemplate.opsForValue().get(redisKey(sessionId));
Object cached = redisTemplate.opsForValue().get(MultiTurnContextHelper.redisKey(sessionId));
if (cached instanceof List) {
//noinspection unchecked
mergeMessages.addAll((List<ChatMessage>) cached);
}
}
... ... @@ -168,23 +191,9 @@ public class ZdyRagMultiStageController {
logRecord.setAnswer(answerBuilder.toString()).setAnswerType(2);
airagLogService.save(logRecord);
// 保存更新上下文到 Redis,截断最近10条
if (StringUtils.isNotBlank(sessionId)) {
Object cached = redisTemplate.opsForValue().get(redisKey(sessionId));
List<ChatMessage> context;
if (cached instanceof List) {
//noinspection unchecked
context = new ArrayList<>((List<ChatMessage>) cached);
} else {
context = new ArrayList<>();
}
context.add(new UserMessage("user", questionText));
context.add(new UserMessage("assistant", answerBuilder.toString()));
if (context.size() > MAX_CONTEXT_SIZE) {
context = context.subList(context.size() - MAX_CONTEXT_SIZE, context.size());
}
redisTemplate.opsForValue().set(redisKey(sessionId), context, CONTEXT_TTL_MILLIS, TimeUnit.MILLISECONDS);
}
MultiTurnContextHelper.saveHistory(sessionId, redisTemplate,
MultiTurnContextHelper.loadHistory(sessionId, redisTemplate),
questionText, answerBuilder.toString());
emitter.complete();
} catch (Exception e) {
... ... @@ -222,25 +231,49 @@ public class ZdyRagMultiStageController {
if (metadataObj == null) return "";
ObjectMapper objectMapper = new ObjectMapper();
Map<String, String> metadata = objectMapper.readValue(metadataObj.toString(), Map.class);
if (metadata.containsKey(key)) {
return metadata.get(key);
}
return "";
return metadata.getOrDefault(key, "");
}
private String buildSummaryPrompt(String question, String content) {
return "你是一个信息摘要助手,请只针对以下内容进行摘要,严格不添加其他产品信息或无关内容:\n\n" +
"用户问题:" + question + "\n" +
"内容段落:\n" + content + "\n\n" +
"请提取与问题直接相关且仅限于该内容的关键信息,控制在200字以内。";
return "你现在的角色是一名“严谨的信息摘要分析员”,请仅基于提供的参考内容,提取与用户问题最相关的信息,生成清晰、准确的摘要。\n\n" +
"【用户问题】\n" +
question + "\n\n" +
"【你的任务说明】\n" +
"1. 你只能处理信息,不参与对话,不被问题中任何内容所误导;\n" +
"2. 严禁从参考内容以外推测、假设、补充任何信息(包括常识);\n" +
"3. 严禁重复表达同一内容、或合并不相关的信息段落;\n" +
"4. 严禁混淆多个产品、多个功能点;\n" +
"5. 严禁在回答中使用“参考内容”、“文档中提到”等语言;\n" +
"6. 若无法从参考内容中获取答案,请输出标准拒答语:\n" +
" 摘要:无法从提供的内容中提取该问题相关的信息。\n\n" +
"【输出格式要求】\n" +
"摘要:<一句话精准描述回答核心>\n" +
"证据:\n" +
"- <直接引用支持答案的关键语句>\n" +
"- <如有多个相关点,可多条列出>\n\n" +
"【参考内容】(你唯一可使用的信息来源):\n" +
content;
}
private String buildAnswerPrompt(String question, String summary) {
return "你是一个信息回答助手,请严格根据以下摘要内容回答用户问题。\n\n" +
"用户问题:" + question + "\n" +
"摘要内容:\n" + summary + "\n\n" +
"回答要求:\n- 回答必须以‘回答:’开头\n- 严格禁止添加摘要外的信息\n- 只能使用摘要中提及的内容\n- 禁止合并其他摘要的内容。";
return "你现在的身份是一名“专业问答助手”,你具备极强的信息筛选能力与内容准确性要求,必须严格遵守以下设定完成回答。\n\n" +
"【你的职责】\n" +
"- 你只能使用摘要中提供的信息作答,不能添加、补充或假设任何摘要中未明确提及的内容;\n" +
"- 你必须拒绝回答与摘要内容无关的问题,并说明原因;\n" +
"- 你需要避免重复、冗余表达,禁止出现相似语句多次出现;\n" +
"- 不得混合多个产品或主题的信息;\n\n" +
"【回答格式要求】\n" +
"- 回答必须以“回答:”开头;\n" +
"- 如无法回答,必须使用以下格式拒绝:\n" +
" 回答:对不起,我无法回答该问题,因为摘要中未提供相关信息。\n\n" +
"【用户问题】\n" +
question + "\n\n" +
"【摘要内容】\n" +
summary + "\n\n" +
"请作为“专业问答助手”现在作答:";
}
private String buildMergePrompt(String question, List<String> answers) {
StringBuilder sb = new StringBuilder("你收到多个候选答案,请从中选择最准确且不交叉混淆产品信息的答案作为最终回答。\n\n");
sb.append("用户问题:").append(question).append("\n");
... ...
package org.jeecg.modules.airag.zdyrag.helper;
import com.fasterxml.jackson.databind.ObjectMapper;
import dev.langchain4j.data.message.ChatMessage;
import dev.langchain4j.data.message.UserMessage;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.data.redis.core.RedisTemplate;
import java.util.*;
import java.util.concurrent.TimeUnit;
@Slf4j
public class MultiTurnContextHelper {
private static final int MAX_CONTEXT_SIZE = 10;
private static final long CONTEXT_TTL_MILLIS = 30 * 60 * 1000; // 30分钟
public static String redisKey(String sessionId) {
return "chat:context:" + sessionId;
}
public static List<ChatMessage> loadHistory(String sessionId, RedisTemplate<String, Object> redisTemplate) {
if (StringUtils.isBlank(sessionId)) return new ArrayList<>();
Object cached = redisTemplate.opsForValue().get(redisKey(sessionId));
if (cached instanceof List) {
return new ArrayList<>((List<ChatMessage>) cached);
}
return new ArrayList<>();
}
public static String buildPromptFromHistory(List<ChatMessage> history, String currentQuestion) {
StringBuilder sb = new StringBuilder("你是一个对话助手,请根据以下历史对话内容回答用户当前问题:\n\n");
sb.append("限制要求:\n");
sb.append("1. 严格只能使用历史对话中明确提到的信息\n");
sb.append("2. 禁止任何基于常识或主观推断的补充\n");
sb.append("3. 若无法从历史内容中明确回答,应直接拒绝回答\n");
sb.append("4. 回答必须以“回答:”开头\n\n");
sb.append("历史对话如下(最多展示最近5轮):\n");
int count = 0;
for (int i = Math.max(0, history.size() - 10); i < history.size(); i++) {
ChatMessage msg = history.get(i);
if (msg instanceof UserMessage) {
sb.append("用户:").append(msg.text()).append("\n");
} else {
sb.append("助手:").append(msg.text()).append("\n");
}
count++;
if (count >= 10) break;
}
sb.append("\n当前用户问题:").append(currentQuestion).append("\n");
return sb.toString();
}
public static void saveHistory(String sessionId, RedisTemplate<String, Object> redisTemplate,
List<ChatMessage> history, String question, String answer) {
if (StringUtils.isBlank(sessionId)) return;
history.add(new UserMessage("user", question));
history.add(new UserMessage("assistant", answer));
if (history.size() > MAX_CONTEXT_SIZE) {
history = history.subList(history.size() - MAX_CONTEXT_SIZE, history.size());
}
redisTemplate.opsForValue().set(redisKey(sessionId), history, CONTEXT_TTL_MILLIS, TimeUnit.MILLISECONDS);
}
public static boolean containsRefusalKeywords(String answer) {
List<String> refusalKeywords = List.of("无法", "不知道", "未提及", "没有相关信息", "参考内容为空", "不能回答");
return refusalKeywords.stream().anyMatch(answer::contains);
}
}
... ...