文档拆分
接口
POST /api/dataset/document/split
请求参数
file
(二进制文件):需要上传并拆分的文档文件。
响应
{
"code": 200,
"message": "成功",
"data": [
{
"name": "ICS111_31391_Miller_Syllabus_F24.pdf",
"id": "file_id"
"content": [
{
"title": "",
"content": "ICS 111- Introduction to Computer Science I, ...n"
}
]
}
]
}
文档上传
用于接收文档上传并存储到本地系统。
数据模型
package com.litongjava.maxkb.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
@Data
@NoArgsConstructor
@AllArgsConstructor
@Accessors(chain = true)
public class UploadResultVo {
private Long id;
private String filename;
private String url;
private String md5;
}
控制器
package com.litongjava.maxkb.controller;
import com.litongjava.annotation.RequestPath;
import com.litongjava.jfinal.aop.Aop;
import com.litongjava.maxkb.model.UploadResultVo;
import com.litongjava.maxkb.service.DatasetDocumentSplitService;
import com.litongjava.maxkb.service.SystemFileService;
import com.litongjava.model.result.ResultVo;
import com.litongjava.tio.http.common.UploadFile;
@RequestPath("/api/dataset/document")
public class ApiDatasetDocumentController {
/**
* 拆分上传的文档
*
* @param file 上传的文件
* @return 拆分结果
* @throws IOException 可能的IO异常
*/
public ResultVo split(UploadFile file) throws IOException {
if (file == null) {
return ResultVo.fail("请求体中未找到文件");
}
SystemFileService systemFileService = Aop.get(SystemFileService.class);
UploadResultVo vo = systemFileService.upload(file, "default", "default");
return Aop.get(DatasetDocumentSplitService.class).split(file.getData(), vo);
}
}
文件服务
package com.litongjava.maxkb.service;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import com.litongjava.db.activerecord.Row;
import com.litongjava.jfinal.aop.Aop;
import com.litongjava.maxkb.dao.SystemUploadFileDao;
import com.litongjava.maxkb.model.UploadResultVo;
import com.litongjava.tio.http.common.UploadFile;
import com.litongjava.tio.utils.crypto.Md5Utils;
import com.litongjava.tio.utils.environment.EnvUtils;
import com.litongjava.tio.utils.hutool.FileUtil;
import com.litongjava.tio.utils.hutool.FilenameUtils;
import com.litongjava.tio.utils.snowflake.SnowflakeIdUtils;
public class SystemFileService {
/**
* 上传文件并存储到本地文件系统
*
* @param uploadFile 上传的文件
* @param bucketName 存储桶名称
* @param category 文件分类
* @return UploadResultVo 文件上传结果
*/
public UploadResultVo upload(UploadFile uploadFile, String bucketName, String category) {
if (uploadFile != null) {
byte[] fileData = uploadFile.getData();
String digestHex = Md5Utils.digestHex(fileData);
SystemUploadFileDao systemUploadFileDao = Aop.get(SystemUploadFileDao.class);
Row row = systemUploadFileDao.getFileBasicInfoByMd5(bucketName, digestHex);
// 如果文件已存在,返回已有文件信息
if (row != null) {
Long id = row.getLong("id");
String filename = row.getStr("filename");
String targetName = row.getStr("target_name");
String url = getUrl(bucketName, targetName);
return new UploadResultVo(id, filename, url, digestHex);
}
// 生成新的文件名和路径
String originFilename = uploadFile.getName();
String suffix = FilenameUtils.getSuffix(originFilename);
long id = SnowflakeIdUtils.id();
String filename = id + "." + suffix;
Path path = Paths.get("pages", bucketName, category);
// 创建目录(如果不存在)
try {
Files.createDirectories(path);
} catch (IOException e) {
e.printStackTrace();
return null;
}
// 完整文件路径
Path filePath = path.resolve(filename);
File file = filePath.toFile();
// 将文件数据写入指定路径
FileUtil.writeBytes(fileData, file);
String targetName = category + "/" + filename;
String url = getUrl(bucketName, targetName);
systemUploadFileDao.save(id, digestHex, originFilename, fileData.length, "local", bucketName, targetName);
return new UploadResultVo(id, originFilename, url, digestHex);
}
return null;
}
/**
* 根据存储桶和目标名称生成文件访问 URL
*
* @param bucketName 存储桶名称
* @param targetName 目标名称
* @return 文件访问 URL
*/
public String getUrl(String bucketName, String targetName) {
String prefixUrl = EnvUtils.getStr("file_prefix_url");
return prefixUrl + "/" + bucketName + "/" + targetName;
}
/**
* 根据文件 ID 获取文件 URL
*
* @param id 文件 ID
* @return UploadResultVo 文件上传结果
*/
public UploadResultVo getUrlById(Long id) {
SystemUploadFileDao systemUploadFileDao = Aop.get(SystemUploadFileDao.class);
Row row = systemUploadFileDao.getFileBasicInfoById(id);
String md5 = row.getStr("md5");
String filename = row.getStr("filename");
String bucketName = row.getStr("bucket_name");
String targetName = row.getStr("target_name");
String url = getUrl(bucketName, targetName);
return new UploadResultVo(id, filename, url, md5);
}
/**
* 根据文件 MD5 获取文件 URL
*
* @param bucketName 存储桶名称
* @param md5 文件 MD5 值
* @return UploadResultVo 文件上传结果
*/
public UploadResultVo getUrlByMd5(String bucketName, String md5) {
SystemUploadFileDao systemUploadFileDao = Aop.get(SystemUploadFileDao.class);
Row row = systemUploadFileDao.getFileBasicInfoByMd5(bucketName, md5);
Long id = row.getLong("id");
String filename = row.getStr("filename");
String targetName = row.getStr("target_name");
String url = getUrl(bucketName, targetName);
return new UploadResultVo(id, filename, url, md5);
}
}
数据访问对象 (DAO)
package com.litongjava.maxkb.dao;
import com.litongjava.db.activerecord.Db;
import com.litongjava.db.activerecord.Row;
public class SystemUploadFileDao {
public static final String tableName = "max_kb_file";
/**
* 根据 MD5 获取文件基本信息
*
* @param bucketName 存储桶名称
* @param md5 文件 MD5 值
* @return Row 文件基本信息
*/
public Row getFileBasicInfoByMd5(String bucketName, String md5) {
String sql = String.format("SELECT id, filename, bucket_name, target_name FROM %s WHERE bucket_name=? AND md5=? AND deleted=0", tableName);
return Db.findFirst(sql, bucketName, md5);
}
/**
* 根据文件 ID 获取文件基本信息
*
* @param id 文件 ID
* @return Row 文件基本信息
*/
public Row getFileBasicInfoById(long id) {
String sql = String.format("SELECT md5, filename, bucket_name, target_name FROM %s WHERE id=? AND deleted=0", tableName);
return Db.findFirst(sql, id);
}
/**
* 保存文件信息
*
* @param id 文件 ID
* @param md5 文件 MD5 值
* @param originFilename 原始文件名
* @param fileSize 文件大小
* @param platform 上传平台
* @param bucketName 存储桶名称
* @param targetName 目标名称
* @return 是否保存成功
*/
public boolean save(long id, String md5, String originFilename, int fileSize, String platform, String bucketName, String targetName) {
Row row = Row.by("id", id)
.set("md5", md5)
.set("filename", originFilename)
.set("file_size", fileSize)
.set("platform", platform)
.set("bucket_name", bucketName)
.set("target_name", targetName);
return Db.save(tableName, row);
}
}
文档解析和拆分
文档解析的方案有很多种,这里采用大模型进行文档解析,通过解析提示词完成文档拆分。
解析提示词示例:
You are a helpful ocr assiatant. Please extract the text from the following image and text.
- Ignore the header and footer, but exclude the title.
- Do not include any additional information or explanations
- Need to recognize mathematical, physical, and chemical symbols and formulas and keep format.
- Need to recognize table,title and page number.
- If you encounter an image, display a placeholder for the image.
- Output format is markdown.
- You can directly output Markdown without using Markdown tags.
- If you cannot recognize , please output "not_working"
markdown:
PDF 文件解析流程
将 PDF 每一页转成图片
使用 PDFBox 将 PDF 文件的每一页渲染为图片。开启指定的并发线路数量
根据配置开启多个并发线程,将图片与提示词一同发送给大模型进行解析。单张图片解析后进行缓存
为防止相同的图片重复解析,解析结果进行缓存。解析完成后拼接成一个文档并存储到本地
将所有解析后的文本拼接成完整文档,并保存到本地。对文档进行分片
使用文档分割工具将文档按指定规则分片,便于后续处理和存储。将分片后的文档返回前端
将处理后的文档分片结果以 JSON 格式返回给前端应用。
单并发方案
提示词常量
控制器(单并发)
package com.litongjava.maxkb.controller;
import java.io.IOException;
import com.litongjava.annotation.RequestPath;
import com.litongjava.jfinal.aop.Aop;
import com.litongjava.maxkb.model.UploadResultVo;
import com.litongjava.maxkb.service.DatasetDocumentSplitService;
import com.litongjava.maxkb.service.SystemFileService;
import com.litongjava.model.result.ResultVo;
import com.litongjava.tio.http.common.UploadFile;
@RequestPath("/api/dataset/document")
public class ApiDatasetDocumentController {
public ResultVo split(UploadFile file) throws IOException {
if (file == null) {
return ResultVo.fail("请求体中未找到文件");
}
SystemFileService systemFileService = Aop.get(SystemFileService.class);
UploadResultVo vo = systemFileService.upload(file, "default", "default");
return Aop.get(DatasetDocumentSplitService.class).split(file.getData(), vo);
}
}
文档拆分服务(单并发)
package com.litongjava.maxkb.service;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import com.jfinal.kit.Kv;
import com.litongjava.db.TableInput;
import com.litongjava.db.activerecord.Db;
import com.litongjava.maxkb.constant.MaxKbPrompt;
import com.litongjava.maxkb.constant.TableNames;
import com.litongjava.maxkb.model.UploadResultVo;
import com.litongjava.model.result.ResultVo;
import com.litongjava.openai.chat.ChatResponseUsage;
import com.litongjava.openai.chat.ChatResponseVo;
import com.litongjava.openai.client.OpenAiClient;
import com.litongjava.openai.constants.OpenAiModels;
import com.litongjava.table.services.ApiTable;
import com.litongjava.tio.utils.crypto.Md5Utils;
import com.litongjava.tio.utils.environment.EnvUtils;
import com.litongjava.tio.utils.hutool.FileUtil;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.openai.OpenAiTokenizer;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class DatasetDocumentSplitService {
/**
* 拆分文档
*
* @param data 文档二进制数据
* @param vo 上传结果对象
* @return 拆分后的结果
* @throws IOException 可能的IO异常
*/
public ResultVo split(byte[] data, UploadResultVo vo) throws IOException {
String filename = vo.getFilename();
String suffix = "png";
String apiKey = EnvUtils.getStr("OPENAI_API_KEY");
String markdown = toMarkdown(apiKey, data, suffix);
Document document = new Document(markdown);
DocumentSplitter splitter = DocumentSplitters.recursive(500, 100, new OpenAiTokenizer());
List<TextSegment> segments = splitter.split(document);
Kv fileSplitResult = Kv.by("name", filename);
List<Kv> contents = new ArrayList<>();
for (TextSegment textSegment : segments) {
contents.add(Kv.by("title", "").set("content", textSegment.text()));
}
fileSplitResult.set("content", contents);
List<Kv> results = new ArrayList<>();
results.add(fileSplitResult);
return ResultVo.ok(results);
}
/**
* 将文档转换为 Markdown 格式
*
* @param apiKey OpenAI API 密钥
* @param data 文档二进制数据
* @param suffix 文件后缀
* @return Markdown 文本
* @throws IOException 可能的IO异常
*/
private String toMarkdown(String apiKey, byte[] data, String suffix) throws IOException {
String md5 = Md5Utils.digestHex(data);
TableInput ti = TableInput.create().columns("target");
String target = ApiTable.queryStr(TableNames.max_kb_document_markdown_cache, md5, ti);
boolean exists = false;
if (target != null) {
false = true;
// 读取缓存文件并返回
File file = new File(target);
if (file.exists()) {
return FileUtil.readString(file);
}
}
// 加载 PDF 文档并获取总页数
List<byte[]> documentBytes = new ArrayList<>();
int totalPages = 0;
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(data))) {
totalPages = document.getNumberOfPages();
log.info("Total pages: {}", totalPages);
PDFRenderer renderer = new PDFRenderer(document);
for (int i = 0; i < totalPages; i++) {
BufferedImage bufferedImage = renderer.renderImageWithDPI(i, 144);
byte[] imageBytes = toBytes(bufferedImage, suffix);
documentBytes.add(imageBytes);
}
} catch (IOException e) {
log.error("Error loading PDF document: {}", e.getMessage(), e);
throw e;
}
List<String> markdowns = new ArrayList<>(documentBytes.size());
for (byte[] imageBytes : documentBytes) {
String markdown = convertPdfToMarkdown(apiKey, imageBytes, suffix);
markdowns.add(markdown);
}
StringBuilder combinedMarkdown = new StringBuilder();
for (String string : markdowns) {
combinedMarkdown.append(string);
}
target = "markdowns/" + md5 + ".md";
new File(target).getParentFile().mkdirs();
// 保存到缓存
FileUtil.writeString(combinedMarkdown.toString(), target, "UTF-8");
return combinedMarkdown.toString();
}
/**
* 将 BufferedImage 转换为字节数组
*
* @param bufferedImage 图片对象
* @param suffix 文件后缀
* @return 字节数组
* @throws IOException 可能的IO异常
*/
public byte[] toBytes(BufferedImage bufferedImage, String suffix) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ImageIO.write(bufferedImage, suffix, baos);
baos.flush();
byte[] imageBytes = baos.toByteArray();
baos.close();
return imageBytes;
}
/**
* 将 PDF 页面转换为 Markdown
*
* @param apiKey OpenAI API 密钥
* @param imageBytes 图片字节数组
* @param suffix 文件后缀
* @return Markdown 文本
* @throws IOException 可能的IO异常
*/
private String convertPdfToMarkdown(String apiKey, byte[] imageBytes, String suffix) throws IOException {
String id = Md5Utils.digestHex(imageBytes);
String sql = String.format("SELECT content FROM %s WHERE id=?", TableNames.max_kb_document_markdown_page_cache);
String content = Db.queryStr(sql, id);
if (content != null) {
return content;
}
String imageName = id + "." + suffix;
String imagePath = "images/" + imageName;
File imageFile = new File(imagePath);
imageFile.getParentFile().mkdirs();
FileUtil.writeBytes(imageBytes, imageFile);
long start = System.currentTimeMillis();
// 调用大模型进行解析
ChatResponseVo chatResponseVo = OpenAiClient.chatWithImage(apiKey, MaxKbPrompt.image_to_text, imageBytes, suffix);
long end = System.currentTimeMillis();
content = chatResponseVo.getChoices().get(0).getMessage().getContent();
if (content.startsWith("```markdown")) {
content = content.substring(11, content.length() - 3);
}
ChatResponseUsage usage = chatResponseVo.getUsage();
long elapsed = end - start;
TableInput saveInput = TableInput.by("id", id)
.set("target", imagePath)
.set("content", content)
.set("elapsed", elapsed)
.set("model", OpenAiModels.gpt_4o)
.set("system_fingerprint", chatResponseVo.getSystem_fingerprint())
.set("completion_tokens", usage.getCompletion_tokens())
.set("prompt_tokens", usage.getPrompt_tokens())
.set("total_tokens", usage.getTotal_tokens());
ApiTable.save(TableNames.max_kb_document_markdown_page_cache, saveInput);
return content;
}
}
多并发方案
CompletionService
CompletionService
是 Java 并发库中的接口,用于管理和组织一组并发任务的执行。它提供了一种方便的方式来提交任务并按完成顺序获取结果,而不需要关注任务的提交顺序或执行时间。
核心方法包括:
submit(Callable task)
:提交任务给内部的执行器(Executor)。take()
:等待并获取完成的任务结果,按完成顺序返回结果。poll()
:非阻塞地获取完成的任务结果,如果没有已完成的任务,则返回null
。
使用场景:
使用 CompletionService
可以方便地在一组异步任务中处理已经完成的结果,比如批量处理多个文件的解析、数据库查询等。它会按完成顺序提供结果,方便进一步处理。
注意事项:
CompletionService
无法保证结果的顺序,因为它会按任务完成的先后顺序返回结果,而不考虑页面的顺序。
解决方案:
要保证解析结果的顺序,可以在提交任务时将每个任务的页码与 Future
绑定,然后在所有任务完成后根据页码重新排序。
示例代码
package com.litongjava.maxkb.utils;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class ExecutorServiceUtils {
private static final int CONCURRENT_REQUESTS = 100;
private static final ExecutorService executorService = Executors.newFixedThreadPool(CONCURRENT_REQUESTS);
public static ExecutorService getExecutorService() {
return executorService;
}
}
package com.litongjava.maxkb.service;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import com.jfinal.kit.Kv;
import com.litongjava.db.TableInput;
import com.litongjava.db.TableResult;
import com.litongjava.db.activerecord.Db;
import com.litongjava.db.activerecord.Row;
import com.litongjava.maxkb.constant.MaxKbPrompt;
import com.litongjava.maxkb.constant.TableNames;
import com.litongjava.maxkb.utils.ExecutorServiceUtils;
import com.litongjava.maxkb.vo.UploadResultVo;
import com.litongjava.model.result.ResultVo;
import com.litongjava.openai.chat.ChatResponseUsage;
import com.litongjava.openai.chat.ChatResponseVo;
import com.litongjava.openai.client.OpenAiClient;
import com.litongjava.openai.constants.OpenAiModels;
import com.litongjava.table.services.ApiTable;
import com.litongjava.tio.utils.crypto.Md5Utils;
import com.litongjava.tio.utils.environment.EnvUtils;
import com.litongjava.tio.utils.hutool.FileUtil;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.openai.OpenAiTokenizer;
import lombok.extern.slf4j.Slf4j;
/**
* DatasetDocumentSplitService
*
* 该服务用于将上传的PDF文档拆分为多个Markdown段落,利用OpenAI API将图像转换为文本。
* 支持多线程并发处理,提高处理效率。
*
* @author
* @date
*/
@Slf4j
public class DatasetDocumentSplitService {
private static final int MAX_HEIGHT = 2200;
private final Object lock = new Object();
/**
* 拆分文档(多并发)
*
* @param data 文档二进制数据
* @param vo 上传结果对象
* @return 拆分后的结果
* @throws IOException 可能的IO异常
* @throws InterruptedException 线程中断异常
* @throws ExecutionException 执行异常
*/
public ResultVo split(byte[] data, UploadResultVo vo) throws IOException, InterruptedException, ExecutionException {
String filename = vo.getFilename();
String suffix = "png";
String apiKey = EnvUtils.getStr("OPENAI_API_KEY");
String markdown = toMarkdown(apiKey, data, suffix);
Document document = new Document(markdown);
// 使用较小的块大小(150)和相同的重叠(50)以实现更细粒度的分割
DocumentSplitter splitter = DocumentSplitters.recursive(150, 100, new OpenAiTokenizer());
List<TextSegment> segments = splitter.split(document);
// 创建包含文件名和ID的KV对象
Kv fileSplitResult = Kv.by("name", filename).set("id", vo.getId());
List<Kv> contents = new ArrayList<>();
for (TextSegment textSegment : segments) {
contents.add(Kv.by("title", "").set("content", textSegment.text()));
}
fileSplitResult.set("content", contents);
List<Kv> results = new ArrayList<>();
results.add(fileSplitResult);
return ResultVo.ok(results);
}
/**
* 将文档转换为 Markdown 格式(多并发)
*
* @param apiKey OpenAI API 密钥
* @param data 文档二进制数据
* @param suffix 文件后缀
* @return Markdown 文本
* @throws IOException 可能的IO异常
* @throws InterruptedException 线程中断异常
* @throws ExecutionException 执行异常
*/
private String toMarkdown(String apiKey, byte[] data, String suffix) throws IOException, InterruptedException, ExecutionException {
String md5 = Md5Utils.digestHex(data);
log.info("Processing document with MD5: {}", md5);
// 查询缓存以检查是否已经处理过该文档
TableInput ti = TableInput.create().columns("target,content").set("id", md5);
TableResult<Row> tableResult = ApiTable.get(TableNames.max_kb_document_markdown_cache, ti);
Row row = tableResult.getData();
boolean exists = false;
String target = null;
if (row != null) {
target = row.getStr("target");
if (target != null) {
exists = true;
File file = new File(target);
if (file.exists()) {
log.info("Markdown found in cache at {}", target);
return FileUtil.readString(file);
}
}
String content = row.getStr("content");
if (content != null) {
log.info("Markdown content found in cache");
return content;
}
}
// 将PDF每一页转换为图像字节数组,并处理可能过大的图像
List<byte[]> documentBytes = new ArrayList<>();
int totalPages = 0;
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(data))) {
totalPages = document.getNumberOfPages();
PDFRenderer renderer = new PDFRenderer(document);
for (int i = 0; i < totalPages; i++) {
BufferedImage bufferedImage = renderer.renderImageWithDPI(i, 144);
// 分割图像以处理高度超过MAX_HEIGHT的情况
List<byte[]> splitImages = toBytes(bufferedImage, suffix);
documentBytes.addAll(splitImages);
}
}
// 使用CompletionService来管理并发任务
CompletionService<String> completionService = new ExecutorCompletionService<>(ExecutorServiceUtils.getExecutorService());
List<Future<String>> futures = new ArrayList<>();
for (byte[] imageBytes : documentBytes) {
futures.add(completionService.submit(() -> convertPdfPageToMarkdown(apiKey, imageBytes, suffix)));
}
// 等待所有任务完成并按提交顺序存储结果
List<String> markdowns = new ArrayList<>(Collections.nCopies(documentBytes.size(), null));
for (int i = 0; i < futures.size(); i++) {
Future<String> future = completionService.take();
int pageIndex = futures.indexOf(future); // 获取任务对应的索引
markdowns.set(pageIndex, future.get()); // 按索引顺序存储结果
}
// 组合所有Markdown内容
StringBuilder combinedMarkdown = new StringBuilder();
for (String markdown : markdowns) {
combinedMarkdown.append(markdown);
}
// 保存Markdown到文件
target = "markdowns/" + md5 + ".md";
new File(target).getParentFile().mkdirs();
FileUtil.writeString(combinedMarkdown.toString(), target, "UTF-8");
log.info("Markdown saved to {}", target);
// 更新或保存缓存记录
if (exists) {
Db.update(TableNames.max_kb_document_markdown_cache, Row.by("id", md5).set("target", target));
log.info("Cache updated for document MD5: {}", md5);
} else {
Db.save(TableNames.max_kb_document_markdown_cache, Row.by("id", md5).set("target", target));
log.info("Cache saved for new document MD5: {}", md5);
}
return combinedMarkdown.toString();
}
/**
* 将 BufferedImage 转换为字节数组,如果图像高度超过MAX_HEIGHT,则进行垂直分割
*
* @param bufferedImage 图片对象
* @param suffix 文件后缀
* @return 字节数组列表
* @throws IOException 可能的IO异常
*/
private List<byte[]> toBytes(BufferedImage bufferedImage, String suffix) throws IOException {
List<byte[]> imageBytesList = new ArrayList<>();
if (bufferedImage.getHeight() <= MAX_HEIGHT) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ImageIO.write(bufferedImage, suffix, baos);
imageBytesList.add(baos.toByteArray());
} else {
int numParts = (int) Math.ceil((double) bufferedImage.getHeight() / MAX_HEIGHT);
for (int i = 0; i < numParts; i++) {
int y = i * MAX_HEIGHT;
int height = Math.min(MAX_HEIGHT, bufferedImage.getHeight() - y);
BufferedImage subImage = bufferedImage.getSubimage(0, y, bufferedImage.getWidth(), height);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ImageIO.write(subImage, suffix, baos);
imageBytesList.add(baos.toByteArray());
log.debug("Image split into part {}/{}", i + 1, numParts);
}
}
return imageBytesList;
}
/**
* 将 PDF 页面转换为 Markdown(多并发)
*
* @param apiKey OpenAI API 密钥
* @param imageBytes 图片字节数组
* @param suffix 文件后缀
* @return Markdown 文本
* @throws IOException 可能的IO异常
*/
private String convertPdfPageToMarkdown(String apiKey, byte[] imageBytes, String suffix) throws IOException {
String id = Md5Utils.digestHex(imageBytes);
String sql = String.format("SELECT content FROM %s WHERE id=?", TableNames.max_kb_document_markdown_page_cache);
// 查询缓存以避免重复处理
String content = Db.queryStr(sql, id);
if (content != null) {
log.debug("Content found in page cache for ID: {}", id);
return content;
}
// 保存图像文件
String imageName = id + "." + suffix;
String imagePath = "images/" + imageName;
File imageFile = new File(imagePath);
imageFile.getParentFile().mkdirs();
FileUtil.writeBytes(imageBytes, imageFile);
log.debug("Image saved to {}", imagePath);
// 调用OpenAI API将图像转换为文本
long start = System.currentTimeMillis();
ChatResponseVo chatResponseVo = null;
try {
chatResponseVo = OpenAiClient.chatWithImage(apiKey, MaxKbPrompt.image_to_text, imageBytes, suffix);
} catch (Exception e) {
try {
chatResponseVo = OpenAiClient.chatWithImage(apiKey, MaxKbPrompt.image_to_text, imageBytes, suffix);
} catch (Exception e1) {
chatResponseVo = OpenAiClient.chatWithImage(apiKey, MaxKbPrompt.image_to_text, imageBytes, suffix);
}
}
content = chatResponseVo.getChoices().get(0).getMessage().getContent();
if (content.startsWith("```")) {
content = content.substring(3, content.length() - 3);
}
ChatResponseUsage usage = chatResponseVo.getUsage();
TableInput saveInput = TableInput.by("id", id).set("target", imagePath).set("content", content).set("elapsed", System.currentTimeMillis() - start).set("model", OpenAiModels.gpt_4o)
.set("system_fingerprint", chatResponseVo.getSystem_fingerprint()).set("completion_tokens", usage.getCompletion_tokens()).set("prompt_tokens", usage.getPrompt_tokens())
.set("total_tokens", usage.getTotal_tokens());
// 再次检查缓存以防止并发情况下的重复保存
String cacheContent = Db.queryStr(sql, id);
if (cacheContent != null) {
log.debug("Content found in page cache during save for ID: {}", id);
return cacheContent;
}
// 同步保存操作,避免多线程同时写入
synchronized (lock) {
// 再次检查以确保线程安全
cacheContent = Db.queryStr(sql, id);
if (cacheContent == null) {
ApiTable.save(TableNames.max_kb_document_markdown_page_cache, saveInput);
log.debug("Content cached for page ID: {}", id);
return content;
} else {
return cacheContent;
}
}
}
}