|
@@ -1,10 +1,15 @@
|
|
|
-import json,os
|
|
|
+import json,os,time
|
|
|
+import re
|
|
|
from typing import List
|
|
|
from PyPDF2 import PdfReader, PdfWriter
|
|
|
-from model import SplitModel,PageConfig
|
|
|
+from tools.pdf_split.model import SplitModel,PageConfig
|
|
|
from PIL import Image
|
|
|
import io
|
|
|
-import pymupdf
|
|
|
+import pymupdf
|
|
|
+import tools.utils as utils
|
|
|
+from tools.utils.file_helper import encode_image
|
|
|
+from tools.utils.ai_helper import AiHelper
|
|
|
+from tools.pdf_split.mysql_store import MysqlStore
|
|
|
|
|
|
class PDFProcessor:
|
|
|
"""PDF处理器类,负责执行PDF文件的拆分操作"""
|
|
@@ -170,87 +175,332 @@ class PDFProcessor:
|
|
|
def extract_and_merge_images(input_file: str, output_file: str = None) -> str:
|
|
|
try:
|
|
|
pdf_document = pymupdf.open(input_file)
|
|
|
- images = []
|
|
|
- total_height = 0
|
|
|
- max_width = 0
|
|
|
+ # 根据输入路径生成图片目录
|
|
|
+ output_name = os.path.splitext(os.path.basename(input_file))[0]
|
|
|
+ parts = input_file.rsplit('/pdf/', 1)
|
|
|
+ output_file = '/pdf/'.join(parts[:-1]) + '/img/' + parts[-1]
|
|
|
+ output_dir = os.path.splitext(output_file)[0]
|
|
|
+
|
|
|
+ #output_dir = output_file + f'/{output_name}/'
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
# 遍历每一页提取图片
|
|
|
for page_num in range(pdf_document.page_count):
|
|
|
page = pdf_document[page_num]
|
|
|
-
|
|
|
# 获取页面上的所有图片,包括内嵌图片
|
|
|
- pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2)) # 使用2倍缩放以获得更好的质量
|
|
|
- img_data = pix.tobytes("png")
|
|
|
-
|
|
|
- # 将图片字节转换为PIL Image对象
|
|
|
- image = Image.open(io.BytesIO(img_data))
|
|
|
- if image.mode != 'RGB':
|
|
|
- image = image.convert('RGB')
|
|
|
-
|
|
|
- images.append(image)
|
|
|
- total_height += image.height
|
|
|
- max_width = max(max_width, image.width)
|
|
|
-
|
|
|
- # 如果没有找到图片
|
|
|
- if not images:
|
|
|
- print("未在PDF中找到任何图片")
|
|
|
- return ''
|
|
|
-
|
|
|
- # 创建新的图片用于拼接
|
|
|
- merged_image = Image.new('RGB', (max_width, total_height))
|
|
|
- y_offset = 0
|
|
|
-
|
|
|
- # 将所有图片垂直拼接
|
|
|
- for img in images:
|
|
|
- x_offset = (max_width - img.width) // 2
|
|
|
- merged_image.paste(img, (x_offset, y_offset))
|
|
|
- y_offset += img.height
|
|
|
-
|
|
|
- # 设置输出路径
|
|
|
- if output_file is None:
|
|
|
- parts = input_file.rsplit('/pdf/', 1)
|
|
|
- output_file = '/pdf/'.join(parts[:-1]) + '/img/' + parts[-1]
|
|
|
- output_file = os.path.splitext(output_file)[0] + "_merged.png"
|
|
|
- os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
|
|
-
|
|
|
- # 根据图片数量计算目标大小
|
|
|
- target_size_per_image = 100 * 1024 # 每张图片100KB
|
|
|
- max_size = target_size_per_image * len(images)
|
|
|
- scale = 1.0
|
|
|
- quality = 95
|
|
|
-
|
|
|
- while True:
|
|
|
- temp_buffer = io.BytesIO()
|
|
|
- if scale < 1.0:
|
|
|
- new_size = (int(merged_image.width * scale), int(merged_image.height * scale))
|
|
|
- resized_image = merged_image.resize(new_size, Image.Resampling.LANCZOS)
|
|
|
- resized_image.save(temp_buffer, 'PNG', optimize=True, quality=quality)
|
|
|
- else:
|
|
|
- merged_image.save(temp_buffer, 'PNG', optimize=True, quality=quality)
|
|
|
-
|
|
|
- size = temp_buffer.tell()
|
|
|
-
|
|
|
- if size <= max_size:
|
|
|
- with open(output_file, 'wb') as f:
|
|
|
- f.write(temp_buffer.getvalue())
|
|
|
- print(f"成功保存图片:[{(size // 1024)} KB] {output_file}")
|
|
|
- break
|
|
|
-
|
|
|
- if scale > 0.5:
|
|
|
- scale *= 0.9
|
|
|
- else:
|
|
|
- # 如果达到最小缩放比例,直接保存当前结果
|
|
|
- with open(output_file, 'wb') as f:
|
|
|
- f.write(temp_buffer.getvalue())
|
|
|
- print(f"成功保存图片:[{(size // 1024)} KB] {output_file}")
|
|
|
- break
|
|
|
-
|
|
|
- return output_file
|
|
|
+ # pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2)) # 实际使用的缩放参数
|
|
|
+ # img_data = pix.tobytes("png")
|
|
|
+
|
|
|
+ # 初始化压缩参数
|
|
|
+ scale = 1.0
|
|
|
+ img_data = None
|
|
|
+ max_size = 200 * 1024
|
|
|
+ # 循环调整缩放直到符合大小要求
|
|
|
+ while scale >= 0.5: # 最小缩放比例50%
|
|
|
+ # 生成临时图片数据
|
|
|
+ temp_pix = page.get_pixmap(matrix=pymupdf.Matrix(1.5 * scale, 1.5 * scale))
|
|
|
+ img_data = temp_pix.tobytes("png")
|
|
|
+
|
|
|
+ if len(img_data) <= max_size: # 100KB限制
|
|
|
+ break
|
|
|
+ scale *= 0.9 # 每次缩小10%
|
|
|
+ # 生成序列文件名
|
|
|
+ img_path = os.path.join(output_dir, f"{page_num + 1:02d}.png")
|
|
|
+ # 保存单页图片
|
|
|
+ with open(img_path, 'wb') as f:
|
|
|
+ f.write(img_data)
|
|
|
+ print(f"成功保存图片({len(img_data) // 1024}KB): {img_path}")
|
|
|
+ return output_dir
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"处理图片时发生错误: {str(e)}")
|
|
|
return ''
|
|
|
|
|
|
+ # @staticmethod
|
|
|
+ # def extract_and_merge_images(input_file: str, output_file: str = None) -> str:
|
|
|
+ # try:
|
|
|
+ # pdf_document = pymupdf.open(input_file)
|
|
|
+ # images = []
|
|
|
+ # total_height = 0
|
|
|
+ # max_width = 0
|
|
|
+ #
|
|
|
+ # # 遍历每一页提取图片
|
|
|
+ # for page_num in range(pdf_document.page_count):
|
|
|
+ # page = pdf_document[page_num]
|
|
|
+ #
|
|
|
+ # # 获取页面上的所有图片,包括内嵌图片
|
|
|
+ # pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2)) # 使用2倍缩放以获得更好的质量
|
|
|
+ # img_data = pix.tobytes("png")
|
|
|
+ #
|
|
|
+ # # 将图片字节转换为PIL Image对象
|
|
|
+ # image = Image.open(io.BytesIO(img_data))
|
|
|
+ # if image.mode != 'RGB':
|
|
|
+ # image = image.convert('RGB')
|
|
|
+ #
|
|
|
+ # images.append(image)
|
|
|
+ # total_height += image.height
|
|
|
+ # max_width = max(max_width, image.width)
|
|
|
+ #
|
|
|
+ # # 如果没有找到图片
|
|
|
+ # if not images:
|
|
|
+ # print("未在PDF中找到任何图片")
|
|
|
+ # return ''
|
|
|
+ #
|
|
|
+ # # 创建新的图片用于拼接
|
|
|
+ # merged_image = Image.new('RGB', (max_width, total_height))
|
|
|
+ # y_offset = 0
|
|
|
+ #
|
|
|
+ # # 将所有图片垂直拼接
|
|
|
+ # for img in images:
|
|
|
+ # x_offset = (max_width - img.width) // 2
|
|
|
+ # merged_image.paste(img, (x_offset, y_offset))
|
|
|
+ # y_offset += img.height
|
|
|
+ #
|
|
|
+ # # 设置输出路径
|
|
|
+ # if output_file is None:
|
|
|
+ # parts = input_file.rsplit('/pdf/', 1)
|
|
|
+ # output_file = '/pdf/'.join(parts[:-1]) + '/img/' + parts[-1]
|
|
|
+ # output_file = os.path.splitext(output_file)[0] + "_merged.png"
|
|
|
+ # os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
|
|
+ #
|
|
|
+ # # 根据图片数量计算目标大小
|
|
|
+ # target_size_per_image = 200 * 1024 # 每张图片100KB
|
|
|
+ # max_size = target_size_per_image * len(images)
|
|
|
+ # scale = 1.0
|
|
|
+ # quality = 95
|
|
|
+ #
|
|
|
+ # while True:
|
|
|
+ # temp_buffer = io.BytesIO()
|
|
|
+ # if scale < 1.0:
|
|
|
+ # new_size = (int(merged_image.width * scale), int(merged_image.height * scale))
|
|
|
+ # resized_image = merged_image.resize(new_size, Image.Resampling.LANCZOS)
|
|
|
+ # resized_image.save(temp_buffer, 'PNG', optimize=True, quality=quality)
|
|
|
+ # else:
|
|
|
+ # merged_image.save(temp_buffer, 'PNG', optimize=True, quality=quality)
|
|
|
+ #
|
|
|
+ # size = temp_buffer.tell()
|
|
|
+ #
|
|
|
+ # if size <= max_size:
|
|
|
+ # with open(output_file, 'wb') as f:
|
|
|
+ # f.write(temp_buffer.getvalue())
|
|
|
+ # print(f"成功保存图片:[{(size // 1024)} KB] {output_file}")
|
|
|
+ # break
|
|
|
+ #
|
|
|
+ # if scale > 0.5:
|
|
|
+ # scale *= 0.9
|
|
|
+ # else:
|
|
|
+ # # 如果达到最小缩放比例,直接保存当前结果
|
|
|
+ # with open(output_file, 'wb') as f:
|
|
|
+ # f.write(temp_buffer.getvalue())
|
|
|
+ # print(f"成功保存图片:[{(size // 1024)} KB] {output_file}")
|
|
|
+ # break
|
|
|
+ #
|
|
|
+ # return output_file
|
|
|
+ #
|
|
|
+ # except Exception as e:
|
|
|
+ # print(f"处理图片时发生错误: {str(e)}")
|
|
|
+ # return ''
|
|
|
+ @staticmethod
|
|
|
+ def process_image_to_txt(filename: str):
|
|
|
+ """将目录下的多张图片合并生成一个Markdown文件"""
|
|
|
+ version = "v1"
|
|
|
+ base_output_dir = "./temp_files/pdf/output/"
|
|
|
+ output_dir = f"{base_output_dir}/{version}/{filename}/"
|
|
|
+ image_dir = f"{output_dir}/img/"
|
|
|
+ txt_dir = f"{output_dir}/txt/"
|
|
|
+
|
|
|
+ db_store = MysqlStore()
|
|
|
+ ai_helper = AiHelper()
|
|
|
+
|
|
|
+ # 创建标准记录(如果不存在)
|
|
|
+ if not db_store.get_standard_by_name(filename):
|
|
|
+ db_store.create_standard(code=filename, name=filename)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 遍历图片目录中的每个子目录(新增目录处理逻辑)
|
|
|
+ for dir_path, dir_names, file_names in os.walk(image_dir):
|
|
|
+ # 跳过根目录
|
|
|
+ if dir_path == image_dir:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 解析目录结构(新增章节解析)
|
|
|
+ dir_rel_path = os.path.relpath(dir_path, image_dir)
|
|
|
+
|
|
|
+ chapter_parts = dir_rel_path.split('@')
|
|
|
+ if len(chapter_parts) < 3:
|
|
|
+ continue # 跳过不符合命名规范的目录
|
|
|
+
|
|
|
+ # 生成对应的txt目录
|
|
|
+ #txt_subdir = os.path.join(txt_dir, dir_rel_path)
|
|
|
+ #os.makedirs(txt_subdir, exist_ok=True)
|
|
|
+
|
|
|
+ # 收集当前目录的所有图片(新增图片收集逻辑)
|
|
|
+ image_files = sorted(
|
|
|
+ [f for f in file_names if f.lower().endswith(('.png', '.jpg', '.jpeg'))],
|
|
|
+ key=lambda x: int(x.split('.')[0])
|
|
|
+ )
|
|
|
+
|
|
|
+ if not image_files:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 创建合并的markdown文件(修改文件生成逻辑)
|
|
|
+ md_filename = f"{dir_rel_path.replace('@', '_')}.md"
|
|
|
+ md_path = os.path.join(txt_dir, md_filename)
|
|
|
+ os.makedirs(os.path.dirname(md_path), exist_ok=True)
|
|
|
+ md_content = f"# {filename}\n## {'/'.join(chapter_parts)}\n\n"
|
|
|
+
|
|
|
+ # 处理目录下所有图片(新增合并循环)
|
|
|
+ all_images = []
|
|
|
+ for img_file in image_files:
|
|
|
+
|
|
|
+ img_path = os.path.join(dir_path, img_file)
|
|
|
+ img_name = os.path.basename(img_path)
|
|
|
+ try:
|
|
|
+ # 调用AI分析图片
|
|
|
+ page_content = ai_helper.analyze_image_with_ai(img_path)
|
|
|
+ # 添加5秒延时以控制API请求频率
|
|
|
+ time.sleep(5)
|
|
|
+ # 生成图片相对路径
|
|
|
+ #rel_path = os.path.relpath(img_path, txt_subdir)
|
|
|
+ utils.get_logger().info(f"处理图片 {img_path} 成功")
|
|
|
+ md_content += f"########### {img_path} ####################\n"
|
|
|
+ md_content += f"--start{img_name}--\n\n"
|
|
|
+ md_content += f"\n\n{page_content}\n\n"
|
|
|
+ md_content += f"--end{img_name}--\n\n"
|
|
|
+ all_images.append(img_path)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"处理图片 {img_file} 失败: {str(e)}")
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 保存合并的文档(修改保存逻辑)
|
|
|
+ with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(md_content)
|
|
|
+
|
|
|
+ # 插入数据库记录(新增批量记录)
|
|
|
+ db_store.add_pdf_record(
|
|
|
+ standard_name=filename,
|
|
|
+ pdf_path=os.path.abspath(all_images[0].replace("/img/", "/pdf/").rsplit('.', 1)[0] + '.pdf'),
|
|
|
+ image_path='\n'.join([os.path.abspath(p) for p in all_images]),
|
|
|
+ markdown_text=md_content,
|
|
|
+ chapter=chapter_parts[0],
|
|
|
+ section=chapter_parts[1],
|
|
|
+ subsection=chapter_parts[2]
|
|
|
+ )
|
|
|
+
|
|
|
+ print(f"成功生成合并文档: {md_path}")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"处理过程中发生错误: {str(e)}")
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def regenerate_markdown(img_path: str):
|
|
|
+ """重新生成指定图片或目录的Markdown内容"""
|
|
|
+ processor = PDFProcessor()
|
|
|
+ db_store = MysqlStore()
|
|
|
+ ai_helper = AiHelper()
|
|
|
+
|
|
|
+ if os.path.isdir(img_path):
|
|
|
+ # 处理整个目录
|
|
|
+ dir_path = img_path
|
|
|
+ # 通过图片路径反向推导标准名称和目录结构
|
|
|
+ parts = dir_path.split('/img/')
|
|
|
+ if len(parts) < 2:
|
|
|
+ print("无效的目录路径")
|
|
|
+ return
|
|
|
+
|
|
|
+ # 获取原Markdown文件路径
|
|
|
+ txt_root = dir_path.replace('/img/', '/txt/')
|
|
|
+ md_files = [f for f in os.listdir(txt_root) if f.endswith('.md')]
|
|
|
+ if not md_files:
|
|
|
+ print("找不到对应的Markdown文件")
|
|
|
+ return
|
|
|
+
|
|
|
+ md_path = os.path.join(txt_root, md_files[0])
|
|
|
+ # 重新生成整个目录内容
|
|
|
+ processor._process_directory(dir_path, md_path, db_store, ai_helper)
|
|
|
+
|
|
|
+ elif os.path.isfile(img_path):
|
|
|
+ # 处理单个图片
|
|
|
+ img_file = os.path.basename(img_path)
|
|
|
+ dir_path = os.path.dirname(img_path)
|
|
|
+ # 查找对应的Markdown文件
|
|
|
+ txt_dir = dir_path.replace('/img/', '/txt/')
|
|
|
+ md_files = [f for f in os.listdir(txt_dir) if f.endswith('.md')]
|
|
|
+ if not md_files:
|
|
|
+ print("找不到对应的Markdown文件")
|
|
|
+ return
|
|
|
+
|
|
|
+ md_path = os.path.join(txt_dir, md_files[0])
|
|
|
+ # 更新单个图片内容
|
|
|
+ processor._update_single_image(img_path, md_path, db_store, ai_helper)
|
|
|
+
|
|
|
+ def _process_directory(self, dir_path: str, md_path: str, db_store, ai_helper):
|
|
|
+ """处理整个目录重新生成"""
|
|
|
+ # 收集目录下所有图片
|
|
|
+ image_files = sorted(
|
|
|
+ [f for f in os.listdir(dir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))],
|
|
|
+ key=lambda x: int(x.split('.')[0])
|
|
|
+ )
|
|
|
+
|
|
|
+ # 重新生成Markdown内容
|
|
|
+ new_content = f"# {os.path.basename(os.path.dirname(md_path))}\n"
|
|
|
+ all_images = []
|
|
|
+
|
|
|
+ for img_file in image_files:
|
|
|
+ img_path = os.path.join(dir_path, img_file)
|
|
|
+ page_content = ai_helper.analyze_image_with_ai(img_path)
|
|
|
+ img_name = os.path.splitext(img_file)[0]
|
|
|
+
|
|
|
+ new_content += f"########### {img_path} ####################\n"
|
|
|
+ new_content += f"--start{img_name}--\n\n"
|
|
|
+ new_content += f"\n\n{page_content}\n\n"
|
|
|
+ new_content += f"--end{img_name}--\n\n"
|
|
|
+ all_images.append(img_path)
|
|
|
+
|
|
|
+ # 写入更新后的内容
|
|
|
+ with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(new_content)
|
|
|
+
|
|
|
+ # 更新数据库记录
|
|
|
+ db_store.update_pdf_record(
|
|
|
+ markdown_text=new_content,
|
|
|
+ image_paths=','.join(all_images),
|
|
|
+ by_image_path=all_images[0]
|
|
|
+ )
|
|
|
+
|
|
|
+ def _update_single_image(self, img_path: str, md_path: str, db_store, ai_helper):
|
|
|
+ """更新单个图片内容"""
|
|
|
+ img_name = os.path.splitext(os.path.basename(img_path))[0]
|
|
|
+
|
|
|
+ # 读取原有内容
|
|
|
+ with open(md_path, 'r', encoding='utf-8') as f:
|
|
|
+ content = f.read()
|
|
|
+
|
|
|
+ # 生成新内容
|
|
|
+ start_tag = f"--start{img_name}--"
|
|
|
+ end_tag = f"--end{img_name}--"
|
|
|
+ pattern = re.compile(f'{re.escape(start_tag)}(.*?){re.escape(end_tag)}', re.DOTALL)
|
|
|
+
|
|
|
+ # 调用AI重新分析
|
|
|
+ new_content = ai_helper.analyze_image_with_ai(img_path)
|
|
|
+ updated_section = f"{start_tag}\n\n{new_content}\n\n{end_tag}"
|
|
|
+
|
|
|
+ # 替换内容
|
|
|
+ new_md_content = re.sub(pattern, updated_section, content)
|
|
|
+
|
|
|
+ # 写入更新后的内容
|
|
|
+ with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(new_md_content)
|
|
|
+
|
|
|
+ # 更新数据库记录
|
|
|
+ # db_store.update_pdf_record(
|
|
|
+ # markdown_text=new_md_content,
|
|
|
+ # image_paths=img_path,
|
|
|
+ # by_image_path=img_path
|
|
|
+ # )
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
|
|
|
|