processor.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. import json,os,time
  2. import re
  3. from typing import List
  4. from PyPDF2 import PdfReader, PdfWriter
  5. from tools.pdf_split.model import SplitModel,PageConfig
  6. from PIL import Image
  7. import io
  8. import pymupdf
  9. import tools.utils as utils
  10. from tools.utils.file_helper import encode_image
  11. from tools.utils.ai_helper import AiHelper
  12. from tools.pdf_split.mysql_store import MysqlStore
  13. class PDFProcessor:
  14. """PDF处理器类,负责执行PDF文件的拆分操作"""
  15. def __init__(self):
  16. pass
  17. s="""
  18. 按照我提供的目录整理信息,页数要准确,要求:
  19. 1. 每章的标题精确到每节的一、二..的页数。 例如 第一章 第一节 一、xxxx 二、xxxx 。
  20. 2. 返回的结构体:```typescript
  21. type PageConfig = {
  22. start_page: number; // 起始页码
  23. end_page: number; // 结束页码
  24. output_name: string; // 输出文件名称
  25. };
  26. type SplitModel = {
  27. output_dir: string; // 输出目录
  28. page_configs: PageConfig[]; // 页面配置数组
  29. };
  30. ```
  31. 3. 输出文件名 格式 章@节@小节.pdf 例如 第一章 第一节 一、xxxx 的格式为 01xxxx@01xxxx@01xxxx.pdf (xxxx为具体标题内容)
  32. 4. 输出目录路径为 章节/ 例如 第一章 就是 01xxxx/
  33. 5. 目录一定要完整,不能有遗漏,不能有多余的目录
  34. 6. 帮我整理1,2,3,4,5,6,7,8,9,10章的目录信息,并返回SplitModel的json数组,一个章节一个SplitModel
  35. 7. end_page不能与下一个start_page相同
  36. """
  37. _generated_pdfs = [] # 类变量,用于存储生成的PDF文件路径
  38. @staticmethod
  39. def split(filename:str)-> None:
  40. """将PDF文件按指定配置拆分成多个PDF文件"""
  41. version = "v1"
  42. base_json_dir = "./tools/pdf_json/"
  43. base_input_dir = "./temp_files/pdf/source/"
  44. base_output_dir = "./temp_files/pdf/output/"
  45. json_file = f"{base_json_dir}/{version}/{filename}.json"
  46. input_file = f"{base_input_dir}/{version}/{filename}.pdf"
  47. output_dir = f"{base_output_dir}/{version}/{filename}/"
  48. # 清空生成的PDF文件列表
  49. PDFProcessor._generated_pdfs = []
  50. try:
  51. # 读取并解析JSON文件
  52. with open(json_file, 'r', encoding='utf-8') as f:
  53. json_data = json.load(f)
  54. # 将JSON数据转换为SplitModel对象列表
  55. split_models = []
  56. for item in json_data:
  57. page_configs = [PageConfig(**page) for page in item['page_configs']]
  58. split_model = SplitModel(output_dir=item['output_dir'], page_configs=page_configs)
  59. split_models.append(split_model)
  60. # 调用batch_split_pdf进行处理
  61. PDFProcessor.batch_split_pdf(input_file, output_dir, split_models)
  62. # 所有PDF文件拆分完成后,执行图片转换
  63. PDFProcessor.convert_pdf_images(PDFProcessor._generated_pdfs)
  64. print("PDF文件拆分成功!")
  65. except FileNotFoundError:
  66. print(f"错误: 找不到JSON文件 {json_file}")
  67. return
  68. except json.JSONDecodeError as e:
  69. print(f"错误: JSON文件格式无效 {str(e)}")
  70. return
  71. except Exception as e:
  72. print(f"处理过程中发生错误: {str(e)}")
  73. return
  74. @staticmethod
  75. def batch_split_pdf(input_file: str, base_output_dir: str, split_models: List[SplitModel]) -> None:
  76. """批量处理多个PDF拆分任务
  77. Args:
  78. input_file: 输入PDF文件路径
  79. base_output_dir: 基础输出目录路径
  80. split_models: SplitModel配置对象数组
  81. """
  82. try:
  83. for split_model in split_models:
  84. try:
  85. PDFProcessor.split_pdf(input_file, base_output_dir, split_model)
  86. except Exception as e:
  87. print(f"处理拆分任务时发生错误: {str(e)}")
  88. continue
  89. except Exception as e:
  90. print(f"批量处理PDF文件时发生错误: {str(e)}")
  91. return
  92. @staticmethod
  93. def split_pdf(input_file: str, base_output_dir: str, split_model: SplitModel) -> None:
  94. """将PDF文件按指定配置拆分成多个PDF文件,并为每个拆分的PDF文件执行图片转换
  95. Args:
  96. input_file: 输入PDF文件路径
  97. base_output_dir: 基础输出目录路径
  98. split_model: SplitModel配置对象
  99. """
  100. try:
  101. # 确保输出目录存在
  102. output_dir = os.path.join(f"{base_output_dir}pdf/", split_model.output_dir)
  103. os.makedirs(output_dir, exist_ok=True)
  104. # 读取PDF文件
  105. reader = PdfReader(input_file)
  106. total_pages = len(reader.pages)
  107. # 处理每个页面配置
  108. for page_config in split_model.page_configs:
  109. try:
  110. # 验证页码范围
  111. if page_config.start_page < 1 or page_config.end_page > total_pages or page_config.start_page > page_config.end_page:
  112. print(f"警告: 页码范围 {page_config.start_page}-{page_config.end_page} 无效,已跳过")
  113. continue
  114. # 创建新的PDF文件
  115. writer = PdfWriter()
  116. # 添加指定范围的页面
  117. for page_num in range(page_config.start_page - 1, page_config.end_page):
  118. writer.add_page(reader.pages[page_num])
  119. # 生成输出文件名
  120. output_name = page_config.output_name
  121. if not page_config.output_name.endswith(".pdf"):
  122. output_name = f"{output_name}.pdf"
  123. output_file = os.path.join(output_dir, output_name)
  124. # 保存拆分后的PDF文件
  125. with open(output_file, 'wb') as output:
  126. writer.write(output)
  127. print(f"成功创建文件: {output_file}")
  128. PDFProcessor._generated_pdfs.append(output_file)
  129. except Exception as e:
  130. print(f"处理页面配置时发生错误: {str(e)}")
  131. continue
  132. except Exception as e:
  133. print(f"处理PDF文件时发生错误: {str(e)}")
  134. return
  135. @staticmethod
  136. def convert_pdf_images(generated_pdfs: List[str]) -> None:
  137. """处理PDF文件的图片转换
  138. Args:
  139. generated_pdfs: 生成的PDF文件路径列表
  140. """
  141. print("开始处理图片转换...")
  142. for pdf_file in generated_pdfs:
  143. try:
  144. result = PDFProcessor.extract_and_merge_images(pdf_file)
  145. if not result:
  146. print(f"图片转换失败: {pdf_file}")
  147. except Exception as e:
  148. print(f"图片转换过程中发生错误: {str(e)}")
  149. continue
  150. @staticmethod
  151. def extract_and_merge_images(input_file: str, output_file: str = None) -> str:
  152. try:
  153. pdf_document = pymupdf.open(input_file)
  154. # 根据输入路径生成图片目录
  155. output_name = os.path.splitext(os.path.basename(input_file))[0]
  156. parts = input_file.rsplit('/pdf/', 1)
  157. output_file = '/pdf/'.join(parts[:-1]) + '/img/' + parts[-1]
  158. output_dir = os.path.splitext(output_file)[0]
  159. #output_dir = output_file + f'/{output_name}/'
  160. os.makedirs(output_dir, exist_ok=True)
  161. # 遍历每一页提取图片
  162. for page_num in range(pdf_document.page_count):
  163. page = pdf_document[page_num]
  164. # 获取页面上的所有图片,包括内嵌图片
  165. # pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2)) # 实际使用的缩放参数
  166. # img_data = pix.tobytes("png")
  167. # 初始化压缩参数
  168. scale = 1.0
  169. img_data = None
  170. max_size = 200 * 1024
  171. # 循环调整缩放直到符合大小要求
  172. while scale >= 0.5: # 最小缩放比例50%
  173. # 生成临时图片数据
  174. temp_pix = page.get_pixmap(matrix=pymupdf.Matrix(1.5 * scale, 1.5 * scale))
  175. img_data = temp_pix.tobytes("png")
  176. if len(img_data) <= max_size: # 100KB限制
  177. break
  178. scale *= 0.9 # 每次缩小10%
  179. # 生成序列文件名
  180. img_path = os.path.join(output_dir, f"{page_num + 1:02d}.png")
  181. # 保存单页图片
  182. with open(img_path, 'wb') as f:
  183. f.write(img_data)
  184. print(f"成功保存图片({len(img_data) // 1024}KB): {img_path}")
  185. return output_dir
  186. except Exception as e:
  187. print(f"处理图片时发生错误: {str(e)}")
  188. return ''
  189. # @staticmethod
  190. # def extract_and_merge_images(input_file: str, output_file: str = None) -> str:
  191. # try:
  192. # pdf_document = pymupdf.open(input_file)
  193. # images = []
  194. # total_height = 0
  195. # max_width = 0
  196. #
  197. # # 遍历每一页提取图片
  198. # for page_num in range(pdf_document.page_count):
  199. # page = pdf_document[page_num]
  200. #
  201. # # 获取页面上的所有图片,包括内嵌图片
  202. # pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2)) # 使用2倍缩放以获得更好的质量
  203. # img_data = pix.tobytes("png")
  204. #
  205. # # 将图片字节转换为PIL Image对象
  206. # image = Image.open(io.BytesIO(img_data))
  207. # if image.mode != 'RGB':
  208. # image = image.convert('RGB')
  209. #
  210. # images.append(image)
  211. # total_height += image.height
  212. # max_width = max(max_width, image.width)
  213. #
  214. # # 如果没有找到图片
  215. # if not images:
  216. # print("未在PDF中找到任何图片")
  217. # return ''
  218. #
  219. # # 创建新的图片用于拼接
  220. # merged_image = Image.new('RGB', (max_width, total_height))
  221. # y_offset = 0
  222. #
  223. # # 将所有图片垂直拼接
  224. # for img in images:
  225. # x_offset = (max_width - img.width) // 2
  226. # merged_image.paste(img, (x_offset, y_offset))
  227. # y_offset += img.height
  228. #
  229. # # 设置输出路径
  230. # if output_file is None:
  231. # parts = input_file.rsplit('/pdf/', 1)
  232. # output_file = '/pdf/'.join(parts[:-1]) + '/img/' + parts[-1]
  233. # output_file = os.path.splitext(output_file)[0] + "_merged.png"
  234. # os.makedirs(os.path.dirname(output_file), exist_ok=True)
  235. #
  236. # # 根据图片数量计算目标大小
  237. # target_size_per_image = 200 * 1024 # 每张图片100KB
  238. # max_size = target_size_per_image * len(images)
  239. # scale = 1.0
  240. # quality = 95
  241. #
  242. # while True:
  243. # temp_buffer = io.BytesIO()
  244. # if scale < 1.0:
  245. # new_size = (int(merged_image.width * scale), int(merged_image.height * scale))
  246. # resized_image = merged_image.resize(new_size, Image.Resampling.LANCZOS)
  247. # resized_image.save(temp_buffer, 'PNG', optimize=True, quality=quality)
  248. # else:
  249. # merged_image.save(temp_buffer, 'PNG', optimize=True, quality=quality)
  250. #
  251. # size = temp_buffer.tell()
  252. #
  253. # if size <= max_size:
  254. # with open(output_file, 'wb') as f:
  255. # f.write(temp_buffer.getvalue())
  256. # print(f"成功保存图片:[{(size // 1024)} KB] {output_file}")
  257. # break
  258. #
  259. # if scale > 0.5:
  260. # scale *= 0.9
  261. # else:
  262. # # 如果达到最小缩放比例,直接保存当前结果
  263. # with open(output_file, 'wb') as f:
  264. # f.write(temp_buffer.getvalue())
  265. # print(f"成功保存图片:[{(size // 1024)} KB] {output_file}")
  266. # break
  267. #
  268. # return output_file
  269. #
  270. # except Exception as e:
  271. # print(f"处理图片时发生错误: {str(e)}")
  272. # return ''
  273. @staticmethod
  274. def process_image_to_txt(filename: str):
  275. """将目录下的多张图片合并生成一个Markdown文件"""
  276. version = "v1"
  277. base_output_dir = "./temp_files/pdf/output/"
  278. output_dir = f"{base_output_dir}/{version}/{filename}/"
  279. image_dir = f"{output_dir}/img/"
  280. txt_dir = f"{output_dir}/txt/"
  281. db_store = MysqlStore()
  282. ai_helper = AiHelper()
  283. # 创建标准记录(如果不存在)
  284. if not db_store.get_standard_by_name(filename):
  285. db_store.create_standard(code=filename, name=filename)
  286. try:
  287. # 遍历图片目录中的每个子目录(新增目录处理逻辑)
  288. for dir_path, dir_names, file_names in os.walk(image_dir):
  289. # 跳过根目录
  290. if dir_path == image_dir:
  291. continue
  292. # 解析目录结构(新增章节解析)
  293. dir_rel_path = os.path.relpath(dir_path, image_dir)
  294. chapter_parts = dir_rel_path.split('@')
  295. if len(chapter_parts) < 3:
  296. continue # 跳过不符合命名规范的目录
  297. # 生成对应的txt目录
  298. #txt_subdir = os.path.join(txt_dir, dir_rel_path)
  299. #os.makedirs(txt_subdir, exist_ok=True)
  300. # 收集当前目录的所有图片(新增图片收集逻辑)
  301. image_files = sorted(
  302. [f for f in file_names if f.lower().endswith(('.png', '.jpg', '.jpeg'))],
  303. key=lambda x: int(x.split('.')[0])
  304. )
  305. if not image_files:
  306. continue
  307. # 创建合并的markdown文件(修改文件生成逻辑)
  308. md_filename = f"{dir_rel_path.replace('@', '_')}.md"
  309. md_path = os.path.join(txt_dir, md_filename)
  310. os.makedirs(os.path.dirname(md_path), exist_ok=True)
  311. md_content = f"# {filename}\n## {'/'.join(chapter_parts)}\n\n"
  312. # 处理目录下所有图片(新增合并循环)
  313. all_images = []
  314. for img_file in image_files:
  315. img_path = os.path.join(dir_path, img_file)
  316. img_name = os.path.basename(img_path)
  317. try:
  318. # 调用AI分析图片
  319. page_content = ai_helper.analyze_image_with_ai(img_path)
  320. # 添加5秒延时以控制API请求频率
  321. time.sleep(5)
  322. # 生成图片相对路径
  323. #rel_path = os.path.relpath(img_path, txt_subdir)
  324. utils.get_logger().info(f"处理图片 {img_path} 成功")
  325. md_content += f"########### {img_path} ####################\n"
  326. md_content += f"--start{img_name}--\n\n"
  327. md_content += f"\n\n{page_content}\n\n"
  328. md_content += f"--end{img_name}--\n\n"
  329. all_images.append(img_path)
  330. except Exception as e:
  331. print(f"处理图片 {img_file} 失败: {str(e)}")
  332. continue
  333. # 保存合并的文档(修改保存逻辑)
  334. with open(md_path, 'w', encoding='utf-8') as f:
  335. f.write(md_content)
  336. # 插入数据库记录(新增批量记录)
  337. db_store.add_pdf_record(
  338. standard_name=filename,
  339. pdf_path=os.path.abspath(all_images[0].replace("/img/", "/pdf/").rsplit('.', 1)[0] + '.pdf'),
  340. image_path='\n'.join([os.path.abspath(p) for p in all_images]),
  341. markdown_text=md_content,
  342. chapter=chapter_parts[0],
  343. section=chapter_parts[1],
  344. subsection=chapter_parts[2]
  345. )
  346. print(f"成功生成合并文档: {md_path}")
  347. except Exception as e:
  348. print(f"处理过程中发生错误: {str(e)}")
  349. @staticmethod
  350. def regenerate_markdown(img_path: str):
  351. """重新生成指定图片或目录的Markdown内容"""
  352. processor = PDFProcessor()
  353. db_store = MysqlStore()
  354. ai_helper = AiHelper()
  355. if os.path.isdir(img_path):
  356. # 处理整个目录
  357. dir_path = img_path
  358. # 通过图片路径反向推导标准名称和目录结构
  359. parts = dir_path.split('/img/')
  360. if len(parts) < 2:
  361. print("无效的目录路径")
  362. return
  363. # 获取原Markdown文件路径
  364. txt_root = dir_path.replace('/img/', '/txt/')
  365. md_files = [f for f in os.listdir(txt_root) if f.endswith('.md')]
  366. if not md_files:
  367. print("找不到对应的Markdown文件")
  368. return
  369. md_path = os.path.join(txt_root, md_files[0])
  370. # 重新生成整个目录内容
  371. processor._process_directory(dir_path, md_path, db_store, ai_helper)
  372. elif os.path.isfile(img_path):
  373. # 处理单个图片
  374. img_file = os.path.basename(img_path)
  375. dir_path = os.path.dirname(img_path)
  376. # 查找对应的Markdown文件
  377. txt_dir = dir_path.replace('/img/', '/txt/')
  378. md_files = [f for f in os.listdir(txt_dir) if f.endswith('.md')]
  379. if not md_files:
  380. print("找不到对应的Markdown文件")
  381. return
  382. md_path = os.path.join(txt_dir, md_files[0])
  383. # 更新单个图片内容
  384. processor._update_single_image(img_path, md_path, db_store, ai_helper)
  385. def _process_directory(self, dir_path: str, md_path: str, db_store, ai_helper):
  386. """处理整个目录重新生成"""
  387. # 收集目录下所有图片
  388. image_files = sorted(
  389. [f for f in os.listdir(dir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))],
  390. key=lambda x: int(x.split('.')[0])
  391. )
  392. # 重新生成Markdown内容
  393. new_content = f"# {os.path.basename(os.path.dirname(md_path))}\n"
  394. all_images = []
  395. for img_file in image_files:
  396. img_path = os.path.join(dir_path, img_file)
  397. page_content = ai_helper.analyze_image_with_ai(img_path)
  398. img_name = os.path.splitext(img_file)[0]
  399. new_content += f"########### {img_path} ####################\n"
  400. new_content += f"--start{img_name}--\n\n"
  401. new_content += f"\n\n{page_content}\n\n"
  402. new_content += f"--end{img_name}--\n\n"
  403. all_images.append(img_path)
  404. # 写入更新后的内容
  405. with open(md_path, 'w', encoding='utf-8') as f:
  406. f.write(new_content)
  407. # 更新数据库记录
  408. db_store.update_pdf_record(
  409. markdown_text=new_content,
  410. image_paths=','.join(all_images),
  411. by_image_path=all_images[0]
  412. )
  413. def _update_single_image(self, img_path: str, md_path: str, db_store, ai_helper):
  414. """更新单个图片内容"""
  415. img_name = os.path.splitext(os.path.basename(img_path))[0]
  416. # 读取原有内容
  417. with open(md_path, 'r', encoding='utf-8') as f:
  418. content = f.read()
  419. # 生成新内容
  420. start_tag = f"--start{img_name}--"
  421. end_tag = f"--end{img_name}--"
  422. pattern = re.compile(f'{re.escape(start_tag)}(.*?){re.escape(end_tag)}', re.DOTALL)
  423. # 调用AI重新分析
  424. new_content = ai_helper.analyze_image_with_ai(img_path)
  425. updated_section = f"{start_tag}\n\n{new_content}\n\n{end_tag}"
  426. # 替换内容
  427. new_md_content = re.sub(pattern, updated_section, content)
  428. # 写入更新后的内容
  429. with open(md_path, 'w', encoding='utf-8') as f:
  430. f.write(new_md_content)
  431. # 更新数据库记录
  432. # db_store.update_pdf_record(
  433. # markdown_text=new_md_content,
  434. # image_paths=img_path,
  435. # by_image_path=img_path
  436. # )