| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- import csv, json, tools.utils as utils, os
- from tools.stores.mysql_store import MysqlStore
- from tools.models.standard_model import StandardModel
- class ImageExtractor:
- def __init__(self):
- self._logger = utils.get_logger()
- self._db_store = MysqlStore()
- self._base_path = "./temp_files/images/output"
- self._complete_path=""
- self._ai = utils.AiHelper()
- self._err_files=[]
- self._file_name = ""
- self._sys_prompt = "请提取图片中的表格,用json格式输出。"
- self._user_prompt = """提取表格信息,要求:
- 1. 提取结构化信息:```typescript
- type item {
- a: string; //书号
- b: string; //定额编号
- c:string; //定额名称
- d: string; //工作内容
- e: string; //单位
- f: string; //基本定额
- g: float; //基价(元)
- h: float; //单重(t)
- i: float; //工费
- j: float; //料费
- k: float; //机费
- l: string; //主材
- }
- ```
- 2. 提取的文字中间的空格需要保留,数据没有就留空
- 3. 确保符号提取准确,例如 kg,m²,m³,直径符号∅等
- 4. 返回压缩成一行的item数组的json字符串
- """
- def extract(self,file_name: str):
- self._file_name = file_name
- self._err_files =[]
- path = f"{self._base_path}/img/{self._file_name}/"
- self._complete_path = f"{self._base_path}/img_complete/{self._file_name}/"
- os.makedirs(self._complete_path , exist_ok=True)
- try:
- self._logger.info(f"开始处理目录: {path}")
- # 确保目录存在
- if not os.path.exists(path):
- self._logger.error(f"目录不存在: {path}")
- return
- # 遍历目录下的所有文件
- for root, dirs, files in os.walk(path):
- for file in files:
- # 检查是否为图片文件
- if file.lower().endswith(('.png', '.jpg', '.jpeg')):
- image_path = os.path.join(root, file)
- self.extract_image(image_path)
- self._logger.info(f"目录处理完成: {path}")
- if len(self._err_files)>0:
- self._logger.error(f"----【处理图片失败】-----: {self._err_files}")
- except Exception as e:
- self._logger.error(f"处理目录失败 {path}: {e}")
- def extract_image(self, image_path: str) -> None:
- try:
- self._logger.info(f"开始处理图片: {image_path}")
- # content = self._ai.call_openai_with_image(image_path,self._sys_prompt,self._user_prompt,api_model="qwen2.5-vl-72b-instruct")
- api_key= utils.get_config_value("fastgpt.api_key")
- content = self._ai.call_fastgpt_ai_with_image(image_path,self._user_prompt,api_key)
- self.save_to_db(content)
- # 保存成功后移动文件到已处理目录
- os.rename(image_path, os.path.join(self._complete_path,os.path.basename(image_path)))
- self._logger.info(f"图片处理完成: {image_path}")
- except Exception as e:
- self._err_files.append(image_path)
- self._logger.error(f"处理图片失败 {image_path}: {e}")
- def save_to_db(self, data_list: str|list) -> None:
- try:
- self._logger.info(f"开始保存图片内到数据库:{data_list}")
- if isinstance(data_list,str):
- data_list = json.loads(data_list)
- for item in data_list:
- try :
- standard = StandardModel(
- book_number=item['a'],
- quota_number=item['b'],
- quota_name=item['c'],
- work_content=item['d'],
- unit=item['e'],
- basic_quota=item['f'],
- base_price=item['g'],
- unit_weight=item['h'],
- labor_cost=item['i'],
- material_cost=item['j'],
- machine_cost=item['k'],
- main_material=item['l']
- )
- if not self._db_store.insert_standard(standard):
- self._logger.error(f"保存数据到数据库失败: {item}")
- except Exception as e:
- self._logger.error(f"保存图片内容失败: {e}")
- continue
- except Exception as e:
- self._logger.error(f"保存图片内容失败: {e}")
- def export(self):
- try:
- self._logger.info(f"开始导出数据库数据")
- data = self._db_store.query_standard_group_by_book()
- for k, v in data.items():
- # 数据保存为 csv
- csv_file = f"{self._base_path}/csv/{k}.csv"
- # 确保目录存在
- os.makedirs(os.path.dirname(csv_file), exist_ok=True)
- with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
- writer = csv.writer(f)
- writer.writerow(['书号', '定额编号', '定额名称', '工作内容', '单位', '基本定额', '基价(元)', '单重(t)', '工费', '料费', '机费', '主材'])
- for item in v:
- # 将 StandardModel 对象的属性提取出来,构造成一个列表
- row = [
- item.book_number,
- item.quota_number,
- item.quota_name,
- item.work_content,
- item.unit,
- item.basic_quota,
- item.base_price,
- item.unit_weight,
- item.labor_cost,
- item.material_cost,
- item.machine_cost,
- item.main_material
- ]
- writer.writerow(row)
- self._logger.info(f"成功导出数据库数据")
- return data
- except Exception as e:
- self._logger.error(f"导出数据库数据失败: {e}")
|