Bläddra i källkod

Update 添加图片分割解析提取表格

YueYunyun 4 månader sedan
förälder
incheckning
a25ea1f911

+ 25 - 0
SourceCode/DataMiddleware/.script/cmd/app.run.xml

@@ -0,0 +1,25 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="app" type="PythonConfigurationType" factoryName="Python">
+    <module name="DataMiddleware" />
+    <option name="ENV_FILES" value="" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="" />
+    <option name="WORKING_DIRECTORY" value="D:\01Work\1012_DataMiddleware\SourceCode\DataMiddleware" />
+    <option name="IS_MODULE_SDK" value="true" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="false" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/app/main.py" />
+    <option name="PARAMETERS" value="" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>

+ 25 - 0
SourceCode/DataMiddleware/.script/cmd/img_spilt.run.xml

@@ -0,0 +1,25 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="img_spilt" type="PythonConfigurationType" factoryName="Python">
+    <module name="DataMiddleware" />
+    <option name="ENV_FILES" value="" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="" />
+    <option name="WORKING_DIRECTORY" value="D:\01Work\1012_DataMiddleware\SourceCode\DataMiddleware" />
+    <option name="IS_MODULE_SDK" value="true" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/tools/img_spilt/run.py" />
+    <option name="PARAMETERS" value="" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>

+ 25 - 0
SourceCode/DataMiddleware/.script/cmd/pdf_split.run.xml

@@ -0,0 +1,25 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="pdf_split" type="PythonConfigurationType" factoryName="Python">
+    <module name="DataMiddleware" />
+    <option name="ENV_FILES" value="" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="" />
+    <option name="WORKING_DIRECTORY" value="D:\01Work\1012_DataMiddleware\SourceCode\DataMiddleware" />
+    <option name="IS_MODULE_SDK" value="true" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="false" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/tools/pdf_split/main.py" />
+    <option name="PARAMETERS" value="" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>

+ 126 - 28
SourceCode/DataMiddleware/app/ai/fast_gpt.py

@@ -1,6 +1,5 @@
-from httpx import stream
 
-import utils,json,requests
+import utils,json,requests, os
 import re  # 添加正则表达式库
 
 class FastGPTAi:
@@ -11,19 +10,19 @@ class FastGPTAi:
         self._logger = utils.get_logger()
         self._headers ={}
 
-    def call_ai(self, msg: str) -> json:
+    def call_ai(self, msg: str, api_key: str=None) -> json:
         try:
-            if self._api_key is None:
+            api_key = api_key if api_key else self._api_key
+            if api_key is None:
                 self._logger.error("fastgpt.api_key 没有配置")
                 raise Exception("fastgpt.api_key 没有配置")
 
-            self._headers = {
-                "Authorization": f"Bearer {self._api_key}",
+            headers = {
+                "Authorization": f"Bearer {api_key}",
                 "Content-Type": "application/json"
             }
             url = f"{self._api_url}/v1/chat/completions"
             data = {
-                # "model":"",
                 "stream": False,
                 "detail": False,
                 "messages": [
@@ -37,27 +36,126 @@ class FastGPTAi:
                 }
             }
 
-            response = requests.post(url, headers=self._headers, json=data)
-            if response.status_code == 200:
-                result = response.json()
-                self._logger.info(f"Response: {result}")
-                content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
-                if content:
-                    # 使用正则表达式去除 content 前后的特定字符串
-                    content = re.sub(r'^```json\n', '', content.strip())
-                    content = re.sub(r'\n```$', '', content.strip())
-                    try:
-                        data = json.loads(content)
-                        self._logger.info(f"Response_JSON: {data}")
-                        return data
-                    except json.JSONDecodeError as e:
-                        self._logger.error(f"Failed to decode JSON: {e}")
-                        return None
-                return None
-            else:
-                error_msg = f"Error: {response.status_code} - {response.text}"
-                self._logger.error(error_msg)
-                return None
+            return self.process_response(data, headers, url)
+        except Exception as e:
+            self._logger.error(f"Error: {str(e)}")
+            return None
+
+    def call_ai_with_image(self,image_path, msg: str,api_key: str=None) -> json:
+        self._logger.info("调用 fastgpt 的AI_Image接口")
+        try:
+            if not os.path.exists(image_path):
+                utils.get_logger().error(f"图片文件不存在: {image_path}")
+                raise Exception(f"图片文件不存在: {image_path}")
+            api_key = api_key if api_key else  self._api_key
+            if api_key is None:
+                self._logger.error("fastgpt.api_key 没有配置")
+                raise Exception("fastgpt.api_key 没有配置")
+
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json"
+            }
+            url = f"{self._api_url}/v1/chat/completions"
+            base64_str = utils.encode_file(image_path)
+            data = {
+                # "model":"",
+                "stream": False,
+                "detail": False,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": msg
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": base64_str
+                                }
+                            }
+                        ]
+                    }
+                ],
+                "response_format": {
+                    "type": "json_object"
+                }
+            }
+
+            return self.process_response(data, headers, url)
         except Exception as e:
             self._logger.error(f"Error: {str(e)}")
             return None
+
+    def call_ai_with_file(self,file_path, msg: str,api_key: str=None) -> json:
+        self._logger.info("调用 fastgpt 的AI_File接口")
+        try:
+            if not os.path.exists(file_path):
+                utils.get_logger().error(f"文件不存在: {file_path}")
+                raise Exception(f"文件不存在: {file_path}")
+            api_key = api_key if api_key else  self._api_key
+            if api_key is None:
+                self._logger.error("fastgpt.api_key 没有配置")
+                raise Exception("fastgpt.api_key 没有配置")
+
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json"
+            }
+            url = f"{self._api_url}/v1/chat/completions"
+            file_name = os.path.basename(file_path)
+            base64_str = utils.encode_file(file_path)
+            data = {
+                # "model":"",
+                "stream": False,
+                "detail": False,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": msg
+                            },
+                            {
+                                "type": "file_url",
+                                "name": file_name,
+                                "url": base64_str
+                            }
+                        ]
+                    }
+                ]
+            }
+
+            res= self.process_response(data, headers, url)
+            if res:
+                return res
+            return None
+        except Exception as e:
+            self._logger.error(f"Error: {str(e)}")
+            return None
+
+    def process_response(self, data, headers, url)->json:
+        response = requests.post(url, headers=headers, json=data)
+        if response.status_code == 200:
+            result = response.json()
+            self._logger.info(f"Response: {result}")
+            content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
+            if content:
+                # 使用正则表达式去除 content 前后的特定字符串
+                content = re.sub(r'^```json\n', '', content.strip())
+                content = re.sub(r'\n```$', '', content.strip())
+                try:
+                    data = json.loads(content)
+                    self._logger.info(f"Response_JSON: {data}")
+                    return data
+                except json.JSONDecodeError as e:
+                    self._logger.error(f"Failed to decode JSON: {e}")
+                    return None
+            return None
+        else:
+            error_msg = f"Error: {response.status_code} - {response.text}"
+            self._logger.error(error_msg)
+            return None

+ 4 - 1
SourceCode/DataMiddleware/app/config.yml

@@ -25,6 +25,9 @@ anythingLLm:
   workspace: datamiddleware
   thread: fd218d56-8717-4519-a6e0-09c64b732091
 fastgpt:
-  api_key: fastgpt-wdJruTQNjqeH7oRdf54Ilb12pHAspEdW7Io3hcsuaifx5U1OOFzW8Qrc
+#  api_key: fastgpt-wdJruTQNjqeH7oRdf54Ilb12pHAspEdW7Io3hcsuaifx5U1OOFzW8Qrc
+  api_key_pre_process: fastgpt-o4CF7Pu1FRTvHjWFqeNcClBS6ApyflNfkBGXo9p51fuBMAX1L0erU8yz8
+  api_key_process: fastgpt-wdJruTQNjqeH7oRdf54Ilb12pHAspEdW7Io3hcsuaifx5U1OOFzW8Qrc
+  api_key: fastgpt-o4CF7Pu1FRTvHjWFqeNcClBS6ApyflNfkBGXo9p51fuBMAX1L0erU8yz8
   api_url: http://192.168.0.104:8020/api
   app_id: 67c6be7d686fc1d3f0cc1cce

+ 70 - 12
SourceCode/DataMiddleware/app/data_process/pre_process.py

@@ -1,15 +1,17 @@
-import utils, pandas as pd
+import utils, pandas as pd,os
 from pathlib import Path
 
 from models.project_data import ProjectModel, ProjectItemModel
 from stores.mysql_store import MysqlStore
+from ai.fast_gpt import FastGPTAi
 
 class PreProcess:
 
     def __init__(self):
         self._logger= utils.get_logger()
         self._store= MysqlStore()
-        self._ai_helper = utils.AiHelper()
+        # self._ai = utils.AiHelper()
+        self._ai = FastGPTAi()
         self._ai_sys_prompt = "从给定信息中提取结构化信息,并返回json压缩成一行的字符串,如果部分信息为空或nan,则该字段返回为空。"
         self.separate_ai_calls = False
         self._data={}
@@ -131,24 +133,28 @@ class PreProcess:
 
     def call_ai(self, project_id:int, excel_data):
         project = ProjectModel(project_id=project_id)
+        api_key = utils.get_config_value("fastgpt.api_key_pre_process")
         if self.separate_ai_calls:
             # 初始化self._data[project_no],避免在循环中重复初始化
             for data in excel_data:
                 prompt = self.prompt_template([data])
-                response = self._ai_helper.call_openai(self._ai_sys_prompt, prompt)
+                response = self._ai.call_ai(prompt,api_key)
+                project.items.extend(self.format_data(project_id, response))
                 # response = {'data': [{'n': '阻燃型导线穿管敷设', 'm': 'WDZB1-BYJ-750V-2.5mm2', 'u': '米', 'c': 900.0}, {'n': '阻燃型导线穿管敷设', 'm': 'WDZB1-BYJ-750V-4mm2', 'u': '米', 'c': 800.0}, {'n': '耐火型导线穿管敷设', 'm': 'WDZB1N-BYJ-750V-2.5mm2', 'u': '米', 'c': 200.0}, {'n': '防火堵料', 'm': '', 'u': '公斤', 'c': 10.0}, {'n': '防火漆', 'm': '10kg/', '': '桶', 'c': 2.0}, {'n': '镀锌钢管', 'm': 'SC20', 'u': '米', 'c': 580.0}, {'n': '接地线', 'm': '热浸镀锌扁钢25x4', 'u': '米', 'c': 50.0}, {'n': '局部等电位端子箱', 'm': '', 'u': '个', 'c': 2.0}, {'n': '双联单控照明开关', 'm': '~250V 10A', 'u': '个', 'c': 4.0}, {'n': '密闭双联单控照明开关', 'm': '~250V 10A', 'u': '个', 'c': 4.0}, {'n': '配合空调室外机移位', 'm': '', 'u': '项', 'c': 1.0}, {'n': '应急照明灯', 'm': '220V,10W', 'u': '套', 'c': 4.0}, {'n': '门禁', 'm': '', 'u': '套', 'c': 4.0}, {'n': '配电线路改移', 'm': '开槽、移点位等', 'u': '项', 'c': 1.0}, {'n': '烘手器插座', 'm': '220V,10A,密闭型', 'u': '个', 'c': 2.0}], 'completion_tokens': 439, 'prompt_tokens': 766, 'total_tokens': 1205}
                 # 更新数据部分
-                project.items.extend(self.format_data(project_id, response["data"]))
-                project.completion_tokens += response["completion_tokens"]
-                project.prompt_tokens += response["prompt_tokens"]
-                project.total_tokens += response["total_tokens"]
+                # project.items.extend(self.format_data(project_id, response["data"]))
+                # project.completion_tokens += response["completion_tokens"]
+                # project.prompt_tokens += response["prompt_tokens"]
+                # project.total_tokens += response["total_tokens"]
         else:
             prompt = self.prompt_template(excel_data)
-            response = self._ai_helper.call_openai(self._ai_sys_prompt, prompt)
-            project.completion_tokens = response["completion_tokens"]
-            project.prompt_tokens = response["prompt_tokens"]
-            project.total_tokens = response["total_tokens"]
-            project.items = self.format_data(project_id, response["data"])
+            response = self._ai.call_ai(prompt,api_key)
+            project.items.extend(self.format_data(project_id, response))
+
+            # project.completion_tokens = response["completion_tokens"]
+            # project.prompt_tokens = response["prompt_tokens"]
+            # project.total_tokens = response["total_tokens"]
+            # project.items = self.format_data(project_id, response["data"])
         self._data[project_id] = project
 
 
@@ -175,3 +181,55 @@ class PreProcess:
                 self._logger.error(f"格式化数据时出错: {data} {e}")
 
         return formatted_data
+
+
+    def run_1(self, project: ProjectModel) ->bool:
+        self._logger.info(f"开始预处理项目:{project.project_name}[{project.project_no}_{project.standard_version}] ")
+        self._store.update_project_status(project.id, 21)
+        file_path =  f"{utils.get_config_value("file.source_path", "./temp_files")}/project/{project.project_no}/"
+        file = os.listdir(file_path)[0]
+        if not file:
+            self._logger.error(f"项目:{project.project_no} 没有找到文件")
+            return False
+        if not self.check_file_type(file):
+            self._logger.error(f"项目:{project.project_no} 文件格式不正确")
+            return False
+        try:
+            prompt="""从上传的表格数据中提取信息,要求:
+            1. 识别字段类型:数值/文本/日期
+            2. 提取信息结构体:```typescript
+                type item { 
+                    n: string; //物料名称
+                    m: string; //型号规格
+                    u:string; //单位
+                    c: float; //数量,数量多列的话要求和
+                }
+            ```
+            3. 返回压缩成一行的item数组的json字符串
+            """
+            api_key= utils.get_config_value("fastgpt.api_key_pre_process")
+            data = self._ai.call_ai_with_file(file_path+file, prompt,api_key)
+            if isinstance(data, str):
+                import json
+                data= json.loads(data)
+            res_data = self.format_data(project.id, data)
+            if len(res_data)<=0:
+                self._logger.error(f"项目:{project.project_no} 文件处理失败: {data}")
+                return False
+            project.items = res_data
+            self._store.re_insert_project(project)
+            self._logger.info(
+                f"结束预处理项目:{project.project_name}[{project.project_no}_{project.standard_version}] [设备条数:{len(project.items)}]")
+            self._store.update_project_status(project.id, 31)
+            return True
+        except Exception as e:
+            self._logger.error(f"项目:{project.project_no} 文件处理失败: {e}")
+            return False
+
+
+    @staticmethod
+    def check_file_type(file_name:str) ->bool:
+        file_type = file_name.split('.')[-1]
+        if file_type not in ['xlsx','xls']:
+            return False
+        return True

+ 2 - 2
SourceCode/DataMiddleware/app/data_process/process.py

@@ -59,9 +59,9 @@ class Process:
 
     def call_ai_process(self, message:str) ->list[ProjectItemModel]:
         try:
-
+            api_key = utils.get_config_value("fastgpt.api_key_process")
             self._logger.info(f"开始调用AI:\n {message}")
-            json_data = self._ai.call_ai(message)
+            json_data = self._ai.call_ai(message,api_key)
             self._logger.info(f"AI返回结果:{json_data}")
             data=[]
             for item in json_data:

+ 2 - 0
SourceCode/DataMiddleware/app/utils/__init__.py

@@ -126,6 +126,8 @@ def clean_report_file(day: int):
     """
     FileHelper().clean_report_file(day)
 
+def encode_file(path: str):
+    return FileHelper.encode_file(path)
 
 def to_array(s: str, split: str = ",") -> list[str]:
     """

+ 17 - 0
SourceCode/DataMiddleware/app/utils/file_helper.py

@@ -6,6 +6,8 @@ from urllib.parse import urlparse
 
 import pandas as pd
 import requests
+import mimetypes
+import base64
 
 
 class FileHelper:
@@ -167,3 +169,18 @@ class FileHelper:
                         continue
         except Exception as e:
             utils.get_logger().error(f"Report 文件清理失败。Exception: {e}")
+
+    @staticmethod
+    def encode_file(file_path: str):
+        if not os.path.exists(file_path):
+            utils.get_logger().error(f"文件不存在: {file_path}")
+            raise FileNotFoundError(f"文件不存在: {file_path}")
+        # 根据文件扩展名获取 MIME 类型
+        mime_type, _ = mimetypes.guess_type(file_path)
+        if mime_type is None:
+            mime_type = 'image/jpeg'  # 默认使用 jpeg 类型
+        # 将图片编码为 base64 字符串
+        with open(file_path, "rb") as image_file:
+            encoded_string = base64.b64encode(image_file.read())
+            base64_str = encoded_string.decode("utf-8")
+            return f"data:{mime_type};base64,{base64_str}"

+ 3 - 3
SourceCode/DataMiddleware/tools/config.yml

@@ -1,7 +1,7 @@
 mysql:
   host: 192.168.0.81
   port: 3307
-  db: iwb_data_tielu_pdf_dev
+  db: iwb_data_tielu_standard_dev
   user: root
   password: Iwb-2024
   charset: utf8mb4
@@ -20,5 +20,5 @@ ai:
   max_tokens: 10240
 
 fastgpt:
-  url: http://192.168.0.104:8020/api/v1/chat/completions
-  key: fastgpt-pzXtKVjkBU8NW8MUqZ7WnEfqK3m8qP6wmDdfcBgOaK2PZDekoHM1
+  api_url: http://192.168.0.104:8020/api
+  api_key: fastgpt-rSTjfs9BPv6KHUnqtz9vWHNSiPqJneOeBYDtMxqgsu6JgW2trJC7

+ 0 - 0
SourceCode/DataMiddleware/tools/image_extract/__init__.py


+ 138 - 0
SourceCode/DataMiddleware/tools/image_extract/extract.py

@@ -0,0 +1,138 @@
+import csv, json, tools.utils as utils, os
+
+from tools.stores.mysql_store import MysqlStore
+from tools.models.standard_model import StandardModel
+
+
+class ImageExtractor:
+    def __init__(self):
+        self._logger = utils.get_logger()
+        self._db_store = MysqlStore()
+        self._base_path = "./temp_files/images/output"
+        self._complete_path=""
+        self._ai = utils.AiHelper()
+        self._err_files=[]
+        self._file_name = ""
+        self._sys_prompt = "请提取图片中的表格,用json格式输出。"
+        self._user_prompt = """提取表格信息,要求:
+        1. 提取结构化信息:```typescript
+        type item { 
+        a: string; //书号
+        b: string; //定额编号
+        c:string; //定额名称
+        d: string; //工作内容
+        e: string; //单位
+        f: string; //基本定额
+        g: float; //基价(元)
+        h: float; //单重(t)
+        i: float; //工费
+        j: float; //料费
+        k: float; //机费
+        l: string; //主材
+        }
+        ```
+        2. 提取的文字中间的空格需要保留,数据没有就留空
+        3. 确保符号提取准确,例如 kg,m²,m³,直径符号∅等
+        4. 返回压缩成一行的item数组的json字符串
+        """
+    def extract(self,file_name: str):
+        self._file_name = file_name
+        self._err_files =[]
+        path = f"{self._base_path}/img/{self._file_name}/"
+        self._complete_path = f"{self._base_path}/img_complete/{self._file_name}/"
+        os.makedirs(self._complete_path , exist_ok=True)
+        try:
+            self._logger.info(f"开始处理目录: {path}")
+            # 确保目录存在
+            if not os.path.exists(path):
+                self._logger.error(f"目录不存在: {path}")
+                return
+            # 遍历目录下的所有文件
+            for root, dirs, files in os.walk(path):
+                for file in files:
+                    # 检查是否为图片文件
+                    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
+                        image_path = os.path.join(root, file)
+                        self.extract_image(image_path)
+
+            self._logger.info(f"目录处理完成: {path}")
+            if len(self._err_files)>0:
+                self._logger.error(f"----【处理图片失败】-----: {self._err_files}")
+        except Exception as e:
+            self._logger.error(f"处理目录失败 {path}: {e}")
+    def extract_image(self, image_path: str) -> None:
+        try:
+            self._logger.info(f"开始处理图片: {image_path}")
+            # content = self._ai.call_openai_with_image(image_path,self._sys_prompt,self._user_prompt,api_model="qwen2.5-vl-72b-instruct")
+            api_key= utils.get_config_value("fastgpt.api_key")
+            content = self._ai.call_fastgpt_ai_with_image(image_path,self._user_prompt,api_key)
+            self.save_to_db(content)
+            # 保存成功后移动文件到已处理目录
+            os.rename(image_path, os.path.join(self._complete_path,os.path.basename(image_path)))
+            self._logger.info(f"图片处理完成: {image_path}")
+        except Exception as e:
+            self._err_files.append(image_path)
+            self._logger.error(f"处理图片失败 {image_path}: {e}")
+
+    def save_to_db(self, data_list: str|list) -> None:
+        try:
+            self._logger.info(f"开始保存图片内到数据库:{data_list}")
+            if isinstance(data_list,str):
+                data_list = json.loads(data_list)
+            for item in data_list:
+                try :
+                    standard = StandardModel(
+                        book_number=item['a'],
+                        quota_number=item['b'],
+                        quota_name=item['c'],
+                        work_content=item['d'],
+                        unit=item['e'],
+                        basic_quota=item['f'],
+                        base_price=item['g'],
+                        unit_weight=item['h'],
+                        labor_cost=item['i'],
+                        material_cost=item['j'],
+                        machine_cost=item['k'],
+                        main_material=item['l']
+                    )
+                    if not self._db_store.insert_standard(standard):
+                        self._logger.error(f"保存数据到数据库失败: {item}")
+                except Exception as e:
+                    self._logger.error(f"保存图片内容失败: {e}")
+                    continue
+        except Exception as e:
+            self._logger.error(f"保存图片内容失败: {e}")
+
+    def export(self):
+        try:
+            self._logger.info(f"开始导出数据库数据")
+            data = self._db_store.query_standard_group_by_book()
+            for k, v in data.items():
+                # 数据保存为 csv
+                csv_file = f"{self._base_path}/csv/{k}.csv"
+                # 确保目录存在
+                os.makedirs(os.path.dirname(csv_file), exist_ok=True)
+                with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
+                    writer = csv.writer(f)
+                    writer.writerow(['书号', '定额编号', '定额名称', '工作内容', '单位', '基本定额', '基价(元)', '单重(t)', '工费', '料费', '机费', '主材'])
+                    for item in v:
+                        # 将 StandardModel 对象的属性提取出来,构造成一个列表
+                        row = [
+                            item.book_number,
+                            item.quota_number,
+                            item.quota_name,
+                            item.work_content,
+                            item.unit,
+                            item.basic_quota,
+                            item.base_price,
+                            item.unit_weight,
+                            item.labor_cost,
+                            item.material_cost,
+                            item.machine_cost,
+                            item.main_material
+                        ]
+                        writer.writerow(row)
+            self._logger.info(f"成功导出数据库数据")
+            return data
+        except Exception as e:
+            self._logger.error(f"导出数据库数据失败: {e}")

+ 0 - 0
SourceCode/DataMiddleware/tools/img_spilt/__init__.py


+ 9 - 0
SourceCode/DataMiddleware/tools/img_spilt/run.py

@@ -0,0 +1,9 @@
+from  tools.img_spilt.spilt import ImageSpilt
+from tools.image_extract.extract import ImageExtractor
+
+if __name__ == '__main__':
+    file_name ="01"
+    # ImageSpilt().split(file_name)
+    ImageExtractor().extract(file_name)
+    # ImageExtractor().extract_image("./temp_files/images/output/img/01/part_1.png")
+    ImageExtractor().export()

+ 71 - 0
SourceCode/DataMiddleware/tools/img_spilt/spilt.py

@@ -0,0 +1,71 @@
+import tools.utils as utils
+
+from PIL import Image
+import os
+
+
+class ImageSpilt:
+
+    def __init__(self,header_height:int=30,row_height:int=23,row_count:int=10):
+
+        self._logger = utils.get_logger()
+
+        self._header_height = header_height
+
+        self._row_height = row_height
+
+        self._row_count = row_count
+
+    def split(self,image_name:str, output_dir:str=None):
+
+        self._logger.info(f"开始处理图片{image_name}")
+        if output_dir is None:
+            output_dir = f"./temp_files/images/output/img/{image_name}"
+        # 确保输出目录存在
+        os.makedirs(output_dir, exist_ok=True)
+
+        image_path = f"./temp_files/images/source/{image_name}.png"
+        # 打开并读取图片
+        image = Image.open(image_path)
+        width, height = image.size
+        self._logger.info(f"图片尺寸为{width}x{height}")
+        # 提取表头部分
+        header = image.crop((0, 0, width, self._header_height))
+        # 计算总行数
+        content_height = height - self._header_height
+        total_rows = content_height // self._row_height
+        
+        # 计算需要生成的图片数量
+        image_count = (total_rows + self._row_count - 1) // self._row_count
+        self._logger.info(f"总行数:{total_rows},需要生成{image_count}张图片")
+
+        # 按图片切割
+        for img_index in range(image_count):
+            # 计算当前图片包含的行范围
+            start_row = img_index * self._row_count
+            end_row = min((img_index + 1) * self._row_count, total_rows)
+            current_rows = end_row - start_row
+
+            # 创建新图片(表头高度 + 当前行数 * 行高)
+            new_image = Image.new('RGB', (width, self._header_height + current_rows * self._row_height))
+            
+            # 粘贴表头
+            new_image.paste(header, (0, 0))
+            
+            # 粘贴每一行内容
+            for row in range(current_rows):
+                # 计算原图中的行位置
+                src_start_y = self._header_height + (start_row + row) * self._row_height
+                src_end_y = src_start_y + self._row_height
+                # 提取行内容
+                row_image = image.crop((0, src_start_y, width, src_end_y))
+                # 粘贴到新图片中
+                new_image.paste(row_image, (0, self._header_height + row * self._row_height))
+
+            # 保存切割后的图片
+            output_path = os.path.join(output_dir, f"part_{img_index + 1}.png")
+            new_image.save(output_path)
+            self._logger.info(f"已保存第{img_index + 1}张图片(包含{current_rows}行): {output_path}")
+
+        self._logger.info(f"图片处理完成,共生成{image_count}张图片")
+        

+ 22 - 0
SourceCode/DataMiddleware/tools/init.sql

@@ -1,6 +1,28 @@
+CREATE DATABASE IF NOT EXISTS iwb_data_tielu_standard_dev CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci;
+USE iwb_data_tielu_standard_dev;
+
+CREATE TABLE standard_data (
+    `book_number` VARCHAR(20) COMMENT '书号',
+    `quota_number` VARCHAR(20) COMMENT '定额编号',
+    `quota_name` VARCHAR(200) COMMENT '定额名称',
+    `work_content` TEXT COMMENT '工作内容',
+    `unit` VARCHAR(10) COMMENT '单位',
+    `basic_quota` VARCHAR(50) COMMENT '基本定额',
+    `base_price` DECIMAL(12,2) COMMENT '基价(元)',
+    `unit_weight` DECIMAL(12,2) COMMENT '单重(t)',
+    `labor_cost` DECIMAL(12,2) COMMENT '工费',
+    `material_cost` DECIMAL(12,2) COMMENT '料费',
+    `machine_cost` DECIMAL(12,2) COMMENT '机费',
+    `main_material` VARCHAR(100) COMMENT '主材',
+    `created_at`  datetime COMMENT '创建时间',
+    `updated_at`  datetime COMMENT '更新时间',
+    PRIMARY KEY (`book_number`, `quota_number`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='标准数据表';
+
 -- 创建数据库
 CREATE DATABASE IF NOT EXISTS iwb_data_tielu_pdf_dev CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci;
 USE iwb_data_tielu_pdf_dev;
+
 -- 创建标准表
 CREATE TABLE IF NOT EXISTS pdf_standard (
     id INT AUTO_INCREMENT PRIMARY KEY,

+ 53 - 0
SourceCode/DataMiddleware/tools/models/standard_model.py

@@ -0,0 +1,53 @@
+from datetime import datetime
+
+class StandardModel:
+    def __init__(self,
+                 book_number: str = None,
+                 quota_number: str = None,
+                 quota_name: str = None,
+                 work_content: str = None,
+                 unit: str = None,
+                 basic_quota: str = None,
+                 base_price: str = None,
+                 unit_weight: str = None,
+                 labor_cost: str = None,
+                 material_cost: str = None,
+                 machine_cost: str = None,
+                 main_material: str = None,
+                 data_id: int = None,
+                 created_at: datetime = None,
+                 updated_at: datetime = None):
+        self.id = data_id
+        self.book_number = book_number
+        self.quota_number = quota_number
+        self.quota_name = quota_name
+        self.work_content = work_content
+        self.unit = unit
+        self.basic_quota = basic_quota
+        self.base_price = base_price
+        self.unit_weight = unit_weight
+        self.labor_cost = labor_cost
+        self.material_cost = material_cost
+        self.machine_cost = machine_cost
+        self.main_material = main_material
+        self.created_at = created_at or datetime.now()
+        self.updated_at = updated_at or datetime.now()
+
+    def to_dict(self) -> dict:
+        return {
+            'id': self.id,
+            'book_number': self.book_number,
+            'quota_number': self.quota_number,
+            'quota_name': self.quota_name,
+            'work_content': self.work_content,
+            'unit': self.unit,
+            'basic_quota': self.basic_quota,
+            'base_price': self.base_price,
+            'unit_weight': self.unit_weight,
+            'labor_cost': self.labor_cost,
+            'material_cost': self.material_cost,
+            'machine_cost': self.machine_cost,
+            'main_material': self.main_material,
+            'created_at': self.created_at.strftime('%Y-%m-%d %H:%M:%S') if self.created_at else None,
+            'updated_at': self.updated_at.strftime('%Y-%m-%d %H:%M:%S') if self.updated_at else None
+        }

+ 0 - 0
SourceCode/DataMiddleware/tools/stores/__init__.py


+ 105 - 0
SourceCode/DataMiddleware/tools/stores/mysql_store.py

@@ -0,0 +1,105 @@
+from datetime import datetime
+
+from httplib2.auth import params
+from pymupdf.utils import insert_text
+
+import tools.utils as utils
+from tools.utils.mysql_helper import MySQLHelper
+from tools.models.standard_model import StandardModel
+
+class MysqlStore:
+    def __init__(self):
+        self._db_helper = MySQLHelper()
+        self._logger = utils.get_logger()
+
+    def insert_standard(self, data: StandardModel) -> bool:
+        try:
+            sql = """
+                REPLACE INTO standard_data (book_number, quota_number, quota_name, work_content, unit, basic_quota, base_price, unit_weight, labor_cost, material_cost, machine_cost, main_material, created_at)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            """
+            with self._db_helper as db:
+                db.execute(sql, (
+                    data.book_number,
+                    data.quota_number,
+                    data.quota_name,
+                    data.work_content,
+                    data.unit,
+                    data.basic_quota,
+                    data.base_price,
+                    data.unit_weight,
+                    data.labor_cost,
+                    data.material_cost,
+                    data.machine_cost,
+                    data.main_material,
+                    datetime.now()
+                ))
+            return True
+        except Exception as e:
+            self._logger.error(f"Error inserting standard: {str(e)}")
+            return False
+    def insert_standard_batch(self, data_list: list[StandardModel]) -> bool:
+        try:
+            sql = """
+                REPLACE INTO standard_data (book_number, quota_number, quota_name, work_content, unit, basic_quota, base_price, unit_weight, labor_cost, material_cost, machine_cost, main_material, created_at)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            """
+            insert_params = [(data.book_number, data.quota_number, data.quota_name, data.work_content, data.unit, data.basic_quota, data.base_price, data.unit_weight, data.labor_cost, data.material_cost, data.machine_cost, data.main_material, datetime.now()) for data in data_list]
+            with self._db_helper as db:
+                db.execute_many(sql, insert_params)
+            return True
+        except Exception as e:
+            self._logger.error(f"Error inserting standard batch: {str(e)}")
+            return False
+    def query_standard_group_by_book(self) -> dict:
+        sql = """
+            SELECT book_number
+            FROM standard_data
+            GROUP BY book_number
+            ORDER BY book_number
+        """
+        with self._db_helper:
+            result = self._db_helper.execute_query(sql)
+            if not result:
+                return {}
+            
+            # 按book_number分组并转换为StandardModel
+            grouped_data = {}
+            for row in result:
+                book_number = row['book_number']
+                if book_number not in grouped_data:
+                    grouped_data[book_number] = []
+            
+                # 由于只按book_number分组,其他字段无法从这里获取,需要重新查询
+                standards = self.query_standards_by_book_number(book_number)
+                grouped_data[book_number].extend(standards)
+            
+            return grouped_data
+
+    def query_standards_by_book_number(self, book_number: str) -> list[StandardModel]:
+        sql = """
+            SELECT book_number, quota_number, quota_name, work_content, unit, basic_quota, base_price, unit_weight, labor_cost, material_cost, machine_cost, main_material
+            FROM standard_data
+            WHERE book_number = %s
+            ORDER BY quota_number
+        """
+        with self._db_helper:
+            result = self._db_helper.execute_query(sql, (book_number,))
+            standards = []
+            for row in result:
+                standard = StandardModel(
+                    book_number=row['book_number'],
+                    quota_number=row['quota_number'],
+                    quota_name=row['quota_name'],
+                    work_content=row['work_content'],
+                    unit=row['unit'],
+                    basic_quota=row['basic_quota'],
+                    base_price=row['base_price'],
+                    unit_weight=row['unit_weight'],
+                    labor_cost=row['labor_cost'],
+                    material_cost=row['material_cost'],
+                    machine_cost=row['machine_cost'],
+                    main_material=row['main_material'],
+                )
+                standards.append(standard)
+            return standards

+ 180 - 28
SourceCode/DataMiddleware/tools/utils/ai_helper.py

@@ -1,5 +1,4 @@
-import json
-import re
+import json, os, re, requests
 
 import tools.utils as utils
 from tools.utils.file_helper import encode_image
@@ -13,29 +12,19 @@ class AiHelper:
     _ai_max_tokens = 150
 
     def __init__(self, api_url: str=None, api_key: str=None, api_model: str=None):
+        self._logger = utils.get_logger()
         self._ai_api_url = api_url if api_url else utils.get_config_value("ai.url")
         self._ai_api_key = api_key if api_key else utils.get_config_value("ai.key")
         self._api_model = api_model if api_model else utils.get_config_value("ai.model")
+        self._fastgpt_api_key = utils.get_config_value("fastgpt.api_key")
+        self._fastgpt_api_url = utils.get_config_value("fastgpt.api_url")
         max_tokens = utils.get_config_value("ai.max_tokens")
         if max_tokens:
             self._ai_max_tokens = int(max_tokens)
 
     def call_openai(self, system_prompt: str, user_prompt: str,api_url: str=None,api_key: str=None,api_model: str=None) -> json:
-        if api_url:
-            self._ai_api_url = api_url
-        if api_key:
-            self._ai_api_key = api_key
-        if api_model:
-            self._api_model = api_model
-        if self._ai_api_key is None:
-            raise Exception("AI API key 没有配置")
-        if self._ai_api_url is None:
-            raise Exception("AI API url 没有配置")
-        if self._api_model is None:
-            raise Exception("AI API model 没有配置")
-
+        self.check_api(api_key, api_model, api_url)
         utils.get_logger().info(f"调用AI API ==> Url:{self._ai_api_url},Model:{self._api_model}")
-
         client = OpenAI(api_key=self._ai_api_key, base_url=self._ai_api_url)
         completion = client.chat.completions.create(
             model=self._api_model,
@@ -73,6 +62,20 @@ class AiHelper:
         except Exception as e:
             raise Exception(f"解析 AI 响应错误: {e}")
 
+    def check_api(self, api_key, api_model, api_url):
+        if api_url:
+            self._ai_api_url = api_url
+        if api_key:
+            self._ai_api_key = api_key
+        if api_model:
+            self._api_model = api_model
+        if self._ai_api_key is None:
+            raise Exception("AI API key 没有配置")
+        if self._ai_api_url is None:
+            raise Exception("AI API url 没有配置")
+        if self._api_model is None:
+            raise Exception("AI API model 没有配置")
+
     @staticmethod
     def _extract_message_content(response_json: dict) -> str:
         utils.get_logger().info(f"AI Response JSON: {response_json}")
@@ -120,21 +123,47 @@ class AiHelper:
             else:
                 raise Exception(f"解析 AI 响应错误: {response} {e}")
 
+    def call_openai_with_image(self, image_path,system_prompt: str, user_prompt: str, api_url: str=None,api_key: str=None,api_model: str=None) -> json:
+        try:
+            self.check_api(api_key, api_model, api_url)
+            utils.get_logger().info(f"调用AI API IMAGE==> Url:{self._ai_api_url},Model:{self._api_model} {image_path}")
+            client = OpenAI(api_key=self._ai_api_key, base_url=self._ai_api_url)
+            if not os.path.exists(image_path):
+                utils.get_logger().error(f"图片文件不存在: {image_path}")
+                raise Exception(f"图片文件不存在: {image_path}")
+            base64_str = encode_image(image_path)
+            response = client.chat.completions.create(
+                model=self._api_model,
+                messages=[
+                    {
+                    "role": "system",
+                    "content": system_prompt,
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text",
+                                "text": f"{user_prompt}"},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": base64_str
+                                }
+                            }
+                        ]
+                    }
+                ],
+                response_format={"type": "json_object"},
+                timeout=600
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            raise Exception(f"调用 AI 错误: {e}")
+
     def analyze_image_with_ai(self,image_path, api_url: str=None,api_key: str=None,api_model: str=None):
         """调用OpenAI的API分析图片内容"""
-        if api_url:
-            self._ai_api_url = api_url
-        if api_key:
-            self._ai_api_key = api_key
-        if api_model:
-            self._api_model = api_model
-        if self._ai_api_key is None:
-            raise Exception("AI API key 没有配置")
-        if self._ai_api_url is None:
-            raise Exception("AI API url 没有配置")
-        if self._api_model is None:
-            raise Exception("AI API model 没有配置")
         try:
+            self.check_api(api_key, api_model, api_url)
             client = OpenAI(api_key=self._ai_api_key, base_url=self._ai_api_url)
             base64_str = encode_image(image_path)
             response = client.chat.completions.create(
@@ -160,3 +189,126 @@ class AiHelper:
         except Exception as e:
             print(f"调用AI接口时出错: {e}")
         return ''
+
+
+    def call_fastgpt_ai(self, msg: str,api_key: str=None) -> json:
+        self._logger.info("调用fastgpt的AI接口")
+        try:
+            if api_key is not None:
+                self._fastgpt_api_key = api_key
+            if self._fastgpt_api_key is None:
+                self._logger.error("fastgpt.api_key 没有配置")
+                raise Exception("fastgpt.api_key 没有配置")
+
+            headers = {
+                "Authorization": f"Bearer {self._fastgpt_api_key}",
+                "Content-Type": "application/json"
+            }
+            url = f"{self._fastgpt_api_url}/v1/chat/completions"
+            data = {
+                # "model":"",
+                "stream": False,
+                "detail": False,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": msg
+                    }
+                ],
+                "response_format": {
+                    "type": "json_object"
+                }
+            }
+
+            response = requests.post(url, headers=headers, json=data)
+            if response.status_code == 200:
+                result = response.json()
+                self._logger.info(f"Response: {result}")
+                content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
+                if content:
+                    # 使用正则表达式去除 content 前后的特定字符串
+                    content = re.sub(r'^```json\n', '', content.strip())
+                    content = re.sub(r'\n```$', '', content.strip())
+                    try:
+                        data = json.loads(content)
+                        self._logger.info(f"Response_JSON: {data}")
+                        return data
+                    except json.JSONDecodeError as e:
+                        self._logger.error(f"Failed to decode JSON: {e}")
+                        return None
+                return None
+            else:
+                error_msg = f"Error: {response.status_code} - {response.text}"
+                self._logger.error(error_msg)
+                return None
+        except Exception as e:
+            self._logger.error(f"Error: {str(e)}")
+            return None
+    def call_fastgpt_ai_with_image(self,image_path, msg: str,api_key: str=None) -> json:
+        self._logger.info("调用fastgpt的AI_Image接口")
+        try:
+            if not os.path.exists(image_path):
+                utils.get_logger().error(f"图片文件不存在: {image_path}")
+                raise Exception(f"图片文件不存在: {image_path}")
+            if api_key is not None:
+                self._fastgpt_api_key = api_key
+            if self._fastgpt_api_key is None:
+                self._logger.error("fastgpt.api_key 没有配置")
+                raise Exception("fastgpt.api_key 没有配置")
+
+            headers = {
+                "Authorization": f"Bearer {self._fastgpt_api_key}",
+                "Content-Type": "application/json"
+            }
+            url = f"{self._fastgpt_api_url}/v1/chat/completions"
+            base64_str = encode_image(image_path)
+            data = {
+                # "model":"",
+                "stream": False,
+                "detail": False,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": msg
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": base64_str
+                                }
+                            }
+                        ]
+                    }
+                ],
+                "response_format": {
+                    "type": "json_object"
+                }
+            }
+
+            response = requests.post(url, headers=headers, json=data)
+            if response.status_code == 200:
+                result = response.json()
+                self._logger.info(f"Response: {result}")
+                content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
+                if content:
+                    # 使用正则表达式去除 content 前后的特定字符串
+                    content = re.sub(r'^```json\n', '', content.strip())
+                    content = re.sub(r'\n```$', '', content.strip())
+                    try:
+                        data = json.loads(content)
+                        self._logger.info(f"Response_JSON: {data}")
+                        return data
+                    except json.JSONDecodeError as e:
+                        self._logger.error(f"Failed to decode JSON: {e}")
+                        return None
+                return None
+            else:
+                error_msg = f"Error: {response.status_code} - {response.text}"
+                self._logger.error(error_msg)
+                return None
+        except Exception as e:
+            self._logger.error(f"Error: {str(e)}")
+            return None