瀏覽代碼

Update AI处理优化

YueYunyun 6 月之前
父節點
當前提交
d33211d0c1

+ 1 - 1
SourceCode/TenderCrawler/.script/Build_Dockerfile.run.xml

@@ -2,7 +2,7 @@
   <configuration default="false" name="Build_Dockerfile" type="docker-deploy" factoryName="dockerfile" activateToolWindowBeforeRun="false" server-name="81">
     <deployment type="dockerfile">
       <settings>
-        <option name="imageTag" value="y_tender-crawler-app:2.0.0" />
+        <option name="imageTag" value="y_tender-crawler-app:2.1.0" />
         <option name="buildOnly" value="true" />
         <option name="containerName" value="" />
         <option name="sourceFilePath" value="Dockerfile" />

+ 24 - 9
SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py

@@ -28,6 +28,8 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
 
     def _collect(self, keyword: str):
         items = self._search(keyword)
+        if len(items) <= 0:
+            return
         self._process_list(items)
         if utils.get_config_bool(self.batch_save_key):
             self.store.save_collect_data(True)
@@ -36,7 +38,11 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
         try:
             if not keyword:
                 raise Exception("搜索关键字不能为空")
-            self._wait_until(ec.presence_of_element_located((By.ID, "searchForm")))
+            self.driver.get(self._url)
+            if not self._wait_until(
+                ec.presence_of_element_located((By.ID, "searchForm"))
+            ):
+                return []
             search_el = self.driver.find_element(By.ID, "kw")
             sleep(2)
             search_el.clear()
@@ -47,9 +53,10 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
             sleep(1)
             search_btn.click()
             self._next_count = 0
-            self._wait_until(
+            if not self._wait_until(
                 ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
-            )
+            ):
+                return []
             default_search_txt = "近1周"
             search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
             utils.get_logger().debug(f"搜索日期条件: {search_txt}")
@@ -60,9 +67,10 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                         sleep(1)
                         last_el.click()
                         break
-                self._wait_until(
+                if not self._wait_until(
                     ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
-                )
+                ):
+                    return []
             else:
                 sleep(1)
             try:
@@ -97,6 +105,8 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
             self._process_item(item)
         sleep(2)
         next_items = self._next_page()
+        if len(items) <= 0:
+            return []
         return self._process_list(next_items)
 
     def _next_page(self) -> list:
@@ -113,9 +123,10 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                 f"下一页[{self._next_count+1}]: {self.driver.current_url}"
             )
             sleep(1)
-            self._wait_until(
+            if not self._wait_until(
                 ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
-            )
+            ):
+                return []
             items = self.driver.find_elements(
                 By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
             )
@@ -136,13 +147,17 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
             utils.get_logger().debug(f"跳转详情")
             sleep(1)
             item.click()
-            self._wait_until(ec.number_of_windows_to_be(2))
+            if not self._wait_until(ec.number_of_windows_to_be(2)):
+                return
             handles = self.driver.window_handles
             for handle in handles:
                 if handle != main_handle:
                     self.driver.switch_to.window(handle)
                     break
-            self._wait_until(ec.presence_of_element_located((By.TAG_NAME, "body")))
+            if not self._wait_until(
+                ec.presence_of_element_located((By.TAG_NAME, "body"))
+            ):
+                return
 
             content = self.driver.find_element(
                 By.XPATH, "//div[@class='vF_deail_maincontent']"

+ 34 - 9
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -3,6 +3,7 @@ from time import sleep
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as ec
+from selenium.webdriver.support.wait import WebDriverWait
 
 import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
@@ -28,14 +29,21 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                 By.XPATH, "//div[@id='loginRight']/a[@class='login']"
             )
             login_el.click()
-            self._wait_until(ec.presence_of_element_located((By.ID, "userpass")))
+            wait = WebDriverWait(self.driver, 10, 1)
+            wait.until(ec.presence_of_element_located((By.ID, "userpass")))
+            # if not self._wait_until(
+            #     ec.presence_of_element_located((By.ID, "userpass"))
+            # ):
+            #     raise TimeoutException(f"id='userpass' 元素没有找到")
             un_el = self.driver.find_element(By.ID, "username")
             un_el.send_keys(username)
             pass_el = self.driver.find_element(By.ID, "userpass")
             pass_el.send_keys(password)
             login_btn = self.driver.find_element(By.ID, "login-button")
             login_btn.click()
-            self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
+            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            # if not self._wait_until(ec.presence_of_element_located((By.ID, "site-content"))):
+            #     raise TimeoutException(f"id='site-content' 元素没有找到")
         except TimeoutException as e:
             raise Exception(f"登录失败 [{self._adapter_type}] [超时]: {e}")
         except NoSuchElementException as e:
@@ -64,7 +72,8 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                     By.XPATH, "//div[@id='z-b-jg-gg']/h2/a[@class='more']"
                 )
             el.click()
-            self._wait_until(ec.number_of_windows_to_be(2))
+            if not self._wait_until(ec.number_of_windows_to_be(2)):
+                return []
             self.driver.close()
             self.driver.switch_to.window(self.driver.window_handles[0])
             return self._search(keyword)
@@ -74,7 +83,10 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
 
     def _search(self, keyword: str) -> list:
-        self._wait_until(ec.presence_of_element_located((By.ID, "searchBidProjForm")))
+        if not self._wait_until(
+            ec.presence_of_element_located((By.ID, "searchBidProjForm"))
+        ):
+            return []
         search_el = self.driver.find_element(
             By.XPATH, "//form[@id='searchBidProjForm']/ul/li/input[@id='fullText']"
         )
@@ -85,7 +97,10 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
         )
         search_btn.click()
         self._next_count = 0
-        self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
+        if not self._wait_until(
+            ec.presence_of_element_located((By.ID, "site-content"))
+        ):
+            return []
         default_search_txt = "全部"
         search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
         utils.get_logger().debug(f"搜索日期条件: {search_txt}")
@@ -93,7 +108,10 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
             sleep(1)
             last_el.click()
-            self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
+            if not self._wait_until(
+                ec.presence_of_element_located((By.ID, "site-content"))
+            ):
+                return []
         else:
             sleep(1)
         try:
@@ -132,7 +150,10 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             utils.get_logger().debug(
                 f"下一页[{self._next_count+1}]: {self.driver.current_url}"
             )
-            self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
+            if not self._wait_until(
+                ec.presence_of_element_located((By.ID, "site-content"))
+            ):
+                return []
             items = self.driver.find_elements(
                 By.XPATH, "//ul[@class='as-pager-body']/li/a"
             )
@@ -151,7 +172,8 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                 close = False
                 return
             item.click()
-            self._wait_until(ec.number_of_windows_to_be(2))
+            if not self._wait_until(ec.number_of_windows_to_be(2)):
+                return
             handles = self.driver.window_handles
             for handle in handles:
                 if handle != main_handle:
@@ -159,7 +181,10 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                     break
             url = self.driver.current_url
             utils.get_logger().debug(f"跳转详情")
-            self._wait_until(ec.presence_of_element_located((By.CLASS_NAME, "content")))
+            if not self._wait_until(
+                ec.presence_of_element_located((By.CLASS_NAME, "content"))
+            ):
+                return
             content = self.driver.find_element(By.CLASS_NAME, "content").text
             if self._check_content(content):
                 self._save_db(url, content, data_type)

+ 41 - 8
SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py

@@ -26,6 +26,8 @@ class IDataCollectionAdapter(ABC):
     _error_count = 0
     _max_error_count = utils.get_config_int("adapter.max_error_count", 3)
 
+    _err_keywords = {}
+
     @property
     def search_day_key(self) -> str:
         return f"adapter.{self._adapter_type}.search_day"
@@ -91,8 +93,13 @@ class IDataCollectionAdapter(ABC):
                 utils.get_logger().info(f"采集关键字[{count}]: {keyword}")
                 self._error_count = 0
                 self._collect(keyword)
+                if self.cur_keyword in self._err_keywords:
+                    del self._err_keywords[self.cur_keyword]  # 删除键
             except Exception as e:
-                raise Exception(f"采集数据失败: {e}")
+                utils.get_logger().error(f"==> {e}")
+            # except Exception as e:
+            #     raise Exception(f"采集数据失败: {e}")
+        self._collect_error_keywors()
 
     @abstractmethod
     def login(self, username: str, password: str) -> None:
@@ -119,17 +126,43 @@ class IDataCollectionAdapter(ABC):
         method: Callable[[D], Union[Literal[False], T]],
         timeout=20,
         poll_frequency=1,
-    ):
+    ) -> bool:
         try:
             self._wait(timeout, poll_frequency).until(method)
-        except TimeoutException as e:
-            self._error_count += 1
-            if self._error_count > self._max_error_count:
-                raise e
-            self._wait_until(method)
+            return True
+        except TimeoutException:
+            err_count = (
+                self._err_keywords[self.cur_keyword]
+                if self.cur_keyword in self._err_keywords
+                else 0
+            )
+            err_count += 1
             utils.get_logger().error(
-                f"采集数据 超时 [{self._error_count}/{self._max_error_count}]"
+                f"采集数据 超时 [{self.cur_keyword}][{err_count}/{self._max_error_count}]"
             )
+            self._err_keywords[self.cur_keyword] = err_count
+            if err_count > self._max_error_count:
+                del self._err_keywords[self.cur_keyword]  # 删除键
+            return False
+            # raise TimeoutException(
+            #     f"采集数据 超时 {self.cur_keyword} [{err_count}/{self._max_error_count}]"
+            # )
+
+    def _collect_error_keywors(self):
+        if not self._err_keywords:
+            return
+        for keyword, err_count in self._err_keywords.items():
+            try:
+                utils.get_logger().info(
+                    f"重新采集错误关键字[{err_count}/{self._max_error_count}]: {keyword}"
+                )
+                self._cur_keyword = keyword
+                self._collect(keyword)
+                if self.cur_keyword in self._err_keywords:
+                    del self._err_keywords[self.cur_keyword]  # 删除键
+            except Exception as e:
+                utils.get_logger().error(f"失败: {e}")
+        self._collect_error_keywors()
 
     @abstractmethod
     def _collect(self, keyword: str) -> None:

+ 36 - 4
SourceCode/TenderCrawler/app/config.yml

@@ -37,9 +37,41 @@ ai:
   url: https://dashscope.aliyuncs.com/compatible-mode/v1
   model: qwen-plus
   max_tokens: 1024
-  system_prompt: 分析文本,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空,返回的一定是可以解析的json对象。
-  prompt_template_1: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),设备(devices)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,area,date,address,release_date,summary,devices字段的json格式字符串,没有找到或未提供的信息json字段为空,返回的一定是可以解析的json字符串。
-  prompt_template_2: 在以上内容中提取信息:编号(no) 、标题(title)、公告时间,格式为yyyy-MM-dd(date)、标中的总价格(price)、标中的公司,多个以逗号分割(bidder)、150-300字的标的物说明,标的物价格,公司的明细等内容摘要(summary),标中的涉及光谱仪的设备(devices)提取出相关设备的名称信息以及对应的品牌厂商,品牌厂商写在设备后面用括号括起来,多个设备以逗号分割,例:XX设备(xx品牌),XX设备(xx品牌)。返回包含no,title,date,devices,price,bidde,summary字段的json格式字符串,没有找到或未提供的信息json字段为空。
+  system_prompt: "从给定信息中提取出关键信息,并以给定的类型返回json字符串,如果部分信息为空,则该字段返回为空"
+  prompt_template_1: "在以上内容中提取招标信息:
+            ```typescript
+            export interface Tender { //招标信息
+                no: string; // 招标项目编号
+                title: string; // 招标公告标题
+                provice: string; // 招标单位省份
+                city: string; // 招标单位城市
+                date: string; // 项目开标的时间
+                address: string; // 项目开标的地点
+                release_date: string; // 招标信息的发布时间
+                summary: string; // 100字左右的招标条件,联系方式等内容摘要
+                devices: string; // 只涉及到光谱仪相关的设备,其他无关设备不需要,多个设备以逗号分割 ,例如 红外光谱仪,拉曼光谱仪等
+            }
+            ```"
+  prompt_template_2: "在以上内容中提取中标信息:
+            ```typescript
+            export interface Instrument { // 中标仪器信息
+              company: string; // 中标单位名称,参与竞标并中标的公司名称
+              name: string; // 仪器名称,例如:红外光谱仪
+              manufacturer: string; // 仪器厂商,例如:赛默飞、Bruker
+              model: string; // 仪器的型号/规格,例如:NIR25S
+              quantity: number; // 中标仪器的数量,台数,例如:2
+              unit_price: number; // 仪器的单价,单位转换为元,例如:178000.00
+            }
+            export interface BiddingAcceptance { //中标信息
+              no: string; // 项目编号
+              title: string; // 中标公告标题
+              date: string; // 中标公告时间
+              provice: string; // 招标单位省份
+              city: string; // 招标单位城市
+              summary: string; // 公告摘要信息,100字左右
+              instruments: Instrument[]; // 中标设备的信息
+            }
+            ```"
 email:
 #  smtp_server: smtp.exmail.qq.com
 #  smtp_port: 465
@@ -51,7 +83,7 @@ email:
   smtp_user: yueyunyun88@163.com
   smtp_password: FWRwBZKHTLHjHT5F
   from_email: yueyunyun88@163.com
-  error_email: yueyy@iwbnet.com
+  error_email:
 job:
   event_id: 1 # 改变这个值,整点会检测重新加载任务
   sleep_interval: 10

+ 51 - 21
SourceCode/TenderCrawler/app/jobs/data_process.py

@@ -9,15 +9,43 @@ class DataProcess:
 
     _store = None
 
-    DEFAULT_AI_SYSTEM_PROMPT = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
-    DEFAULT_AI_PROMPT_TEMPLATE_1 = """在以上内容中提取信息:
-            编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary), 设备(devices)。
-            提取出相关设备的名称信息, 多个设备以逗号分割。
-            返回包含no, title, area, date, address, release_date, summary, devices字段的json格式字符串,没有找到或未提供的信息json字段为空。
+    DEFAULT_AI_SYSTEM_PROMPT = "从给定信息中提取出关键信息,并以给定的类型返回json字符串,如果部分信息为空,则该字段返回为空。"
+    DEFAULT_AI_PROMPT_TEMPLATE_1 = """在以上内容中提取招标信息:
+            ```typescript
+            export interface Tender { //招标信息
+                no: string; // 招标项目编号
+                title: string; // 招标公告标题
+                provice: string; // 招标单位省份
+                city: string; // 招标单位城市
+                date: string; // 项目开标的时间
+                address: string; // 项目开标的地点
+                release_date: string; // 招标信息的发布时间
+                summary: string; // 100字左右的招标条件,联系方式等内容摘要
+                devices: string; // 只涉及到光谱仪相关的设备,其他无关设备不需要,多个设备以逗号分割 ,例如 红外光谱仪,拉曼光谱仪等
+            }
+            ```
             """
-    DEFAULT_AI_PROMPT_TEMPLATE_2 = """在以上内容中提取信息:
-            编号(no) 、标题(title)、公告时间(date)、标中的总价格(price)、标中的涉及光谱仪的设备及厂商(devices)、标中的公司,多个以逗号分割(bidder)、150-300字的标的物说明,标的物价格,公司的明细等内容摘要(summary),设备(devices)。
-            提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,date,devices,price,bidder,summary字段的json格式字符串,没有找到或未提供的信息json字段为空  """
+    DEFAULT_AI_PROMPT_TEMPLATE_2 = """在以上内容中提取中标信息:
+            ```typescript
+            export interface Instrument { // 中标仪器信息
+              company: string; // 中标单位名称,参与竞标并中标的公司名称
+              name: string; // 仪器名称,例如:红外光谱仪
+              manufacturer: string; // 仪器厂商,例如:赛默飞、Bruker
+              model: string; // 仪器的型号/规格,例如:NIR25S
+              quantity: number; // 中标仪器的数量,台数,例如:2
+              unit_price: number; // 仪器的单价,单位转换为元,例如:178000.00
+            }
+            export interface BiddingAcceptance { //中标信息
+              no: string; // 项目编号
+              title: string; // 中标公告标题
+              date: string; // 中标公告时间,格式为yyyy-MM-dd 例如:2025-01-01
+              provice: string; // 招标单位省份
+              city: string; // 招标单位城市
+              summary: string; // 公告摘要信息,100字左右
+              instruments: Instrument[]; // 中标设备的信息,关于光谱仪的设备,其他设备不要
+            }
+            ```
+           """
 
     def __init__(self, store: IDataStore):
         self._store = store
@@ -111,20 +139,21 @@ class DataProcess:
             data = utils.call_openai(
                 self._ai_system_prompt, f"{item.content} {self._ai_prompt_template_1}"
             )
-            area_str = data.get("area")
-
-            if "省" in area_str:
-                area_str_arr = area_str.split("省")
-                area_str = area_str_arr[1] if len(area_str_arr) > 1 else area_str_arr[0]
-            if "市" in area_str:
-                area_str_arr = area_str.split("市")
-                area_str = area_str_arr[0]
+            # area_str = data.get("area")
+            #
+            # if "省" in area_str:
+            #     area_str_arr = area_str.split("省")
+            #     area_str = area_str_arr[1] if len(area_str_arr) > 1 else area_str_arr[0]
+            # if "市" in area_str:
+            #     area_str_arr = area_str.split("市")
+            #     area_str = area_str_arr[0]
 
             return ProcessData(
                 no=data.get("no"),
                 title=data.get("title"),
                 date=data.get("date"),
-                area=area_str,
+                provice=data.get("provice"),
+                city=data.get("city"),
                 address=data.get("address"),
                 devices=data.get("devices"),
                 summary=data.get("summary"),
@@ -142,18 +171,19 @@ class DataProcess:
             data = utils.call_openai(
                 self._ai_system_prompt, f"{item.content} {self._ai_prompt_template_2}"
             )
-            return ProcessResultData(
+            result = ProcessResultData(
                 no=data.get("no"),
                 title=data.get("title"),
+                provice=data.get("provice"),
+                city=data.get("city"),
                 date=data.get("date"),
-                devices=data.get("devices"),
-                price=data.get("price"),
-                bidder=data.get("bidder"),
+                instruments_o=data.get("instruments"),
                 summary=data.get("summary"),
                 prompt_tokens=data.get("prompt_tokens"),
                 completion_tokens=data.get("completion_tokens"),
                 total_tokens=data.get("total_tokens"),
             )
+            return result
         except Exception as e:
             utils.get_logger().error(f"AI 提取数据失败2: {item.url} {e}")
             return None

+ 109 - 49
SourceCode/TenderCrawler/app/jobs/data_send.py

@@ -3,7 +3,7 @@ from datetime import datetime
 
 import utils
 from models.process_data import ProcessData
-from models.process_result_data import ProcessResultData
+from models.process_result_data import ProcessResultData, InstrumentData
 from stores.data_store_interface import IDataStore
 
 
@@ -58,12 +58,12 @@ class DataSend:
             utils.get_logger().info("发送中标报告邮件成功")
 
     def _send_item(self, item: ProcessData) -> None:
-        utils.get_logger().info(f"开始发送邮件,地区为:{item.area} ,URL为 {item.url}")
-        email = self._get_email_by_area(item.area)
+        utils.get_logger().info(f"开始发送邮件,地区为:{item.city} ,URL为 {item.url}")
+        email = self._get_email_by_area(item.city)
         if not email:
-            utils.get_logger().error(f"{item.area} 下没有找到email")
-            if item.area not in self._error_arr:
-                self._error_arr.append(item.area)
+            utils.get_logger().error(f"{item.city} 下没有找到email")
+            if item.city not in self._error_arr:
+                self._error_arr.append(item.city)
             return
         title_prev = utils.get_config_value("email.title_prev", "【招标信息】")
         body = self._build_email_html(item)
@@ -77,13 +77,9 @@ class DataSend:
         self, area: str, count: int = 0, virtual_area: str = None
     ) -> str:
         email = None
-        area_str = area
-        # if "省" in area:
-        #     area_str_arr = area.split("省")
-        #     area_str = area_str_arr[1] if len(area_str) > 1 else area_str_arr[0]
-        # if "市" in area:
-        #     area_str_arr = area.split("市")
-        #     area_str = area_str_arr[0]
+        area_str = (
+            area.replace("省", "").replace("市", "").replace("区", "").replace("县", "")
+        )
         for area_item in self._email_area_arr:
             if area_str in area_item.area:
                 email = area_item.email
@@ -159,11 +155,13 @@ class DataSend:
         <body>
             <div class="container">
                 <h1>{item.title}</h1>
-                <p><strong>发布日期:</strong> {item.release_date}</p>
-                <p><strong>招标编号:</strong> {item.no}</p>
-                <p><strong>开标时间:</strong> {item.date}</p>
-                <p><strong>开标地点:</strong> {item.address}</p>
-                <p><strong>标书摘要:</strong> {item.summary}</p>
+                <p><strong>招标编号:</strong> {item.no if item.no else ""}</p>
+                <p><strong>项目区域:</strong> {item.provice if item.provice else ""}{item.city if item.city else ""}</p>
+                <p><strong>相关设备:</strong> {item.devices if item.devices else ""}</p>
+                <p><strong>开标时间:</strong> {item.date if item.date else ""}</p>
+                <p><strong>开标地点:</strong> {item.address if item.address else ""}</p>
+                <p><strong>发布日期:</strong> {item.release_date if item.release_date else ""}</p>
+                <p><strong>标书摘要:</strong> {item.summary if item.summary else ""}</p>
                 <div class="button-container">
                     <a href="{item.url}" class="button">查看详情</a>
                 </div>
@@ -195,7 +193,7 @@ class DataSend:
                     color: #333;
                 }}
                 .container {{
-                    max-width: 1000px;
+                    max-width: 1200px;
                     margin: 0 auto;
                     background-color: #fff;
                     padding: 20px;
@@ -211,7 +209,7 @@ class DataSend:
                     width: 100%;
                 }}
                 .table {{
-                    width: 1000px;
+                    width: 1200px;
                     background-color: #ffffff;
                     border: 1px solid #dddddd;
                     border-radius: 8px;
@@ -227,9 +225,10 @@ class DataSend:
                     text-align: center;
                     font-size:12px;
                 }}
-                .table th:not(:first-child), .table td:not(:first-child) {{
+                .table th:not(:first-child), .table td:not(:first-child) ,.table tr.instrument-row th,.table tr.instrument-row td{{
                     border-left: 1px solid #dddddd;
                 }}
+                 
                 .table th {{
                     padding: 10px;
                     background-color: #f8f9fa;
@@ -255,8 +254,7 @@ class DataSend:
         """
         return html
 
-    @staticmethod
-    def _build_report_email_body(items: list[ProcessResultData]) -> str:
+    def _build_report_email_body(self, items: list[ProcessResultData]) -> str:
         if not items:
             return ""
 
@@ -264,40 +262,102 @@ class DataSend:
         <div class="table-container">
             <table class="table">
                 <tr>
-                    <th style="width:250px">项目名称</th>
-                    <th>设备</th>
-                    <th style="width:150px">公告日期</th>
-                    <th style="width:120px">价格</th>
+                    <th style="width:200px" rowspan="2">项目名称</th>
+                    <th style="width:100px" rowspan="2">地区</th>
+                    <th colspan="6" style="width:800px">中标设备</th>
+                    <th style="width:100px" rowspan="2">公告日期</th>
+                </tr>
+                <tr class='instrument-row'>
+                    <th style="180px">仪器名称</th>
+                    <th style="200px">仪器厂商</th>
+                    <th style="130px">仪器型号</th>
+                    <th style="80px">数量</th>
+                    <th style="140px">单价(元)</th>
+                    <th>中标单位</th>
                 </tr>
         """
         for item in items:
-            body += f"""
-            <tr>
-                <td><a  title="点击查看详情" href="{item.url}">{item.title}</a></td>
-                <td>{item.devices}</td>
-                <td>{item.date}</td>
-                <td>{item.price}</td>
-            </tr>
-            """
+            body += self._gen_report_body_item(item)
         body += "</table></div>"
         return body
 
-    @staticmethod
-    def _gen_report_exlecl(title, items: list[ProcessResultData]) -> str:
-        if not items:
+    def _gen_report_body_item(self, item: ProcessResultData):
+        if not item:
             return ""
-            # 将 list 数据转换为 DataFrame
-        data = {
-            "项目编号": [item.no for item in items],
-            "项目名称": [item.title for item in items],
-            "公告日期": [item.date for item in items],
-            "相关设备": [item.devices for item in items],
-            "价格": [item.price for item in items],
-            "中标人": [item.bidder for item in items],
-            "公告摘要": [item.summary for item in items],
-            "URL": [item.url for item in items],
+        row_count = len(item.instruments) if item.instruments else 1
+        html = f"""
+                  <tr>
+                      <td rowspan="{row_count}"><a title="点击查看详情" href="{item.url}">{item.title}</a></td>
+                      <td rowspan="{row_count}">{item.provice if item.provice else ''}{item.city if item.city else ''}</td>
+                        {self._gen_report_body_item_instrument(item.instruments[0] if item.instruments else None)}
+                      <td rowspan="{row_count}">{item.date if item.date else ''}</td>
+                  </tr>
+                  """
+        if row_count > 1:
+            for instrument in item.instruments[1:]:
+                html += f"""
+                  <tr class="instrument-row">
+                     {self._gen_report_body_item_instrument(instrument)}
+                  </tr>
+                  """
+        return html
+
+    @staticmethod
+    def _gen_report_body_item_instrument(instrument):
+        if not instrument:
+            return '<td colspan="6">无设备信息</td>'
+        return f"""
+                      <td>{instrument.name if instrument.name else ''}</td>
+                      <td>{instrument.manufacturer if instrument.manufacturer else ''}</td>
+                      <td>{instrument.model if instrument.model else ''}</td>
+                      <td>{instrument.quantity if instrument.quantity else ''}</td>
+                      <td>{instrument.unit_price if instrument.unit_price else ''}</td>
+                      <td>{instrument.company if instrument.company else ''}</td>
+                 """
+
+    def _gen_report_exlecl(self, title, items: list[ProcessResultData]) -> str:
+        all_data = []
+
+        for item in items:
+            if item.instruments:
+                # 获取第一台仪器的数据
+                first_instrument = item.instruments[0]
+                all_data.append(self._gen_report_row_data(item, first_instrument))
+
+                # 处理剩余的仪器
+                for instrument in item.instruments[1:]:
+                    all_data.append(self._gen_report_row_data(None, instrument))
+            else:
+                # 如果没有仪器,只添加 ProcessResultData 的字段
+                all_data.append(self._gen_report_row_data(item, None))
+
+        return utils.save_reort_excel(all_data, title)
+
+    @staticmethod
+    def _gen_report_row_data(
+        data: ProcessResultData | None, instrument: InstrumentData | None
+    ):
+        return {
+            "项目编号": data.no if data and data.no else "",
+            "项目名称": data.title if data and data.title else "",
+            "公告日期": data.date if data and data.date else "",
+            "招标省份": data.provice if data and data.provice else "",
+            "招标城市": data.city if data and data.city else "",
+            "中标单位名称": instrument.company if instrument.company else "",
+            "仪器名称": instrument.name if instrument and instrument.name else "",
+            "仪器厂商": (
+                instrument.manufacturer
+                if instrument and instrument.manufacturer
+                else ""
+            ),
+            "仪器型号": instrument.model if instrument and instrument.model else "",
+            "数量": instrument.quantity if instrument and instrument.quantity else "",
+            "单价": (
+                instrument.unit_price if instrument and instrument.unit_price else ""
+            ),
+            "公告摘要": data.summary if data and data.summary else "",
+            "URL": data.url if data and data.url else "",
         }
-        return utils.save_reort_excel(data, title)
 
     def _send_email_no_found(self) -> None:
         email = utils.get_config_value("email.error_email")

+ 1 - 1
SourceCode/TenderCrawler/app/jobs/job_runner.py

@@ -88,7 +88,7 @@ class JobRunner:
                 # self._clean_job()
                 # self._process_job()
                 # self._send_job()
-                # self._send_current_month_report_job()
+                # DataSend(self.store).send_report_current_month()
 
         except Exception as e:
             utils.get_logger().error(f"应用程序停止: {e}")

+ 16 - 10
SourceCode/TenderCrawler/app/models/process_data.py

@@ -13,7 +13,8 @@ class ProcessData:
         url=None,
         keyword=None,
         date=None,
-        area=None,
+        provice=None,
+        city=None,
         address=None,
         summary=None,
         release_date=None,
@@ -32,9 +33,12 @@ class ProcessData:
         self.title = title
         self.url = url
         self.date = date
-        if not area:
-            area = utils.get_config_value("default_area", "全国")
-        self.area = area.replace(" ", "")
+        self.provice = provice.replace("省", "").replace("市", "") if provice else ""
+        self.city = (
+            city.replace("市", "").replace("区", "").replace("县", "") if city else ""
+        )
+        if self.provice == self.city:
+            self.provice = ""
         self.keyword = keyword
         self.address = address
         self.summary = summary
@@ -53,14 +57,14 @@ class ProcessData:
     def __repr__(self):
         return (
             f"ProcessData(no={self.no}, title={self.title}, date={self.date}, "
-            f"area={self.area}, address={self.address}, summary={self.summary}, "
+            f"provice={self.provice},city={self.city}, address={self.address}, summary={self.summary}, "
             f"status={self.status}, create_time={self.create_time}, "
             f"send_time={self.send_time}, remark={self.remark})"
         )
 
     _insert_query = """
-              INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, devices, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
-              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+              INSERT IGNORE INTO t_data (no, title, url, keyword, date, provice, city, address, summary, release_date, devices, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
+              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
           """
 
     # _update_query = """
@@ -76,7 +80,8 @@ class ProcessData:
             process_data.url,
             process_data.keyword,
             process_data.date,
-            process_data.area,
+            process_data.provice,
+            process_data.city,
             process_data.address,
             process_data.summary,
             process_data.release_date,
@@ -110,7 +115,8 @@ class ProcessData:
                 process_data.url,
                 process_data.keyword,
                 process_data.date,
-                process_data.area,
+                process_data.provice,
+                process_data.city,
                 process_data.address,
                 process_data.summary,
                 process_data.release_date,
@@ -171,7 +177,7 @@ class ProcessData:
             )
             return data
 
-    _not_send_query = "SELECT no, title, url, keyword, date, area, address, summary, attach_path, release_date FROM t_data WHERE status = 0"
+    _not_send_query = "SELECT no, title, url, keyword, devices,date, city, address, summary, attach_path, release_date FROM t_data WHERE status = 0"
 
     def fetch_not_send(self):
         with MySQLHelper() as db_helper:

+ 68 - 19
SourceCode/TenderCrawler/app/models/process_result_data.py

@@ -1,9 +1,31 @@
+import json
 import utils
 from datetime import datetime
 
 from utils.mysql_helper import MySQLHelper
 
 
+class InstrumentData:
+    def __init__(
+        self,
+        company: str,
+        name: str,
+        manufacturer: str,
+        model: str,
+        quantity: int,
+        unit_price: float,
+    ):
+        self.company = company  # 中标单位名称,参与竞标并中标的公司名称
+        self.name = name  # 仪器名称,例如:红外光谱仪
+        self.manufacturer = manufacturer  # 仪器厂商,例如:赛默飞、Bruker
+        self.model = model  # 仪器的型号 / 规格,例如:NIR25S
+        self.quantity = quantity  # 中标仪器的数量,台数,例如:2
+        self.unit_price = unit_price  # 仪器的单价,单位转
+
+    def to_str(self) -> str:
+        return f"[名称:{self.name},中标单位:{self.company},仪器厂商:{self.manufacturer},仪器规格:{self.model},数量:{self.quantity},单价:{self.unit_price}]"
+
+
 class ProcessResultData:
 
     def __init__(
@@ -12,10 +34,11 @@ class ProcessResultData:
         title=None,
         url=None,
         keyword=None,
+        provice=None,
+        city=None,
         date=None,
-        devices=None,
-        price=None,
-        bidder=None,
+        instruments=None,
+        instruments_o=None,
         summary=None,
         attach_path=None,
         status=None,
@@ -32,9 +55,15 @@ class ProcessResultData:
         self.url = url
         self.keyword = keyword
         self.date = date
-        self.devices = devices
-        self.price = price
-        self.bidder = bidder
+        self.instruments_str = ""
+        self.instruments = []
+        self.set_instruments(instruments, instruments_o)
+        self.provice = provice.replace("省", "").replace("市", "") if provice else ""
+        self.city = (
+            city.replace("市", "").replace("区", "").replace("县", "") if city else ""
+        )
+        if self.provice == self.city:
+            self.provice = ""
         self.summary = summary
         self.attach_path = attach_path
         self.status = status
@@ -49,13 +78,33 @@ class ProcessResultData:
     def __repr__(self):
         return (
             f"ProcessResultData(no={self.no}, title={self.title}, date={self.date}, "
-            f"keyword={self.keyword}, price={self.price}, bidder={self.bidder}, summary={self.summary}, attach_path={self.attach_path}, "
+            f"keyword={self.keyword}, provice={self.provice},city={self.city},instruments={self.instruments_str} summary={self.summary}, attach_path={self.attach_path}, "
             f"status={self.status}, create_time={self.create_time}, "
             f"send_time={self.send_time}, remark={self.remark})"
         )
 
+    def set_instruments(self, instruments_str: str, instruments):
+        if instruments is None:
+            instruments = []
+        if instruments_str:
+            self.instruments_str = instruments_str
+            self.instruments = [
+                InstrumentData(**instrument)
+                for instrument in json.loads(self.instruments_str)
+            ]
+        else:
+            self.instruments = instruments or []
+            self.instruments_str = (
+                json.dumps(
+                    instruments,
+                    ensure_ascii=False,
+                )
+                if len(instruments) > 0
+                else ""
+            )
+
     _insert_query = """
-              INSERT IGNORE INTO t_data_result (no, title, url, keyword, date, devices, price,  bidder, summary, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
+              INSERT IGNORE INTO t_data_result (no, title, url, keyword, date, provice, city, instruments, summary, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
           """
     # _update_query = """
@@ -72,9 +121,9 @@ class ProcessResultData:
             process_result_data.url,
             process_result_data.keyword,
             process_result_data.date,
-            process_result_data.devices,
-            process_result_data.price,
-            process_result_data.bidder,
+            process_result_data.provice,
+            process_result_data.city,
+            process_result_data.instruments_str,
             process_result_data.summary,
             process_result_data.attach_path,
             0,
@@ -107,9 +156,9 @@ class ProcessResultData:
                 process_result_data.url,
                 process_result_data.keyword,
                 process_result_data.date,
-                process_result_data.devices,
-                process_result_data.price,
-                process_result_data.bidder,
+                process_result_data.provice,
+                process_result_data.city,
+                process_result_data.instruments_str,
                 process_result_data.summary,
                 process_result_data.attach_path,
                 0,
@@ -142,10 +191,10 @@ class ProcessResultData:
             if not result:
                 return None
             data = ProcessResultData(
-                url=result["url"],
                 no=result["no"],
-                other_urls=result["other_urls"],
+                url=result["url"],
                 attach_path=result["attach_path"],
+                other_urls=result["other_urls"],
             )
             return data
 
@@ -159,14 +208,14 @@ class ProcessResultData:
             if not result:
                 return None
             data = ProcessResultData(
-                url=result["url"],
                 no=result["no"],
-                other_urls=result["other_urls"],
+                url=result["url"],
                 attach_path=result["attach_path"],
+                other_urls=result["other_urls"],
             )
             return data
 
-    _not_send_query = "SELECT no, title, url, keyword, date, devices, price, bidder, summary, attach_path, status, create_time, send_time FROM t_data_result WHERE status = 0"
+    _not_send_query = "SELECT no, title, url, keyword, date, provice, city, instruments, summary, attach_path, status, create_time, send_time FROM t_data_result WHERE status = 0"
 
     def fetch_not_send(self):
         with MySQLHelper() as db_helper:

+ 4 - 1
SourceCode/TenderCrawler/app/utils/ai_helper.py

@@ -43,6 +43,8 @@ class AiHelper:
             ],
             stream=False,
             temperature=0.7,
+            response_format={"type": "json_object"},
+            # max_tokens=self._ai_max_tokens,
         )
         try:
             response = completion.model_dump_json()
@@ -55,7 +57,7 @@ class AiHelper:
                 result["completion_tokens"] = usage.get("completion_tokens", 0)
                 result["prompt_tokens"] = usage.get("prompt_tokens", 0)
                 result["total_tokens"] = usage.get("total_tokens", 0)
-                # utils.get_logger().info(f"AI Process JSON: {result}")
+                utils.get_logger().info(f"AI Process JSON: {result}")
             else:
                 utils.get_logger().info(f"AI Response: {response}")
             return result
@@ -64,6 +66,7 @@ class AiHelper:
 
     @staticmethod
     def _extract_message_content(response_json: dict) -> str:
+        utils.get_logger().info(f"AI Response JSON: {response_json}")
         if "choices" in response_json and len(response_json["choices"]) > 0:
             choice = response_json["choices"][0]
             message_content = choice.get("message", {}).get("content", "")

+ 2 - 2
SourceCode/TenderCrawler/docker-compose.yml

@@ -41,7 +41,7 @@ services:
 
   crawler-app:
     build: .
-    image: y_tender-crawler-app:2.0.0
+    image: y_tender-crawler-app:2.1.0
     container_name: y_tender-crawler-app
     depends_on:
       - crawler-mysql
@@ -60,7 +60,7 @@ services:
       - APP_JOB__COLLECT=20:00,12:00
       - APP_JOB__PROCESS=23:00,4:00,13:00
       - APP_JOB__SEND_EMAIL=08:20,14:00
-      - APP_JOB__RUN_NOW=0
+      - APP_JOB__RUN_NOW=1
       - APP_SELENIUM__REMOTE_DRIVER_URL=http://y_selenium:4444/wd/hub
     volumes:
       - /home/docker/tender-crawler_v2/app/config.yml:/app/config.yml

+ 7 - 5
SourceCode/TenderCrawler/init.sql

@@ -62,7 +62,7 @@ INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`,
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('上海', '黄浦区,徐汇区,长宁区,静安区,普陀区,虹口区,杨浦区,宝山区,闵行区,嘉定区,浦东新区,金山区,松江区,青浦区,奉贤区,崇明区', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('天津', '和平区,河东区,河西区,南开区,河北区,红桥区,东丽区,西青区,北辰区,武清区,宝坻区,滨海新区,宁河区,静海区,蓟州区', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('重庆', '渝中区,大渡口区,沙坪坝区,九龙坡区,南岸区,北碚区,渝北区,巴南区,长寿区,江北区,沙坪坝区,九龙坡区,南岸区,北碚区,渝北区,巴南区,长寿区,永川区,合川区,南川区,璧山区,江津区,合川区,永川区,南川区,璧山区,江津区,合川区,永川区,南川区,璧山区,江津区', '', 1, 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('江苏', '南京,苏州,无锡,常州,镇江,南通,扬州,盐城,连云港,淮安,宿迁,泰州,徐州', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('江苏', '南京,苏州,无锡,常州,镇江,南通,扬州,盐城,连云港,淮安,宿迁,泰州,徐州,溧阳', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('黑龙江','哈尔滨,齐齐哈尔,鸡西,鹤岗,双鸭山,大庆,伊春,佳木斯,七台河,牡丹江,黑河,绥化,大兴安岭地区', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('吉林', '长春,吉林,白山,延边朝鲜族自治州', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('辽宁', '沈阳,大连,鞍山,抚顺,本溪,丹东,锦州,营口,阜新,辽阳,盘锦,铁岭,朝阳,葫芦岛', '', 1, 1, NULL);
@@ -133,7 +133,8 @@ CREATE TABLE `t_data`  (
   `no` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标编号',
   `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标标题',
   `date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标时间',
-  `area` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标地区',
+  `provice` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位省份',
+  `city` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位城市',
   `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '详细地点',
   `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
   `release_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '发布时间',
@@ -160,9 +161,10 @@ CREATE TABLE `t_data_result`  (
   `no` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标编号',
   `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标标题',
   `date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '公告时间',
-  `price` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '中标金额',
-  `bidder` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '中标人',
-  `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
+  `provice` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位省份',
+  `city` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位城市',
+  `instruments` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '相关设备仪器',
+  `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '公告摘要',
   `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NULL DEFAULT 0 COMMENT '状态 0:未推送 1:已推送',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',