Browse Source

Merge branch 'V2.0.0' of Crawler/TenderCrawler into dev

YueYunyun 6 months ago
parent
commit
49b95eee8e

+ 201 - 0
SourceCode/TenderCrawler/README.md

@@ -0,0 +1,201 @@
+# 招投标数据采集系统
+
+## 项目说明
+
+本系统用于自动采集和处理招投标信息,主要功能包括:
+
+- 数据采集:从多个招投标网站自动采集数据
+- 数据处理:使用 AI 处理和分析采集的数据
+- 数据分发:通过邮件发送处理后的数据
+- 数据清理:定期清理历史数据
+
+## 系统架构
+
+### 核心模块
+
+- 数据采集模块 (DataCollector)
+- 数据处理模块 (DataProcess)
+- 数据发送模块 (DataSend)
+- 数据清理模块 (DataClean)
+
+### 关键组件
+
+- 适配器 (Adapters): 负责对接不同的数据源
+- 存储层 (Stores): 负责数据持久化
+- 工具类 (Utils): 提供通用功能支持
+
+## 配置说明
+
+### 基础配置
+
+```yaml
+# 数据采集配置
+save:
+  collect_data_key: '红外光谱仪,拉曼光谱仪' # 采集数据关键词过滤
+  collect_batch_size: 100 # 采集数据批量保存大小
+  process_batch_size: 1 # AI处理数据批量大小
+  attach_file_path: './temp_files/attaches/' # 附件保存路径
+  report_file_path: './temp_files/report/' # 报表保存路径
+
+# 任务调度配置
+job:
+  event_id: 1 # 任务ID,改变此值会触发任务重新加载
+  sleep_interval: 10 # 任务检查间隔(秒)
+  collect: '06:00,22:00' # 每天数据采集时间点
+  process: '07:00,10:00' # 每天数据处理时间点
+  send_email: '08:20,14:00' # 每天邮件发送时间点
+  clean_data: '00:05' # 每天数据清理时间点
+
+  # 月度报告配置
+  send_current_month_report_day: 30 # 每月发送当月报告的日期
+  send_current_month_report_time: '08:20' # 发送当月报告的时间点
+  send_prev_month_report_day: 1 # 每月发送上月报告的日期
+  send_prev_month_report_time: '08:20' # 发送上月报告的时间点
+
+  run_now: false # 是否立即执行一次任务
+
+# 数据清理配置
+clean:
+  day: 30 # 默认清理天数(所有类型默认使用此值)
+  attach: 30 # 附件清理天数
+  log: 30 # 日志清理天数
+  collect_data: 30 # 采集数据清理天数
+  process_data: 30 # 招标数据清理天数
+  process_result_data: 60 # 中标数据清理天数(最小45天)
+  report: 90 # 报表清理天数(最小60天)
+
+# AI配置
+ai:
+  key: 'your-api-key' # AI API密钥
+  url: 'https://api-url' # AI API地址
+  model: 'model-name' # 使用的模型名称
+  # 系统提示词
+  system_prompt: '从给定信息中提取出关键信息,并以给定的类型返回json字符串,如果部分信息为空,则该字段返回为空' 
+  # 模板1 处理招标信息
+  prompt_template_1: '在以上内容中提取招标信息:
+    ```typescript
+    export interface Tender { //招标信息
+    no: string; // 招标项目编号
+    title: string; // 招标公告标题
+    province: string; // 招标单位省份
+    city: string; // 招标单位城市
+    date: string; // 项目开标的时间
+    address: string; // 项目开标的地点
+    release_date: string; // 招标信息的发布时间
+    summary: string; // 100字左右的招标条件,联系方式等内容摘要
+    devices: string; // 只涉及到光谱仪相关的设备,其他无关设备不需要,多个设备以逗号分割 ,例如 红外光谱仪,拉曼光谱仪等
+    }
+    ```' 
+  # 模板2 处理中标信息
+  prompt_template_2: '在以上内容中提取中标信息:
+    ```typescript
+    export interface Instrument { // 中标仪器信息
+    company: string; // 中标单位名称,参与竞标并中标的公司名称
+    name: string; // 仪器名称,例如:红外光谱仪
+    manufacturer: string; // 仪器厂商,例如:赛默飞、Bruker
+    model: string; // 仪器的型号/规格,例如:NIR25S
+    quantity: number; // 中标仪器的数量,台数,例如:2
+    unit_price: number; // 仪器的单价,单位转换为元,例如:178000.00
+    }
+    export interface BiddingAcceptance { //中标信息
+    no: string; // 项目编号
+    title: string; // 中标公告标题
+    date: string; // 中标公告时间
+    province: string; // 招标单位省份
+    city: string; // 招标单位城市
+    summary: string; // 公告摘要信息,100字左右
+    instruments: Instrument[]; // 中标设备的信息
+    }
+    ```'
+# 邮件配置
+email:
+  smtp_server: 'smtp.example.com' # SMTP服务器地址
+  smtp_port: 465 # SMTP端口
+  smtp_user: 'user@example.com' # SMTP用户名
+  smtp_password: 'password' # SMTP密码
+  from_email: 'from@example.com' # 发件人地址
+  error_email: 'error@example.com' # 错误通知邮箱
+  default_email: 'default@example.com' # 默认收件人地址,所有业务邮件都发送到该地址
+
+# 数据库配置  可以在docker-compose中的环境变量中配置
+mysql:
+  host: 'localhost' # 数据库主机
+  port: 3306 # 数据库端口
+  db: 'database_name' # 数据库名
+  user: 'root' # 数据库用户名
+  password: 'password' # 数据库密码
+  charset: 'utf8mb4' # 字符集
+```
+
+### 数据源配置
+
+```yaml
+adapter:
+  max_retries: 3 # 最大重试次数
+  # 中国政府采购网配置
+  ccgp:
+    search_day: '近3日' # 搜索时间范围
+    model_name: 'ccgp_data_collection_adapter' # 适配器模块名
+    class_name: 'CCGPDataCollectionAdapter' # 适配器类名
+    batch_save: false # 是否批量保存数据
+
+  # 中国采购与招标网配置
+  chinabidding:
+    search_day: '近一周' # 搜索时间范围
+    model_name: 'chinabidding_data_collection_adapter' # 适配器模块名
+    class_name: 'ChinabiddingDataCollectionAdapter' # 适配器类名
+    batch_save: true # 是否批量保存数据
+
+# Selenium配置
+selenium:
+  remote_driver_url: 'http://127.0.0.1:3534/wd/hub' # WebDriver地址
+```
+
+### 日志配置
+
+```yaml
+logger:
+  file-path: './logs/' # 日志文件路径
+  level: 'debug' # 日志级别
+```
+
+## 部署说明
+
+### 环境要求
+
+- Python 3.8+
+- MySQL 5.7+
+- Selenium WebDriver
+
+### 安装步骤
+
+1. 安装依赖: `pip install -r requirements.txt`
+2. 配置数据库: 执行 `init.sql`
+3. 修改配置: 编辑 `config.yml`
+4. 启动应用: `python app/main.py`
+
+### 目录结构
+
+```
+app/
+├── adapters/           # 数据源适配器
+├── drivers/            # 浏览器驱动
+├── jobs/              # 任务处理模块
+├── models/            # 数据模型
+├── stores/            # 数据存储
+├── utils/             # 工具类
+├── config.yml         # 配置文件
+└── main.py           # 主程序
+```
+
+## 数据采集说明
+
+### 采集流程
+
+1. 系统按配置的时间点(`job.collect`)自动启动采集任务
+2. 根据配置的数据源(`adapter`)和关键词(`save.collect_data_key`)进行数据采集
+3. 采集到的数据经过关键词过滤后保存到数据库
+4. 相关附件下载到指定目录(`save.attach_file_path`)
+
+
+

+ 0 - 4
SourceCode/TenderCrawler/app/adapters/__init__.py

@@ -4,7 +4,3 @@ from stores.data_store_interface import IDataStore
 
 def collect(adapter: IDataCollectionAdapter, keywords: str, store: IDataStore = None):
     adapter.collect(keywords, store)
-
-
-def teardown(adapter: IDataCollectionAdapter):
-    adapter.teardown()

+ 194 - 135
SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py

@@ -1,172 +1,213 @@
 from time import sleep
+from typing import List, Optional
 
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as ec
 
-
 import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
-from stores.data_store_interface import IDataStore
 
 
-class CcgpDataCollectionAdapter(IDataCollectionAdapter):
-    """
-    中国政府采购网数据采集适配器
-    """
+class CCGPDataCollectionAdapter(IDataCollectionAdapter):
+    """中国政府采购网数据采集适配器"""
 
-    def __init__(self, url: str, store: IDataStore = None):
-        self._url = url
-        self._store = store
-        self._driver = None
-        self._keyword = None
-        self._adapter_type = "ccgp"
-        self._next_count = 0
+    def __init__(self, url: str):
+        """初始化适配器
+
+        Args:
+            url: 目标网站URL
+        """
+        super().__init__(url, "ccgp", "近1周")
 
     def login(self, username: str, password: str) -> None:
+        """登录网站(CCGP无需登录)"""
         pass
 
-    def _collect(self, keyword: str):
-        items = self._search(keyword)
-        if len(items) <= 0:
-            return
-        self._process_list(items)
-        if utils.get_config_bool(self.batch_save_key):
-            self.store.save_collect_data(True)
+    def _collect(self, keyword: str) -> None:
+        """执行数据采集
 
-    def _search(self, keyword: str) -> list:
+        Args:
+            keyword: 单个搜索关键词
+        """
         try:
-            if not keyword:
-                raise Exception("搜索关键字不能为空")
-            self.driver.get(self._url)
-            if not self._wait_until(
-                ec.presence_of_element_located((By.ID, "searchForm"))
-            ):
-                return []
-            search_el = self.driver.find_element(By.ID, "kw")
-            sleep(2)
-            search_el.clear()
-            search_el.send_keys(keyword)
-            search_btn = self.driver.find_element(
-                By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']"
-            )
-            sleep(1)
-            search_btn.click()
-            self._next_count = 0
-            if not self._wait_until(
-                ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
-            ):
-                return []
-            default_search_txt = "近1周"
-            search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
-            utils.get_logger().debug(f"搜索日期条件: {search_txt}")
-            if search_txt != default_search_txt:
+            # 获取搜索时间范围
+
+            self.logger.info(f"开始采集关键词: {keyword}, 时间范围: {self._search_txt}")
+
+            # 搜索数据
+            items = self._search(keyword)
+            if not items:
+                return
+
+            # 处理数据列表
+            self._process_list(items)
+
+        except Exception as e:
+            self.logger.error(f"采集失败: {e}")
+            raise
+
+    def _search(self, keyword: str) -> List:
+        """搜索数据
+
+        Args:
+            keyword: 搜索关键词
+
+        Returns:
+            List: 搜索结果列表
+        """
+        # 打开搜索页面
+        self.driver.get(self.url)
+
+        # 等待搜索框
+        self._wait_for(
+            ec.presence_of_element_located((By.ID, "searchForm")),
+            message="搜索框加载超时",
+        )
+
+        # 输入关键词
+        search_el = self.driver.find_element(By.ID, "kw")
+        sleep(2)
+        search_el.clear()
+        search_el.send_keys(keyword)
+
+        # 点击搜索
+        search_btn = self.driver.find_element(
+            By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']"
+        )
+        sleep(1)
+        search_btn.click()
+
+        # 等待结果加载
+        self._next_count = 0
+        self._wait_for(
+            ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
+            message="搜索结果加载超时",
+        )
+
+        # 设置时间范围
+        self._set_search_date()
+
+        # 获取结果列表
+        items = self.driver.find_elements(
+            By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
+        )
+        return items
+
+    def _set_search_date(self) -> None:
+        """设置搜索时间范围"""
+        try:
+            if self._search_txt != self._default_search_txt:
                 last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
                 for last_el in last_els:
-                    if search_txt == last_el.text:
+                    if self._search_txt == last_el.text:
                         sleep(1)
                         last_el.click()
                         break
-                if not self._wait_until(
-                    ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
-                ):
-                    return []
+
+                self._wait_for(
+                    ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
+                    message="设置时间范围后页面加载超时",
+                )
             else:
                 sleep(1)
-            try:
-                p_els = self.driver.find_elements(
-                    By.XPATH, "//body/div[@class='vT_z']/div/div/p"
-                )
-                if len(p_els) > 0:
-                    utils.get_logger().debug(f" {p_els[0].text}")
-                else:
-                    a_links = self.driver.find_elements(
-                        By.XPATH, "//div[@class='vT-srch-result-list']/p/a"
-                    )
-                    count = len(a_links)
-                    if count > 1:
-                        count = count - 1
-                    utils.get_logger().debug(f"共查询到 {count} 页,每页 20 条")
-            except Exception as e:
-                utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
-            items = self.driver.find_elements(
-                By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
-            )
-            return items
-        except TimeoutException as e:
-            raise Exception(f"搜索失败 [{self._adapter_type}] [超时]: {e}")
-        except NoSuchElementException as e:
-            raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
 
-    def _process_list(self, items: list) -> list:
+        except Exception as e:
+            self.logger.error(f"设置时间范围失败: {e}")
+
+    def _process_list(self, items: List) -> None:
+        """处理数据列表
+
+        Args:
+            items: 数据列表
+        """
         if not items:
-            return []
+            return
+
+        # 处理当前页
         for item in items:
             self._process_item(item)
         sleep(2)
+
+        # 处理下一页
         next_items = self._next_page()
-        if len(items) <= 0:
-            return []
-        return self._process_list(next_items)
+        if next_items:
+            self._process_list(next_items)
 
-    def _next_page(self) -> list:
+    def _next_page(self) -> Optional[List]:
+        """获取下一页数据"""
         try:
+            # 查找下一页按钮
             next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
             try:
                 btn = self.driver.find_element(By.XPATH, next_path)
             except NoSuchElementException:
-                utils.get_logger().debug(f"翻页结束 [{self._adapter_type}]")
-                return []
+                self.logger.debug("已到最后一页")
+                return None
+
+            # 点击下一页
             btn.click()
             self._next_count += 1
-            utils.get_logger().debug(
-                f"下一页[{self._next_count+1}]: {self.driver.current_url}"
-            )
+            self.logger.debug(f"下一页[{self._next_count+1}]")
             sleep(1)
-            if not self._wait_until(
-                ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
-            ):
-                return []
+
+            # 等待页面加载
+            self._wait_for(
+                ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
+                message="下一页加载超时",
+            )
+
+            # 获取数据列表
             items = self.driver.find_elements(
                 By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
             )
             return items
+
         except NoSuchElementException as e:
-            raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
-        except TimeoutException as e:
-            raise Exception(f"翻页失败 [{self._adapter_type}] [超时]: {e}")
+            raise Exception(f"页面元素未找到: {e}")
 
-    def _process_item(self, item):
+    def _process_item(self, item) -> None:
+        """处理单条数据"""
         main_handle = self.driver.current_window_handle
         close = True
+
         try:
+            # 检查URL是否已采集
             url = item.get_attribute("href")
             if self._check_is_collect_by_url(url):
                 close = False
                 return
-            utils.get_logger().debug(f"跳转详情")
+
+            # 打开详情页
+            self.logger.debug("打开详情页")
             sleep(1)
             item.click()
-            if not self._wait_until(ec.number_of_windows_to_be(2)):
-                return
+
+            # 切换窗口
+            self._wait_for(ec.number_of_windows_to_be(2), message="新窗口打开超时")
+
             handles = self.driver.window_handles
             for handle in handles:
                 if handle != main_handle:
                     self.driver.switch_to.window(handle)
                     break
-            if not self._wait_until(
-                ec.presence_of_element_located((By.TAG_NAME, "body"))
-            ):
-                return
 
+            # 等待页面加载
+            self._wait_for(
+                ec.presence_of_element_located((By.TAG_NAME, "body")),
+                message="详情页加载超时",
+            )
+
+            # 获取内容
             content = self.driver.find_element(
                 By.XPATH, "//div[@class='vF_deail_maincontent']"
             ).text
-            # 排除其他公告
+
+            # 判断公告类型
             if self._check_type("其他公告"):
                 self._save_db(url, content, 3, is_invalid=True)
                 return
-            # 判断是否为投标公告
+
             data_type = (
                 1
                 if self._check_type("中标公告")
@@ -174,20 +215,19 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                 or self._check_type("终止公告")
                 else 0
             )
+
+            # 检查关键词并保存
             if self._check_content(content):
                 attach_str = self._attach_download()
                 self._save_db(url, content, data_type, attach_str)
             else:
                 self._save_db(url, content, data_type, is_invalid=True)
+
         except TimeoutException as e:
-            utils.get_logger().error(
-                f"采集发生异常 [{self._adapter_type}] Timeout: {self.driver.current_url}。Exception: {e}"
-            )
+            self.logger.error(f"处理数据超时: {e}")
         except NoSuchElementException as e:
-            utils.get_logger().error(
-                f"采集发生异常 [{self._adapter_type}] NoSuchElement: {self.driver.current_url}。Exception: {e}"
-            )
-            raise Exception(f"采集失败 [{self._adapter_type}] [找不到元素]: {e}")
+            self.logger.error(f"页面元素未找到: {e}")
+            raise
         finally:
             if close:
                 sleep(1)
@@ -195,56 +235,75 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                 self.driver.switch_to.window(main_handle)
 
     def _check_type(self, type_str: str) -> bool:
+        """检查公告类型
+
+        Args:
+            type_str: 类型文本
+
+        Returns:
+            bool: 是否匹配
+        """
         links = self.driver.find_elements(By.LINK_TEXT, type_str)
-        if len(links) > 0:
-            utils.get_logger().info(f"{type_str}")
+        if links:
+            self.logger.info(f"公告类型: {type_str}")
             return True
         return False
 
-    def _attach_download(self):
+    def _attach_download(self) -> Optional[str]:
+        """下载附件
+
+        Returns:
+            str: 附件路径
+        """
         paths = []
 
+        # 查找附件链接
         attach_els = self.driver.find_elements(
             By.XPATH, "//td[@class='bid_attachtab_content']/a"
         )
         attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
-        # 合并两个列表
         all_attachments = attach_els + attach_2_els
-        utils.get_logger().debug(
-            f"附件检索数量: {len(attach_els)}/{len(attach_2_els)}/{len(all_attachments)}"
+
+        self.logger.debug(
+            f"附件数量: {len(attach_els)}/{len(attach_2_els)}/{len(all_attachments)}"
         )
+
+        # 下载附件
         attach_urls = []
-        if len(all_attachments) > 0:
-            for attach_el in all_attachments:
+        for attach_el in all_attachments:
+            try:
+                # 获取附件信息
                 attach_url = attach_el.get_attribute("href")
-                if attach_url not in attach_urls:
-                    attach_urls.append(attach_url)
-                else:
-                    utils.get_logger().info(f"重复附件: {attach_url}")
+                if attach_url in attach_urls:
+                    self.logger.info(f"重复附件: {attach_url}")
                     continue
+                attach_urls.append(attach_url)
+
+                # 获取文件名
                 file_name = (
                     attach_el.text
                     or attach_el.get_attribute("download")
                     or attach_url.split("/")[-1]
                 )
-                if not file_name:
-                    continue
-                # 检查 file_name 是否包含文件扩展名
-                if "." not in file_name:
-                    utils.get_logger().warning(
-                        f"文件名 {file_name} 不包含扩展名,跳过下载。"
-                    )
+                if not file_name or "." not in file_name:
+                    self.logger.warning(f"无效文件名: {file_name}")
                     continue
-                utils.get_logger().debug(
-                    f"开始下载附件: {file_name} 链接: {attach_url}"
-                )
+
+                # 下载文件
+                self.logger.debug(f"下载附件: {file_name}")
                 path = utils.download_remote_file(attach_url, file_name)
                 if path:
-                    utils.get_logger().debug(f"下载附件路径: {path}")
+                    self.logger.debug(f"下载成功: {path}")
                     paths.append(path)
                 else:
-                    utils.get_logger().warning(f"下载附件失败: {file_name}")
+                    self.logger.warning(f"下载失败: {file_name}")
+
+            except Exception as e:
+                self.logger.error(f"处理附件失败: {e}")
+                continue
+
+        # 返回附件路径
         attach_str = ",".join(paths)
         if attach_str:
-            utils.get_logger().info(f"附件下载完成: {attach_str}")
+            self.logger.info(f"附件下载完成: {attach_str}")
         return attach_str

+ 198 - 117
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -1,206 +1,287 @@
 from time import sleep
+from typing import List, Optional
 
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as ec
-from selenium.webdriver.support.wait import WebDriverWait
 
 import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
-from stores.data_store_interface import IDataStore
 
 
 class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
-    """
-    中国招标网数据采集适配器
-    """
-
-    def __init__(self, url: str, store: IDataStore = None):
-        self._url = url
-        self._store = store
-        self._driver = None
-        self._keyword = None
-        self._adapter_type = "chinabidding"
-        self._next_count = 0
+    """中国采购与招标网数据采集适配器"""
+
+    def __init__(self, url: str):
+        """初始化适配器
+
+        Args:
+            url: 目标网站URL
+        """
+        super().__init__(url, "chinabidding", "全部")
 
     def login(self, username: str, password: str) -> None:
+        """登录网站
+
+        Args:
+            username: 用户名
+            password: 密码
+        """
         try:
+            # 点击登录按钮
             login_el = self.driver.find_element(
                 By.XPATH, "//div[@id='loginRight']/a[@class='login']"
             )
             login_el.click()
-            wait = WebDriverWait(self.driver, 10, 1)
-            wait.until(ec.presence_of_element_located((By.ID, "userpass")))
-            # if not self._wait_until(
-            #     ec.presence_of_element_located((By.ID, "userpass"))
-            # ):
-            #     raise TimeoutException(f"id='userpass' 元素没有找到")
+
+            # 等待登录框加载
+            self._wait_for(
+                ec.presence_of_element_located((By.ID, "userpass")),
+                timeout=10,
+                message="登录框加载超时",
+            )
+
+            # 输入用户名密码
             un_el = self.driver.find_element(By.ID, "username")
             un_el.send_keys(username)
             pass_el = self.driver.find_element(By.ID, "userpass")
             pass_el.send_keys(password)
+
+            # 点击登录
             login_btn = self.driver.find_element(By.ID, "login-button")
             login_btn.click()
-            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
-            # if not self._wait_until(ec.presence_of_element_located((By.ID, "site-content"))):
-            #     raise TimeoutException(f"id='site-content' 元素没有找到")
+
+            # 等待登录成功
+            self._wait_for(
+                ec.presence_of_element_located((By.ID, "site-content")),
+                message="登录成功页面加载超时",
+            )
+            self.logger.info("登录成功")
+
         except TimeoutException as e:
-            raise Exception(f"登录失败 [{self._adapter_type}] [超时]: {e}")
+            raise Exception(f"登录超时: {e}")
         except NoSuchElementException as e:
-            raise Exception(f"登录失败 [{self._adapter_type}] [找不到元素]: {e}")
+            raise Exception(f"页面元素未找到: {e}")
 
-    def _collect(self, keyword: str):
-        items = self._search_by_type(keyword, 0)
-        self._process_list(items, 0)
-        sleep(2)
-        items = self._search_by_type(keyword, 1)
-        self._process_list(items, 1)
-        if utils.get_config_bool(self.batch_save_key):
-            self.store.save_collect_data(True)
+    def _collect(self, keyword: str) -> None:
+        """执行数据采集
 
-    def _search_by_type(self, keyword: str, data_type):
+        Args:
+            keyword: 单个搜索关键词
+        """
         try:
-            self.driver.get(self._url)
-            if data_type == 0:
-                utils.get_logger().info(f"开始采集 招标公告")
-                el = self.driver.find_element(
-                    By.XPATH, "//div[@id='z-b-g-g']/h2/a[@class='more']"
-                )
-            else:
-                utils.get_logger().info(f"开始采集 中标结果公告")
-                el = self.driver.find_element(
-                    By.XPATH, "//div[@id='z-b-jg-gg']/h2/a[@class='more']"
-                )
-            el.click()
-            if not self._wait_until(ec.number_of_windows_to_be(2)):
-                return []
-            self.driver.close()
-            self.driver.switch_to.window(self.driver.window_handles[0])
-            return self._search(keyword)
-        except TimeoutException as e:
-            raise Exception(f"搜索失败 [{self._adapter_type}] [超时]: {e}")
-        except NoSuchElementException as e:
-            raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
 
-    def _search(self, keyword: str) -> list:
-        if not self._wait_until(
-            ec.presence_of_element_located((By.ID, "searchBidProjForm"))
-        ):
-            return []
+            self.logger.info(f"开始采集关键词: {keyword}, 时间范围: {self._search_txt}")
+
+            # 采集招标公告
+            self.logger.info("开始采集招标公告")
+            items = self._search_by_type(keyword, 0)
+            self._process_list(items, 0)
+            sleep(2)
+
+            # 采集中标公告
+            self.logger.info("开始采集中标公告")
+            items = self._search_by_type(keyword, 1)
+            self._process_list(items, 1)
+
+        except Exception as e:
+            self.logger.error(f"采集失败: {e}")
+            raise
+
+    def _search_by_type(self, keyword: str, data_type: int) -> List:
+        """根据类型搜索数据
+
+        Args:
+            keyword: 搜索关键词
+            data_type: 数据类型(0:招标,1:中标)
+
+        Returns:
+            List: 搜索结果列表
+        """
+        # 打开首页
+        self.driver.get(self.url)
+
+        # 选择公告类型
+        if data_type == 0:
+            el = self.driver.find_element(
+                By.XPATH, "//div[@id='z-b-g-g']/h2/a[@class='more']"
+            )
+        else:
+            el = self.driver.find_element(
+                By.XPATH, "//div[@id='z-b-jg-gg']/h2/a[@class='more']"
+            )
+        el.click()
+
+        # 切换窗口
+        self._wait_for(ec.number_of_windows_to_be(2), message="新窗口打开超时")
+
+        self.driver.close()
+        self.driver.switch_to.window(self.driver.window_handles[0])
+
+        # 执行搜索
+        return self._search(keyword)
+
+    def _search(self, keyword: str) -> List:
+        """执行搜索"""
+        # 等待搜索框加载
+        self._wait_for(
+            ec.presence_of_element_located((By.ID, "searchBidProjForm")),
+            message="搜索框加载超时",
+        )
+
+        # 输入关键词
         search_el = self.driver.find_element(
             By.XPATH, "//form[@id='searchBidProjForm']/ul/li/input[@id='fullText']"
         )
         search_el.clear()
         search_el.send_keys(keyword)
+
+        # 点击搜索
         search_btn = self.driver.find_element(
             By.XPATH, "//form[@id='searchBidProjForm']/ul/li/button"
         )
         search_btn.click()
+
+        # 等待结果加载
         self._next_count = 0
-        if not self._wait_until(
-            ec.presence_of_element_located((By.ID, "site-content"))
-        ):
-            return []
-        default_search_txt = "全部"
-        search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
-        utils.get_logger().debug(f"搜索日期条件: {search_txt}")
-        if search_txt != default_search_txt:
-            last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
-            sleep(1)
-            last_el.click()
-            if not self._wait_until(
-                ec.presence_of_element_located((By.ID, "site-content"))
-            ):
-                return []
-        else:
-            sleep(1)
-        try:
-            a_links = self.driver.find_elements(
-                By.XPATH, "//form[@id='pagerSubmitForm']/a"
-            )
-            count = len(a_links)
-            if count > 1:
-                count = count - 1
-            utils.get_logger().debug(f"共查询到 {count} 页,每页 10 条")
-        except Exception as e:
-            utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
+        self._wait_for(
+            ec.presence_of_element_located((By.ID, "site-content")),
+            message="搜索结果加载超时",
+        )
+
+        # 设置时间范围
+        self._set_search_date()
+
+        # 获取结果列表
         items = self.driver.find_elements(By.XPATH, "//ul[@class='as-pager-body']/li/a")
         return items
 
-    def _process_list(self, items: list, data_type) -> list:
+    def _set_search_date(self) -> None:
+        """设置搜索时间范围"""
+        try:
+            if self._search_txt != self._default_search_txt:
+                last_el = self.driver.find_element(By.LINK_TEXT, self._search_txt)
+                sleep(1)
+                last_el.click()
+
+                self._wait_for(
+                    ec.presence_of_element_located((By.ID, "site-content")),
+                    message="设置时间范围后页面加载超时",
+                )
+            else:
+                sleep(1)
+
+        except Exception as e:
+            self.logger.error(f"设置时间范围失败: {e}")
+
+    def _process_list(self, items: List, data_type: int) -> None:
+        """处理数据列表
+
+        Args:
+            items: 数据列表
+            data_type: 数据类型(0:招标,1:中标)
+        """
         if not items:
-            return []
+            return
+
+        # 处理当前页
         for item in items:
             self._process_item(item, data_type)
         sleep(2)
+
+        # 处理下一页
         next_items = self._next_page()
-        return self._process_list(next_items, data_type)
+        if next_items:
+            self._process_list(next_items, data_type)
+
+    def _next_page(self) -> Optional[List]:
+        """获取下一页数据
 
-    def _next_page(self) -> list:
+        Returns:
+            List: 下一页数据列表
+        """
         try:
+            # 查找下一页按钮
             try:
                 btn = self.driver.find_element(
                     By.XPATH, "//form[@id='pagerSubmitForm']/a[@class='next']"
                 )
             except NoSuchElementException:
-                utils.get_logger().debug(f"翻页结束 [{self._adapter_type}]")
-                return []
+                self.logger.debug("已到最后一页")
+                return None
+
+            # 点击下一页
             btn.click()
             self._next_count += 1
-            utils.get_logger().debug(
-                f"下一页[{self._next_count+1}]: {self.driver.current_url}"
+            self.logger.debug(f"下一页[{self._next_count+1}]")
+
+            # 等待页面加载
+            self._wait_for(
+                ec.presence_of_element_located((By.ID, "site-content")),
+                message="下一页加载超时",
             )
-            if not self._wait_until(
-                ec.presence_of_element_located((By.ID, "site-content"))
-            ):
-                return []
+
+            # 获取数据列表
             items = self.driver.find_elements(
                 By.XPATH, "//ul[@class='as-pager-body']/li/a"
             )
             return items
+
         except NoSuchElementException as e:
-            raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
-        except TimeoutException as e:
-            raise Exception(f"翻页失败 [{self._adapter_type}] [超时]: {e}")
+            raise Exception(f"页面元素未找到: {e}")
 
-    def _process_item(self, item, data_type):
+    def _process_item(self, item, data_type: int) -> None:
+        """处理单条数据
+
+        Args:
+            item: 数据项
+            data_type: 数据类型(0:招标,1:中标)
+        """
         main_handle = self.driver.current_window_handle
         close = True
+
         try:
+            # 检查URL是否已采集
             url = item.get_attribute("href")
             if self._check_is_collect_by_url(url):
                 close = False
                 return
+
+            # 打开详情页
             item.click()
-            if not self._wait_until(ec.number_of_windows_to_be(2)):
-                return
+            self._wait_for(ec.number_of_windows_to_be(2), message="新窗口打开超时")
+
+            # 切换窗口
             handles = self.driver.window_handles
             for handle in handles:
                 if handle != main_handle:
                     self.driver.switch_to.window(handle)
                     break
+
+            # 获取URL
             url = self.driver.current_url
-            utils.get_logger().debug(f"跳转详情")
-            if not self._wait_until(
-                ec.presence_of_element_located((By.CLASS_NAME, "content"))
-            ):
-                return
+            self.logger.debug(f"打开详情页: {url}")
+
+            # 等待内容加载
+            self._wait_for(
+                ec.presence_of_element_located((By.CLASS_NAME, "content")),
+                message="详情页加载超时",
+            )
+
+            # 获取内容
             content = self.driver.find_element(By.CLASS_NAME, "content").text
+
+            # 检查关键词并保存
             if self._check_content(content):
                 self._save_db(url, content, data_type)
             else:
                 self._save_db(url, content, data_type, is_invalid=True)
 
         except TimeoutException as e:
-            utils.get_logger().error(
-                f"采集发生异常 [{self._adapter_type}] Timeout: {self.driver.current_url}。Exception: {e}"
-            )
-            # raise Exception(f"采集失败 [超时]: {e}")
+            self.logger.error(f"处理数据超时: {e}")
         except NoSuchElementException as e:
-            utils.get_logger().error(
-                f"采集发生异常 [{self._adapter_type}] NoSuchElement: {self.driver.current_url}。Exception: {e}"
-            )
-            raise Exception(f"采集失败 [{self._adapter_type}] [找不到元素]: {e}")
+            self.logger.error(f"页面元素未找到: {e}")
+            raise
         finally:
             if close:
                 sleep(2)

+ 208 - 167
SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py

@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Callable, Union, Literal
+from typing import Optional, List, Any
+from datetime import datetime
 
-from selenium import webdriver
+from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.support.wait import WebDriverWait
 from selenium.common.exceptions import TimeoutException
-from selenium.webdriver.support.wait import WebDriverWait, D, T
 
 import drivers
 import utils
@@ -12,221 +13,261 @@ from stores.data_store_interface import IDataStore
 
 
 class IDataCollectionAdapter(ABC):
-    """
-    数据收集适配器抽象类
-    """
-
-    _url = ""
-    _store = None
-    _driver = None
-    _adapter_type = ""
-    _cur_keyword = None
-    _keywords = None
-    _keyword_array = None
-    _error_count = 0
-    _max_error_count = utils.get_config_int("adapter.max_error_count", 3)
-
-    _err_keywords = {}
+    """数据采集适配器基类"""
+
+    def __init__(self, url: str, adapter_type: str, default_search_txt: str = ""):
+        """初始化适配器
+
+        Args:
+            url: 目标网站URL
+        """
+        self._url = url
+        self._adapter_type = adapter_type
+        self._default_search_txt = default_search_txt
+        self._search_txt = utils.get_config_value(
+            self.search_day_key, default_search_txt
+        )
+        self._next_count = 0
+
+        self._store: Optional[IDataStore] = None
+        self._driver: Optional[WebDriver] = None
+        self._adapter_type = ""
+        self._keyword = None
+        self._keywords: List[str] = []
+        self._timeout = utils.get_config_int("selenium.page_load_timeout", 30)
+        self._max_retries = utils.get_config_int("adapter.max_retries", 3)
+        self._retry_keywords: dict = {}  # 记录重试关键词及次数
+
+        try:
+            # 初始化WebDriver
+            self._driver = drivers.gen_driver(url)
+            self.logger.info(f"初始化WebDriver成功: {url}")
+        except Exception as e:
+            self.logger.error(f"初始化WebDriver失败: {e}")
+            raise
 
     @property
-    def search_day_key(self) -> str:
-        return f"adapter.{self._adapter_type}.search_day"
+    def logger(self):
+        return utils.get_logger()
 
     @property
-    def batch_save_key(self) -> str:
-        return f"adapter.{self._adapter_type}.batch_save"
+    def driver(self) -> WebDriver:
+        return self._driver
 
     @property
     def store(self) -> IDataStore:
         return self._store
 
     @property
-    def url(self):
+    def url(self) -> str:
         return self._url
 
     @property
-    def cur_keyword(self):
-        return self._cur_keyword
+    def keyword(self) -> str:
+        return self._keyword
 
     @property
-    def keywords(self):
-        return self._keywords
+    def adapter_type(self) -> str:
+        return self._adapter_type
 
     @property
-    def keyword_array(self):
-        return self._keyword_array
+    def batch_save_key(self) -> str:
+        return f"adapter.{self._adapter_type}.batch_save"
 
     @property
-    def driver(self) -> webdriver:
-        if not self._driver:
-            try:
-                self._driver = drivers.gen_driver(self.url)
-            except Exception as e:
-                raise Exception(f"创建驱动器失败: {e}")
-        return self._driver
+    def search_day_key(self) -> str:
+        return f"adapter.{self._adapter_type}.search_day"
 
-    def collect(self, keywords: str, store: IDataStore) -> None:
-        """
-        处理搜索结果列表,返回处理后的数据列表
-
-        :param keywords: 搜索结果列表
-        :param store: 数据储存库
-        :type keywords: str
-        :return: 处理后的数据列表
-        :rtype: list
-        :raises Exception: 如果处理失败,应抛出异常
+    def collect(self, keyword: str, store: Optional[IDataStore] = None) -> None:
+        """执行数据采集
+
+        Args:
+            keyword: 搜索关键词,多个关键词以逗号分隔
+            store: 数据存储器
         """
-        if store:
+        try:
             self._store = store
-        if not keywords:
-            raise Exception("未指定搜索关键字")
-        utils.get_logger().info(f"开始采集: {keywords}")
-        self._error_count = 0
-        self._keyword_array = keywords.split(",")
-        count = 0
-        for keyword in self._keyword_array:
-            if not keyword:
-                continue
-            try:
-                count += 1
-                self._cur_keyword = keyword
-                utils.get_logger().info(f"采集关键字[{count}]: {keyword}")
-                self._error_count = 0
-                self._collect(keyword)
-                if self.cur_keyword in self._err_keywords:
-                    del self._err_keywords[self.cur_keyword]  # 删除键
-            except Exception as e:
-                utils.get_logger().error(f"==> {e}")
-            # except Exception as e:
-            #     raise Exception(f"采集数据失败: {e}")
-        self._collect_error_keywors()
+            self._keyword = keyword
+            self._keywords = utils.to_array(keyword)
+            self._retry_keywords.clear()
 
-    @abstractmethod
-    def login(self, username: str, password: str) -> None:
-        """
-        如果需要登录,则登录后跳转到搜索页面(不自动跳转的需要手动执行)
+            # 首次采集所有关键词
+            for kw in self._keywords:
+                try:
+                    self.logger.debug(f"开始采集关键词: {kw}")
+                    self._collect(kw)
+                except TimeoutException as e:
+                    self.logger.warning(f"采集关键词 {kw} 超时: {e}")
+                    self._retry_keywords[kw] = 1
+                    continue
+                except Exception as e:
+                    self.logger.error(f"采集关键词 {kw} 失败: {e}")
+                    continue
 
-        :param username: 用户名
-        :type username: str
-        :param password: 密码
-        :type password: str
-        :raises Exception: 如果登录失败,应抛出异常
-        """
-        try:
-            # 实现登录逻辑
-            pass
-        except Exception as e:
-            raise Exception(f"登录失败: {e}")
+            # 重试超时的关键词
+            while self._retry_keywords:
+                retry_kws = list(self._retry_keywords.keys())
+                for kw in retry_kws:
+                    retry_count = self._retry_keywords[kw]
+                    if retry_count >= self._max_retries:
+                        self.logger.error(f"关键词 {kw} 超过最大重试次数")
+                        del self._retry_keywords[kw]
+                        continue
+
+                    try:
+                        self.logger.info(f"重试采集关键词[{retry_count}]: {kw}")
+                        self._collect(kw)
+                        del self._retry_keywords[kw]
+                    except TimeoutException as e:
+                        self.logger.warning(f"重试采集关键词 {kw} 超时: {e}")
+                        self._retry_keywords[kw] = retry_count + 1
+                        continue
+                    except Exception as e:
+                        self.logger.error(f"重试采集关键词 {kw} 失败: {e}")
+                        del self._retry_keywords[kw]
+                        continue
 
-    def _wait(self, timeout=20, poll_frequency=1):
-        return WebDriverWait(self.driver, timeout, poll_frequency)
+            # 批量保存
+            if utils.get_config_bool(self.batch_save_key):
+                self.store.save_collect_data(True)
 
-    def _wait_until(
+        except Exception as e:
+            self.logger.error(f"采集失败: {e}")
+            raise
+        finally:
+            self.cleanup()
+
+    def _wait_for(
         self,
-        method: Callable[[D], Union[Literal[False], T]],
-        timeout=20,
-        poll_frequency=1,
-    ) -> bool:
-        try:
-            self._wait(timeout, poll_frequency).until(method)
-            return True
-        except TimeoutException:
-            err_count = (
-                self._err_keywords[self.cur_keyword]
-                if self.cur_keyword in self._err_keywords
-                else 0
-            )
-            err_count += 1
-            utils.get_logger().error(
-                f"采集数据 超时 [{self.cur_keyword}][{err_count}/{self._max_error_count}]"
-            )
-            self._err_keywords[self.cur_keyword] = err_count
-            if err_count > self._max_error_count:
-                del self._err_keywords[self.cur_keyword]  # 删除键
-            return False
-            # raise TimeoutException(
-            #     f"采集数据 超时 {self.cur_keyword} [{err_count}/{self._max_error_count}]"
-            # )
-
-    def _collect_error_keywors(self):
-        if not self._err_keywords:
-            return
-        for keyword, err_count in self._err_keywords.items():
-            try:
-                utils.get_logger().info(
-                    f"重新采集错误关键字[{err_count}/{self._max_error_count}]: {keyword}"
-                )
-                self._cur_keyword = keyword
-                self._collect(keyword)
-                if self.cur_keyword in self._err_keywords:
-                    del self._err_keywords[self.cur_keyword]  # 删除键
-            except Exception as e:
-                utils.get_logger().error(f"失败: {e}")
-        self._collect_error_keywors()
+        condition: Any,
+        timeout: Optional[int] = None,
+        message: Optional[str] = None,
+    ) -> Any:
+        """等待条件满足
 
-    @abstractmethod
-    def _collect(self, keyword: str) -> None:
-        """
-        根据关键字采集
-        :param keyword: 搜索关键字
-        :type keyword: str
-        """
-        pass
+        Args:
+            condition: 等待条件
+            timeout: 超时时间(秒),默认使用全局超时时间
+            message: 超时错误消息
+
+        Returns:
+            Any: 条件满足时的返回值
 
-    def teardown(self) -> None:
+        Raises:
+            TimeoutException: 等待超时
         """
-        关闭浏览器驱动器
+        if not timeout:
+            timeout = self._timeout
+
+        wait = WebDriverWait(self.driver, timeout)
+        return wait.until(condition, message)
+
+    def _check_is_collect_by_url(self, url: str) -> bool:
+        """检查URL是否已采集
 
-        :raises Exception: 如果关闭驱动器失败,应抛出异常
+        Args:
+            url: 目标URL
+
+        Returns:
+            bool: 是否已采集
         """
+        if not self.store:
+            return False
         try:
-            if self.driver:
-                self.driver.quit()
+            old = self.store.query_one_collect_url(url)
+            if old:
+                self.logger.debug(f"URL已采集: {url}")
+                return True
+            return False
         except Exception as e:
-            raise Exception(f"关闭驱动器失败: {e}")
+            self.logger.error(f"检查URL采集状态失败: {e}")
+            return False
 
-    def _check_is_collect_by_url(self, url: str) -> bool:
-        old = self.store.query_one_collect_url(url)
-        if old:
-            utils.get_logger().debug(f"已采集过: {url}")
-            return True
-        return False
+    def _check_content(self, content: str) -> bool:
+        """检查内容是否包含关键词
 
-    def _check_content(self, content) -> bool:
-        collect_data_key = utils.get_config_value("save.collect_data_key")
-        if not collect_data_key:
-            utils.get_logger().info("未配置 save.collect_data_key,跳过内容检查")
+        Args:
+            content: 内容文本
+
+        Returns:
+            bool: 是否包含关键词
+        """
+        if not content:
+            return False
+
+        # 使用当前正在处理的关键词进行匹配
+        if self._keyword in content:
+            self.logger.info(f"内容包含关键词: {self._keyword}")
             return True
-        # utils.get_logger().info(f"检查数据有效性: {collect_data_key}")
-        collect_data_key = collect_data_key.replace(",", ",")
-        keys = collect_data_key.split(",")
-        keys = [key.strip() for key in keys]
-        for key in keys:
-            key = key.strip()
-            # utils.get_logger().info(f"检查数据有效性: {key}")
-            if key in content:
-                utils.get_logger().info(f"有效数据: {self.driver.current_url}")
-                return True
 
         return False
 
-    def _save_db(self, url, content, data_type=0, attach_str=None, is_invalid=False):
+    def _save_db(
+        self,
+        url: str,
+        content: str,
+        data_type: int = 0,
+        attach_str: str = None,
+        is_invalid: bool = False,
+    ) -> bool:
+        """保存数据到数据库
+
+        Args:
+            url: 数据URL
+            content: 数据内容
+            data_type: 数据类型(0:招标,1:中标)
+            attach_str: 附件路径
+            is_invalid: 是否无效数据
+
+        Returns:
+            bool: 是否保存成功
+        """
         if not self.store:
-            utils.get_logger().info(
-                f"DataStore 未指定: {url},关键字{self.cur_keyword}"
-            )
+            self.logger.info(f"未设置存储器: {url}")
             return False
-        else:
+
+        try:
             status = 2 if is_invalid else 0
             data = CollectData(
                 url=url,
-                keyword=self.cur_keyword,
+                keyword=self.keyword,
                 content=content,
                 data_type=data_type,
                 attach_path=attach_str,
                 status=status,
+                create_time=datetime.now(),
             )
             self.store.insert_collect_data(
                 data, utils.get_config_bool(self.batch_save_key)
             )
             return True
+        except Exception as e:
+            self.logger.error(f"保存数据失败: {e}")
+            return False
+
+    def cleanup(self):
+        """清理资源"""
+        try:
+            if self.driver:
+                self.driver.quit()
+        except Exception as e:
+            self.logger.error(f"清理资源失败: {e}")
+
+    @abstractmethod
+    def login(self, username: str, password: str) -> None:
+        """登录网站
+
+        Args:
+            username: 用户名
+            password: 密码
+        """
+        pass
+
+    @abstractmethod
+    def _collect(self, keyword: str) -> None:
+        """采集数据
+
+        Args:
+            keyword: 搜索关键词
+        """
+        pass

+ 52 - 45
SourceCode/TenderCrawler/app/config.yml

@@ -1,18 +1,20 @@
 #file: noinspection SpellCheckingInspection,SpellCheckingInspection,SpellCheckingInspection
 adapter:
-  max_error_count: 5
+  max_retries: 3
   chinabidding:
     #search_day: '今天'
     search_day: '近一周'
     model_name: 'chinabidding_data_collection_adapter'
     class_name: 'ChinabiddingDataCollectionAdapter'
     batch_save: True
+    timeout: 30
   ccgp:
     #search_day: '今日'
     search_day: '近3日'
     model_name: 'ccgp_data_collection_adapter'
-    class_name: 'CcgpDataCollectionAdapter'
+    class_name: 'CCGPDataCollectionAdapter'
     batch_save: False
+    timeout: 30
 default_area: '全国'
 logger:
   file-path: './logs/'
@@ -31,59 +33,61 @@ mysql:
   password: Iwb-2024
   charset: utf8mb4
 ai:
-#  url: http://192.168.0.109:7580/api/chat
-#  model: qwen2.5:7b
+  #  url: http://192.168.0.109:7580/api/chat
+  #  model: qwen2.5:7b
   key: sk-febca8fea4a247f096cedeea9f185520
   url: https://dashscope.aliyuncs.com/compatible-mode/v1
   model: qwen-plus
   max_tokens: 1024
-  system_prompt: "从给定信息中提取出关键信息,并以给定的类型返回json字符串,如果部分信息为空,则该字段返回为空"
-  prompt_template_1: "在以上内容中提取招标信息:
-            ```typescript
-            export interface Tender { //招标信息
-                no: string; // 招标项目编号
-                title: string; // 招标公告标题
-                provice: string; // 招标单位省份
-                city: string; // 招标单位城市
-                date: string; // 项目开标的时间
-                address: string; // 项目开标的地点
-                release_date: string; // 招标信息的发布时间
-                summary: string; // 100字左右的招标条件,联系方式等内容摘要
-                devices: string; // 只涉及到光谱仪相关的设备,其他无关设备不需要,多个设备以逗号分割 ,例如 红外光谱仪,拉曼光谱仪等
-            }
-            ```"
-  prompt_template_2: "在以上内容中提取中标信息:
-            ```typescript
-            export interface Instrument { // 中标仪器信息
-              company: string; // 中标单位名称,参与竞标并中标的公司名称
-              name: string; // 仪器名称,例如:红外光谱仪
-              manufacturer: string; // 仪器厂商,例如:赛默飞、Bruker
-              model: string; // 仪器的型号/规格,例如:NIR25S
-              quantity: number; // 中标仪器的数量,台数,例如:2
-              unit_price: number; // 仪器的单价,单位转换为元,例如:178000.00
-            }
-            export interface BiddingAcceptance { //中标信息
-              no: string; // 项目编号
-              title: string; // 中标公告标题
-              date: string; // 中标公告时间
-              provice: string; // 招标单位省份
-              city: string; // 招标单位城市
-              summary: string; // 公告摘要信息,100字左右
-              instruments: Instrument[]; // 中标设备的信息
-            }
-            ```"
+  system_prompt: '从给定信息中提取出关键信息,并以给定的类型返回json字符串,如果部分信息为空,则该字段返回为空'
+  prompt_template_1: '在以上内容中提取招标信息:
+    ```typescript
+    export interface Tender { //招标信息
+    no: string; // 招标项目编号
+    title: string; // 招标公告标题
+    province: string; // 招标单位省份
+    city: string; // 招标单位城市
+    date: string; // 项目开标的时间
+    address: string; // 项目开标的地点
+    budget: string; // 项目预算金额,单位换成元,没有留空
+    release_date: string; // 招标信息的发布时间
+    summary: string; // 100字左右的招标条件,联系方式等内容摘要
+    devices: string; // 只涉及到光谱仪相关的设备,其他无关设备不需要,多个设备以逗号分割 ,例如 红外光谱仪,拉曼光谱仪等
+    }
+    ```'
+  prompt_template_2: '在以上内容中提取中标信息:
+    ```typescript
+    export interface Instrument { // 中标仪器信息
+    company: string; // 中标单位名称,参与竞标并中标的公司名称
+    name: string; // 仪器名称,例如:红外光谱仪
+    manufacturer: string; // 仪器厂商,例如:赛默飞、Bruker
+    model: string; // 仪器的型号/规格,例如:NIR25S
+    quantity: number; // 中标仪器的数量,台数,例如:2
+    unit_price: number; // 仪器的单价,单位转换为元,例如:178000.00
+    }
+    export interface BiddingAcceptance { //中标信息
+    no: string; // 项目编号
+    title: string; // 中标公告标题
+    date: string; // 中标公告时间
+    province: string; // 招标单位省份
+    city: string; // 招标单位城市
+    summary: string; // 公告摘要信息,100字左右
+    instruments: Instrument[]; // 中标设备的信息
+    }
+    ```'
 email:
-#  smtp_server: smtp.exmail.qq.com
-#  smtp_port: 465
-#  smtp_user: yueyy@iwbnet.com
-#  smtp_password: EXN38AtT97FX635c
-#  from_email: yueyy@iwbnet.com
+  #  smtp_server: smtp.exmail.qq.com
+  #  smtp_port: 465
+  #  smtp_user: yueyy@iwbnet.com
+  #  smtp_password: EXN38AtT97FX635c
+  #  from_email: yueyy@iwbnet.com
   smtp_server: smtp.163.com
   smtp_port: 465
   smtp_user: yueyunyun88@163.com
   smtp_password: FWRwBZKHTLHjHT5F
   from_email: yueyunyun88@163.com
   error_email:
+  default_email: 349977741@qq.com
 job:
   event_id: 1 # 改变这个值,整点会检测重新加载任务
   sleep_interval: 10
@@ -98,7 +102,10 @@ job:
   clean_data: 00:05 # 每日清理数据时间
   run_now: false
 selenium:
-  remote_driver_url: http://127.0.0.1:3534/wd/hub
+  remote_driver_url: 'http://127.0.0.1:3534/wd/hub'
+  page_load_timeout: 30
+  implicit_wait: 10
+  headless: true
 clean:
   day: 30 # 清理多少天前的数据 0不清理
   # 下面的没有配置 默认使用 day 的配置

+ 109 - 43
SourceCode/TenderCrawler/app/jobs/data_collector.py

@@ -1,36 +1,56 @@
 import importlib
+from typing import Optional
 
-from selenium import webdriver
+from selenium.webdriver.remote.webdriver import WebDriver
 
 import adapters
 import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
 from stores.data_store_interface import IDataStore
-from stores.default_data_store import DefaultDataStore
 
 
 class DataCollector:
-
-    _adapter = None
-    _driver = None
-    _store = None
+    """数据采集器"""
 
     def __init__(
         self, adapter_type: str, url: str, un: str, up: str, store: IDataStore = None
     ):
-        self._adapter = self._gen_adapter(adapter_type, url)
-        self._driver = self.adapter.driver
-        # if type == "chinabidding":
-        #     return
-        self.adapter.login(un, up)
-        if store:
-            self._store = store
-        else:
-            self._store = DefaultDataStore()
+        """
+        初始化数据采集器
+
+        Args:
+            adapter_type: 适配器类型
+            url: 目标URL
+            un: 用户名
+            up: 密码
+            store: 数据存储器(可选)
+        """
+        self._adapter: Optional[IDataCollectionAdapter] = None
+        self._store: Optional[IDataStore] = None
+        self._retry_count = 0
+        self._max_retries = utils.get_config_int("adapter.max_retries", 3)
+
+        try:
+            self._adapter = self._gen_adapter(adapter_type, url)
+
+            if store:
+                self._store = store
+
+            # 登录处理
+            if un and up:
+                self.adapter.login(un, up)
+
+        except Exception as e:
+            self.logger.error(f"初始化采集器失败: {e}")
+            raise
+
+    @property
+    def logger(self):
+        return utils.get_logger()
 
     @property
-    def driver(self) -> webdriver:
-        return self._driver
+    def driver(self) -> WebDriver:
+        return self.adapter.driver
 
     @property
     def store(self) -> IDataStore:
@@ -40,43 +60,89 @@ class DataCollector:
     def adapter(self) -> IDataCollectionAdapter:
         return self._adapter
 
-    def set_store(self, store: IDataStore) -> None:
-        self._store = store
-
     def collect(self, keywords: str):
+        """
+        执行数据采集
+
+        Args:
+            keywords: 搜索关键词,多个关键词用逗号分隔
+        """
         if not self.store:
             raise Exception("未设置存储器")
-        adapters.collect(self.adapter, keywords, self.store)
+
+        try:
+            self.logger.info(f"开始采集数据, 关键词: {keywords}")
+            self._retry_count = 0
+
+            while self._retry_count < self._max_retries:
+                try:
+                    adapters.collect(self.adapter, keywords, self.store)
+                    break
+                except Exception as e:
+                    self._retry_count += 1
+                    if self._retry_count >= self._max_retries:
+                        self.logger.error(f"采集失败,已达最大重试次数: {e}")
+                        raise
+                    self.logger.warning(
+                        f"采集失败,准备第{self._retry_count}次重试: {e}"
+                    )
+                    self._reset_adapter()
+
+        except Exception as e:
+            self.logger.error(f"采集过程发生异常: {e}")
+            raise
 
     def close(self):
-        utils.get_logger().info(f"关闭浏览器驱动,URL: {self.adapter.url}")
-        adapters.teardown(self.adapter)
+        """关闭采集器,释放资源"""
+        try:
+            pass
+        except Exception as e:
+            self.logger.error(f"关闭采集器失败: {e}")
+
+    def _reset_adapter(self):
+        """重置适配器状态"""
+        try:
+            self._adapter = self._gen_adapter(
+                self.adapter.adapter_type, self.adapter.url
+            )
+        except Exception as e:
+            self.logger.error(f"重置适配器失败: {e}")
+            raise
 
     @staticmethod
-    def _gen_adapter(adapter_type: str, url: str):
+    def _gen_adapter(adapter_type: str, url: str) -> IDataCollectionAdapter:
+        """
+        生成数据源适配器
+
+        Args:
+            adapter_type: 适配器类型
+            url: 目标URL
+
+        Returns:
+            IDataCollectionAdapter: 适配器实例
+        """
         adapter_model_name = utils.get_config_value(
             f"adapter.{adapter_type}.model_name"
         )
         adapter_class_name = utils.get_config_value(
             f"adapter.{adapter_type}.class_name"
         )
-        if adapter_class_name:
-            try:
-                utils.get_logger().info(
-                    f"生成适配器 TYPE:{adapter_type},适配器: {adapter_class_name},URL:{url}"
-                )
-                # 使用 importlib 动态导入模块
-                adapter_module = importlib.import_module(
-                    f"adapters.{adapter_model_name}"
-                )
-                adapter_class = getattr(adapter_module, adapter_class_name)
-                adapter = adapter_class(url)
-            except ImportError as e:
-                raise ImportError(f"无法导入适配器模块 {adapter_model_name}") from e
-            except AttributeError as e:
-                raise AttributeError(
-                    f"适配器模块 {adapter_model_name} 中找不到类 {adapter_class_name}"
-                ) from e
-        else:
+
+        if not adapter_class_name:
             raise Exception("不支持的适配器类型")
-        return adapter
+
+        try:
+            utils.get_logger().info(
+                f"生成适配器 TYPE:{adapter_type},适配器: {adapter_class_name},URL:{url}"
+            )
+            adapter_module = importlib.import_module(f"adapters.{adapter_model_name}")
+            adapter_class = getattr(adapter_module, adapter_class_name)
+            adapter = adapter_class(url)
+            return adapter
+
+        except ImportError as e:
+            raise ImportError(f"无法导入适配器模块 {adapter_model_name}") from e
+        except AttributeError as e:
+            raise AttributeError(
+                f"适配器模块 {adapter_model_name} 中找不到类 {adapter_class_name}"
+            ) from e

+ 6 - 4
SourceCode/TenderCrawler/app/jobs/data_process.py

@@ -15,10 +15,11 @@ class DataProcess:
             export interface Tender { //招标信息
                 no: string; // 招标项目编号
                 title: string; // 招标公告标题
-                provice: string; // 招标单位省份
+                province: string; // 招标单位省份
                 city: string; // 招标单位城市
                 date: string; // 项目开标的时间
                 address: string; // 项目开标的地点
+                budget: string; // 项目预算金额,单位换成元,没有留空
                 release_date: string; // 招标信息的发布时间
                 summary: string; // 100字左右的招标条件,联系方式等内容摘要
                 devices: string; // 只涉及到光谱仪相关的设备,其他无关设备不需要,多个设备以逗号分割 ,例如 红外光谱仪,拉曼光谱仪等
@@ -39,7 +40,7 @@ class DataProcess:
               no: string; // 项目编号
               title: string; // 中标公告标题
               date: string; // 中标公告时间,格式为yyyy-MM-dd 例如:2025-01-01
-              provice: string; // 招标单位省份
+              province: string; // 招标单位省份
               city: string; // 招标单位城市
               summary: string; // 公告摘要信息,100字左右
               instruments: Instrument[]; // 中标设备的信息,关于光谱仪的设备,其他设备不要
@@ -152,10 +153,11 @@ class DataProcess:
                 no=data.get("no"),
                 title=data.get("title"),
                 date=data.get("date"),
-                provice=data.get("provice"),
+                province=data.get("province"),
                 city=data.get("city"),
                 address=data.get("address"),
                 devices=data.get("devices"),
+                budget=data.get("budget"),
                 summary=data.get("summary"),
                 release_date=data.get("release_date"),
                 prompt_tokens=data.get("prompt_tokens"),
@@ -174,7 +176,7 @@ class DataProcess:
             result = ProcessResultData(
                 no=data.get("no"),
                 title=data.get("title"),
-                provice=data.get("provice"),
+                province=data.get("province"),
                 city=data.get("city"),
                 date=data.get("date"),
                 instruments_o=data.get("instruments"),

+ 20 - 21
SourceCode/TenderCrawler/app/jobs/data_send.py

@@ -17,6 +17,7 @@ class DataSend:
         return self._store
 
     def __init__(self, store: IDataStore):
+        self._default_email = utils.get_config_value("email.default_email", "")
         self._store = store
         self._email_area_arr = self.store.query_all_emails()
         self._email_area_virtual_arr = self.store.query_all_virtual_emails()
@@ -45,6 +46,8 @@ class DataSend:
             f"开始发送中标报告邮件,开始日期:{start_date.strftime("%Y-%m-%d")},结束日期:{end_date.strftime("%Y-%m-%d")}"
         )
         email = self.store.query_master_email()
+        if self._default_email and self._default_email not in email:
+            email = email + "," + self._default_email if email else self._default_email
         if not email:
             utils.get_logger().error("没有找到master email")
             return
@@ -61,21 +64,23 @@ class DataSend:
         utils.get_logger().info(f"开始发送邮件,地区为:{item.city} ,URL为 {item.url}")
         email = self._get_email_by_area(item.city)
         if not email:
+            email = ""
             utils.get_logger().error(f"{item.city} 下没有找到email")
             if item.city not in self._error_arr:
                 self._error_arr.append(item.city)
-            return
-        title_prev = utils.get_config_value("email.title_prev", "【招标信息】")
-        body = self._build_email_html(item)
-        flag = utils.send_email(
-            email, f"{title_prev} {item.title}", body, True, item.attach_path
-        )
+        if self._default_email and self._default_email not in email:
+            email = email + "," + self._default_email if email else self._default_email
+        flag = False
+        if item.title:
+            title_prev = utils.get_config_value("email.title_prev", "【招标信息】")
+            body = self._build_email_html(item)
+            flag = utils.send_email(
+                email, f"{title_prev} {item.title}", body, True, item.attach_path
+            )
         if flag:
             self.store.set_send(item.no)
 
-    def _get_email_by_area(
-        self, area: str, count: int = 0, virtual_area: str = None
-    ) -> str:
+    def _get_email_by_area(self, area: str, count: int = 0) -> str:
         email = None
         area_str = (
             area.replace("省", "").replace("市", "").replace("区", "").replace("县", "")
@@ -83,18 +88,11 @@ class DataSend:
         for area_item in self._email_area_arr:
             if area_str in area_item.area:
                 email = area_item.email
-                if virtual_area:
-                    new_area = f"{area_item.area},{virtual_area}"
-                    self.store.update_area_email_area_by_name(area_item.name, new_area)
-                    self._email_area_arr = self.store.query_all_emails()
                 break
-        if not email and count < 3:
+        if not email and count <= 3:
             area_name = self._get_email_by_area_virtual(area_str)
             if area_name:
-                virtual_area = (
-                    f"{area_str},{virtual_area}" if virtual_area else area_str
-                )
-                email = self._get_email_by_area(area_name, count + 1, virtual_area)
+                email = self._get_email_by_area(area_name, count + 1)
         return email
 
     def _get_email_by_area_virtual(self, area: str) -> str:
@@ -156,7 +154,8 @@ class DataSend:
             <div class="container">
                 <h1>{item.title}</h1>
                 <p><strong>招标编号:</strong> {item.no if item.no else ""}</p>
-                <p><strong>项目区域:</strong> {item.provice if item.provice else ""}{item.city if item.city else ""}</p>
+                <p><strong>项目区域:</strong> {item.province if item.province else ""}{item.city if item.city else ""}</p>
+                {f"<p><strong>项目预算:</strong> {item.budget + "元"}</p>" if item.budget else ""}
                 <p><strong>相关设备:</strong> {item.devices if item.devices else ""}</p>
                 <p><strong>开标时间:</strong> {item.date if item.date else ""}</p>
                 <p><strong>开标地点:</strong> {item.address if item.address else ""}</p>
@@ -288,7 +287,7 @@ class DataSend:
         html = f"""
                   <tr>
                       <td rowspan="{row_count}"><a title="点击查看详情" href="{item.url}">{item.title}</a></td>
-                      <td rowspan="{row_count}">{item.provice if item.provice else ''}{item.city if item.city else ''}</td>
+                      <td rowspan="{row_count}">{item.province if item.province else ''}{item.city if item.city else ''}</td>
                         {self._gen_report_body_item_instrument(item.instruments[0] if item.instruments else None)}
                       <td rowspan="{row_count}">{item.date if item.date else ''}</td>
                   </tr>
@@ -341,7 +340,7 @@ class DataSend:
             "项目编号": data.no if data and data.no else "",
             "项目名称": data.title if data and data.title else "",
             "公告日期": data.date if data and data.date else "",
-            "招标省份": data.provice if data and data.provice else "",
+            "招标省份": data.province if data and data.province else "",
             "招标城市": data.city if data and data.city else "",
             "中标单位名称": instrument.company if instrument.company else "",
             "仪器名称": instrument.name if instrument and instrument.name else "",

+ 27 - 8
SourceCode/TenderCrawler/app/main.py

@@ -11,16 +11,31 @@ from jobs.job_runner import JobRunner
 
 
 class Application:
+    """应用程序主类"""
 
     def __init__(self):
+        """初始化应用程序"""
+        self._init_logger()
+        self._init_config()
+        self._init_signal_handlers()
+
+    def _init_logger(self):
+        """初始化日志系统"""
         self.logger = utils.get_logger()
+        self.logger.info("日志系统初始化完成")
+
+    def _init_config(self):
+        """初始化配置"""
         self.running = True
         self.job: Optional[JobRunner] = None
-        self.interval = utils.get_config_int("job.sleep_interval", 10)  # 默认10秒
+        self.interval = utils.get_config_int("job.sleep_interval", 10)
+        self.logger.info(f"配置加载完成, 任务检查间隔: {self.interval}秒")
 
-        # 注册信号处理
+    def _init_signal_handlers(self):
+        """初始化信号处理器"""
         signal.signal(signal.SIGINT, self._handle_shutdown)
         signal.signal(signal.SIGTERM, self._handle_shutdown)
+        self.logger.info("信号处理器注册完成")
 
     def _handle_shutdown(self, signum, frame):
         """处理退出信号"""
@@ -65,15 +80,15 @@ class Application:
             self.job = JobRunner()
             self.job.run_job()
 
-            self.logger.info(f"应用程序启动成功! 任务执行检测间隔: {self.interval}秒")
+            self.logger.info(f"应用程序启动成功! 任务执行间隔: {self.interval}秒")
 
             # 主循环
             while self.running:
                 try:
+                    schedule.run_pending()
                     now = datetime.datetime.now()
                     self._check_reload(now)
                     time.sleep(self.interval)
-                    schedule.run_pending()
                 except Exception as e:
                     self.logger.error(f"主循环执行异常: {e}")
                     time.sleep(self.interval)
@@ -82,10 +97,14 @@ class Application:
             self.logger.error(f"应用程序运行异常: {e}")
             sys.exit(1)
         finally:
-            self.logger.info("应用程序正在关闭...")
-            if self.job:
-                self.job.stop_job()
-            self.logger.info("应用程序已关闭")
+            self._cleanup()
+
+    def _cleanup(self):
+        """清理资源"""
+        self.logger.info("应用程序正在关闭...")
+        if self.job:
+            self.job.stop_job()
+        self.logger.info("应用程序已关闭")
 
 
 if __name__ == "__main__":

+ 14 - 10
SourceCode/TenderCrawler/app/models/process_data.py

@@ -13,9 +13,10 @@ class ProcessData:
         url=None,
         keyword=None,
         date=None,
-        provice=None,
+        province=None,
         city=None,
         address=None,
+        budget=None,
         summary=None,
         release_date=None,
         devices=None,
@@ -33,13 +34,14 @@ class ProcessData:
         self.title = title
         self.url = url
         self.date = date
-        self.provice = provice.replace("省", "").replace("市", "") if provice else ""
+        self.province = province.replace("省", "").replace("市", "") if province else ""
         self.city = (
             city.replace("市", "").replace("区", "").replace("县", "") if city else ""
         )
-        if self.provice == self.city:
-            self.provice = ""
+        if self.province == self.city:
+            self.province = ""
         self.keyword = keyword
+        self.budget = budget
         self.address = address
         self.summary = summary
         self.release_date = release_date
@@ -57,14 +59,14 @@ class ProcessData:
     def __repr__(self):
         return (
             f"ProcessData(no={self.no}, title={self.title}, date={self.date}, "
-            f"provice={self.provice},city={self.city}, address={self.address}, summary={self.summary}, "
+            f"province={self.province},city={self.city}, address={self.address}, summary={self.summary}, "
             f"status={self.status}, create_time={self.create_time}, "
             f"send_time={self.send_time}, remark={self.remark})"
         )
 
     _insert_query = """
-              INSERT IGNORE INTO t_data (no, title, url, keyword, date, provice, city, address, summary, release_date, devices, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
-              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+              INSERT IGNORE INTO t_data (no, title, url, keyword, date, province, city, address, budget, summary, release_date, devices, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
+              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
           """
 
     # _update_query = """
@@ -80,9 +82,10 @@ class ProcessData:
             process_data.url,
             process_data.keyword,
             process_data.date,
-            process_data.provice,
+            process_data.province,
             process_data.city,
             process_data.address,
+            process_data.budget,
             process_data.summary,
             process_data.release_date,
             process_data.devices,
@@ -115,9 +118,10 @@ class ProcessData:
                 process_data.url,
                 process_data.keyword,
                 process_data.date,
-                process_data.provice,
+                process_data.province,
                 process_data.city,
                 process_data.address,
+                process_data.budget,
                 process_data.summary,
                 process_data.release_date,
                 process_data.devices,
@@ -177,7 +181,7 @@ class ProcessData:
             )
             return data
 
-    _not_send_query = "SELECT no, title, url, keyword, devices,date, city, address, summary, attach_path, release_date FROM t_data WHERE status = 0"
+    _not_send_query = "SELECT no, title, url, keyword, devices,date, city, address, budget, summary, attach_path, release_date FROM t_data WHERE status = 0"
 
     def fetch_not_send(self):
         with MySQLHelper() as db_helper:

+ 42 - 52
SourceCode/TenderCrawler/app/models/process_result_data.py

@@ -6,6 +6,7 @@ from utils.mysql_helper import MySQLHelper
 
 
 class InstrumentData:
+
     def __init__(
         self,
         company: str,
@@ -34,7 +35,7 @@ class ProcessResultData:
         title=None,
         url=None,
         keyword=None,
-        provice=None,
+        province=None,
         city=None,
         date=None,
         instruments=None,
@@ -58,12 +59,12 @@ class ProcessResultData:
         self.instruments_str = ""
         self.instruments = []
         self.set_instruments(instruments, instruments_o)
-        self.provice = provice.replace("省", "").replace("市", "") if provice else ""
-        self.city = (
-            city.replace("市", "").replace("区", "").replace("县", "") if city else ""
-        )
-        if self.provice == self.city:
-            self.provice = ""
+        self.province = province.replace("省", "").replace(
+            "市", "") if province else ""
+        self.city = (city.replace("市", "").replace("区", "").replace("县", "")
+                     if city else "")
+        if self.province == self.city:
+            self.province = ""
         self.summary = summary
         self.attach_path = attach_path
         self.status = status
@@ -78,10 +79,9 @@ class ProcessResultData:
     def __repr__(self):
         return (
             f"ProcessResultData(no={self.no}, title={self.title}, date={self.date}, "
-            f"keyword={self.keyword}, provice={self.provice},city={self.city},instruments={self.instruments_str} summary={self.summary}, attach_path={self.attach_path}, "
+            f"keyword={self.keyword}, province={self.province},city={self.city},instruments={self.instruments_str} summary={self.summary}, attach_path={self.attach_path}, "
             f"status={self.status}, create_time={self.create_time}, "
-            f"send_time={self.send_time}, remark={self.remark})"
-        )
+            f"send_time={self.send_time}, remark={self.remark})")
 
     def set_instruments(self, instruments_str: str, instruments):
         if instruments is None:
@@ -94,19 +94,16 @@ class ProcessResultData:
             ]
         else:
             self.instruments = instruments or []
-            self.instruments_str = (
-                json.dumps(
-                    instruments,
-                    ensure_ascii=False,
-                )
-                if len(instruments) > 0
-                else ""
-            )
+            self.instruments_str = (json.dumps(
+                instruments,
+                ensure_ascii=False,
+            ) if len(instruments) > 0 else "")
 
     _insert_query = """
-              INSERT IGNORE INTO t_data_result (no, title, url, keyword, date, provice, city, instruments, summary, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
+              INSERT IGNORE INTO t_data_result (no, title, url, keyword, date, province, city, instruments, summary, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
           """
+
     # _update_query = """
     #             UPDATE t_collect_data SET status = 1 WHERE url = %s;
     #         """
@@ -121,7 +118,7 @@ class ProcessResultData:
             process_result_data.url,
             process_result_data.keyword,
             process_result_data.date,
-            process_result_data.provice,
+            process_result_data.province,
             process_result_data.city,
             process_result_data.instruments_str,
             process_result_data.summary,
@@ -142,33 +139,28 @@ class ProcessResultData:
 
     def insert_batch(self, process_result_data_list):
         if not all(
-            isinstance(process_result_data, self.__class__)
-            for process_result_data in process_result_data_list
-        ):
+                isinstance(process_result_data, self.__class__)
+                for process_result_data in process_result_data_list):
             raise TypeError(
-                "process_result_data_list 中的所有元素必须是 ProcessResultData 的实例"
-            )
+                "process_result_data_list 中的所有元素必须是 ProcessResultData 的实例")
 
-        insert_params = [
-            (
-                process_result_data.no,
-                process_result_data.title,
-                process_result_data.url,
-                process_result_data.keyword,
-                process_result_data.date,
-                process_result_data.provice,
-                process_result_data.city,
-                process_result_data.instruments_str,
-                process_result_data.summary,
-                process_result_data.attach_path,
-                0,
-                datetime.now(),
-                process_result_data.prompt_tokens,
-                process_result_data.completion_tokens,
-                process_result_data.total_tokens,
-            )
-            for process_result_data in process_result_data_list
-        ]
+        insert_params = [(
+            process_result_data.no,
+            process_result_data.title,
+            process_result_data.url,
+            process_result_data.keyword,
+            process_result_data.date,
+            process_result_data.province,
+            process_result_data.city,
+            process_result_data.instruments_str,
+            process_result_data.summary,
+            process_result_data.attach_path,
+            0,
+            datetime.now(),
+            process_result_data.prompt_tokens,
+            process_result_data.completion_tokens,
+            process_result_data.total_tokens,
+        ) for process_result_data in process_result_data_list]
 
         # update_params = [(process_result_data.url, )
         #                  for process_result_data in process_result_data_list]
@@ -187,7 +179,7 @@ class ProcessResultData:
 
     def fetch_one_process_by_url(self, url: str):
         with MySQLHelper() as db_helper:
-            result = db_helper.fetch_one(self._one_url_query, (url,))
+            result = db_helper.fetch_one(self._one_url_query, (url, ))
             if not result:
                 return None
             data = ProcessResultData(
@@ -204,7 +196,7 @@ class ProcessResultData:
 
     def fetch_one_process_by_no(self, no: str):
         with MySQLHelper() as db_helper:
-            result = db_helper.fetch_one(self._one_no_query, (no,))
+            result = db_helper.fetch_one(self._one_no_query, (no, ))
             if not result:
                 return None
             data = ProcessResultData(
@@ -215,7 +207,7 @@ class ProcessResultData:
             )
             return data
 
-    _not_send_query = "SELECT no, title, url, keyword, date, provice, city, instruments, summary, attach_path, status, create_time, send_time FROM t_data_result WHERE status = 0"
+    _not_send_query = "SELECT no, title, url, keyword, date, province, city, instruments, summary, attach_path, status, create_time, send_time FROM t_data_result WHERE status = 0"
 
     def fetch_not_send(self):
         with MySQLHelper() as db_helper:
@@ -265,10 +257,8 @@ class ProcessResultData:
         :return:
         """
         with MySQLHelper() as db_helper:
-            params = (date,)
+            params = (date, )
             db_helper.execute_non_query(self._delete_before_date_query, params)
             affected_rows = db_helper.connection.affected_rows()
-            utils.get_logger().info(
-                f"删除 {date} 之前共 {affected_rows} 条 中标处理记录。"
-            )
+            utils.get_logger().info(f"删除 {date} 之前共 {affected_rows} 条 中标处理记录。")
             return affected_rows

+ 1 - 1
SourceCode/TenderCrawler/docker-compose.yml

@@ -56,7 +56,7 @@ services:
       #      - APP_AI__KEY=
       #      - APP_AI__URL=http://192.168.0.109:7580/api/chat
       #      - APP_AI__MODEL=qwen2.5:7b
-      - APP_LOGGER__LEVEL=INFO
+      - APP_LOGGER__LEVEL=DEBUG
       - APP_JOB__COLLECT=20:00,12:00
       - APP_JOB__PROCESS=23:00,4:00,13:00
       - APP_JOB__SEND_EMAIL=08:20,14:00

+ 17 - 12
SourceCode/TenderCrawler/init.sql

@@ -38,11 +38,11 @@ CREATE TABLE `t_area_email`  (
   PRIMARY KEY (`name`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('master', 'master', 'chancelot@foxmail.com,349977741@qq.com', 0,1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('all', '全国', 'chancelot@foxmail.com,349977741@qq.com', 0,1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('master', 'master', '349977741@qq.com', 0,1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('all', '全国', '349977741@qq.com', 0,1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('张志琼', '黑龙江,吉林,辽宁', 'zhiqiong.zhang@bruker.com', 0,0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('王双', '河北,济南,山东德州', 'shuang.wang@bruker.com', 0,0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('范国春', '山东', 'guochun.fan@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('王双', '河北,山东_02', 'shuang.wang@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('范国春', '山东_01', 'guochun.fan@bruker.com', 0,0, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('尚祖俭', '天津', 'zujian.shang@bruker.com', 0,0, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('赵跃', '北京', 'yue.zhao@bruker.com', 0,0, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('张景灿', '陕西,新疆,宁夏,青海', 'jingcan.zhang@bruker.com', 0,0, NULL);
@@ -54,10 +54,10 @@ INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`,
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('邬歆', '安徽,香港,澳门', 'xin.wu@bruker.com', 0,0, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('冯新宝', '湖北,湖南', 'xinbao.feng@bruker.com', 0,0, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('耿朝曦', '江西,贵州', 'zhaoxi.geng@bruker.com', 0,0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('李华斌', '广西,深圳', 'huabin.li@bruker.com', 0,0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('吕万明', '海南,广州,中山', 'wanming.lv@bruker.com', 0,0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('许建光', '西藏,云南,广东', 'jianguang.xu@bruker.com', 0,0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('全国', '北京,天津,上海,重庆,河北,山西,黑龙江,吉林,辽宁,江苏,浙江,安徽,福建,江西,山东,河南,湖北,湖南,广东,海南,四川,贵州,云南,陕西,甘肃,青海,台湾,内蒙古,广西,西藏,宁夏,新疆,香港,澳门', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('许建光', '西藏,云南,广东_01', 'jianguang.xu@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('李华斌', '广西,广东_02', 'huabin.li@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('吕万明', '海南,广东_03', 'wanming.lv@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('全国', '北京,天津,上海,重庆,河北,山西,黑龙江,吉林,辽宁,江苏,浙江,安徽,福建,江西,山东_01,山东_02,河南,湖北,湖南,广东_01,广东_02,广东_03,海南,四川,贵州,云南,陕西,甘肃,青海,台湾,内蒙古,广西,西藏,宁夏,新疆,香港,澳门', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('北京', '东城区,西城区,朝阳区,丰台区,石景山区,海淀区,门头沟区,房山区,通州区,顺义区,大兴区,昌平区,平谷区,怀柔区,密云区,延庆区', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('上海', '黄浦区,徐汇区,长宁区,静安区,普陀区,虹口区,杨浦区,宝山区,闵行区,嘉定区,浦东新区,金山区,松江区,青浦区,奉贤区,崇明区', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('天津', '和平区,河东区,河西区,南开区,河北区,红桥区,东丽区,西青区,北辰区,武清区,宝坻区,滨海新区,宁河区,静海区,蓟州区', '', 1, 1, NULL);
@@ -74,7 +74,8 @@ INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`,
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('河北', '石家庄,唐山,秦皇岛,邯郸,邢台,保定,张家口,承德,沧州,廊坊,衡水', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('河南', '郑州,开封,洛阳,平顶山,安阳,鹤壁,新乡,焦作,濮阳,许昌,漯河,三门峡,南阳,商丘,周口,驻马店,济源示范区', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('甘肃', '兰州,嘉峪关,金昌,白银,天水,武威,张掖,平凉,酒泉,庆阳,定西,陇南,临夏回族自治州,甘南藏族自治州', '', 1, 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('山东', '青岛,淄博,枣庄,东营,烟台,潍坊,济宁,泰安,威海,日照,莱州,临沂,聊城,滨州,菏泽', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('山东_01', '青岛,淄博,枣庄,东营,烟台,潍坊,济宁,泰安,威海,日照,莱州,临沂,聊城,滨州,菏泽', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('山东_02', '济南,德州', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('内蒙古', '呼和浩特,包头,乌海,赤峰,通辽,鄂尔多斯,呼伦贝尔,巴彦淖尔,乌兰察布,兴安盟,锡林郭勒盟,阿拉善盟', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('浙江', '杭州,宁波,温州,嘉兴,湖州,绍兴,金华,衢州,舟山,台州,丽水', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('福建', '福州,厦门,莆田,三明,泉州,漳州,南平,龙岩,宁德', '', 1, 1, NULL);
@@ -85,12 +86,15 @@ INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`,
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('贵州', '贵阳,六盘水,遵义,安顺,毕节,铜仁,黔东南苗族侗族自治州,黔南布依族苗族自治州,黔西南布依族苗族自治州', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('广西', '南宁,柳州,桂林,梧州,北海,防城港,钦州,贵港,玉林,百色,贺州,河池,来宾,崇左', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('海南', '海口,三亚,三沙,儋州,琼海,文昌,万宁,东方,澄迈,定安,屯昌,临高,白沙黎族自治县,昌江黎族自治县,乐东黎族自治县,陵水黎族自治县,保亭黎族苗族自治县,琼中黎族苗族自治县', '', 1, 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('广东', '珠海,汕头,佛山,韶关,湛江,肇庆,江门,茂名,惠州,梅州,汕尾,河源,阳江,清远,东莞,潮州,揭阳,云浮', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('广东_01', '珠海,汕头,佛山,韶关,湛江,肇庆,江门,茂名,惠州,梅州,汕尾,河源,阳江,清远,东莞,潮州,揭阳,云浮', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('广东_02', '深圳', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('广东_03', '广州,中山', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('西藏', '拉萨,日喀则,昌都,林芝,山南,那曲,阿里地区', '', 1, 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('云南', '昆明,曲靖,玉溪,保山,昭通,丽江,普洱,临沧,红河哈尼族彝族自治州,文山壮族苗族自治州,西双版纳傣族自治州,大理白族自治州,德宏傣族景颇族自治州,怒江傈僳族自治州,迪庆藏族自治州', '', 1, 1, NULL);
 
 
 
+
 -- ----------------------------
 -- Table structure for t_collect_data
 -- ----------------------------
@@ -133,9 +137,10 @@ CREATE TABLE `t_data`  (
   `no` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标编号',
   `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标标题',
   `date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标时间',
-  `provice` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位省份',
+  `province` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位省份',
   `city` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位城市',
   `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '详细地点',
+  `budget` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '预算金额(单位:元)',
   `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
   `release_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '发布时间',
   `devices` varchar(1000) NULL DEFAULT NULL COMMENT '相关设备',
@@ -161,7 +166,7 @@ CREATE TABLE `t_data_result`  (
   `no` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标编号',
   `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标标题',
   `date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '公告时间',
-  `provice` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位省份',
+  `province` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位省份',
   `city` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标单位城市',
   `instruments` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '相关设备仪器',
   `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '公告摘要',