Browse Source

Add 添加ccgp适配器

YueYunyun 6 tháng trước cách đây
mục cha
commit
56cab926d9
25 tập tin đã thay đổi với 996 bổ sung410 xóa
  1. 1 0
      .gitignore
  2. 191 0
      SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py
  3. 92 102
      SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py
  4. 83 43
      SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py
  5. 31 12
      SourceCode/TenderCrawler/app/config.yml
  6. 12 3
      SourceCode/TenderCrawler/app/drivers/driver_creator.py
  7. 1 8
      SourceCode/TenderCrawler/app/main.py
  8. 18 22
      SourceCode/TenderCrawler/app/main/data_collector.py
  9. 1 1
      SourceCode/TenderCrawler/app/main/data_send.py
  10. 10 11
      SourceCode/TenderCrawler/app/main/runner.py
  11. 101 25
      SourceCode/TenderCrawler/app/models/collect_data.py
  12. 38 34
      SourceCode/TenderCrawler/app/models/process_data.py
  13. 11 6
      SourceCode/TenderCrawler/app/models/url_setting.py
  14. 6 5
      SourceCode/TenderCrawler/app/stores/data_store_interface.py
  15. 8 5
      SourceCode/TenderCrawler/app/stores/default_data_store.py
  16. 9 17
      SourceCode/TenderCrawler/app/stores/mysql_data_store.py
  17. 75 40
      SourceCode/TenderCrawler/app/utils/ai_helper.py
  18. 17 6
      SourceCode/TenderCrawler/app/utils/config_helper.py
  19. 53 17
      SourceCode/TenderCrawler/app/utils/email_helper.py
  20. 67 0
      SourceCode/TenderCrawler/app/utils/file_helper.py
  21. 6 5
      SourceCode/TenderCrawler/app/utils/logger_helper.py
  22. 77 0
      SourceCode/TenderCrawler/app/utils/string_helper.py
  23. 30 28
      SourceCode/TenderCrawler/docker-compose.yml
  24. 57 20
      SourceCode/TenderCrawler/init.sql
  25. 1 0
      SourceCode/TenderCrawler/requirements.txt

+ 1 - 0
.gitignore

@@ -161,3 +161,4 @@ cython_debug/
 .vscode/
 .dev/
 logs/
+attaches/

+ 191 - 0
SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py

@@ -0,0 +1,191 @@
+from time import sleep
+
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+
+from stores.data_store_interface import IDataStore
+from adapters.data_collection_adapter_interface import IDataCollectionAdapter
+from utils.file_helper import FileHelper
+
+
+
+class CcgpDataCollectionAdapter(IDataCollectionAdapter):
+    """
+    中国政府采购网数据采集适配器
+    """
+    file_helper = FileHelper()
+
+    def __init__(self, url: str,store:IDataStore=None):
+        self._url = url
+        self._store = store
+        self._driver = None
+        self._keyword = None
+        self._adapter_type = "ccgp"
+
+    def login(self, username: str, password: str) -> None:
+       pass
+
+    def collect(self, keyword: str, store: IDataStore):
+        if store:
+            self._store = store
+        self._keyword = keyword
+        items = self._search(keyword)
+        self._process_list(items)
+        if self.config.get_bool(self.batch_save_key):
+            self.store.save_collect_data(True)
+
+    def _search(self, keyword: str) -> list:
+        try:
+            if not keyword:
+                raise Exception("搜索关键字不能为空")
+            wait = WebDriverWait(self.driver, 10, 1)
+            wait.until(
+                ec.presence_of_element_located((By.ID, "searchForm")))
+            search_el = self.driver.find_element(By.ID, "kw")
+            sleep(2)
+            search_el.clear()
+            search_el.send_keys(keyword)
+            search_btn = self.driver.find_element(
+                By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']")
+            sleep(1)
+            search_btn.click()
+            wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
+            default_search_txt = "近一周"
+            search_txt = self.config.get(self.search_day_key, default_search_txt)
+            self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
+            if search_txt != default_search_txt:
+                last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
+                for last_el in last_els:
+                    if search_txt == last_el.text:
+                        sleep(1)
+                        last_el.click()
+                        break
+                wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
+            else:
+                sleep(1)
+
+
+            # try:
+            #     a_links = self.driver.find_elements(
+            #         By.XPATH, "//form[@id='pagerSubmitForm']/a")
+            #     count = len(a_links)
+            #     if count > 1:
+            #         count = count - 1
+            #     self.logger.info(f"共查询到 {count} 页")
+            # except Exception as e:
+            #     self.logger.error(f"搜索失败[尝试查询页数]: {e}")
+            items = self.driver.find_elements(By.XPATH,
+                                         "//ul[@class='vT-srch-result-list-bid']/li/a")
+            return items
+        except TimeoutException as e:
+            raise Exception(f"搜索失败 [超时]: {e}")
+        except NoSuchElementException as e:
+            raise Exception(f"搜索失败 [找不到元素]: {e}")
+
+
+    def _process_list(self,  items: list) -> list:
+        if not items:
+            return []
+        for item in items:
+            self._process_item( item)
+        sleep(2)
+        next_items = self._next_page()
+        return self._process_list( next_items)
+
+
+    def _next_page(self) -> list:
+        try:
+            wait = WebDriverWait(self.driver, 10, 1)
+            next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
+            wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
+            btn = self.driver.find_element(By.XPATH, next_path)
+            btn.click()
+            self.logger.info(f"跳转到下页: {self.driver.current_url}")
+            sleep(5)
+            wait.until(ec.presence_of_element_located((By.ID, "vT-srch-result")))
+            items = self.driver.find_elements(By.XPATH,
+                                         "//ul[@class='vT-srch-result-list-bid']/li/a")
+            return items
+        except NoSuchElementException as e:
+            raise Exception(f"翻页失败 [找不到元素]: {e}")
+        except TimeoutException:
+            self.logger.info("翻页结束")
+            return []
+
+    def _process_item(self,  item):
+        main_handle = self.driver.current_window_handle
+        wait = WebDriverWait(self.driver, 10, 1)
+        close = True
+        try:
+            url = item.get_attribute('href')
+            if self._check_is_collect_by_url(url):
+                close = False
+                return
+            self.logger.info(f"跳转详情")
+            sleep(1)
+            item.click()
+            wait.until(ec.number_of_windows_to_be(2))
+            handles = self.driver.window_handles
+            for handle in handles:
+                if handle != main_handle:
+                    self.driver.switch_to.window(handle)
+                    break
+            wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
+            # 判断是否为投标公告
+            if self._check_type("中标公告") or  self._check_type("成交公告") or self._check_type("终止公告"):
+                self._save_db(url, "", is_invalid=True)
+                return
+            content = self.driver.find_element(By.XPATH, "//div[@class='vF_deail_maincontent']").text
+            if self._check_content(content):
+                paths = []
+
+                attach_els = self.driver.find_elements(By.XPATH, "//td[@class='bid_attachtab_content']/a")
+                attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
+
+                # 合并两个列表
+                all_attachments = attach_els + attach_2_els
+                attach_urls = []
+                if len(all_attachments) > 0:
+                    for attach_el in attach_els:
+                        attach_url = attach_el.get_attribute('href')
+                        if attach_url not in attach_urls:
+                            attach_urls.append(attach_url)
+                        else:
+                            self.logger.info(f"重复附件: {attach_url}")
+                            continue
+                        file_name =  attach_el.text or attach_el.get_attribute('download') or attach_url.split('/')[-1]
+                        if not file_name:
+                            continue
+                        # 检查 file_name 是否包含文件扩展名
+                        if '.' not in file_name:
+                            self.logger.warning(f"文件名 {file_name} 不包含扩展名,跳过下载。")
+                            continue
+                        path = self.file_helper.download_remote_file(attach_url, file_name)
+                        if path:
+                            paths.append(path)
+                attach_str = ",".join(paths)
+                self._save_db(url, content, attach_str)
+            else:
+                self._save_db(url, content, is_invalid=True)
+        except TimeoutException as e:
+            self.logger.error(
+                f"采集发生异常 Timeout: {self.driver.current_url}。Exception: {e}")
+        except NoSuchElementException as e:
+            self.logger.error(
+                f"采集发生异常 NoSuchElement: {self.driver.current_url}。Exception: {e}")
+            raise Exception(f"采集失败 [找不到元素]: {e}")
+        finally:
+            if close:
+                sleep(1)
+                self.driver.close()
+                self.driver.switch_to.window(main_handle)
+
+    def _check_type(self,type_str: str)->bool:
+        links = self.driver.find_elements(By.LINK_TEXT, type_str)
+        if len(links) > 0:
+            self.logger.info(f"{type_str},跳过")
+            return True
+        return False
+

+ 92 - 102
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -1,16 +1,13 @@
 from time import sleep
 
-from selenium import webdriver
+
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 
-from drivers.driver_creator import DriverCreator
 from stores.data_store_interface import IDataStore
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
 
 
 class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
@@ -18,50 +15,45 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
     中国招标网数据采集适配器
     """
 
-    logger = LoggerHelper.get_logger()
 
-    def __init__(self, url: str):
+    def __init__(self, url: str,store:IDataStore=None):
         self._url = url
-        self._store = None
+        self._store = store
         self._driver = None
         self._keyword = None
-
-    @property
-    def store(self) -> IDataStore:
-        return self._store
-
-    @property
-    def url(self):
-        return self._url
-
-    @property
-    def keyword(self):
-        return self._keyword
-
-    @property
-    def driver(self):
-        if not self._driver:
-            self._driver = self.create_driver()
-        return self._driver
-
-    def create_driver(self) -> webdriver:
+        self._adapter_type = "chinabidding"
+
+    # @property
+    # def store(self) -> IDataStore:
+    #     return self._store
+    #
+    # @property
+    # def url(self):
+    #     return self._url
+    #
+    # @property
+    # def keyword(self):
+    #     return self._keyword
+    #
+    # @property
+    # def driver(self)->webdriver:
+    #     if not self._driver:
+    #         self._driver = self._create_driver()
+    #     return self._driver
+
+
+    def login(self, username: str, password: str) -> None:
         try:
-            return DriverCreator().gen_remote_driver(self.url)
-        except Exception as e:
-            raise Exception(f"创建驱动器失败: {e}")
-
-    def login(self, driver, username: str, password: str) -> None:
-        try:
-            login_el = driver.find_element(
+            login_el = self.driver.find_element(
                 By.XPATH, "//div[@id='loginRight']/a[@class='login']")
             login_el.click()
-            wait = WebDriverWait(driver, 10, 1)
+            wait = WebDriverWait(self.driver, 10, 1)
             wait.until(ec.presence_of_element_located((By.ID, "userpass")))
-            un_el = driver.find_element(By.ID, "username")
+            un_el = self.driver.find_element(By.ID, "username")
             un_el.send_keys(username)
-            pass_el = driver.find_element(By.ID, "userpass")
+            pass_el = self.driver.find_element(By.ID, "userpass")
             pass_el.send_keys(password)
-            login_btn = driver.find_element(By.ID, "login-button")
+            login_btn = self.driver.find_element(By.ID, "login-button")
             login_btn.click()
             wait.until(ec.presence_of_element_located((By.ID, "site-content")))
         except TimeoutException as e:
@@ -69,28 +61,39 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
         except NoSuchElementException as e:
             raise Exception(f"登录失败 [找不到元素]: {e}")
 
-    def search(self, driver, keyword: str) -> list:
+
+    def collect(self, keyword: str, store: IDataStore):
+        if store:
+            self._store = store
+        self._keyword = keyword
+        items = self._search(keyword)
+        self._process_list(items)
+        if self.config.get_bool(self.batch_save_key):
+            self.store.save_collect_data(True)
+
+    def _search(self, keyword: str) -> list:
         try:
-            self._keyword = keyword
-            wait = WebDriverWait(driver, 10, 1)
+            wait = WebDriverWait(self.driver, 10, 1)
             wait.until(
                 ec.presence_of_element_located((By.ID, "projSearchForm")))
-            search_el = driver.find_element(By.ID, "fullText")
+            search_el = self.driver.find_element(By.ID, "fullText")
+            search_el.send_keys("")
             search_el.send_keys(keyword)
-            search_btn = driver.find_element(
+            search_btn = self.driver.find_element(
                 By.XPATH, "//form[@id='projSearchForm']/button")
             search_btn.click()
             wait.until(ec.presence_of_element_located((By.ID, "site-content")))
-            # 查询3天内的数据
-            search_txt = ConfigHelper().get("adapter.chinabidding.search_day")
-            if not search_txt:
-                search_txt = "近三天"
+            default_search_txt = "近3日"
+            search_txt = self.config.get(self.search_day_key, default_search_txt)
             self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
-            last_el = driver.find_element(By.LINK_TEXT, search_txt)
-            last_el.click()
-            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            if search_txt != default_search_txt:
+                last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
+                last_el.click()
+                wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            else:
+                sleep(1)
             try:
-                a_links = driver.find_elements(
+                a_links = self.driver.find_elements(
                     By.XPATH, "//form[@id='pagerSubmitForm']/a")
                 count = len(a_links)
                 if count > 1:
@@ -98,30 +101,33 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                 self.logger.info(f"共查询到 {count} 页")
             except Exception as e:
                 self.logger.error(f"搜索失败[尝试查询页数]: {e}")
-            items = driver.find_elements(By.XPATH,
-                                         "//ul[@class='as-pager-body']/li/a")
+            items = self.driver.find_elements(By.XPATH,
+                                              "//ul[@class='as-pager-body']/li/a")
             return items
         except TimeoutException as e:
             raise Exception(f"搜索失败 [超时]: {e}")
         except NoSuchElementException as e:
             raise Exception(f"搜索失败 [找不到元素]: {e}")
 
-    def collect(self, driver, items: list, store: IDataStore) :
-        if store:
-            self._store = store
-        self._process_list(driver, items)
-        self.store.save_collect_data(True)
+    def _process_list(self, items: list) -> list:
+        if not items:
+            return []
+        for item in items:
+            self._process_item(item)
+        sleep(2)
+        next_items = self._next_page()
+        return self._process_list(next_items)
 
-    def _next_page(self, driver) -> list:
+    def _next_page(self) -> list:
         try:
-            wait = WebDriverWait(driver, 10, 1)
+            wait = WebDriverWait(self.driver, 10, 1)
             next_path = "//form[@id='pagerSubmitForm']/a[@class='next']"
             wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
-            btn = driver.find_element(By.XPATH, next_path)
+            btn = self.driver.find_element(By.XPATH, next_path)
             btn.click()
-            self.logger.info(f"跳转到下页: {driver.current_url}")
+            self.logger.info(f"跳转到下页: {self.driver.current_url}")
             wait.until(ec.presence_of_element_located((By.ID, "site-content")))
-            items = driver.find_elements(By.XPATH,
+            items = self.driver.find_elements(By.XPATH,
                                          "//ul[@class='as-pager-body']/li/a")
             return items
         except NoSuchElementException as e:
@@ -130,60 +136,44 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             self.logger.info("翻页结束")
             return []
 
-    def _process_item(self, driver, item):
-        current_handle = driver.current_window_handle
+    def _process_item(self, item):
+        main_handle = self.driver.current_window_handle
+        close = True
         try:
             url = item.get_attribute('href')
-            old = self.store.query_one_collect_by_url(url)
-            if old:
-                self.logger.info(f"已采集过: {url}")
+            if self._check_is_collect_by_url(url):
+                close = False
                 return
             item.click()
-            wait = WebDriverWait(driver, 10, 1)
+            wait = WebDriverWait(self.driver, 10, 1)
             wait.until(ec.number_of_windows_to_be(2))
-            handles = driver.window_handles
+            handles = self.driver.window_handles
             for handle in handles:
-                if handle != current_handle:
-                    driver.switch_to.window(handle)
+                if handle != main_handle:
+                    self.driver.switch_to.window(handle)
                     break
-            url = driver.current_url
-            self.logger.info(f"跳转详情: {driver.current_url}")
+            url = self.driver.current_url
+            self.logger.info(f"跳转详情")
             wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
-            content = driver.find_element(By.TAG_NAME, "body").text
-            self._save(url, content)
-            sleep(1)
-            driver.close()
-            sleep(2)
+            content = self.driver.find_element(By.TAG_NAME, "body").text
+            if self._check_content(content):
+                self._save_db(url, content)
+            else:
+                self._save_db(url, content, is_invalid=True)
+
         except TimeoutException as e:
             self.logger.error(
-                f"采集发生异常 Timeout: {driver.current_url}。Exception: {e}")
+                f"采集发生异常 Timeout: {self.driver.current_url}。Exception: {e}")
             # raise Exception(f"采集失败 [超时]: {e}")
         except NoSuchElementException as e:
             self.logger.error(
-                f"采集发生异常 NoSuchElement: {driver.current_url}。Exception: {e}")
+                f"采集发生异常 NoSuchElement: {self.driver.current_url}。Exception: {e}")
             raise Exception(f"采集失败 [找不到元素]: {e}")
         finally:
-            driver.switch_to.window(current_handle)
+            if close:
+                sleep(2)
+                self.driver.close()
+                self.driver.switch_to.window(main_handle)
 
-    def _save(self, url, content):
-        # self.logger.info(f"保存数据: {url},关键字{self.keyword}")
-        if not self.store:
-            self.logger.info(f"DataStore 未指定: {url},关键字{self.keyword}")
-        else:
-            self.store.insert_collect_data(url, self.keyword, content, True)
 
-    def _process_list(self, driver, items: list) -> list:
-        if not items:
-            return []
-        for item in items:
-            self._process_item(driver, item)
-        sleep(2)
-        next_items = self._next_page(driver)
-        return self._process_list(driver, next_items)
 
-    def teardown(self, driver) -> None:
-        try:
-            if driver:
-                driver.quit()
-        except Exception as e:
-            raise Exception(f"关闭驱动器失败: {e}")

+ 83 - 43
SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py

@@ -1,57 +1,66 @@
+
+
 from abc import ABC, abstractmethod
 from selenium import webdriver
 
 from stores.data_store_interface import IDataStore
+from drivers.driver_creator import DriverCreator
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+from models.collect_data import CollectData
+from models.process_data import ProcessData
 
 
 class IDataCollectionAdapter(ABC):
     """
     数据收集适配器抽象类
     """
+    _url = ""
+    _store = None
+    _driver = None
+    _keyword = None
+    _adapter_type = ""
 
-    @property
-    @abstractmethod
-    def url(self):
-        """
-        驱动器初始打开的URL
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
 
-        :return:  驱动器初始打开的URL
-        :rtype: str
-        """
-        pass
 
     @property
-    @abstractmethod
-    def driver(self):
-        """
-        创建的驱动器
+    def search_day_key(self) -> str:
+        return f"adapter.{self._adapter_type}.search_day"
+    @property
+    def batch_save_key(self) -> str:
+        return f"adapter.{self._adapter_type}.batch_save"
+    @property
+    def store(self) -> IDataStore:
+        return self._store
 
-        :return:  驱动器
-        :rtype: webdriver
-        """
-        pass
+    @property
+    def url(self):
+        return self._url
 
-    @abstractmethod
-    def create_driver(self) -> webdriver:
-        """
-        根据URL创建一个浏览器驱动器
+    @property
+    def keyword(self):
+        return self._keyword
 
-        :return: 创建的驱动器
-        :rtype: webdriver
-        :raises Exception: 如果创建驱动器失败,应抛出异常
-        """
+    @property
+    def driver(self) -> webdriver:
+        if not self._driver:
+            self._driver = self._create_driver()
+        return self._driver
+
+    def _create_driver(self) -> webdriver:
         try:
-            # 实现创建驱动器的逻辑
-            pass
+            return DriverCreator().gen_remote_driver(self.url)
+            # return DriverCreator().gen_chrome_driver(self.url)
         except Exception as e:
             raise Exception(f"创建驱动器失败: {e}")
 
     @abstractmethod
-    def login(self, driver, username: str, password: str) -> None:
+    def login(self, username: str, password: str) -> None:
         """
         如果需要登录,则登录后跳转到搜索页面(不自动跳转的需要手动执行)
 
-        :param driver: 浏览器驱动器实例
         :param username: 用户名
         :type username: str
         :param password: 密码
@@ -65,11 +74,10 @@ class IDataCollectionAdapter(ABC):
             raise Exception(f"登录失败: {e}")
 
     @abstractmethod
-    def search(self, driver, keyword: str) -> list:
+    def _search(self, keyword: str) -> list:
         """
         根据关键字搜索,返回搜索结果列表
 
-        :param driver: 浏览器驱动器实例
         :param keyword: 搜索关键字
         :type keyword: str
         :return: 搜索结果列表
@@ -84,37 +92,69 @@ class IDataCollectionAdapter(ABC):
             raise Exception(f"搜索失败: {e}")
 
     @abstractmethod
-    def collect(self, driver, items: list, store: IDataStore) -> list:
+    def collect(self, keyword: str, store: IDataStore) -> None:
         """
         处理搜索结果列表,返回处理后的数据列表
 
-        :param driver: 浏览器驱动器实例
-        :param items: 搜索结果列表
+        :param keyword: 搜索结果列表
         :param store: 数据储存库
-        :type items: list
+        :type keyword: str
         :return: 处理后的数据列表
         :rtype: list
         :raises Exception: 如果处理失败,应抛出异常
         """
         try:
-            processed_items = []
-            if items:
+            if keyword:
                 # 实现处理逻辑
                 pass
-            return processed_items
         except Exception as e:
             raise Exception(f"处理失败: {e}")
 
-    @abstractmethod
-    def teardown(self, driver) -> None:
+    def teardown(self) -> None:
         """
         关闭浏览器驱动器
 
-        :param driver: 浏览器驱动器实例
         :raises Exception: 如果关闭驱动器失败,应抛出异常
         """
         try:
-            if driver:
-                driver.quit()
+            if self.driver:
+                self.driver.quit()
         except Exception as e:
             raise Exception(f"关闭驱动器失败: {e}")
+
+    def _check_is_collect_by_url(self, url: str) -> bool:
+        old = self.store.query_one_collect_url(url)
+        if old:
+            self.logger.info(f"已采集过: {url}")
+            return True
+        return False
+    def _check_content(self,content) -> bool:
+        collect_data_key = self.config.get("save.collect_data_key")
+        if not collect_data_key:
+            self.logger.info("未配置 save.collect_data_key,跳过内容检查")
+            return True
+        # self.logger.info(f"检查数据有效性: {collect_data_key}")
+        collect_data_key = collect_data_key.replace(",", ",")
+        keys = collect_data_key.split(",")
+        keys = [key.strip() for key in keys]
+        for key in keys:
+            key = key.strip()
+            # self.logger.info(f"检查数据有效性: {key}")
+            if key in content:
+                self.logger.info(f"有效数据: {self.driver.current_url}")
+                return True
+
+        return False
+
+
+
+    def _save_db(self, url, content, attach_str = None,is_invalid=False):
+        if not self.store:
+            self.logger.info(f"DataStore 未指定: {url},关键字{self.keyword}")
+            return False
+        else:
+            status = 2 if is_invalid else 0
+            data = CollectData(url, self.keyword, content, attach_str, status)
+            self.store.insert_collect_data(data, self.config.get_bool(self.batch_save_key))
+            return True
+

+ 31 - 12
SourceCode/TenderCrawler/app/config.yml

@@ -1,13 +1,25 @@
 adapter:
   chinabidding:
     #search_day: '今天'
-    search_day: '近一月'
+    #search_day: '近一周'
+    search_day: '近三天'
     model_name: 'chinabidding_data_collection_adapter'
     class_name: 'ChinabiddingDataCollectionAdapter'
+    batch_save: True
+  ccgp:
+    #search_day: '今日'
+    search_day: '近3日'
+    model_name: 'ccgp_data_collection_adapter'
+    class_name: 'CcgpDataCollectionAdapter'
+    batch_save: False
 default_area: '全国'
+logger:
+  file-path: './logs/'
 save:
+  collect_data_key: '红外光谱仪,拉曼光谱仪'
   collect_batch_size: 100
   process_batch_size: 1 #AI处理一条插入一条
+  attach_file_path: './attaches/'
 mysql:
   host: 192.168.0.81
   port: 3307
@@ -16,20 +28,27 @@ mysql:
   password: Iwb-2024
   charset: utf8mb4
 ai:
-  key: 1
-  url: http://192.168.0.109:7580/api/chat
-  # url: https://api.qwen.aliyun.com/v1/models/qwen/completions
-  model: qwen2.5:7b
+#  url: http://192.168.0.109:7580/api/chat
+#  model: qwen2.5:7b
+  key: sk-febca8fea4a247f096cedeea9f185520
+  url: https://dashscope.aliyuncs.com/compatible-mode/v1
+  model: qwen-plus
   max_tokens: 1024
   system_prompt: 请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。
-  prompt_template: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary)。编号一般在“招标编号:”的后面,例如 (招标编号:xxx...), “xxx...”就是编号(no)。返回包含no,title,area,date,address,release_date,summary字段的json格式字符串,没有找到或未提供的信息json字段为空。
+  prompt_template: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),设备(device)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,area,date,address,release_date,summary,device字段的json格式字符串,没有找到或未提供的信息json字段为空。
 email:
-  smtp_server: smtp.exmail.qq.com
-  smtp_port: 587
-  smtp_user: yueyy@iwbnet.com
-  smtp_password: EXN38AtT97FX635c
-  from_email: yueyy@iwbnet.com
-  error_email: yueyy@iwbnet.com
+#  smtp_server: smtp.exmail.qq.com
+#  smtp_port: 465
+#  smtp_user: yueyy@iwbnet.com
+#  smtp_password: EXN38AtT97FX635c
+#  from_email: yueyy@iwbnet.com
+  smtp_server: smtp.163.com
+  smtp_port: 465
+  smtp_user: yueyunyun88@163.com
+  smtp_password: FWRwBZKHTLHjHT5F
+  from_email: yueyunyun88@163.com
+
+  #error_email: yueyy@iwbnet.com
 schedule:
   sleep_interval: 10
   #sleep_interval: 600 #单位:秒 10分钟检查一次

+ 12 - 3
SourceCode/TenderCrawler/app/drivers/driver_creator.py

@@ -14,12 +14,19 @@ class DriverCreator:
         # 设置Chrome选项
         options = webdriver.ChromeOptions()
 
-        options.add_argument('--headless')  # 无头模式运行
+        # options.add_argument('--headless')  # 无头模式运行
         options.add_argument('--no-sandbox')
         options.add_argument('--disable-dev-shm-usage')
+        options.add_experimental_option('excludeSwitches',
+                                        ['enable-automation'])
+        options.add_argument('--disable-blink-features=AutomationControlled')
+        options.add_argument('--disable-extensions')
+        # 最大化窗口
+        options.add_argument('--start-maximized')
         # 无痕浏览模式
         options.add_argument('--incognito')
 
+
         remote_driver_url = ConfigHelper().get('selenium.remote_driver_url')
         if not remote_driver_url:
             remote_driver_url = self.default_remote_driver_url
@@ -58,9 +65,10 @@ class DriverCreator:
         driver = webdriver.Chrome(options=options)  # 创建Chrome浏览器驱动实例
         return self._gen_driver(driver, url)
 
+
     def _gen_driver(self, driver, url):
-        # 检查是否为 ChromeDriver 或 FirefoxDriver
-        if isinstance(driver, (webdriver.Chrome, webdriver.Firefox)):
+        # 设置user-agent,改变user-agent的值
+        if hasattr(driver, 'execute_cdp_cmd'):
             # 隐藏navigator.webdriver标志,将其值修改为false或undefined
             driver.execute_cdp_cmd(
                 'Page.addScriptToEvaluateOnNewDocument', {
@@ -92,5 +100,6 @@ class DriverCreator:
         self.logger.info(f"创建浏览器驱动,URL: {url}")
         return driver
 
+
     # def shutdown_driver(self,driver):
     #     driver.quit()

+ 1 - 8
SourceCode/TenderCrawler/app/main.py

@@ -11,14 +11,7 @@ DEFAULT_USER_SLEEP_INTERVAL = 60 * 30  # 配置默认时间间隔30分钟
 runner = Runner()
 runner.run()
 
-interval_str = ConfigHelper().get("schedule.sleep_interval")
-try:
-    interval = int(interval_str)
-except Exception as e:
-    interval = DEFAULT_USER_SLEEP_INTERVAL
-    logger.warning(
-        f"schedule.sleep_interval {interval_str} 配置不正确, 使用默认配置: {DEFAULT_USER_SLEEP_INTERVAL}秒。 错误:{e}"
-    )
+interval = ConfigHelper().get_int("schedule.sleep_interval",DEFAULT_USER_SLEEP_INTERVAL)
 
 if __name__ == '__main__':
     while True:

+ 18 - 22
SourceCode/TenderCrawler/app/main/data_collector.py

@@ -22,16 +22,16 @@ class DataCollector:
     # _adapterClassMap = {"chinabidding": "ChinabiddingDataCollectionAdapter"}
 
     def __init__(self,
-                 type: str,
+                 adapter_type: str,
                  url: str,
                  un: str,
                  up: str,
                  store: IDataStore = None):
-        self._adapter = self._genAdapter(type, url)
-        self._driver = self.adapter.create_driver()
+        self._adapter = self._gen_adapter(adapter_type, url)
+        self._driver = self.adapter.driver
         # if type == "chinabidding":
         #     return
-        self.adapter.login(self.driver, un, up)
+        self.adapter.login(un, up)
         if store:
             self._store = store
         else:
@@ -49,38 +49,34 @@ class DataCollector:
     def adapter(self) -> IDataCollectionAdapter:
         return self._adapter
 
-    def setStore(self, store: IDataStore) -> None:
+    def set_store(self, store: IDataStore) -> None:
         self._store = store
 
     def collect(self, keyword: str):
-        items = self.adapter.search(self.driver, keyword)
-        self.adapter.collect(self.driver, items, self.store)
+        self.adapter.collect(keyword, self.store)
 
     def close(self):
         self.logger.info(f"关闭浏览器驱动,URL: {self.adapter.url}")
-        self.adapter.teardown(self.driver)
+        self.adapter.teardown()
 
-    def collectWithStore(self, keyword: str, store: IDataStore):
-        self.setStore(store)
-        self.collect(keyword)
 
-    def _genAdapter(self, type: str, url: str):
-        adapterModelName = self.config.get(f"adapter.{type}.model_name")
-        adapterClassName = self.config.get(f"adapter.{type}.class_name")
-        if adapterClassName:
+    def _gen_adapter(self, adapter_type: str, url: str):
+        adapter_model_name = self.config.get(f"adapter.{adapter_type}.model_name")
+        adapter_class_name = self.config.get(f"adapter.{adapter_type}.class_name")
+        if adapter_class_name:
             try:
                 self.logger.info(
-                    f"生成适配器 TYPE:{type},适配器: {adapterClassName},URL:{url}")
+                    f"生成适配器 TYPE:{adapter_type},适配器: {adapter_class_name},URL:{url}")
                 # 使用 importlib 动态导入模块
-                adapterModule = importlib.import_module(
-                    f"adapters.{adapterModelName}")
-                adapterClass = getattr(adapterModule, adapterClassName)
-                adapter = adapterClass(url)
+                adapter_module = importlib.import_module(
+                    f"adapters.{adapter_model_name}")
+                adapter_class = getattr(adapter_module, adapter_class_name)
+                adapter = adapter_class(url)
             except ImportError as e:
-                raise ImportError(f"无法导入适配器模块 {adapterModelName}") from e
+                raise ImportError(f"无法导入适配器模块 {adapter_model_name}") from e
             except AttributeError as e:
                 raise AttributeError(
-                    f"适配器模块 {adapterModelName} 中找不到类 {adapterClassName}"
+                    f"适配器模块 {adapter_model_name} 中找不到类 {adapter_class_name}"
                 ) from e
         else:
             raise Exception("不支持的适配器类型")

+ 1 - 1
SourceCode/TenderCrawler/app/main/data_send.py

@@ -33,7 +33,7 @@ class DataSend:
                 self._error_arr.append(item.area)
             return
         body = self._build_email_content(item)
-        flag = EmailHelper().send_email(email, item.title, body, True, None)
+        flag = EmailHelper().send_email(email, item.title, body, True, item.attach_path)
         if flag:
             self.store.set_send(item.no)
 

+ 10 - 11
SourceCode/TenderCrawler/app/main/runner.py

@@ -9,7 +9,7 @@ from main.data_collector import DataCollector
 from main.data_process import DataProcess
 from main.data_send import DataSend
 from utils.email_helper import EmailHelper
-
+from utils.file_helper import FileHelper
 
 class Runner:
     logger = LoggerHelper.get_logger()
@@ -44,11 +44,10 @@ class Runner:
         for time in send_email_times:
             self.logger.info(f"{time} 执行   发送邮件   任务")
             schedule.every().day.at(time).do(self._send_job)
-        run_now = self.config.get("schedule.run_now")
-        if run_now and (str(run_now).lower() == 'true' or str(run_now) == '1'):
+        if self.config.get_bool("schedule.run_now"):
             self.logger.info("立即执行任务")
             self._collect_process_job()
-            self._send_job()
+            # self._send_job()
             # self._process_job()
 
     def _collect_process_job(self):
@@ -59,11 +58,11 @@ class Runner:
                 data_collector =None
                 try:
                     self.logger.info(f"开始采集: {url_setting.url}")
-                    data_collector = DataCollector(url_setting.type,
-                                                  url_setting.url,
-                                                  url_setting.username,
-                                                  url_setting.password,
-                                                  self.store)
+                    data_collector = DataCollector(url_setting.adapter_type,
+                                                   url_setting.url,
+                                                   url_setting.username,
+                                                   url_setting.password,
+                                                   self.store)
                     keywords = url_setting.keywords
                     keyword_array = keywords.split(',')
                     for keyword in keyword_array:
@@ -72,7 +71,7 @@ class Runner:
                 except Exception as e:
                     self._send_error_email(
                         "数据采集",
-                        f"\n    Type: {url_setting.type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
+                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
                     )
                     self.logger.error(f"采集发生异常: {e}")
                 finally:
@@ -86,7 +85,7 @@ class Runner:
                 except Exception as e:
                     self._send_error_email(
                         "AI数据处理",
-                        f"\n    Type: {url_setting.type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
+                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
                     )
                     self.logger.error(f"AI处理发生异常: {e}")
                     break  # 中断当前 URL 设置的处理

+ 101 - 25
SourceCode/TenderCrawler/app/models/collect_data.py

@@ -6,17 +6,22 @@ from utils.logger_helper import LoggerHelper
 class CollectData:
 
     logger = LoggerHelper.get_logger()
+    UNPROCESSED = 0
+    PROCESSED = 1
+    INVALID = 2
 
     def __init__(self,
                  url=None,
                  keyword=None,
                  content=None,
-                 status=None,
+                 attach_path=None,
+                 status=UNPROCESSED,
                  create_time=None,
                  process_time=None):
         self.url = url
         self.keyword = keyword
         self.content = content
+        self.attach_path = attach_path
         self.status = status
         self.create_time = create_time or datetime.now()
         self.process_time = process_time
@@ -28,17 +33,30 @@ class CollectData:
             f"create_time={self.create_time}, process_time={self.process_time})"
         )
 
+    _insert_query = """
+        INSERT IGNORE INTO t_collect_data (url, keyword, content, attach_path, status, create_time)
+        VALUES (%s, %s, %s, %s, %s, %s);
+        """
+    _insert_query_history = """
+         INSERT IGNORE INTO t_collect_data_history (url, keyword, content, attach_path, status, create_time)
+         VALUES (%s, %s, %s, %s, %s, %s);
+         """
+    _delete_query = """
+         DELETE FROM t_collect_data
+         WHERE url = %s;
+         """
     def insert(self, collect_data):
         if not isinstance(collect_data, self.__class__):
             raise TypeError("collect_data 不是 CollectData 的实例")
         with MySQLHelper() as db_helper:
-            query = """
-                INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time)
-                VALUES (%s, %s, %s, %s, %s)
-                """
+
             params = (collect_data.url, collect_data.keyword,
-                      collect_data.content, 0, datetime.now())
-            db_helper.execute_non_query(query, params)
+                       collect_data.content,collect_data.attach_path,
+                      collect_data.status, datetime.now())
+            if collect_data.status == self.INVALID:
+                db_helper.execute_non_query(self._insert_query_history, params)
+            else:
+                db_helper.execute_non_query(self._insert_query, params)
 
     def insert_batch(self, collect_data_list):
         if not all(
@@ -46,34 +64,46 @@ class CollectData:
                 for collect_data in collect_data_list):
             raise TypeError("collect_data_list 中的所有元素必须是 CollectData 的实例")
 
-        query = """
-            INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time)
-            VALUES (%s, %s, %s, %s, %s)
-            """
         params = [
             (
                 collect_data.url,
                 collect_data.keyword,
                 collect_data.content,
+                collect_data.attach_path,
+                collect_data.status,
+                datetime.now()  # 每次调用 datetime.now() 获取当前时间
+            ) for collect_data in collect_data_list
+            if collect_data.status != 2
+        ]
+        params2 = [
+            (
+                collect_data.url,
+                collect_data.keyword,
+                collect_data.content,
+                collect_data.attach_path,
                 collect_data.status,
                 datetime.now()  # 每次调用 datetime.now() 获取当前时间
             ) for collect_data in collect_data_list
+            if collect_data.status == 2
         ]
+
         with MySQLHelper() as db_helper:
-            db_helper.execute_non_query(query, params)
+            db_helper.execute_non_query(self._insert_query, params)
             # 获取受影响的行数
             affected_rows = db_helper.connection.affected_rows()
+
+            db_helper.execute_non_query(self._insert_query_history, params2)
             self.logger.info(f"成功插入 {affected_rows} 条数据")
             return affected_rows
 
-    def insert_url(self, url: str, keyword: str, content: str):
-        with MySQLHelper() as db_helper:
-            query = """
-                INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time, process_time)
-                VALUES (%s, %s, %s, %s, %s, %s)
-                """
-            params = (url, keyword, content, 0, datetime.now, None)
-            db_helper.execute_non_query(query, params)
+    # def insert_url(self, url: str, keyword: str, content: str):
+    #     with MySQLHelper() as db_helper:
+    #         query = """
+    #             INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time, process_time)
+    #             VALUES (%s, %s, %s, %s, %s, %s)
+    #             """
+    #         params = (url, keyword, content, 0, datetime.now, None)
+    #         db_helper.execute_non_query(query, params)
 
     # def fetch_all():
     #     with MySQLHelper() as db_helper:
@@ -101,6 +131,17 @@ class CollectData:
             data = [result['url'] for result in results]
             return data
 
+    def fetch_one_url(self, url: str):
+        with MySQLHelper() as db_helper:
+            query = """
+             SELECT url FROM `t_collect_data_history` WHERE url= %s UNION SELECT url FROM `t_collect_data`  WHERE url= %s LIMIT 1
+            """
+            result = db_helper.fetch_one(query, (url, url))
+            if not result:
+                return None
+            data = result["url"]
+            return data
+
     def fetch_one_collect_by_url(self, url: str):
         with MySQLHelper() as db_helper:
             query = """
@@ -116,13 +157,48 @@ class CollectData:
             return data
 
     def set_process(self, url: str):
+        # with MySQLHelper() as db_helper:
+        #     query = """
+        #     UPDATE t_collect_data
+        #     SET status = 1
+        #     WHERE url = %s
+        #     """
+        #     db_helper.execute_non_query(query, (url))
+        self.move_to_history_and_delete(url)
+
+
+    def move_to_history_and_delete(self, url: str):
         with MySQLHelper() as db_helper:
+            # 查询 t_collect_data 中的数据
             query = """
-            UPDATE t_collect_data
-            SET status = 1
-            WHERE url = %s
-            """
-            db_helper.execute_non_query(query, (url))
+             SELECT url, keyword, content, attach_path, status, create_time, process_time
+             FROM t_collect_data
+             WHERE url = %s
+             """
+            result = db_helper.fetch_one(query, (url,))
+            if not result:
+                self.logger.warning(f"URL {url} 未在 t_collect_data 中找到,无法移动到历史表并删除。")
+                return False
+
+            # 将数据插入到 t_collect_data_history
+            insert_query = self._insert_query_history
+            insert_params = (
+                result["url"],
+                result["keyword"],
+                result["content"],
+                result["attach_path"],
+                result["status"],
+                result["create_time"]
+            )
+            db_helper.execute_non_query(insert_query, insert_params)
+
+            # 删除 t_collect_data 中的数据
+            delete_query = self._delete_query
+            delete_params = (url,)
+            db_helper.execute_non_query(delete_query, delete_params)
+
+            self.logger.info(f"URL {url} 已从 t_collect_data 移动到 t_collect_data_history 并删除。")
+            return True
 
     def fetch_by_status(self, status=0):
         with MySQLHelper() as db_helper:

+ 38 - 34
SourceCode/TenderCrawler/app/models/process_data.py

@@ -18,6 +18,8 @@ class ProcessData:
                  address=None,
                  summary=None,
                  release_date=None,
+                 devices=None,
+                 attach_path=None,
                  status=None,
                  create_time=None,
                  send_time=None,
@@ -28,14 +30,14 @@ class ProcessData:
         self.url = url
         self.date = date
         if not area:
-            area = ConfigHelper().get("default_area")
-        if not area:
-            area = "全国"
+            area = ConfigHelper().get("default_area", "全国")
         self.area = area.replace(" ", "")
         self.keyword = keyword
         self.address = address
         self.summary = summary
         self.release_date = release_date
+        self.devices = devices
+        self.attach_path = attach_path
         self.status = status
         self.create_time = create_time or datetime.now()
         self.send_time = send_time
@@ -49,30 +51,36 @@ class ProcessData:
             f"status={self.status}, create_time={self.create_time}, "
             f"send_time={self.send_time}, remark={self.remark})")
 
+    _insert_query = """
+              INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, devices, attach_path, status, create_time)
+              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+          """
+    _update_query = """
+                UPDATE t_collect_data SET status = 1 WHERE url = %s;
+            """
     def insert(self, process_data):
         if not isinstance(process_data, self.__class__):
             raise TypeError("process_data 不是 ProcessData 的实例")
 
-        insert_query = """
-            INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, status, create_time)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-        """
-
-        update_query = """
-            UPDATE t_collect_data SET status = 1 WHERE url = %s;
-        """
-
-        insert_params = (process_data.no, process_data.title, process_data.url,
-                         process_data.keyword, process_data.date,
-                         process_data.area, process_data.address,
-                         process_data.summary, process_data.release_date, 0,
+        insert_params = (process_data.no,
+                         process_data.title,
+                         process_data.url,
+                         process_data.keyword,
+                         process_data.date,
+                         process_data.area,
+                         process_data.address,
+                         process_data.summary,
+                         process_data.release_date,
+                         process_data.devices,
+                         process_data.attach_path,
+                         0,
                          datetime.now())
 
         update_params = (process_data.url, )
 
         with MySQLHelper() as db_helper:
-            db_helper.execute_non_query(insert_query, insert_params)
-            db_helper.execute_non_query(update_query, update_params)
+            db_helper.execute_non_query(self._insert_query, insert_params)
+            db_helper.execute_non_query(self._update_query, update_params)
 
     def insert_batch(self, process_data_list):
         if not all(
@@ -80,14 +88,6 @@ class ProcessData:
                 for process_data in process_data_list):
             raise TypeError("process_data_list 中的所有元素必须是 ProcessData 的实例")
 
-        insert_query = """
-            INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, status, create_time)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-        """
-
-        update_query = """
-            UPDATE t_collect_data SET status = 1 WHERE url = %s;
-        """
 
         insert_params = [(
             process_data.no,
@@ -99,6 +99,8 @@ class ProcessData:
             process_data.address,
             process_data.summary,
             process_data.release_date,
+            process_data.devices,
+            process_data.attach_path,
             0,
             datetime.now(),
         ) for process_data in process_data_list]
@@ -107,29 +109,31 @@ class ProcessData:
                          for process_data in process_data_list]
 
         with MySQLHelper() as db_helper:
-            db_helper.execute_non_query(insert_query, insert_params)
+            db_helper.execute_non_query(self._insert_query, insert_params)
             affected_rows = db_helper.connection.affected_rows()
             self.logger.info(f"成功插入 {affected_rows} 条数据")
             for param in update_params:
-                db_helper.execute_non_query(update_query, param)
+                db_helper.execute_non_query(self._update_query, param)
             return affected_rows
 
+    _one_query = """
+                    SELECT url,no,other_urls,attach_path FROM t_data WHERE no = %s  LIMIT 1
+                """
     def fetch_one_process_by_no(self, no: str):
         with MySQLHelper() as db_helper:
-            query = """
-                SELECT url,no,other_urls FROM t_data WHERE no = %s  LIMIT 1
-            """
-            result = db_helper.fetch_one(query, (no, ))
+
+            result = db_helper.fetch_one(self._one_query, (no, ))
             if not result:
                 return None
             data = ProcessData(url=result["url"],
                                no=result["no"],
-                               other_urls=result["other_urls"])
+                               other_urls=result["other_urls"],
+                               attach_path=result["attach_path"])
             return data
 
     def fetch_no_send(self):
         with MySQLHelper() as db_helper:
-            query = "SELECT no, title, url, keyword, date, area, address, summary, release_date FROM t_data WHERE status = 0"
+            query = "SELECT no, title, url, keyword, date, area, address, summary, attach_path, release_date FROM t_data WHERE status = 0"
             results = db_helper.execute_query(query)
             data = [ProcessData(**result) for result in results]
             return data

+ 11 - 6
SourceCode/TenderCrawler/app/models/url_setting.py

@@ -5,28 +5,33 @@ class UrlSetting:
 
     def __init__(self,
                  url=None,
-                 type=None,
+                 adapter_type=None,
                  username=None,
                  password=None,
-                 keywords=None):
+                 keywords=None,
+                 sort=None,
+                 is_active=None):
         self.url = url
-        self.type = type
+        self.adapter_type = adapter_type
         self.username = username
         self.password = password
         if not keywords:
             keywords = ""
         self.keywords = keywords.replace(",", ",")
+        self.sort = sort or 0
+        self.is_active = is_active
+
 
     def __repr__(self):
         return (
-            f"<UrlSetting(url={self.url}, type={self.type}, "
+            f"<UrlSetting(url={self.url}, type={self.adapter_type}, "
             f"username={self.username}, keywords={self.keywords}, is_active={self.is_active})>"
         )
 
     def to_dict(self):
         return {
             'url': self.url,
-            'type': self.type,
+            'type': self.adapter_type,
             'username': self.username,
             'password': self.password,
             'keywords': self.keywords,
@@ -46,7 +51,7 @@ class UrlSetting:
     #                   url_setting.is_active)
     #         db_helper.execute_non_query(query, params)
 
-    _query = "SELECT  url, type, username, password, keywords FROM t_urls WHERE is_active = 1"
+    _query = "SELECT  url, adapter_type, username, password, keywords FROM t_urls WHERE is_active = 1 ORDER BY  sort DESC "
 
     # 查询 URL 设置数据
     def fetch_all(self):

+ 6 - 5
SourceCode/TenderCrawler/app/stores/data_store_interface.py

@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from models.collect_data import CollectData
 from models.process_data import ProcessData
 
 
@@ -8,11 +9,11 @@ class IDataStore(ABC):
     """
 
     @abstractmethod
-    def insert_collect_data(self,
-                            url: str,
-                            keyword: str,
-                            content: str,
-                            is_batch=True) -> None:
+    def query_one_collect_url(self, url: str) -> str | None:
+        raise NotImplementedError("insert 应由子类重写。")
+
+    @abstractmethod
+    def insert_collect_data(self, data: CollectData, is_batch=True) -> None:
         raise NotImplementedError("insert 应由子类重写。")
 
     @abstractmethod

+ 8 - 5
SourceCode/TenderCrawler/app/stores/default_data_store.py

@@ -4,16 +4,18 @@ from stores.data_store_interface import IDataStore
 
 class DefaultDataStore(IDataStore):
 
-    def query_one_process_by_no(self, no):
-        pass
+
 
     logger = LoggerHelper.get_logger()
 
     def __init__(self):
         pass
 
-    def insert_collect_data(self, url, keyword, content, is_batch=True):
-        self.logger.info(f"Default: INSERT {url},关键字:{keyword}")
+    def query_one_collect_url(self, url: str) :
+        self.logger.info(f"Default: fetch_one_url")
+    def insert_collect_data(self, data , is_batch=True):
+        self.logger.info(f"Default: insert_collect_data")
+
 
     def save_collect_data(self, is_force=False):
         self.logger.info("Default: SAVE")
@@ -23,7 +25,8 @@ class DefaultDataStore(IDataStore):
 
     def query_one_collect_by_url(self, url):
         self.logger.info("Default: QUERY_ONE_PROCESS")
-
+    def query_one_process_by_no(self, no):
+        self.logger.info(f"Default: query_one_process_by_no")
     def insert_process_data(self, data):
         self.logger.info("Default: INSERT_PROCESS_DATA")
 

+ 9 - 17
SourceCode/TenderCrawler/app/stores/mysql_data_store.py

@@ -15,23 +15,14 @@ class MysqlDataStore(IDataStore):
     _areaEmail = AreaEmail()
 
     def __init__(self):
-        size = self.config.get('save.collect_batch_size')
-        if not size:
-            size = 1
-        self._collect_size = int(size)
+        self._collect_size = self.config.get_int('save.collect_batch_size',1)
         self._collect_list = []
-        size = self.config.get('save.process_batch_size')
-        if not size:
-            size = 1
-        self._process_size = int(size)
+        self._process_size = self.config.get_int('save.process_batch_size',1)
         self._process_list = []
 
-    def insert_collect_data(self,
-                            url: str,
-                            keyword: str,
-                            content: str,
-                            is_batch=True):
-        data = CollectData(url, keyword, content, 0)
+    def query_one_collect_url(self, url: str) -> str | None:
+        return self._collectData.fetch_one_url(url)
+    def insert_collect_data(self, data: CollectData, is_batch=True):
         if not is_batch:
             self._collectData.insert(data)
         else:
@@ -39,7 +30,7 @@ class MysqlDataStore(IDataStore):
             self.save_collect_data()
 
     def save_collect_data(self, is_force=False):
-        if is_force or len(self._collect_list) >= self._collect_size:
+        if (is_force and len(self._collect_list)>0) or len(self._collect_list) >= self._collect_size:
             self.logger.info("批量保存到数据库,数量: " + str(len(self._collect_list)))
             self._collectData.insert_batch(self._collect_list)
             self._collect_list = []
@@ -56,14 +47,15 @@ class MysqlDataStore(IDataStore):
     def insert_process_data(self, data: ProcessData, is_batch=True):
         if not is_batch:
             self._processData.insert(data)
+            self.logger.info(f"保存到数据库: {data.url}" )
         else:
             self._process_list.append(data)
             self.save_process_data()
 
     # 插入到数据库时会把CollectData设为已处理
     def save_process_data(self, is_force=False):
-        if is_force or len(self._process_list) >= self._process_size:
-            self.logger.info("批量保存到数据库,数量: " + str(len(self._process_list)))
+        if (is_force and len(self._process_list)>0) or len(self._process_list) >= self._process_size:
+            self.logger.info(f"批量保存到数据库,数量: {str(len(self._process_list))}")
             self._processData.insert_batch(self._process_list)
             self._process_list = []
 

+ 75 - 40
SourceCode/TenderCrawler/app/utils/ai_helper.py

@@ -1,5 +1,7 @@
 import re
 import requests
+from openai import OpenAI
+import json
 
 from utils.logger_helper import LoggerHelper
 from utils.config_helper import ConfigHelper
@@ -14,11 +16,11 @@ class AiHelper:
     _ai_api_key = None
     _ai_api_url = None
     _ai_max_tokens = 150
-    _ai_system_prompt = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
-    _ai_prompt_template = """在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、
-    开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary)。
-    编号一般在“招标编号:”的后面,例如 (招标编号:xxx...), “xxx...”就是编号(no)。"
-    返回包含no,title,area,date,address,release_date,summary字段的json格式字符串,没有找到的信息json字段为空。"""
+    DEFAULT_AI_SYSTEM_PROMPT = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
+    DEFAULT_AI_PROMPT_TEMPLATE = """在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、
+    开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),相关采购设备的名称信息,多个设备以逗号分割(device)
+    返回包含no,title,area,date,address,release_date,summary,device字段的json格式字符串,没有找到或未提供的信息json字段为空。
+"""
 
     def __init__(self):
         self._ai_api_key = self.config.get("ai.key")
@@ -27,12 +29,11 @@ class AiHelper:
         max_tokens = self.config.get("ai.max_tokens")
         if max_tokens:
             self._ai_max_tokens = int(max_tokens)
-        system_prompt = self.config.get("ai.system_prompt")
-        if system_prompt:
-            self._ai_system_prompt = system_prompt
-        prompt_template = self.config.get("ai.prompt_template")
-        if prompt_template:
-            self._ai_prompt_template = prompt_template
+        self._ai_system_prompt = self.config.get("ai.system_prompt",
+                                                 self.DEFAULT_AI_SYSTEM_PROMPT)
+        self._ai_prompt_template = self.config.get(
+            "ai.prompt_template", self.DEFAULT_AI_PROMPT_TEMPLATE)
+
 
     def call_ai(self, content: str) -> ProcessData:
         # 截取前100个字符进行日志记录
@@ -44,35 +45,28 @@ class AiHelper:
             raise Exception("AI API url 没有配置")
         if self._api_model is None:
             raise Exception("AI API model 没有配置")
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self._ai_api_key}"
-        }
-        messages = [{
-            "role": "system",
-            "content": self._ai_system_prompt
-        }, {
-            "role": "user",
-            "content": f"{content} {self._ai_prompt_template}"
-        }]
+        client = OpenAI(api_key=self._ai_api_key, base_url=self._ai_api_url)
+        completion = client.chat.completions.create(
+            model=self._api_model,
+            messages=[{
+                "role": "system",
+                "content": self._ai_system_prompt,
+            }, {
+                "role": "user",
+                "content": f"{content}  {self._ai_prompt_template}",
+            }],
+            stream=False,
+            temperature=0.7,
+        )
 
-        data = {
-            "model": self._api_model,
-            "messages": messages,
-            "stream": False,
-            "max_tokens": self._ai_max_tokens
-        }
-        response = requests.post(self._ai_api_url, headers=headers, json=data)
-        if response.status_code == 200:
-            try:
-                self.logger.info(f"AI Response: {response.text}")
-                res_str = self._extract_message_content(response.json())
-                return self._parse_response(res_str, True)
-            except Exception as e:
-                raise Exception(f"解析 AI 响应错误: {e}")
-        else:
-            raise Exception(
-                f"调用 AI 错误: {response.status_code} - {response.text}")
+        self.logger.info(f"AI Response: {completion.model_dump_json()}")
+        response = json.loads(completion.model_dump_json())
+        #self.logger.info(f"AI Response: {response}")
+        try:
+            res_str = self._extract_message_content(response)
+            return self._parse_response(res_str, True)
+        except Exception as e:
+            raise Exception(f"解析 AI 响应错误: {e}")
 
     @staticmethod
     def _extract_message_content(response_json: dict) -> str:
@@ -105,7 +99,6 @@ class AiHelper:
         return message_content
 
     def _parse_response(self, response: str, first=True) -> ProcessData:
-        import json
         self.logger.info(f"AI Response JSON STR: {response}")
         try:
             data = json.loads(response)
@@ -114,6 +107,7 @@ class AiHelper:
                                date=data.get("date"),
                                area=data.get("area"),
                                address=data.get("address"),
+                               devices=data.get("device"),
                                summary=data.get("summary"),
                                release_date=data.get("release_date"))
         except json.JSONDecodeError as e:
@@ -125,3 +119,44 @@ class AiHelper:
                 return self._parse_response(message_content, False)
             else:
                 raise Exception(f"解析 AI 响应错误: {e}")
+
+
+    def call_ai_1(self, content: str) -> ProcessData:
+        # 截取前100个字符进行日志记录
+        # truncated_content = content[:100]
+        self.logger.info("调用AI API")
+        if self._ai_api_key is None:
+            raise Exception("AI API key 没有配置")
+        if self._ai_api_url is None:
+            raise Exception("AI API url 没有配置")
+        if self._api_model is None:
+            raise Exception("AI API model 没有配置")
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self._ai_api_key}"
+        }
+        messages = [{
+            "role": "system",
+            "content": self._ai_system_prompt
+        }, {
+            "role": "user",
+            "content": f"{content} {self._ai_prompt_template}"
+        }]
+
+        data = {
+            "model": self._api_model,
+            "messages": messages,
+            "stream": False,
+            "max_tokens": self._ai_max_tokens
+        }
+        response = requests.post(self._ai_api_url, headers=headers, json=data)
+        if response.status_code == 200:
+            try:
+                self.logger.info(f"AI Response: {response.text}")
+                res_str = self._extract_message_content(response.json())
+                return self._parse_response(res_str, True)
+            except Exception as e:
+                raise Exception(f"解析 AI 响应错误: {e}")
+        else:
+            raise Exception(
+                f"调用 AI 错误: {response.status_code} - {response.text}")

+ 17 - 6
SourceCode/TenderCrawler/app/utils/config_helper.py

@@ -1,12 +1,10 @@
 import os
 import yaml
 
-from utils.logger_helper import LoggerHelper
 
 
 class ConfigHelper:
     _instance = None
-    logger = LoggerHelper.get_logger()
 
     # 默认配置文件路径
     default_config_path = os.path.join(os.path.dirname(__file__), '..',
@@ -24,7 +22,7 @@ class ConfigHelper:
     def load_config(self, path=None):
         if self._config is None:
             if not path:
-                # self.logger.info(f"使用默认配置文件:{self.default_config_path}")
+                # print(f"使用默认配置文件:{self.default_config_path}")
                 self._path = self.default_config_path
             else:
                 self._path = path
@@ -34,7 +32,7 @@ class ConfigHelper:
             self._config = yaml.safe_load(file)
         # 合并环境变量配置
         self._merge_env_vars()
-        # self.logger.info(f"加载的配置文件内容:{self._config}")
+        # print(f"加载的配置文件内容:{self._config}")
         return self._config
 
     def _merge_env_vars(self, env_prefix="APP_"):  # 环境变量前缀为 APP_
@@ -52,7 +50,7 @@ class ConfigHelper:
         else:
             config[keys[0]] = value
 
-    def get(self, key):
+    def get(self, key:str, default:str=None):
         if self._config is None:
             self.load_config(self._path)
         keys = key.split('.')
@@ -61,9 +59,22 @@ class ConfigHelper:
             if isinstance(config, dict) and k in config:
                 config = config[k]
             else:
-                return None
+                return default
         return config
 
+    def get_bool(self, key:str)->bool:
+        val = str(self.get(key,"0"))
+        return True if val.lower() == "true" or val == "1" else False
+
+    def get_int(self, key:str, default:int=0)->int:
+        val = self.get(key)
+        if not val:
+            return default
+        try :
+            return int(val)
+        except ValueError:
+            return default
+
     def get_all(self):
         if self._config is None:
             self.load_config(self._path)

+ 53 - 17
SourceCode/TenderCrawler/app/utils/email_helper.py

@@ -1,27 +1,18 @@
 import smtplib
+import os
+import mimetypes
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from email.mime.base import MIMEBase
 from email import encoders
-import os
+
 
 from utils.config_helper import ConfigHelper
 from utils.logger_helper import LoggerHelper
+from utils.string_helper import StringHelper
 
 
-def _attach_file(msg: MIMEMultipart, attachment_path: str):
-    if not os.path.isfile(attachment_path):
-        raise FileNotFoundError(
-            f"The file {attachment_path} does not exist.")
 
-    with open(attachment_path, "rb") as attachment:
-        part = MIMEBase('application', 'octet-stream')
-        part.set_payload(attachment.read())
-        encoders.encode_base64(part)
-        part.add_header(
-            'Content-Disposition',
-            f"attachment; filename= {os.path.basename(attachment_path)}")
-        msg.attach(part)
 
 
 class EmailHelper:
@@ -44,7 +35,7 @@ class EmailHelper:
                    subject: str,
                    body: str,
                    body_is_html: bool = True,
-                   attachment_path: str = None):
+                   attachment_paths: str = None):
         msg = MIMEMultipart()
         msg['From'] = self.from_email
         msg['To'] = ', '.join(to_addr.split(','))
@@ -56,11 +47,13 @@ class EmailHelper:
         else:
             msg.attach(MIMEText(body, 'plain', 'utf-8'))
 
-        if attachment_path:
-            _attach_file(msg, attachment_path)
+        if attachment_paths:
+            attachment_arr = StringHelper.to_array(attachment_paths)
+            for attachment_path in attachment_arr:
+                self._attach_file(msg, attachment_path)
 
         try:
-            with smtplib.SMTP_SSL(self.smtp_server, timeout=10) as server:
+            with smtplib.SMTP_SSL(self.smtp_server,port=self.port, timeout=10) as server:
                 # server.starttls()
                 server.login(self.username, self.password)
                 # 将 to_addr 字符串通过 split(',') 分割成列表,传递给 sendmail
@@ -68,6 +61,49 @@ class EmailHelper:
                                 msg.as_string())
             self.logger.info(f"邮件发送成功:{to_addr}")
             return True
+        except smtplib.SMTPAuthenticationError:
+            self.logger.error("SMTP 认证失败")
+        except smtplib.SMTPServerDisconnected:
+            self.logger.error("SMTP 服务器断开连接")
+        except smtplib.SMTPException as e:
+            self.logger.error(f"SMTP 异常: {e}")
         except Exception as e:
             self.logger.error(f"邮件发送失败:{to_addr} {e}")
             return False
+
+
+    def _attach_file(self, msg: MIMEMultipart, attachment_path: str):
+        if not os.path.isfile(attachment_path):
+            self.logger.error(f"文件 {attachment_path} 不存在。")
+            return
+
+        file_size = os.path.getsize(attachment_path)
+        max_size = 1024 * 8192  # 8MB
+
+        if file_size > max_size:
+            self.logger.error(f"文件 {attachment_path} 大小超过限制 ({file_size} bytes > {max_size} bytes),不添加附件。")
+            return
+
+        # 根据文件名后缀获取 MIME 类型
+        content_type, _ = mimetypes.guess_type(attachment_path)
+        if content_type is None:
+            content_type = 'application/octet-stream'  # 默认类型
+        main_type, sub_type = content_type.split('/', 1)
+
+        with open(attachment_path, "rb") as attachment:
+            # part = MIMEBase('application', 'octet-stream')
+            part = MIMEBase(main_type, sub_type)
+            part.set_payload(attachment.read(max_size))
+            # 获取文件名并去除第一个 @ 字符前面的部分
+            name = os.path.basename(attachment_path)
+            at_index = name.find('@')
+            if at_index != -1:
+                name = name[at_index + 1:]
+            part.add_header(
+                'Content-Disposition',
+                f"attachment; filename= {name}")
+            part.add_header('Content-ID', '<0>')
+            part.add_header('X-Attachment-Id', '0')
+            encoders.encode_base64(part)
+            msg.attach(part)
+            self.logger.info(f"添加附件 {name} {attachment_path} 到邮件中。")

+ 67 - 0
SourceCode/TenderCrawler/app/utils/file_helper.py

@@ -0,0 +1,67 @@
+import os
+import requests
+from datetime import datetime
+from urllib.parse import urlparse
+
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+
+class FileHelper:
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+    DEFAULT_ATTACH_PATH = "./attaches/"
+    def __init__(self):
+        path = self.config.get("save.attach_file_path", self.DEFAULT_ATTACH_PATH)
+        path = path.replace("\\", "/")
+        path = path.replace("//", "/")
+        self._attach_file_path = path
+
+    def download_remote_file(self, file_url, file_name) -> str | None:
+        self.logger.info(f"下载远程文件: {file_url}  文件名:{file_name}")
+        current_timestamp = datetime.now().strftime("%H%M%S%f")[:-3]  # 取前三位毫秒
+        file_name = f"{current_timestamp}@{file_name}"
+        file_path = os.path.join(self._attach_file_path, f'{datetime.now().strftime("%Y-%m-%d")}')
+        if not os.path.exists(file_path):
+            os.makedirs(file_path)
+        path = os.path.join(file_path, file_name)
+        path = path.replace("\\", "/")
+        path = path.replace("//", "/")
+        # 10个不同的 User-Agent
+        user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/91.0.4472.124 Safari/605.1.15",
+            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
+        ]
+
+        # 根据文件名长度选择一个 User-Agent
+        ua_index = len(file_name) % len(user_agents)
+        # 解析 file_url 获取 Referer
+        parsed_url = urlparse(file_url)
+        referer = f"{parsed_url.scheme}://{parsed_url.netloc}/".replace("//download.", "//www.")
+        headers = {
+            'User-Agent': user_agents[ua_index],
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
+            'Referer': referer
+        }
+
+        try:
+            response = requests.get(file_url, headers=headers, allow_redirects=True)
+            response.raise_for_status()
+            with open(path, 'wb') as f:
+                f.write(response.content)
+            self.logger.info(f"文件下载成功: {file_name}")
+            return path
+        except requests.exceptions.HTTPError as http_err:
+            self.logger.error(f"HTTP 错误: {http_err}")
+        except Exception as e:
+            self.logger.error(f"文件下载失败: {file_name}。Exception: {e}")
+            return None

+ 6 - 5
SourceCode/TenderCrawler/app/utils/logger_helper.py

@@ -2,6 +2,7 @@ import os
 import logging
 from logging.handlers import TimedRotatingFileHandler
 
+from utils.config_helper import ConfigHelper
 
 class LoggerHelper:
     """
@@ -9,6 +10,7 @@ class LoggerHelper:
     该类实现了单例模式,确保在整个应用程序中只有一个日志记录器实例被创建和使用
     """
     _instance = None
+    config = ConfigHelper()
 
     def __new__(cls, *args, **kwargs):
         """
@@ -34,13 +36,12 @@ class LoggerHelper:
         """
         self._logger = logging.getLogger('app_logger')
         self._logger.setLevel(logging.INFO)
-        log_folder = './logs'
-        if not os.path.exists(log_folder):
-            os.makedirs(log_folder)
+        log_file_path = self.config.get("logger.file_path", "./logs")
+        if not os.path.exists(log_file_path):
+            os.makedirs(log_file_path)
 
         # 创建按日期分割的文件处理器
-        file_handler = TimedRotatingFileHandler(os.path.join(
-            log_folder, 'data_collector.log'),
+        file_handler = TimedRotatingFileHandler(os.path.join(log_file_path, 'crawler.log'),
                                                 when='midnight',
                                                 interval=1,
                                                 backupCount=7,

+ 77 - 0
SourceCode/TenderCrawler/app/utils/string_helper.py

@@ -0,0 +1,77 @@
+class StringHelper:
+
+    @staticmethod
+    def check_empty(s: str,default:str) -> str:
+        """
+        检查字符串是否为空
+        """
+        if s:
+            return s
+        return default
+
+
+
+    @staticmethod
+    def to_array(s: str, sep: str=",") -> list[str]:
+        """
+        将字符串按指定分隔符分割成数组。
+
+        :param s: 要分割的字符串。
+        :param sep: 分隔符,默认为逗号。
+        :return: 分割后的字符串数组。
+        """
+        if not s:
+            return []
+        if sep == ",":
+            s = s.replace(",", ",")
+        return s.split(sep)
+
+    @staticmethod
+    def startswith(s: str, prefix: str) -> str:
+        """
+        检查字符串是否以特定前缀开头,如果没有则补全。
+
+        :param s: 要检查的字符串。
+        :param prefix: 前缀。
+        :return: 如果字符串以指定前缀开头,返回原字符串;否则返回补全后的字符串。
+        """
+        if not s.startswith(prefix):
+            return prefix + s
+        return s
+
+    @staticmethod
+    def endswith(s: str, suffix: str) -> str:
+        """
+        检查字符串是否以特定后缀结尾,如果没有则补全。
+
+        :param s: 要检查的字符串。
+        :param suffix: 后缀。
+        :return: 如果字符串以指定后缀结尾,返回原字符串;否则返回补全后的字符串。
+        """
+        if not s.endswith(suffix):
+            return s + suffix
+        return s
+
+    @staticmethod
+    def split_and_clean(s: str, sep: str=",") -> list[str]:
+        """
+        将字符串按指定分隔符分割并去除空字符串。
+
+        :param s: 要分割的字符串。
+        :param sep: 分隔符,默认为逗号。
+        :return: 分割后的字符串数组,去除空字符串。
+        """
+        if not s:
+            return []
+        parts = StringHelper.to_array(s,sep)
+        return [part.strip() for part in parts if part.strip()]
+
+    @staticmethod
+    def remove_extra_spaces(s: str) -> str:
+        """
+        将字符串中的多个连续空格替换为单个空格。
+
+        :param s: 要处理的字符串。
+        :return: 替换后的字符串。
+        """
+        return ' '.join(s.split())

+ 30 - 28
SourceCode/TenderCrawler/docker-compose.yml

@@ -1,9 +1,9 @@
 version: '3.8'
 
 services:
-  dc-mysql:
+  crawler-mysql:
     image: mysql:8.0.39
-    container_name: y_data-collect-mysql
+    container_name: y_tender-crawler-mysql
     environment:
       - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD}
       - MYSQL_DATABASE=${MYSQL_DATABASE}
@@ -12,23 +12,23 @@ services:
       - TZ=Asia/Shanghai
       # - MYSQL_DEFAULT_AUTHENTICATION_PLUGIN=mysql_native_password
     volumes:
-      - /home/docker/data-collect/mysql/log:/var/log/mysql
-      - /home/docker/data-collect/mysql/data:/var/lib/mysql
-      - /home/docker/data-collect/mysql/conf.d:/etc/mysql/conf.d
-      - /etc/localtime:/etc/localtime:ro
-      - /home/docker/data-collect/mysql/init/init.sql:/docker-entrypoint-initdb.d/init.sql # 挂载 init.sql 文件
+       - /home/docker/tender-crawler/mysql/log:/var/log/mysql
+       - /home/docker/tender-crawler/mysql/data:/var/lib/mysql
+       - /home/docker/tender-crawler/mysql/conf.d:/etc/mysql/conf.d
+       - /etc/localtime:/etc/localtime:ro
+       - /home/docker/tender-crawler/app/init.sql:/docker-entrypoint-initdb.d/init.sql # 挂载 init.sql 文件
       # - ./.dev/mysql5.7/log:/var/log/mysql
       # - ./.dev/mysql5.7/data:/var/lib/mysql
-      # - ./.dev/mysql8.0.39/log:/var/log/mysql
-      # - ./.dev/mysql8.0.39/data:/var/lib/mysql
-      # - ./init.sql:/docker-entrypoint-initdb.d/init.sql
+#      - ./.dev/mysql8.0.39/log:/var/log/mysql
+#      - ./.dev/mysql8.0.39/data:/var/lib/mysql
+#      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
     ports:
       - '${MYSQL_PORT}:3306'
     networks:
-      - dc-net
+      - crawler-net
     restart: always
 
-  dc-selenium:
+  crawler-selenium:
     image: selenium/standalone-chrome:latest
     container_name: y_selenium
     environment:
@@ -37,26 +37,26 @@ services:
       - '${SELENIUM_CHROME_PORT}:4444'
       - '5900:5900'
     networks:
-      - dc-net
+      - crawler-net
     restart: always
 
-  dc-app:
+  crawler-app:
     build: .
-    image: y_data-collect-app:1.0.0
-    container_name: y_data-collect-app
+    image: y_tender-crawler-app:1.0.0
+    container_name: y_tender-crawler-app
     depends_on:
-      - dc-mysql
-      - dc-selenium
+      - crawler-mysql
+      - crawler-selenium
     environment:
       - TZ=Asia/Shanghai
-      - APP_MYSQL__HOST=y_data-collect-mysql
+      - APP_MYSQL__HOST=y_tender-crawler-mysql
       - APP_MYSQL__PORT=3306
       - APP_MYSQL__DB=${MYSQL_DATABASE}
       - APP_MYSQL__USER=${MYSQL_USER}
       - APP_MYSQL__PASSWORD=${MYSQL_PASSWORD}
-      - APP_AI__KEY=
-      - APP_AI__URL=http://192.168.0.109:7580/api/chat
-      - APP_AI__MODEL=qwen2.5:7b
+#      - APP_AI__KEY=
+#      - APP_AI__URL=http://192.168.0.109:7580/api/chat
+#      - APP_AI__MODEL=qwen2.5:7b
       - APP_AI__MAX_TOKENS=1024
       - APP_SCHEDULE__SLEEP_INTERVAL=600 #单位:秒 10分钟检查一次
       - APP_SCHEDULE__COLLECT=20:00,12:00
@@ -65,17 +65,19 @@ services:
       - APP_SCHEDULE__RUN_NOW=1
       - APP_SELENIUM__REMOTE_DRIVER_URL=http://y_selenium:4444/wd/hub
     volumes:
-      - /home/docker/data-collect/app/config.yml:/app/config.yml
-      - /home/docker/data-collect/app/logs:/app/logs
-      # - ./.dev/app/config.yml:/app/config.yml
-      # - ./.dev/app/logs:/app/logs
+      - /home/docker/tender-crawler/app/config.yml:/app/config.yml
+      - /home/docker/tender-crawler/app/logs:/app/logs
+      - /home/docker/tender-crawler/app/attaches:/app/attaches
+#      - ./.dev/app/config.yml:/app/config.yml
+#      - ./.dev/app/logs:/app/logs
+#      - ./.dev/app/attaches:/app/attaches
     networks:
-      - dc-net
+      - crawler-net
     # 如果需要暴露端口
     # ports:
     #   - "8080:8080"
     restart: always
 
 networks:
-  dc-net:
+  crawler-net:
     driver: bridge

+ 57 - 20
SourceCode/TenderCrawler/init.sql

@@ -3,6 +3,26 @@
 SET NAMES utf8mb4;
 SET FOREIGN_KEY_CHECKS = 0;
 
+-- ----------------------------
+-- Table structure for t_urls
+-- ----------------------------
+DROP TABLE IF EXISTS `t_urls`;
+CREATE TABLE `t_urls`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '需访问的URL链接',
+  `adapter_type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '适配器类型',
+  `username` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '用户名',
+  `password` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '密码',
+  `keywords` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '关键字,多个以”,“分隔',
+  `is_active` int(4) NULL DEFAULT NULL COMMENT '激活状态 1:激活 0:失活',
+  `sort` int(4) NULL DEFAULT NULL COMMENT '排序字段',
+  `remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.ccgp.gov.cn/index.shtml', 'ccgp', '', '', '红外,红外显微镜,傅里叶红外,红外光谱,显微红外,拉曼,激光共聚焦拉曼,拉曼显微镜,拉曼光谱,显微拉曼,气体分析\'', 1, 100, '中国政府采购网 https://www.ccgp.gov.cn/index.shtml');
+INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.chinabidding.com/', 'chinabidding', 'brukernano2011', '695765FqX', '红外光谱仪', 1,0, '中国国际招标网 (www.chinabidding.com 必联网)');
+
+
 -- ----------------------------
 -- Table structure for t_area_email
 -- ----------------------------
@@ -16,10 +36,26 @@ CREATE TABLE `t_area_email`  (
   PRIMARY KEY (`name`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('盐城', '江苏省盐城市,江苏盐城,盐城市,盐城', '349977741@qq.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('南京', '江苏省南京市,江苏南京,南京市,南京', '349977741@qq.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('全国', '全国', 'chancelot@foxmail.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('宁波', '浙江省宁波市,浙江宁波,宁波市,宁波', '349977741@qq.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('武汉', '武汉市,武汉,中国武汉,中国武汉市', 'chancelot@foxmail.com,349977741@qq.com', 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('济南', '江苏省济南市,江苏济南,济南市,济南', '349977741@qq.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('全国', '全国', 'yueyy@iwbnet.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张志琼', '黑龙江,吉林,辽宁', 'zhiqiong.zhang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('王双', '河北,山东济南,山东德州', 'shuang.wang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('尚祖俭', '天津市,天津,中国天津,中国天津市', 'zujian.shang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('赵跃', '北京', 'yue.zhao@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张景灿', '陕西,新疆,宁夏,青海', 'jingcan.zhang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('穆彦竹', '山西,河南,甘肃', 'yanzhu.mu@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('廖然', '内蒙古', 'ran.liao@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吕小勇', '江苏', 'xiaoyong.lv@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张潇', '浙江,福建', 'xiao.zhang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吴雪美', '上海', 'xuemei.wu@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('邬歆', '安徽,香港,澳门', 'xin.wu@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('冯新宝', '湖北,湖南', 'xinbao.feng@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('耿朝曦', '江西,贵州', 'zhaoxi.geng@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('李华斌', '广西,广东深圳', 'huabin.li@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吕万明', '海南,广东广州,广东中山', 'wanming.lv@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('许建光', '西藏,云南,广东', 'jianguang.xu@bruker.com', 0, NULL);
 
 
 -- ----------------------------
@@ -30,6 +66,22 @@ CREATE TABLE `t_collect_data`  (
   `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
   `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
   `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
+  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
+  `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
+  `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
+  `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+-- ----------------------------
+-- Table structure for t_collect_data_history
+-- ----------------------------
+DROP TABLE IF EXISTS `t_collect_data_history`;
+CREATE TABLE `t_collect_data_history`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
+  `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
+  `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
+  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
@@ -50,6 +102,8 @@ CREATE TABLE `t_data`  (
   `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '详细地点',
   `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
   `release_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '发布时间',
+  `devices` varchar(1000) NULL DEFAULT NULL COMMENT '相关设备',
+  `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NULL DEFAULT NULL COMMENT '状态 0:未推送 1:已推送',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `send_time` datetime NULL DEFAULT NULL COMMENT '推送时间',
@@ -58,23 +112,6 @@ CREATE TABLE `t_data`  (
   PRIMARY KEY (`url`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
-INSERT INTO `t_data` (`url`, `no`, `title`, `date`, `area`, `address`, `summary`, `release_date`, `status`, `create_time`, `send_time`, `remark`) VALUES ('https://www.chinabidding.com/bidDetail/260794529.html', 'NWZ241216-2103-049601', '中石化华东油气分公司2024年度210306填料塔框架协议招标采购', '2024年12月27日9时0分', '全国', '中国石化物资电子招投标交易平台(https://bidding.epec.com)', '本招标项目为中国石油化工股份有限公司华东油气分公司2024年度210306填料塔框架协议招标采购,招标编号为NWZ241216-2103-049601。投标人须具备工业管道安装资质或压力管道元件制造资质,并在有效期内;具备A级压力容器制造证书且在有效期内。招标文件于2024年12月16日11时0分开始售卖,截止时间为2024年12月23日9时0分,电子投标文件需在2024年12月27日9时0分前递交。', '2024-12-17', 0, '2024-12-19 15:26:54', NULL, NULL);
-
--- ----------------------------
--- Table structure for t_urls
--- ----------------------------
-DROP TABLE IF EXISTS `t_urls`;
-CREATE TABLE `t_urls`  (
-  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '需访问的URL链接',
-  `type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '适配器类型',
-  `username` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '用户名',
-  `password` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '密码',
-  `keywords` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '关键字,多个以”,“分隔',
-  `is_active` int(4) NULL DEFAULT NULL COMMENT '激活状态 1:激活 0:失活',
-  `remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
-  PRIMARY KEY (`url`) USING BTREE
-) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
-INSERT INTO `t_urls` (`url`, `type`, `username`, `password`, `keywords`, `is_active`, `remark`) VALUES ('https://www.chinabidding.com/', 'chinabidding', 'brukernano2011', '695765FqX', '红外光谱仪', 1, '中国国际招标网 (www.chinabidding.com 必联网)\r\nBruker Beijing	 用户名:brukernano2011               密码:695765FqX');
 
 SET FOREIGN_KEY_CHECKS = 1;

+ 1 - 0
SourceCode/TenderCrawler/requirements.txt

@@ -6,3 +6,4 @@ Requests==2.32.3
 schedule==1.2.2
 selenium==4.27.1
 cryptography==41.0.4
+openai==1.58.1