Browse Source

Merge branch 'dev' of Crawler/TenderCrawler into main

YueYunyun 8 months ago
parent
commit
7070913a2f
28 changed files with 1059 additions and 461 deletions
  1. 3 1
      .gitignore
  2. 2 1
      SourceCode/TenderCrawler/.vscode/launch.json
  3. 191 0
      SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py
  4. 109 119
      SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py
  5. 84 43
      SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py
  6. 31 12
      SourceCode/TenderCrawler/app/config.yml
  7. 19 11
      SourceCode/TenderCrawler/app/drivers/driver_creator.py
  8. 2 9
      SourceCode/TenderCrawler/app/main.py
  9. 18 22
      SourceCode/TenderCrawler/app/main/data_collector.py
  10. 1 1
      SourceCode/TenderCrawler/app/main/data_process.py
  11. 4 3
      SourceCode/TenderCrawler/app/main/data_send.py
  12. 22 21
      SourceCode/TenderCrawler/app/main/runner.py
  13. 7 5
      SourceCode/TenderCrawler/app/models/area_email.py
  14. 101 25
      SourceCode/TenderCrawler/app/models/collect_data.py
  15. 38 34
      SourceCode/TenderCrawler/app/models/process_data.py
  16. 13 7
      SourceCode/TenderCrawler/app/models/url_setting.py
  17. 6 5
      SourceCode/TenderCrawler/app/stores/data_store_interface.py
  18. 9 3
      SourceCode/TenderCrawler/app/stores/default_data_store.py
  19. 9 17
      SourceCode/TenderCrawler/app/stores/mysql_data_store.py
  20. 77 41
      SourceCode/TenderCrawler/app/utils/ai_helper.py
  21. 17 6
      SourceCode/TenderCrawler/app/utils/config_helper.py
  22. 48 12
      SourceCode/TenderCrawler/app/utils/email_helper.py
  23. 67 0
      SourceCode/TenderCrawler/app/utils/file_helper.py
  24. 16 15
      SourceCode/TenderCrawler/app/utils/logger_helper.py
  25. 77 0
      SourceCode/TenderCrawler/app/utils/string_helper.py
  26. 30 28
      SourceCode/TenderCrawler/docker-compose.yml
  27. 57 20
      SourceCode/TenderCrawler/init.sql
  28. 1 0
      SourceCode/TenderCrawler/requirements.txt

+ 3 - 1
.gitignore

@@ -158,5 +158,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 .idea/
 
 
+.vscode/
 .dev/
 .dev/
-logs/
+logs/
+attaches/

+ 2 - 1
SourceCode/TenderCrawler/.vscode/launch.json

@@ -21,7 +21,8 @@
 				"APP_SAVE__PROCESS_BATCH_SIZE": "1",
 				"APP_SAVE__PROCESS_BATCH_SIZE": "1",
 				"APP_SCHEDULE__COLLECT": "12:53",
 				"APP_SCHEDULE__COLLECT": "12:53",
 				"APP_SCHEDULE__SEND_EMAIL": "22:48",
 				"APP_SCHEDULE__SEND_EMAIL": "22:48",
-				"APP_SCHEDULE__RUN_NOW": "1"
+				"APP_SCHEDULE__RUN_NOW": "1",
+
 			}
 			}
 		}
 		}
 	]
 	]

+ 191 - 0
SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py

@@ -0,0 +1,191 @@
+from time import sleep
+
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+
+from stores.data_store_interface import IDataStore
+from adapters.data_collection_adapter_interface import IDataCollectionAdapter
+from utils.file_helper import FileHelper
+
+
+
+class CcgpDataCollectionAdapter(IDataCollectionAdapter):
+    """
+    中国政府采购网数据采集适配器
+    """
+    file_helper = FileHelper()
+
+    def __init__(self, url: str,store:IDataStore=None):
+        self._url = url
+        self._store = store
+        self._driver = None
+        self._keyword = None
+        self._adapter_type = "ccgp"
+
+    def login(self, username: str, password: str) -> None:
+       pass
+
+    def collect(self, keyword: str, store: IDataStore):
+        if store:
+            self._store = store
+        self._keyword = keyword
+        items = self._search(keyword)
+        self._process_list(items)
+        if self.config.get_bool(self.batch_save_key):
+            self.store.save_collect_data(True)
+
+    def _search(self, keyword: str) -> list:
+        try:
+            if not keyword:
+                raise Exception("搜索关键字不能为空")
+            wait = WebDriverWait(self.driver, 10, 1)
+            wait.until(
+                ec.presence_of_element_located((By.ID, "searchForm")))
+            search_el = self.driver.find_element(By.ID, "kw")
+            sleep(2)
+            search_el.clear()
+            search_el.send_keys(keyword)
+            search_btn = self.driver.find_element(
+                By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']")
+            sleep(1)
+            search_btn.click()
+            wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
+            default_search_txt = "近一周"
+            search_txt = self.config.get(self.search_day_key, default_search_txt)
+            self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
+            if search_txt != default_search_txt:
+                last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
+                for last_el in last_els:
+                    if search_txt == last_el.text:
+                        sleep(1)
+                        last_el.click()
+                        break
+                wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
+            else:
+                sleep(1)
+
+
+            # try:
+            #     a_links = self.driver.find_elements(
+            #         By.XPATH, "//form[@id='pagerSubmitForm']/a")
+            #     count = len(a_links)
+            #     if count > 1:
+            #         count = count - 1
+            #     self.logger.info(f"共查询到 {count} 页")
+            # except Exception as e:
+            #     self.logger.error(f"搜索失败[尝试查询页数]: {e}")
+            items = self.driver.find_elements(By.XPATH,
+                                         "//ul[@class='vT-srch-result-list-bid']/li/a")
+            return items
+        except TimeoutException as e:
+            raise Exception(f"搜索失败 [超时]: {e}")
+        except NoSuchElementException as e:
+            raise Exception(f"搜索失败 [找不到元素]: {e}")
+
+
+    def _process_list(self,  items: list) -> list:
+        if not items:
+            return []
+        for item in items:
+            self._process_item( item)
+        sleep(2)
+        next_items = self._next_page()
+        return self._process_list( next_items)
+
+
+    def _next_page(self) -> list:
+        try:
+            wait = WebDriverWait(self.driver, 10, 1)
+            next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
+            wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
+            btn = self.driver.find_element(By.XPATH, next_path)
+            btn.click()
+            self.logger.info(f"跳转到下页: {self.driver.current_url}")
+            sleep(5)
+            wait.until(ec.presence_of_element_located((By.ID, "vT-srch-result")))
+            items = self.driver.find_elements(By.XPATH,
+                                         "//ul[@class='vT-srch-result-list-bid']/li/a")
+            return items
+        except NoSuchElementException as e:
+            raise Exception(f"翻页失败 [找不到元素]: {e}")
+        except TimeoutException:
+            self.logger.info("翻页结束")
+            return []
+
+    def _process_item(self,  item):
+        main_handle = self.driver.current_window_handle
+        wait = WebDriverWait(self.driver, 10, 1)
+        close = True
+        try:
+            url = item.get_attribute('href')
+            if self._check_is_collect_by_url(url):
+                close = False
+                return
+            self.logger.info(f"跳转详情")
+            sleep(1)
+            item.click()
+            wait.until(ec.number_of_windows_to_be(2))
+            handles = self.driver.window_handles
+            for handle in handles:
+                if handle != main_handle:
+                    self.driver.switch_to.window(handle)
+                    break
+            wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
+            # 判断是否为投标公告
+            if self._check_type("中标公告") or  self._check_type("成交公告") or self._check_type("终止公告"):
+                self._save_db(url, "", is_invalid=True)
+                return
+            content = self.driver.find_element(By.XPATH, "//div[@class='vF_deail_maincontent']").text
+            if self._check_content(content):
+                paths = []
+
+                attach_els = self.driver.find_elements(By.XPATH, "//td[@class='bid_attachtab_content']/a")
+                attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
+
+                # 合并两个列表
+                all_attachments = attach_els + attach_2_els
+                attach_urls = []
+                if len(all_attachments) > 0:
+                    for attach_el in attach_els:
+                        attach_url = attach_el.get_attribute('href')
+                        if attach_url not in attach_urls:
+                            attach_urls.append(attach_url)
+                        else:
+                            self.logger.info(f"重复附件: {attach_url}")
+                            continue
+                        file_name =  attach_el.text or attach_el.get_attribute('download') or attach_url.split('/')[-1]
+                        if not file_name:
+                            continue
+                        # 检查 file_name 是否包含文件扩展名
+                        if '.' not in file_name:
+                            self.logger.warning(f"文件名 {file_name} 不包含扩展名,跳过下载。")
+                            continue
+                        path = self.file_helper.download_remote_file(attach_url, file_name)
+                        if path:
+                            paths.append(path)
+                attach_str = ",".join(paths)
+                self._save_db(url, content, attach_str)
+            else:
+                self._save_db(url, content, is_invalid=True)
+        except TimeoutException as e:
+            self.logger.error(
+                f"采集发生异常 Timeout: {self.driver.current_url}。Exception: {e}")
+        except NoSuchElementException as e:
+            self.logger.error(
+                f"采集发生异常 NoSuchElement: {self.driver.current_url}。Exception: {e}")
+            raise Exception(f"采集失败 [找不到元素]: {e}")
+        finally:
+            if close:
+                sleep(1)
+                self.driver.close()
+                self.driver.switch_to.window(main_handle)
+
+    def _check_type(self,type_str: str)->bool:
+        links = self.driver.find_elements(By.LINK_TEXT, type_str)
+        if len(links) > 0:
+            self.logger.info(f"{type_str},跳过")
+            return True
+        return False
+

+ 109 - 119
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -1,16 +1,13 @@
 from time import sleep
 from time import sleep
 
 
-from selenium import webdriver
+
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support.wait import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support import expected_conditions as ec
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 
 
-from drivers.driver_creator import DriverCreator
 from stores.data_store_interface import IDataStore
 from stores.data_store_interface import IDataStore
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
 
 
 
 
 class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
 class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
@@ -18,110 +15,119 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
     中国招标网数据采集适配器
     中国招标网数据采集适配器
     """
     """
 
 
-    logger = LoggerHelper.get_logger()
 
 
-    def __init__(self, url: str):
+    def __init__(self, url: str,store:IDataStore=None):
         self._url = url
         self._url = url
-        self._store = None
+        self._store = store
         self._driver = None
         self._driver = None
         self._keyword = None
         self._keyword = None
-
-    @property
-    def store(self) -> IDataStore:
-        return self._store
-
-    @property
-    def url(self):
-        return self._url
-
-    @property
-    def keyword(self):
-        return self._keyword
-
-    @property
-    def driver(self):
-        if not self._driver:
-            self._driver = self.createDriver()
-        return self._driver
-
-    def createDriver(self) -> webdriver:
+        self._adapter_type = "chinabidding"
+
+    # @property
+    # def store(self) -> IDataStore:
+    #     return self._store
+    #
+    # @property
+    # def url(self):
+    #     return self._url
+    #
+    # @property
+    # def keyword(self):
+    #     return self._keyword
+    #
+    # @property
+    # def driver(self)->webdriver:
+    #     if not self._driver:
+    #         self._driver = self._create_driver()
+    #     return self._driver
+
+
+    def login(self, username: str, password: str) -> None:
         try:
         try:
-            return DriverCreator().GenRemoteDriver(self.url)
-        except Exception as e:
-            raise Exception(f"创建驱动器失败: {e}")
-
-    def login(self, driver, username: str, password: str) -> None:
-        try:
-            loginEl = driver.find_element(
+            login_el = self.driver.find_element(
                 By.XPATH, "//div[@id='loginRight']/a[@class='login']")
                 By.XPATH, "//div[@id='loginRight']/a[@class='login']")
-            loginEl.click()
-            wait = WebDriverWait(driver, 10, 1)
-            wait.until(EC.presence_of_element_located((By.ID, "userpass")))
-            unEl = driver.find_element(By.ID, "username")
-            unEl.send_keys(username)
-            passEl = driver.find_element(By.ID, "userpass")
-            passEl.send_keys(password)
-            loginBtn = driver.find_element(By.ID, "login-button")
-            loginBtn.click()
-            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
+            login_el.click()
+            wait = WebDriverWait(self.driver, 10, 1)
+            wait.until(ec.presence_of_element_located((By.ID, "userpass")))
+            un_el = self.driver.find_element(By.ID, "username")
+            un_el.send_keys(username)
+            pass_el = self.driver.find_element(By.ID, "userpass")
+            pass_el.send_keys(password)
+            login_btn = self.driver.find_element(By.ID, "login-button")
+            login_btn.click()
+            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
         except TimeoutException as e:
         except TimeoutException as e:
             raise Exception(f"登录失败 [超时]: {e}")
             raise Exception(f"登录失败 [超时]: {e}")
         except NoSuchElementException as e:
         except NoSuchElementException as e:
             raise Exception(f"登录失败 [找不到元素]: {e}")
             raise Exception(f"登录失败 [找不到元素]: {e}")
 
 
-    def search(self, driver, keyword: str) -> list:
+
+    def collect(self, keyword: str, store: IDataStore):
+        if store:
+            self._store = store
+        self._keyword = keyword
+        items = self._search(keyword)
+        self._process_list(items)
+        if self.config.get_bool(self.batch_save_key):
+            self.store.save_collect_data(True)
+
+    def _search(self, keyword: str) -> list:
         try:
         try:
-            self._keyword = keyword
-            wait = WebDriverWait(driver, 10, 1)
+            wait = WebDriverWait(self.driver, 10, 1)
             wait.until(
             wait.until(
-                EC.presence_of_element_located((By.ID, "projSearchForm")))
-            searchEl = driver.find_element(By.ID, "fullText")
-            searchEl.send_keys(keyword)
-            searchBtn = driver.find_element(
+                ec.presence_of_element_located((By.ID, "projSearchForm")))
+            search_el = self.driver.find_element(By.ID, "fullText")
+            search_el.send_keys("")
+            search_el.send_keys(keyword)
+            search_btn = self.driver.find_element(
                 By.XPATH, "//form[@id='projSearchForm']/button")
                 By.XPATH, "//form[@id='projSearchForm']/button")
-            searchBtn.click()
-            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
-            # 查询3天内的数据
-            search_txt = ConfigHelper().get("adapter.chinabidding.search_day")
-            if not search_txt:
-                search_txt = "近三天"
+            search_btn.click()
+            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            default_search_txt = "近3日"
+            search_txt = self.config.get(self.search_day_key, default_search_txt)
             self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
             self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
-            lastEl = driver.find_element(By.LINK_TEXT, search_txt)
-            lastEl.click()
-            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
+            if search_txt != default_search_txt:
+                last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
+                last_el.click()
+                wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            else:
+                sleep(1)
             try:
             try:
-                aLinks = driver.find_elements(
+                a_links = self.driver.find_elements(
                     By.XPATH, "//form[@id='pagerSubmitForm']/a")
                     By.XPATH, "//form[@id='pagerSubmitForm']/a")
-                count = len(aLinks)
+                count = len(a_links)
                 if count > 1:
                 if count > 1:
                     count = count - 1
                     count = count - 1
                 self.logger.info(f"共查询到 {count} 页")
                 self.logger.info(f"共查询到 {count} 页")
             except Exception as e:
             except Exception as e:
                 self.logger.error(f"搜索失败[尝试查询页数]: {e}")
                 self.logger.error(f"搜索失败[尝试查询页数]: {e}")
-            items = driver.find_elements(By.XPATH,
-                                         "//ul[@class='as-pager-body']/li/a")
+            items = self.driver.find_elements(By.XPATH,
+                                              "//ul[@class='as-pager-body']/li/a")
             return items
             return items
         except TimeoutException as e:
         except TimeoutException as e:
             raise Exception(f"搜索失败 [超时]: {e}")
             raise Exception(f"搜索失败 [超时]: {e}")
         except NoSuchElementException as e:
         except NoSuchElementException as e:
             raise Exception(f"搜索失败 [找不到元素]: {e}")
             raise Exception(f"搜索失败 [找不到元素]: {e}")
 
 
-    def collect(self, driver, items: list, store: IDataStore) -> list:
-        if store:
-            self._store = store
-        self._process_list(driver, items)
-        self.store.save_collect_data(True)
+    def _process_list(self, items: list) -> list:
+        if not items:
+            return []
+        for item in items:
+            self._process_item(item)
+        sleep(2)
+        next_items = self._next_page()
+        return self._process_list(next_items)
 
 
-    def _next_page(self, driver) -> list:
+    def _next_page(self) -> list:
         try:
         try:
-            wait = WebDriverWait(driver, 10, 1)
-            nextPath = "//form[@id='pagerSubmitForm']/a[@class='next']"
-            wait.until(EC.presence_of_element_located((By.XPATH, nextPath)))
-            btn = driver.find_element(By.XPATH, nextPath)
+            wait = WebDriverWait(self.driver, 10, 1)
+            next_path = "//form[@id='pagerSubmitForm']/a[@class='next']"
+            wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
+            btn = self.driver.find_element(By.XPATH, next_path)
             btn.click()
             btn.click()
-            self.logger.info(f"跳转到下页: {driver.current_url}")
-            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
-            items = driver.find_elements(By.XPATH,
+            self.logger.info(f"跳转到下页: {self.driver.current_url}")
+            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            items = self.driver.find_elements(By.XPATH,
                                          "//ul[@class='as-pager-body']/li/a")
                                          "//ul[@class='as-pager-body']/li/a")
             return items
             return items
         except NoSuchElementException as e:
         except NoSuchElementException as e:
@@ -130,60 +136,44 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             self.logger.info("翻页结束")
             self.logger.info("翻页结束")
             return []
             return []
 
 
-    def _process_item(self, driver, item):
+    def _process_item(self, item):
+        main_handle = self.driver.current_window_handle
+        close = True
         try:
         try:
-            currentHandle = driver.current_window_handle
             url = item.get_attribute('href')
             url = item.get_attribute('href')
-            old = self.store.query_one_collect_by_url(url)
-            if old:
-                self.logger.info(f"已采集过: {url}")
+            if self._check_is_collect_by_url(url):
+                close = False
                 return
                 return
             item.click()
             item.click()
-            wait = WebDriverWait(driver, 10, 1)
-            wait.until(EC.number_of_windows_to_be(2))
-            handles = driver.window_handles
+            wait = WebDriverWait(self.driver, 10, 1)
+            wait.until(ec.number_of_windows_to_be(2))
+            handles = self.driver.window_handles
             for handle in handles:
             for handle in handles:
-                if handle != currentHandle:
-                    driver.switch_to.window(handle)
+                if handle != main_handle:
+                    self.driver.switch_to.window(handle)
                     break
                     break
-            url = driver.current_url
-            self.logger.info(f"跳转详情: {driver.current_url}")
-            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
-            content = driver.find_element(By.TAG_NAME, "body").text
-            self._save(url, content)
-            sleep(1)
-            driver.close()
-            sleep(2)
+            url = self.driver.current_url
+            self.logger.info(f"跳转详情")
+            wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
+            content = self.driver.find_element(By.TAG_NAME, "body").text
+            if self._check_content(content):
+                self._save_db(url, content)
+            else:
+                self._save_db(url, content, is_invalid=True)
+
         except TimeoutException as e:
         except TimeoutException as e:
             self.logger.error(
             self.logger.error(
-                f"采集发生异常 Timeout: {driver.current_url}。Exception: {e}")
+                f"采集发生异常 Timeout: {self.driver.current_url}。Exception: {e}")
             # raise Exception(f"采集失败 [超时]: {e}")
             # raise Exception(f"采集失败 [超时]: {e}")
         except NoSuchElementException as e:
         except NoSuchElementException as e:
             self.logger.error(
             self.logger.error(
-                f"采集发生异常 NoSuchElement: {driver.current_url}。Exception: {e}")
+                f"采集发生异常 NoSuchElement: {self.driver.current_url}。Exception: {e}")
             raise Exception(f"采集失败 [找不到元素]: {e}")
             raise Exception(f"采集失败 [找不到元素]: {e}")
         finally:
         finally:
-            driver.switch_to.window(currentHandle)
+            if close:
+                sleep(2)
+                self.driver.close()
+                self.driver.switch_to.window(main_handle)
 
 
-    def _save(self, url, content):
-        # self.logger.info(f"保存数据: {url},关键字{self.keyword}")
-        if not self.store:
-            self.logger.info(f"DataStore 未指定: {url},关键字{self.keyword}")
-        else:
-            self.store.insert_collect_data(url, self.keyword, content, True)
 
 
-    def _process_list(self, driver, items: list) -> list:
-        if not items:
-            return []
-        for item in items:
-            self._process_item(driver, item)
-        sleep(2)
-        next_items = self._next_page(driver)
-        return self._process_list(driver, next_items)
 
 
-    def teardown(self, driver) -> None:
-        try:
-            if driver:
-                driver.quit()
-        except Exception as e:
-            raise Exception(f"关闭驱动器失败: {e}")

+ 84 - 43
SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py

@@ -1,57 +1,66 @@
+
+
 from abc import ABC, abstractmethod
 from abc import ABC, abstractmethod
 from selenium import webdriver
 from selenium import webdriver
 
 
 from stores.data_store_interface import IDataStore
 from stores.data_store_interface import IDataStore
+from drivers.driver_creator import DriverCreator
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+from models.collect_data import CollectData
+from models.process_data import ProcessData
 
 
 
 
 class IDataCollectionAdapter(ABC):
 class IDataCollectionAdapter(ABC):
     """
     """
     数据收集适配器抽象类
     数据收集适配器抽象类
     """
     """
+    _url = ""
+    _store = None
+    _driver = None
+    _keyword = None
+    _adapter_type = ""
 
 
-    @property
-    @abstractmethod
-    def url(self):
-        """
-        驱动器初始打开的URL
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
 
 
-        :return:  驱动器初始打开的URL
-        :rtype: str
-        """
-        pass
 
 
     @property
     @property
-    @abstractmethod
-    def driver(self):
-        """
-        创建的驱动器
+    def search_day_key(self) -> str:
+        return f"adapter.{self._adapter_type}.search_day"
+    @property
+    def batch_save_key(self) -> str:
+        return f"adapter.{self._adapter_type}.batch_save"
+    @property
+    def store(self) -> IDataStore:
+        return self._store
 
 
-        :return:  驱动器
-        :rtype: webdriver
-        """
-        pass
+    @property
+    def url(self):
+        return self._url
 
 
-    @abstractmethod
-    def createDriver(self) -> webdriver:
-        """
-        根据URL创建一个浏览器驱动器
+    @property
+    def keyword(self):
+        return self._keyword
 
 
-        :return: 创建的驱动器
-        :rtype: webdriver
-        :raises Exception: 如果创建驱动器失败,应抛出异常
-        """
+    @property
+    def driver(self) -> webdriver:
+        if not self._driver:
+            self._driver = self._create_driver()
+        return self._driver
+
+    def _create_driver(self) -> webdriver:
         try:
         try:
-            # 实现创建驱动器的逻辑
-            pass
+            return DriverCreator().gen_remote_driver(self.url)
+            # return DriverCreator().gen_chrome_driver(self.url)
         except Exception as e:
         except Exception as e:
             raise Exception(f"创建驱动器失败: {e}")
             raise Exception(f"创建驱动器失败: {e}")
 
 
     @abstractmethod
     @abstractmethod
-    def login(self, driver, username: str, password: str) -> None:
+    def login(self, username: str, password: str) -> None:
         """
         """
         如果需要登录,则登录后跳转到搜索页面(不自动跳转的需要手动执行)
         如果需要登录,则登录后跳转到搜索页面(不自动跳转的需要手动执行)
 
 
-        :param driver: 浏览器驱动器实例
         :param username: 用户名
         :param username: 用户名
         :type username: str
         :type username: str
         :param password: 密码
         :param password: 密码
@@ -65,11 +74,10 @@ class IDataCollectionAdapter(ABC):
             raise Exception(f"登录失败: {e}")
             raise Exception(f"登录失败: {e}")
 
 
     @abstractmethod
     @abstractmethod
-    def search(self, driver, keyword: str) -> list:
+    def _search(self, keyword: str) -> list:
         """
         """
         根据关键字搜索,返回搜索结果列表
         根据关键字搜索,返回搜索结果列表
 
 
-        :param driver: 浏览器驱动器实例
         :param keyword: 搜索关键字
         :param keyword: 搜索关键字
         :type keyword: str
         :type keyword: str
         :return: 搜索结果列表
         :return: 搜索结果列表
@@ -84,36 +92,69 @@ class IDataCollectionAdapter(ABC):
             raise Exception(f"搜索失败: {e}")
             raise Exception(f"搜索失败: {e}")
 
 
     @abstractmethod
     @abstractmethod
-    def collect(self, driver, items: list, store: IDataStore) -> list:
+    def collect(self, keyword: str, store: IDataStore) -> None:
         """
         """
         处理搜索结果列表,返回处理后的数据列表
         处理搜索结果列表,返回处理后的数据列表
 
 
-        :param driver: 浏览器驱动器实例
-        :param items: 搜索结果列表
-        :type items: list
+        :param keyword: 搜索结果列表
+        :param store: 数据储存库
+        :type keyword: str
         :return: 处理后的数据列表
         :return: 处理后的数据列表
         :rtype: list
         :rtype: list
         :raises Exception: 如果处理失败,应抛出异常
         :raises Exception: 如果处理失败,应抛出异常
         """
         """
         try:
         try:
-            processed_items = []
-            if items:
+            if keyword:
                 # 实现处理逻辑
                 # 实现处理逻辑
                 pass
                 pass
-            return processed_items
         except Exception as e:
         except Exception as e:
             raise Exception(f"处理失败: {e}")
             raise Exception(f"处理失败: {e}")
 
 
-    @abstractmethod
-    def teardown(self, driver) -> None:
+    def teardown(self) -> None:
         """
         """
         关闭浏览器驱动器
         关闭浏览器驱动器
 
 
-        :param driver: 浏览器驱动器实例
         :raises Exception: 如果关闭驱动器失败,应抛出异常
         :raises Exception: 如果关闭驱动器失败,应抛出异常
         """
         """
         try:
         try:
-            if driver:
-                driver.quit()
+            if self.driver:
+                self.driver.quit()
         except Exception as e:
         except Exception as e:
             raise Exception(f"关闭驱动器失败: {e}")
             raise Exception(f"关闭驱动器失败: {e}")
+
+    def _check_is_collect_by_url(self, url: str) -> bool:
+        old = self.store.query_one_collect_url(url)
+        if old:
+            self.logger.info(f"已采集过: {url}")
+            return True
+        return False
+    def _check_content(self,content) -> bool:
+        collect_data_key = self.config.get("save.collect_data_key")
+        if not collect_data_key:
+            self.logger.info("未配置 save.collect_data_key,跳过内容检查")
+            return True
+        # self.logger.info(f"检查数据有效性: {collect_data_key}")
+        collect_data_key = collect_data_key.replace(",", ",")
+        keys = collect_data_key.split(",")
+        keys = [key.strip() for key in keys]
+        for key in keys:
+            key = key.strip()
+            # self.logger.info(f"检查数据有效性: {key}")
+            if key in content:
+                self.logger.info(f"有效数据: {self.driver.current_url}")
+                return True
+
+        return False
+
+
+
+    def _save_db(self, url, content, attach_str = None,is_invalid=False):
+        if not self.store:
+            self.logger.info(f"DataStore 未指定: {url},关键字{self.keyword}")
+            return False
+        else:
+            status = 2 if is_invalid else 0
+            data = CollectData(url, self.keyword, content, attach_str, status)
+            self.store.insert_collect_data(data, self.config.get_bool(self.batch_save_key))
+            return True
+

+ 31 - 12
SourceCode/TenderCrawler/app/config.yml

@@ -1,13 +1,25 @@
 adapter:
 adapter:
   chinabidding:
   chinabidding:
     #search_day: '今天'
     #search_day: '今天'
-    search_day: '近一月'
+    #search_day: '近一周'
+    search_day: '近三天'
     model_name: 'chinabidding_data_collection_adapter'
     model_name: 'chinabidding_data_collection_adapter'
     class_name: 'ChinabiddingDataCollectionAdapter'
     class_name: 'ChinabiddingDataCollectionAdapter'
+    batch_save: True
+  ccgp:
+    #search_day: '今日'
+    search_day: '近3日'
+    model_name: 'ccgp_data_collection_adapter'
+    class_name: 'CcgpDataCollectionAdapter'
+    batch_save: False
 default_area: '全国'
 default_area: '全国'
+logger:
+  file-path: './logs/'
 save:
 save:
+  collect_data_key: '红外光谱仪,拉曼光谱仪'
   collect_batch_size: 100
   collect_batch_size: 100
   process_batch_size: 1 #AI处理一条插入一条
   process_batch_size: 1 #AI处理一条插入一条
+  attach_file_path: './attaches/'
 mysql:
 mysql:
   host: 192.168.0.81
   host: 192.168.0.81
   port: 3307
   port: 3307
@@ -16,20 +28,27 @@ mysql:
   password: Iwb-2024
   password: Iwb-2024
   charset: utf8mb4
   charset: utf8mb4
 ai:
 ai:
-  key: 1
-  url: http://192.168.0.109:7580/api/chat
-  # url: https://api.qwen.aliyun.com/v1/models/qwen/completions
-  model: qwen2.5:7b
+#  url: http://192.168.0.109:7580/api/chat
+#  model: qwen2.5:7b
+  key: sk-febca8fea4a247f096cedeea9f185520
+  url: https://dashscope.aliyuncs.com/compatible-mode/v1
+  model: qwen-plus
   max_tokens: 1024
   max_tokens: 1024
   system_prompt: 请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。
   system_prompt: 请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。
-  prompt_template: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary)。编号一般在“招标编号:”的后面,例如 (招标编号:xxxxxxx...), “xxxxxxx...”就是编号(no)。返回包含no,title,area,date,address,release_date,summary字段的json格式字符串,没有找到或未提供的信息json字段为空。
+  prompt_template: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),设备(device)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,area,date,address,release_date,summary,device字段的json格式字符串,没有找到或未提供的信息json字段为空。
 email:
 email:
-  smtp_server: smtp.exmail.qq.com
-  smtp_port: 587
-  smtp_user: yueyy@iwbnet.com
-  smtp_password: EXN38AtT97FX635c
-  from_email: yueyy@iwbnet.com
-  error_email: yueyy@iwbnet.com
+#  smtp_server: smtp.exmail.qq.com
+#  smtp_port: 465
+#  smtp_user: yueyy@iwbnet.com
+#  smtp_password: EXN38AtT97FX635c
+#  from_email: yueyy@iwbnet.com
+  smtp_server: smtp.163.com
+  smtp_port: 465
+  smtp_user: yueyunyun88@163.com
+  smtp_password: FWRwBZKHTLHjHT5F
+  from_email: yueyunyun88@163.com
+
+  #error_email: yueyy@iwbnet.com
 schedule:
 schedule:
   sleep_interval: 10
   sleep_interval: 10
   #sleep_interval: 600 #单位:秒 10分钟检查一次
   #sleep_interval: 600 #单位:秒 10分钟检查一次

+ 19 - 11
SourceCode/TenderCrawler/app/drivers/driver_creator.py

@@ -10,16 +10,23 @@ class DriverCreator:
 
 
     default_remote_driver_url = "http://127.0.0.1:4444/wd/hub"
     default_remote_driver_url = "http://127.0.0.1:4444/wd/hub"
 
 
-    def GenRemoteDriver(self, url):
+    def gen_remote_driver(self, url):
         # 设置Chrome选项
         # 设置Chrome选项
         options = webdriver.ChromeOptions()
         options = webdriver.ChromeOptions()
 
 
-        options.add_argument('--headless')  # 无头模式运行
+        # options.add_argument('--headless')  # 无头模式运行
         options.add_argument('--no-sandbox')
         options.add_argument('--no-sandbox')
         options.add_argument('--disable-dev-shm-usage')
         options.add_argument('--disable-dev-shm-usage')
+        options.add_experimental_option('excludeSwitches',
+                                        ['enable-automation'])
+        options.add_argument('--disable-blink-features=AutomationControlled')
+        options.add_argument('--disable-extensions')
+        # 最大化窗口
+        options.add_argument('--start-maximized')
         # 无痕浏览模式
         # 无痕浏览模式
         options.add_argument('--incognito')
         options.add_argument('--incognito')
 
 
+
         remote_driver_url = ConfigHelper().get('selenium.remote_driver_url')
         remote_driver_url = ConfigHelper().get('selenium.remote_driver_url')
         if not remote_driver_url:
         if not remote_driver_url:
             remote_driver_url = self.default_remote_driver_url
             remote_driver_url = self.default_remote_driver_url
@@ -30,9 +37,9 @@ class DriverCreator:
         # 创建远程浏览器驱动实例
         # 创建远程浏览器驱动实例
         driver = webdriver.Remote(command_executor=remote_driver_url,
         driver = webdriver.Remote(command_executor=remote_driver_url,
                                   options=options)
                                   options=options)
-        return self._genDriver(driver, url)
+        return self._gen_driver(driver, url)
 
 
-    def GenChromeDriver(self, url):
+    def gen_chrome_driver(self, url):
         # 设置Chrome选项,包括隐藏Selenium特征、设置代理IP和排除或关闭一些Selenium相关开关
         # 设置Chrome选项,包括隐藏Selenium特征、设置代理IP和排除或关闭一些Selenium相关开关
         options = webdriver.ChromeOptions()
         options = webdriver.ChromeOptions()
         options.add_experimental_option('excludeSwitches',
         options.add_experimental_option('excludeSwitches',
@@ -40,7 +47,6 @@ class DriverCreator:
         options.add_argument('--disable-blink-features=AutomationControlled')
         options.add_argument('--disable-blink-features=AutomationControlled')
         options.add_argument('--disable-extensions')
         options.add_argument('--disable-extensions')
         # options.add_argument('--disable-gpu')
         # options.add_argument('--disable-gpu')
-        # options.add_argument('--disable-infobars')
         options.add_argument('--disable-notifications')
         options.add_argument('--disable-notifications')
         # options.add_argument('--disable-popup-blocking')
         # options.add_argument('--disable-popup-blocking')
         # options.add_argument('--disable-web-security')
         # options.add_argument('--disable-web-security')
@@ -57,11 +63,12 @@ class DriverCreator:
         # 阻止浏览器窗口自动关闭
         # 阻止浏览器窗口自动关闭
         # options.add_experimental_option('detach', True)
         # options.add_experimental_option('detach', True)
         driver = webdriver.Chrome(options=options)  # 创建Chrome浏览器驱动实例
         driver = webdriver.Chrome(options=options)  # 创建Chrome浏览器驱动实例
-        return self._genDriver(driver, url)
+        return self._gen_driver(driver, url)
 
 
-    def _genDriver(self, driver, url):
-        # 检查是否为 ChromeDriver 或 FirefoxDriver
-        if isinstance(driver, (webdriver.Chrome, webdriver.Firefox)):
+
+    def _gen_driver(self, driver, url):
+        # 设置user-agent,改变user-agent的值
+        if hasattr(driver, 'execute_cdp_cmd'):
             # 隐藏navigator.webdriver标志,将其值修改为false或undefined
             # 隐藏navigator.webdriver标志,将其值修改为false或undefined
             driver.execute_cdp_cmd(
             driver.execute_cdp_cmd(
                 'Page.addScriptToEvaluateOnNewDocument', {
                 'Page.addScriptToEvaluateOnNewDocument', {
@@ -93,5 +100,6 @@ class DriverCreator:
         self.logger.info(f"创建浏览器驱动,URL: {url}")
         self.logger.info(f"创建浏览器驱动,URL: {url}")
         return driver
         return driver
 
 
-    def ShutdownDriver(driver):
-        driver.quit()
+
+    # def shutdown_driver(self,driver):
+    #     driver.quit()

+ 2 - 9
SourceCode/TenderCrawler/app/main.py

@@ -6,19 +6,12 @@ from utils.logger_helper import LoggerHelper
 from main.runner import Runner
 from main.runner import Runner
 
 
 logger = LoggerHelper.get_logger()
 logger = LoggerHelper.get_logger()
-DEFAUlT_SLEEP_INTERVAL = 60 * 30  # 配置默认时间间隔30分钟
+DEFAULT_USER_SLEEP_INTERVAL = 60 * 30  # 配置默认时间间隔30分钟
 
 
 runner = Runner()
 runner = Runner()
 runner.run()
 runner.run()
 
 
-try:
-    intervalStr = ConfigHelper().get("schedule.sleep_interval")
-    interval = int(intervalStr)
-except Exception:
-    interval = DEFAUlT_SLEEP_INTERVAL
-    logger.warning(
-        f"schedule.sleep_interval {intervalStr} 配置不正确, 使用默认配置: {DEFAUlT_SLEEP_INTERVAL}秒"
-    )
+interval = ConfigHelper().get_int("schedule.sleep_interval",DEFAULT_USER_SLEEP_INTERVAL)
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     while True:
     while True:

+ 18 - 22
SourceCode/TenderCrawler/app/main/data_collector.py

@@ -22,16 +22,16 @@ class DataCollector:
     # _adapterClassMap = {"chinabidding": "ChinabiddingDataCollectionAdapter"}
     # _adapterClassMap = {"chinabidding": "ChinabiddingDataCollectionAdapter"}
 
 
     def __init__(self,
     def __init__(self,
-                 type: str,
+                 adapter_type: str,
                  url: str,
                  url: str,
                  un: str,
                  un: str,
                  up: str,
                  up: str,
                  store: IDataStore = None):
                  store: IDataStore = None):
-        self._adapter = self._genAdapter(type, url)
-        self._driver = self.adapter.createDriver()
+        self._adapter = self._gen_adapter(adapter_type, url)
+        self._driver = self.adapter.driver
         # if type == "chinabidding":
         # if type == "chinabidding":
         #     return
         #     return
-        self.adapter.login(self.driver, un, up)
+        self.adapter.login(un, up)
         if store:
         if store:
             self._store = store
             self._store = store
         else:
         else:
@@ -49,38 +49,34 @@ class DataCollector:
     def adapter(self) -> IDataCollectionAdapter:
     def adapter(self) -> IDataCollectionAdapter:
         return self._adapter
         return self._adapter
 
 
-    def setStore(self, store: IDataStore) -> None:
+    def set_store(self, store: IDataStore) -> None:
         self._store = store
         self._store = store
 
 
     def collect(self, keyword: str):
     def collect(self, keyword: str):
-        items = self.adapter.search(self.driver, keyword)
-        self.adapter.collect(self.driver, items, self.store)
+        self.adapter.collect(keyword, self.store)
 
 
     def close(self):
     def close(self):
         self.logger.info(f"关闭浏览器驱动,URL: {self.adapter.url}")
         self.logger.info(f"关闭浏览器驱动,URL: {self.adapter.url}")
-        self.adapter.teardown(self.driver)
+        self.adapter.teardown()
 
 
-    def collectWithStore(self, keyword: str, store: IDataStore):
-        self.setStore(store)
-        self.collect(keyword)
 
 
-    def _genAdapter(self, type: str, url: str):
-        adapterModelName = self.config.get(f"adapter.{type}.model_name")
-        adapterClassName = self.config.get(f"adapter.{type}.class_name")
-        if adapterClassName:
+    def _gen_adapter(self, adapter_type: str, url: str):
+        adapter_model_name = self.config.get(f"adapter.{adapter_type}.model_name")
+        adapter_class_name = self.config.get(f"adapter.{adapter_type}.class_name")
+        if adapter_class_name:
             try:
             try:
                 self.logger.info(
                 self.logger.info(
-                    f"生成适配器 TYPE:{type},适配器: {adapterClassName},URL:{url}")
+                    f"生成适配器 TYPE:{adapter_type},适配器: {adapter_class_name},URL:{url}")
                 # 使用 importlib 动态导入模块
                 # 使用 importlib 动态导入模块
-                adapterModule = importlib.import_module(
-                    f"adapters.{adapterModelName}")
-                adapterClass = getattr(adapterModule, adapterClassName)
-                adapter = adapterClass(url)
+                adapter_module = importlib.import_module(
+                    f"adapters.{adapter_model_name}")
+                adapter_class = getattr(adapter_module, adapter_class_name)
+                adapter = adapter_class(url)
             except ImportError as e:
             except ImportError as e:
-                raise ImportError(f"无法导入适配器模块 {adapterModelName}") from e
+                raise ImportError(f"无法导入适配器模块 {adapter_model_name}") from e
             except AttributeError as e:
             except AttributeError as e:
                 raise AttributeError(
                 raise AttributeError(
-                    f"适配器模块 {adapterModelName} 中找不到类 {adapterClassName}"
+                    f"适配器模块 {adapter_model_name} 中找不到类 {adapter_class_name}"
                 ) from e
                 ) from e
         else:
         else:
             raise Exception("不支持的适配器类型")
             raise Exception("不支持的适配器类型")

+ 1 - 1
SourceCode/TenderCrawler/app/main/data_process.py

@@ -55,7 +55,7 @@ class DataProcess:
 
 
         self.logger.info("END   ==>" + url)
         self.logger.info("END   ==>" + url)
 
 
-    def _ai_process(self, item: CollectData) -> ProcessData:
+    def _ai_process(self, item: CollectData) -> ProcessData | None:
         try:
         try:
             data = AiHelper().call_ai(item.content)
             data = AiHelper().call_ai(item.content)
             return data
             return data

+ 4 - 3
SourceCode/TenderCrawler/app/main/data_send.py

@@ -29,15 +29,16 @@ class DataSend:
         email = self.store.get_email_by_area(item.area)
         email = self.store.get_email_by_area(item.area)
         if not email:
         if not email:
             self.logger.error(f"{item.area} 下没有找到email")
             self.logger.error(f"{item.area} 下没有找到email")
-            if (item.area not in self._error_arr):
+            if item.area not in self._error_arr:
                 self._error_arr.append(item.area)
                 self._error_arr.append(item.area)
             return
             return
         body = self._build_email_content(item)
         body = self._build_email_content(item)
-        flag = EmailHelper().send_email(email, item.title, body, True, None)
+        flag = EmailHelper().send_email(email, item.title, body, True, item.attach_path)
         if flag:
         if flag:
             self.store.set_send(item.no)
             self.store.set_send(item.no)
 
 
-    def _build_email_content(self, item: ProcessData, other: str = "") -> str:
+    @staticmethod
+    def _build_email_content(item: ProcessData, other: str = "") -> str:
         html_body = f"""
         html_body = f"""
         <html>
         <html>
         <head>
         <head>

+ 22 - 21
SourceCode/TenderCrawler/app/main/runner.py

@@ -9,7 +9,7 @@ from main.data_collector import DataCollector
 from main.data_process import DataProcess
 from main.data_process import DataProcess
 from main.data_send import DataSend
 from main.data_send import DataSend
 from utils.email_helper import EmailHelper
 from utils.email_helper import EmailHelper
-
+from utils.file_helper import FileHelper
 
 
 class Runner:
 class Runner:
     logger = LoggerHelper.get_logger()
     logger = LoggerHelper.get_logger()
@@ -44,47 +44,48 @@ class Runner:
         for time in send_email_times:
         for time in send_email_times:
             self.logger.info(f"{time} 执行   发送邮件   任务")
             self.logger.info(f"{time} 执行   发送邮件   任务")
             schedule.every().day.at(time).do(self._send_job)
             schedule.every().day.at(time).do(self._send_job)
-        run_now = self.config.get("schedule.run_now")
-        if run_now and (str(run_now).lower() == 'true' or str(run_now) == '1'):
+        if self.config.get_bool("schedule.run_now"):
             self.logger.info("立即执行任务")
             self.logger.info("立即执行任务")
             self._collect_process_job()
             self._collect_process_job()
-            self._send_job()
+            # self._send_job()
             # self._process_job()
             # self._process_job()
 
 
     def _collect_process_job(self):
     def _collect_process_job(self):
         try:
         try:
             self.logger.info("开始执行数据采集处理任务")
             self.logger.info("开始执行数据采集处理任务")
-            urlSetting = UrlSetting()
-            for url_setting in urlSetting.fetch_all():
+            url_setting = UrlSetting()
+            for url_setting in url_setting.fetch_all():
+                data_collector =None
                 try:
                 try:
                     self.logger.info(f"开始采集: {url_setting.url}")
                     self.logger.info(f"开始采集: {url_setting.url}")
-                    dataCollector = DataCollector(url_setting.type,
-                                                  url_setting.url,
-                                                  url_setting.username,
-                                                  url_setting.password,
-                                                  self.store)
+                    data_collector = DataCollector(url_setting.adapter_type,
+                                                   url_setting.url,
+                                                   url_setting.username,
+                                                   url_setting.password,
+                                                   self.store)
                     keywords = url_setting.keywords
                     keywords = url_setting.keywords
-                    keywordArray = keywords.split(',')
-                    for keyword in keywordArray:
-                        dataCollector.collect(keyword)
+                    keyword_array = keywords.split(',')
+                    for keyword in keyword_array:
+                        data_collector.collect(keyword)
                     self.logger.info(f"采集完成: {url_setting.url}")
                     self.logger.info(f"采集完成: {url_setting.url}")
                 except Exception as e:
                 except Exception as e:
                     self._send_error_email(
                     self._send_error_email(
                         "数据采集",
                         "数据采集",
-                        f"\n    Type: {url_setting.type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
+                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
                     )
                     )
                     self.logger.error(f"采集发生异常: {e}")
                     self.logger.error(f"采集发生异常: {e}")
                 finally:
                 finally:
-                    dataCollector.close()
+                    if data_collector:
+                        data_collector.close()
 
 
                 try:
                 try:
                     self.logger.info(f"开始AI处理: {url_setting.url}")
                     self.logger.info(f"开始AI处理: {url_setting.url}")
-                    dataProcess = DataProcess(self.store)
-                    dataProcess.process()
+                    data_process = DataProcess(self.store)
+                    data_process.process()
                 except Exception as e:
                 except Exception as e:
                     self._send_error_email(
                     self._send_error_email(
                         "AI数据处理",
                         "AI数据处理",
-                        f"\n    Type: {url_setting.type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
+                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
                     )
                     )
                     self.logger.error(f"AI处理发生异常: {e}")
                     self.logger.error(f"AI处理发生异常: {e}")
                     break  # 中断当前 URL 设置的处理
                     break  # 中断当前 URL 设置的处理
@@ -95,8 +96,8 @@ class Runner:
     def _process_job(self):
     def _process_job(self):
         try:
         try:
             self.logger.info("开始AI处理数据执行任务")
             self.logger.info("开始AI处理数据执行任务")
-            dataProcess = DataProcess(self.store)
-            dataProcess.process()
+            data_process = DataProcess(self.store)
+            data_process.process()
             self.logger.info("AI处理数据任务执行完毕")
             self.logger.info("AI处理数据任务执行完毕")
         except Exception as e:
         except Exception as e:
             self._send_error_email("AI数据处理", f"\n    错误: {str(e)}")
             self._send_error_email("AI数据处理", f"\n    错误: {str(e)}")

+ 7 - 5
SourceCode/TenderCrawler/app/models/area_email.py

@@ -3,12 +3,14 @@ from utils.mysql_helper import MySQLHelper
 
 
 class AreaEmail:
 class AreaEmail:
 
 
-    def __init__(self, name=None, area=None, email=None):
+    def __init__(self, name=None, area=None, email=None,is_active=None,remark=None):
         self.name = name
         self.name = name
         self.area = area
         self.area = area
         if email is None:
         if email is None:
             email = ""
             email = ""
         self.email = email.replace(",", ",")
         self.email = email.replace(",", ",")
+        self.is_active = is_active
+        self.remark = remark
 
 
     def __repr__(self):
     def __repr__(self):
         return (
         return (
@@ -33,19 +35,19 @@ class AreaEmail:
     #                   area_email.remark)
     #                   area_email.remark)
     #         db_helper.execute_non_query(query, params)
     #         db_helper.execute_non_query(query, params)
 
 
+    _query = "SELECT name,area,email FROM t_area_email WHERE is_active = 1"
+    _query_by_area = "SELECT email FROM t_area_email WHERE CONCAT(area,',') like %s AND is_active = 1"
     # 查询 AreaEmail 数据
     # 查询 AreaEmail 数据
     def fetch_all(self):
     def fetch_all(self):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            query = "SELECT name,area,email FROM t_area_email WHERE is_active = 1"
-            results = db_helper.execute_query(query)
+            results = db_helper.execute_query(self._query)
             data = [AreaEmail(**result) for result in results]
             data = [AreaEmail(**result) for result in results]
             return data
             return data
 
 
     def fetch_one_by_area(self, area: str):
     def fetch_one_by_area(self, area: str):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            query = "SELECT email FROM t_area_email WHERE CONCAT(area,',') like %s AND is_active = 1"
             params = ('%' + area + ',%', )
             params = ('%' + area + ',%', )
-            result = db_helper.fetch_one(query, params)
+            result = db_helper.fetch_one(self._query_by_area, params)
             if result is None:
             if result is None:
                 return None
                 return None
             return result["email"]
             return result["email"]

+ 101 - 25
SourceCode/TenderCrawler/app/models/collect_data.py

@@ -6,17 +6,22 @@ from utils.logger_helper import LoggerHelper
 class CollectData:
 class CollectData:
 
 
     logger = LoggerHelper.get_logger()
     logger = LoggerHelper.get_logger()
+    UNPROCESSED = 0
+    PROCESSED = 1
+    INVALID = 2
 
 
     def __init__(self,
     def __init__(self,
                  url=None,
                  url=None,
                  keyword=None,
                  keyword=None,
                  content=None,
                  content=None,
-                 status=None,
+                 attach_path=None,
+                 status=UNPROCESSED,
                  create_time=None,
                  create_time=None,
                  process_time=None):
                  process_time=None):
         self.url = url
         self.url = url
         self.keyword = keyword
         self.keyword = keyword
         self.content = content
         self.content = content
+        self.attach_path = attach_path
         self.status = status
         self.status = status
         self.create_time = create_time or datetime.now()
         self.create_time = create_time or datetime.now()
         self.process_time = process_time
         self.process_time = process_time
@@ -28,17 +33,30 @@ class CollectData:
             f"create_time={self.create_time}, process_time={self.process_time})"
             f"create_time={self.create_time}, process_time={self.process_time})"
         )
         )
 
 
+    _insert_query = """
+        INSERT IGNORE INTO t_collect_data (url, keyword, content, attach_path, status, create_time)
+        VALUES (%s, %s, %s, %s, %s, %s);
+        """
+    _insert_query_history = """
+         INSERT IGNORE INTO t_collect_data_history (url, keyword, content, attach_path, status, create_time)
+         VALUES (%s, %s, %s, %s, %s, %s);
+         """
+    _delete_query = """
+         DELETE FROM t_collect_data
+         WHERE url = %s;
+         """
     def insert(self, collect_data):
     def insert(self, collect_data):
         if not isinstance(collect_data, self.__class__):
         if not isinstance(collect_data, self.__class__):
             raise TypeError("collect_data 不是 CollectData 的实例")
             raise TypeError("collect_data 不是 CollectData 的实例")
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            query = """
-                INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time)
-                VALUES (%s, %s, %s, %s, %s)
-                """
+
             params = (collect_data.url, collect_data.keyword,
             params = (collect_data.url, collect_data.keyword,
-                      collect_data.content, 0, datetime.now())
-            db_helper.execute_non_query(query, params)
+                       collect_data.content,collect_data.attach_path,
+                      collect_data.status, datetime.now())
+            if collect_data.status == self.INVALID:
+                db_helper.execute_non_query(self._insert_query_history, params)
+            else:
+                db_helper.execute_non_query(self._insert_query, params)
 
 
     def insert_batch(self, collect_data_list):
     def insert_batch(self, collect_data_list):
         if not all(
         if not all(
@@ -46,34 +64,46 @@ class CollectData:
                 for collect_data in collect_data_list):
                 for collect_data in collect_data_list):
             raise TypeError("collect_data_list 中的所有元素必须是 CollectData 的实例")
             raise TypeError("collect_data_list 中的所有元素必须是 CollectData 的实例")
 
 
-        query = """
-            INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time)
-            VALUES (%s, %s, %s, %s, %s)
-            """
         params = [
         params = [
             (
             (
                 collect_data.url,
                 collect_data.url,
                 collect_data.keyword,
                 collect_data.keyword,
                 collect_data.content,
                 collect_data.content,
+                collect_data.attach_path,
+                collect_data.status,
+                datetime.now()  # 每次调用 datetime.now() 获取当前时间
+            ) for collect_data in collect_data_list
+            if collect_data.status != 2
+        ]
+        params2 = [
+            (
+                collect_data.url,
+                collect_data.keyword,
+                collect_data.content,
+                collect_data.attach_path,
                 collect_data.status,
                 collect_data.status,
                 datetime.now()  # 每次调用 datetime.now() 获取当前时间
                 datetime.now()  # 每次调用 datetime.now() 获取当前时间
             ) for collect_data in collect_data_list
             ) for collect_data in collect_data_list
+            if collect_data.status == 2
         ]
         ]
+
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            db_helper.execute_non_query(query, params)
+            db_helper.execute_non_query(self._insert_query, params)
             # 获取受影响的行数
             # 获取受影响的行数
             affected_rows = db_helper.connection.affected_rows()
             affected_rows = db_helper.connection.affected_rows()
+
+            db_helper.execute_non_query(self._insert_query_history, params2)
             self.logger.info(f"成功插入 {affected_rows} 条数据")
             self.logger.info(f"成功插入 {affected_rows} 条数据")
             return affected_rows
             return affected_rows
 
 
-    def insert_url(self, url: str, keyword: str, content: str):
-        with MySQLHelper() as db_helper:
-            query = """
-                INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time, process_time)
-                VALUES (%s, %s, %s, %s, %s, %s)
-                """
-            params = (url, keyword, content, 0, datetime.now, None)
-            db_helper.execute_non_query(query, params)
+    # def insert_url(self, url: str, keyword: str, content: str):
+    #     with MySQLHelper() as db_helper:
+    #         query = """
+    #             INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time, process_time)
+    #             VALUES (%s, %s, %s, %s, %s, %s)
+    #             """
+    #         params = (url, keyword, content, 0, datetime.now, None)
+    #         db_helper.execute_non_query(query, params)
 
 
     # def fetch_all():
     # def fetch_all():
     #     with MySQLHelper() as db_helper:
     #     with MySQLHelper() as db_helper:
@@ -101,6 +131,17 @@ class CollectData:
             data = [result['url'] for result in results]
             data = [result['url'] for result in results]
             return data
             return data
 
 
+    def fetch_one_url(self, url: str):
+        with MySQLHelper() as db_helper:
+            query = """
+             SELECT url FROM `t_collect_data_history` WHERE url= %s UNION SELECT url FROM `t_collect_data`  WHERE url= %s LIMIT 1
+            """
+            result = db_helper.fetch_one(query, (url, url))
+            if not result:
+                return None
+            data = result["url"]
+            return data
+
     def fetch_one_collect_by_url(self, url: str):
     def fetch_one_collect_by_url(self, url: str):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
             query = """
             query = """
@@ -116,13 +157,48 @@ class CollectData:
             return data
             return data
 
 
     def set_process(self, url: str):
     def set_process(self, url: str):
+        # with MySQLHelper() as db_helper:
+        #     query = """
+        #     UPDATE t_collect_data
+        #     SET status = 1
+        #     WHERE url = %s
+        #     """
+        #     db_helper.execute_non_query(query, (url))
+        self.move_to_history_and_delete(url)
+
+
+    def move_to_history_and_delete(self, url: str):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
+            # 查询 t_collect_data 中的数据
             query = """
             query = """
-            UPDATE t_collect_data
-            SET status = 1
-            WHERE url = %s
-            """
-            db_helper.execute_non_query(query, (url))
+             SELECT url, keyword, content, attach_path, status, create_time, process_time
+             FROM t_collect_data
+             WHERE url = %s
+             """
+            result = db_helper.fetch_one(query, (url,))
+            if not result:
+                self.logger.warning(f"URL {url} 未在 t_collect_data 中找到,无法移动到历史表并删除。")
+                return False
+
+            # 将数据插入到 t_collect_data_history
+            insert_query = self._insert_query_history
+            insert_params = (
+                result["url"],
+                result["keyword"],
+                result["content"],
+                result["attach_path"],
+                result["status"],
+                result["create_time"]
+            )
+            db_helper.execute_non_query(insert_query, insert_params)
+
+            # 删除 t_collect_data 中的数据
+            delete_query = self._delete_query
+            delete_params = (url,)
+            db_helper.execute_non_query(delete_query, delete_params)
+
+            self.logger.info(f"URL {url} 已从 t_collect_data 移动到 t_collect_data_history 并删除。")
+            return True
 
 
     def fetch_by_status(self, status=0):
     def fetch_by_status(self, status=0):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:

+ 38 - 34
SourceCode/TenderCrawler/app/models/process_data.py

@@ -18,6 +18,8 @@ class ProcessData:
                  address=None,
                  address=None,
                  summary=None,
                  summary=None,
                  release_date=None,
                  release_date=None,
+                 devices=None,
+                 attach_path=None,
                  status=None,
                  status=None,
                  create_time=None,
                  create_time=None,
                  send_time=None,
                  send_time=None,
@@ -28,14 +30,14 @@ class ProcessData:
         self.url = url
         self.url = url
         self.date = date
         self.date = date
         if not area:
         if not area:
-            area = ConfigHelper().get("default_area")
-        if not area:
-            area = "全国"
+            area = ConfigHelper().get("default_area", "全国")
         self.area = area.replace(" ", "")
         self.area = area.replace(" ", "")
         self.keyword = keyword
         self.keyword = keyword
         self.address = address
         self.address = address
         self.summary = summary
         self.summary = summary
         self.release_date = release_date
         self.release_date = release_date
+        self.devices = devices
+        self.attach_path = attach_path
         self.status = status
         self.status = status
         self.create_time = create_time or datetime.now()
         self.create_time = create_time or datetime.now()
         self.send_time = send_time
         self.send_time = send_time
@@ -49,30 +51,36 @@ class ProcessData:
             f"status={self.status}, create_time={self.create_time}, "
             f"status={self.status}, create_time={self.create_time}, "
             f"send_time={self.send_time}, remark={self.remark})")
             f"send_time={self.send_time}, remark={self.remark})")
 
 
+    _insert_query = """
+              INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, devices, attach_path, status, create_time)
+              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+          """
+    _update_query = """
+                UPDATE t_collect_data SET status = 1 WHERE url = %s;
+            """
     def insert(self, process_data):
     def insert(self, process_data):
         if not isinstance(process_data, self.__class__):
         if not isinstance(process_data, self.__class__):
             raise TypeError("process_data 不是 ProcessData 的实例")
             raise TypeError("process_data 不是 ProcessData 的实例")
 
 
-        insert_query = """
-            INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, status, create_time)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-        """
-
-        update_query = """
-            UPDATE t_collect_data SET status = 1 WHERE url = %s;
-        """
-
-        insert_params = (process_data.no, process_data.title, process_data.url,
-                         process_data.keyword, process_data.date,
-                         process_data.area, process_data.address,
-                         process_data.summary, process_data.release_date, 0,
+        insert_params = (process_data.no,
+                         process_data.title,
+                         process_data.url,
+                         process_data.keyword,
+                         process_data.date,
+                         process_data.area,
+                         process_data.address,
+                         process_data.summary,
+                         process_data.release_date,
+                         process_data.devices,
+                         process_data.attach_path,
+                         0,
                          datetime.now())
                          datetime.now())
 
 
         update_params = (process_data.url, )
         update_params = (process_data.url, )
 
 
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            db_helper.execute_non_query(insert_query, insert_params)
-            db_helper.execute_non_query(update_query, update_params)
+            db_helper.execute_non_query(self._insert_query, insert_params)
+            db_helper.execute_non_query(self._update_query, update_params)
 
 
     def insert_batch(self, process_data_list):
     def insert_batch(self, process_data_list):
         if not all(
         if not all(
@@ -80,14 +88,6 @@ class ProcessData:
                 for process_data in process_data_list):
                 for process_data in process_data_list):
             raise TypeError("process_data_list 中的所有元素必须是 ProcessData 的实例")
             raise TypeError("process_data_list 中的所有元素必须是 ProcessData 的实例")
 
 
-        insert_query = """
-            INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, status, create_time)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-        """
-
-        update_query = """
-            UPDATE t_collect_data SET status = 1 WHERE url = %s;
-        """
 
 
         insert_params = [(
         insert_params = [(
             process_data.no,
             process_data.no,
@@ -99,6 +99,8 @@ class ProcessData:
             process_data.address,
             process_data.address,
             process_data.summary,
             process_data.summary,
             process_data.release_date,
             process_data.release_date,
+            process_data.devices,
+            process_data.attach_path,
             0,
             0,
             datetime.now(),
             datetime.now(),
         ) for process_data in process_data_list]
         ) for process_data in process_data_list]
@@ -107,29 +109,31 @@ class ProcessData:
                          for process_data in process_data_list]
                          for process_data in process_data_list]
 
 
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            db_helper.execute_non_query(insert_query, insert_params)
+            db_helper.execute_non_query(self._insert_query, insert_params)
             affected_rows = db_helper.connection.affected_rows()
             affected_rows = db_helper.connection.affected_rows()
             self.logger.info(f"成功插入 {affected_rows} 条数据")
             self.logger.info(f"成功插入 {affected_rows} 条数据")
             for param in update_params:
             for param in update_params:
-                db_helper.execute_non_query(update_query, param)
+                db_helper.execute_non_query(self._update_query, param)
             return affected_rows
             return affected_rows
 
 
+    _one_query = """
+                    SELECT url,no,other_urls,attach_path FROM t_data WHERE no = %s  LIMIT 1
+                """
     def fetch_one_process_by_no(self, no: str):
     def fetch_one_process_by_no(self, no: str):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            query = """
-                SELECT url,no,other_urls FROM t_data WHERE no = %s  LIMIT 1
-            """
-            result = db_helper.fetch_one(query, (no, ))
+
+            result = db_helper.fetch_one(self._one_query, (no, ))
             if not result:
             if not result:
                 return None
                 return None
             data = ProcessData(url=result["url"],
             data = ProcessData(url=result["url"],
                                no=result["no"],
                                no=result["no"],
-                               other_urls=result["other_urls"])
+                               other_urls=result["other_urls"],
+                               attach_path=result["attach_path"])
             return data
             return data
 
 
     def fetch_no_send(self):
     def fetch_no_send(self):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            query = "SELECT no, title, url, keyword, date, area, address, summary, release_date FROM t_data WHERE status = 0"
+            query = "SELECT no, title, url, keyword, date, area, address, summary, attach_path, release_date FROM t_data WHERE status = 0"
             results = db_helper.execute_query(query)
             results = db_helper.execute_query(query)
             data = [ProcessData(**result) for result in results]
             data = [ProcessData(**result) for result in results]
             return data
             return data

+ 13 - 7
SourceCode/TenderCrawler/app/models/url_setting.py

@@ -5,28 +5,33 @@ class UrlSetting:
 
 
     def __init__(self,
     def __init__(self,
                  url=None,
                  url=None,
-                 type=None,
+                 adapter_type=None,
                  username=None,
                  username=None,
                  password=None,
                  password=None,
-                 keywords=None):
+                 keywords=None,
+                 sort=None,
+                 is_active=None):
         self.url = url
         self.url = url
-        self.type = type
+        self.adapter_type = adapter_type
         self.username = username
         self.username = username
         self.password = password
         self.password = password
         if not keywords:
         if not keywords:
             keywords = ""
             keywords = ""
         self.keywords = keywords.replace(",", ",")
         self.keywords = keywords.replace(",", ",")
+        self.sort = sort or 0
+        self.is_active = is_active
+
 
 
     def __repr__(self):
     def __repr__(self):
         return (
         return (
-            f"<UrlSetting(url={self.url}, type={self.type}, "
+            f"<UrlSetting(url={self.url}, type={self.adapter_type}, "
             f"username={self.username}, keywords={self.keywords}, is_active={self.is_active})>"
             f"username={self.username}, keywords={self.keywords}, is_active={self.is_active})>"
         )
         )
 
 
     def to_dict(self):
     def to_dict(self):
         return {
         return {
             'url': self.url,
             'url': self.url,
-            'type': self.type,
+            'type': self.adapter_type,
             'username': self.username,
             'username': self.username,
             'password': self.password,
             'password': self.password,
             'keywords': self.keywords,
             'keywords': self.keywords,
@@ -46,10 +51,11 @@ class UrlSetting:
     #                   url_setting.is_active)
     #                   url_setting.is_active)
     #         db_helper.execute_non_query(query, params)
     #         db_helper.execute_non_query(query, params)
 
 
+    _query = "SELECT  url, adapter_type, username, password, keywords FROM t_urls WHERE is_active = 1 ORDER BY  sort DESC "
+
     # 查询 URL 设置数据
     # 查询 URL 设置数据
     def fetch_all(self):
     def fetch_all(self):
         with MySQLHelper() as db_helper:
         with MySQLHelper() as db_helper:
-            query = "SELECT  url, type, username, password, keywords FROM t_urls WHERE is_active = 1"
-            results = db_helper.execute_query(query)
+            results = db_helper.execute_query(self._query)
             data = [UrlSetting(**result) for result in results]
             data = [UrlSetting(**result) for result in results]
             return data
             return data

+ 6 - 5
SourceCode/TenderCrawler/app/stores/data_store_interface.py

@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
 from abc import ABC, abstractmethod
+from models.collect_data import CollectData
 from models.process_data import ProcessData
 from models.process_data import ProcessData
 
 
 
 
@@ -8,11 +9,11 @@ class IDataStore(ABC):
     """
     """
 
 
     @abstractmethod
     @abstractmethod
-    def insert_collect_data(self,
-                            url: str,
-                            keyword: str,
-                            content: str,
-                            is_batch=True) -> None:
+    def query_one_collect_url(self, url: str) -> str | None:
+        raise NotImplementedError("insert 应由子类重写。")
+
+    @abstractmethod
+    def insert_collect_data(self, data: CollectData, is_batch=True) -> None:
         raise NotImplementedError("insert 应由子类重写。")
         raise NotImplementedError("insert 应由子类重写。")
 
 
     @abstractmethod
     @abstractmethod

+ 9 - 3
SourceCode/TenderCrawler/app/stores/default_data_store.py

@@ -4,13 +4,18 @@ from stores.data_store_interface import IDataStore
 
 
 class DefaultDataStore(IDataStore):
 class DefaultDataStore(IDataStore):
 
 
+
+
     logger = LoggerHelper.get_logger()
     logger = LoggerHelper.get_logger()
 
 
     def __init__(self):
     def __init__(self):
         pass
         pass
 
 
-    def insert_collect_data(self, url, keyword, content):
-        self.logger.info(f"Default: INSERT {url},关键字:{keyword}")
+    def query_one_collect_url(self, url: str) :
+        self.logger.info(f"Default: fetch_one_url")
+    def insert_collect_data(self, data , is_batch=True):
+        self.logger.info(f"Default: insert_collect_data")
+
 
 
     def save_collect_data(self, is_force=False):
     def save_collect_data(self, is_force=False):
         self.logger.info("Default: SAVE")
         self.logger.info("Default: SAVE")
@@ -20,7 +25,8 @@ class DefaultDataStore(IDataStore):
 
 
     def query_one_collect_by_url(self, url):
     def query_one_collect_by_url(self, url):
         self.logger.info("Default: QUERY_ONE_PROCESS")
         self.logger.info("Default: QUERY_ONE_PROCESS")
-
+    def query_one_process_by_no(self, no):
+        self.logger.info(f"Default: query_one_process_by_no")
     def insert_process_data(self, data):
     def insert_process_data(self, data):
         self.logger.info("Default: INSERT_PROCESS_DATA")
         self.logger.info("Default: INSERT_PROCESS_DATA")
 
 

+ 9 - 17
SourceCode/TenderCrawler/app/stores/mysql_data_store.py

@@ -15,23 +15,14 @@ class MysqlDataStore(IDataStore):
     _areaEmail = AreaEmail()
     _areaEmail = AreaEmail()
 
 
     def __init__(self):
     def __init__(self):
-        size = self.config.get('save.collect_batch_size')
-        if not size:
-            size = 1
-        self._collect_size = int(size)
+        self._collect_size = self.config.get_int('save.collect_batch_size',1)
         self._collect_list = []
         self._collect_list = []
-        size = self.config.get('save.process_batch_size')
-        if not size:
-            size = 1
-        self._process_size = int(size)
+        self._process_size = self.config.get_int('save.process_batch_size',1)
         self._process_list = []
         self._process_list = []
 
 
-    def insert_collect_data(self,
-                            url: str,
-                            keyword: str,
-                            content: str,
-                            is_batch=True):
-        data = CollectData(url, keyword, content, 0)
+    def query_one_collect_url(self, url: str) -> str | None:
+        return self._collectData.fetch_one_url(url)
+    def insert_collect_data(self, data: CollectData, is_batch=True):
         if not is_batch:
         if not is_batch:
             self._collectData.insert(data)
             self._collectData.insert(data)
         else:
         else:
@@ -39,7 +30,7 @@ class MysqlDataStore(IDataStore):
             self.save_collect_data()
             self.save_collect_data()
 
 
     def save_collect_data(self, is_force=False):
     def save_collect_data(self, is_force=False):
-        if (is_force or len(self._collect_list) >= self._collect_size):
+        if (is_force and len(self._collect_list)>0) or len(self._collect_list) >= self._collect_size:
             self.logger.info("批量保存到数据库,数量: " + str(len(self._collect_list)))
             self.logger.info("批量保存到数据库,数量: " + str(len(self._collect_list)))
             self._collectData.insert_batch(self._collect_list)
             self._collectData.insert_batch(self._collect_list)
             self._collect_list = []
             self._collect_list = []
@@ -56,14 +47,15 @@ class MysqlDataStore(IDataStore):
     def insert_process_data(self, data: ProcessData, is_batch=True):
     def insert_process_data(self, data: ProcessData, is_batch=True):
         if not is_batch:
         if not is_batch:
             self._processData.insert(data)
             self._processData.insert(data)
+            self.logger.info(f"保存到数据库: {data.url}" )
         else:
         else:
             self._process_list.append(data)
             self._process_list.append(data)
             self.save_process_data()
             self.save_process_data()
 
 
     # 插入到数据库时会把CollectData设为已处理
     # 插入到数据库时会把CollectData设为已处理
     def save_process_data(self, is_force=False):
     def save_process_data(self, is_force=False):
-        if (is_force or len(self._process_list) >= self._process_size):
-            self.logger.info("批量保存到数据库,数量: " + str(len(self._process_list)))
+        if (is_force and len(self._process_list)>0) or len(self._process_list) >= self._process_size:
+            self.logger.info(f"批量保存到数据库,数量: {str(len(self._process_list))}")
             self._processData.insert_batch(self._process_list)
             self._processData.insert_batch(self._process_list)
             self._process_list = []
             self._process_list = []
 
 

+ 77 - 41
SourceCode/TenderCrawler/app/utils/ai_helper.py

@@ -1,5 +1,7 @@
 import re
 import re
 import requests
 import requests
+from openai import OpenAI
+import json
 
 
 from utils.logger_helper import LoggerHelper
 from utils.logger_helper import LoggerHelper
 from utils.config_helper import ConfigHelper
 from utils.config_helper import ConfigHelper
@@ -14,11 +16,11 @@ class AiHelper:
     _ai_api_key = None
     _ai_api_key = None
     _ai_api_url = None
     _ai_api_url = None
     _ai_max_tokens = 150
     _ai_max_tokens = 150
-    _ai_system_prompt = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
-    _ai_prompt_template = """在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、
-    开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary)。
-    编号一般在“招标编号:”的后面,例如 (招标编号:xxxxxxx...), “xxxxxxx...”就是编号(no)。"
-    返回包含no,title,area,date,address,release_date,summary字段的json格式字符串,没有找到的信息json字段为空。"""
+    DEFAULT_AI_SYSTEM_PROMPT = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
+    DEFAULT_AI_PROMPT_TEMPLATE = """在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、
+    开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),相关采购设备的名称信息,多个设备以逗号分割(device)
+    返回包含no,title,area,date,address,release_date,summary,device字段的json格式字符串,没有找到或未提供的信息json字段为空。
+"""
 
 
     def __init__(self):
     def __init__(self):
         self._ai_api_key = self.config.get("ai.key")
         self._ai_api_key = self.config.get("ai.key")
@@ -27,12 +29,11 @@ class AiHelper:
         max_tokens = self.config.get("ai.max_tokens")
         max_tokens = self.config.get("ai.max_tokens")
         if max_tokens:
         if max_tokens:
             self._ai_max_tokens = int(max_tokens)
             self._ai_max_tokens = int(max_tokens)
-        system_prompt = self.config.get("ai.system_prompt")
-        if system_prompt:
-            self._ai_system_prompt = system_prompt
-        prompt_template = self.config.get("ai.prompt_template")
-        if prompt_template:
-            self._ai_prompt_template = prompt_template
+        self._ai_system_prompt = self.config.get("ai.system_prompt",
+                                                 self.DEFAULT_AI_SYSTEM_PROMPT)
+        self._ai_prompt_template = self.config.get(
+            "ai.prompt_template", self.DEFAULT_AI_PROMPT_TEMPLATE)
+
 
 
     def call_ai(self, content: str) -> ProcessData:
     def call_ai(self, content: str) -> ProcessData:
         # 截取前100个字符进行日志记录
         # 截取前100个字符进行日志记录
@@ -44,37 +45,31 @@ class AiHelper:
             raise Exception("AI API url 没有配置")
             raise Exception("AI API url 没有配置")
         if self._api_model is None:
         if self._api_model is None:
             raise Exception("AI API model 没有配置")
             raise Exception("AI API model 没有配置")
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self._ai_api_key}"
-        }
-        messages = [{
-            "role": "system",
-            "content": self._ai_system_prompt
-        }, {
-            "role": "user",
-            "content": f"{content} {self._ai_prompt_template}"
-        }]
+        client = OpenAI(api_key=self._ai_api_key, base_url=self._ai_api_url)
+        completion = client.chat.completions.create(
+            model=self._api_model,
+            messages=[{
+                "role": "system",
+                "content": self._ai_system_prompt,
+            }, {
+                "role": "user",
+                "content": f"{content}  {self._ai_prompt_template}",
+            }],
+            stream=False,
+            temperature=0.7,
+        )
 
 
-        data = {
-            "model": self._api_model,
-            "messages": messages,
-            "stream": False,
-            "max_tokens": self._ai_max_tokens
-        }
-        response = requests.post(self._ai_api_url, headers=headers, json=data)
-        if response.status_code == 200:
-            try:
-                self.logger.info(f"AI Response: {response.text}")
-                resStr = self._extract_message_content(response.json())
-                return self._parse_response(resStr, True)
-            except Exception as e:
-                raise Exception(f"解析 AI 响应错误: {e}")
-        else:
-            raise Exception(
-                f"调用 AI 错误: {response.status_code} - {response.text}")
+        self.logger.info(f"AI Response: {completion.model_dump_json()}")
+        response = json.loads(completion.model_dump_json())
+        #self.logger.info(f"AI Response: {response}")
+        try:
+            res_str = self._extract_message_content(response)
+            return self._parse_response(res_str, True)
+        except Exception as e:
+            raise Exception(f"解析 AI 响应错误: {e}")
 
 
-    def _extract_message_content(self, response_json: dict) -> str:
+    @staticmethod
+    def _extract_message_content(response_json: dict) -> str:
         if "choices" in response_json and len(response_json["choices"]) > 0:
         if "choices" in response_json and len(response_json["choices"]) > 0:
             choice = response_json["choices"][0]
             choice = response_json["choices"][0]
             message_content = choice.get("message", {}).get("content", "")
             message_content = choice.get("message", {}).get("content", "")
@@ -104,7 +99,6 @@ class AiHelper:
         return message_content
         return message_content
 
 
     def _parse_response(self, response: str, first=True) -> ProcessData:
     def _parse_response(self, response: str, first=True) -> ProcessData:
-        import json
         self.logger.info(f"AI Response JSON STR: {response}")
         self.logger.info(f"AI Response JSON STR: {response}")
         try:
         try:
             data = json.loads(response)
             data = json.loads(response)
@@ -113,6 +107,7 @@ class AiHelper:
                                date=data.get("date"),
                                date=data.get("date"),
                                area=data.get("area"),
                                area=data.get("area"),
                                address=data.get("address"),
                                address=data.get("address"),
+                               devices=data.get("device"),
                                summary=data.get("summary"),
                                summary=data.get("summary"),
                                release_date=data.get("release_date"))
                                release_date=data.get("release_date"))
         except json.JSONDecodeError as e:
         except json.JSONDecodeError as e:
@@ -124,3 +119,44 @@ class AiHelper:
                 return self._parse_response(message_content, False)
                 return self._parse_response(message_content, False)
             else:
             else:
                 raise Exception(f"解析 AI 响应错误: {e}")
                 raise Exception(f"解析 AI 响应错误: {e}")
+
+
+    def call_ai_1(self, content: str) -> ProcessData:
+        # 截取前100个字符进行日志记录
+        # truncated_content = content[:100]
+        self.logger.info("调用AI API")
+        if self._ai_api_key is None:
+            raise Exception("AI API key 没有配置")
+        if self._ai_api_url is None:
+            raise Exception("AI API url 没有配置")
+        if self._api_model is None:
+            raise Exception("AI API model 没有配置")
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self._ai_api_key}"
+        }
+        messages = [{
+            "role": "system",
+            "content": self._ai_system_prompt
+        }, {
+            "role": "user",
+            "content": f"{content} {self._ai_prompt_template}"
+        }]
+
+        data = {
+            "model": self._api_model,
+            "messages": messages,
+            "stream": False,
+            "max_tokens": self._ai_max_tokens
+        }
+        response = requests.post(self._ai_api_url, headers=headers, json=data)
+        if response.status_code == 200:
+            try:
+                self.logger.info(f"AI Response: {response.text}")
+                res_str = self._extract_message_content(response.json())
+                return self._parse_response(res_str, True)
+            except Exception as e:
+                raise Exception(f"解析 AI 响应错误: {e}")
+        else:
+            raise Exception(
+                f"调用 AI 错误: {response.status_code} - {response.text}")

+ 17 - 6
SourceCode/TenderCrawler/app/utils/config_helper.py

@@ -1,12 +1,10 @@
 import os
 import os
 import yaml
 import yaml
 
 
-from utils.logger_helper import LoggerHelper
 
 
 
 
 class ConfigHelper:
 class ConfigHelper:
     _instance = None
     _instance = None
-    logger = LoggerHelper.get_logger()
 
 
     # 默认配置文件路径
     # 默认配置文件路径
     default_config_path = os.path.join(os.path.dirname(__file__), '..',
     default_config_path = os.path.join(os.path.dirname(__file__), '..',
@@ -24,7 +22,7 @@ class ConfigHelper:
     def load_config(self, path=None):
     def load_config(self, path=None):
         if self._config is None:
         if self._config is None:
             if not path:
             if not path:
-                # self.logger.info(f"使用默认配置文件:{self.default_config_path}")
+                # print(f"使用默认配置文件:{self.default_config_path}")
                 self._path = self.default_config_path
                 self._path = self.default_config_path
             else:
             else:
                 self._path = path
                 self._path = path
@@ -34,7 +32,7 @@ class ConfigHelper:
             self._config = yaml.safe_load(file)
             self._config = yaml.safe_load(file)
         # 合并环境变量配置
         # 合并环境变量配置
         self._merge_env_vars()
         self._merge_env_vars()
-        # self.logger.info(f"加载的配置文件内容:{self._config}")
+        # print(f"加载的配置文件内容:{self._config}")
         return self._config
         return self._config
 
 
     def _merge_env_vars(self, env_prefix="APP_"):  # 环境变量前缀为 APP_
     def _merge_env_vars(self, env_prefix="APP_"):  # 环境变量前缀为 APP_
@@ -52,7 +50,7 @@ class ConfigHelper:
         else:
         else:
             config[keys[0]] = value
             config[keys[0]] = value
 
 
-    def get(self, key):
+    def get(self, key:str, default:str=None):
         if self._config is None:
         if self._config is None:
             self.load_config(self._path)
             self.load_config(self._path)
         keys = key.split('.')
         keys = key.split('.')
@@ -61,9 +59,22 @@ class ConfigHelper:
             if isinstance(config, dict) and k in config:
             if isinstance(config, dict) and k in config:
                 config = config[k]
                 config = config[k]
             else:
             else:
-                return None
+                return default
         return config
         return config
 
 
+    def get_bool(self, key:str)->bool:
+        val = str(self.get(key,"0"))
+        return True if val.lower() == "true" or val == "1" else False
+
+    def get_int(self, key:str, default:int=0)->int:
+        val = self.get(key)
+        if not val:
+            return default
+        try :
+            return int(val)
+        except ValueError:
+            return default
+
     def get_all(self):
     def get_all(self):
         if self._config is None:
         if self._config is None:
             self.load_config(self._path)
             self.load_config(self._path)

+ 48 - 12
SourceCode/TenderCrawler/app/utils/email_helper.py

@@ -1,12 +1,18 @@
 import smtplib
 import smtplib
+import os
+import mimetypes
 from email.mime.multipart import MIMEMultipart
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from email.mime.text import MIMEText
 from email.mime.base import MIMEBase
 from email.mime.base import MIMEBase
 from email import encoders
 from email import encoders
-import os
+
 
 
 from utils.config_helper import ConfigHelper
 from utils.config_helper import ConfigHelper
 from utils.logger_helper import LoggerHelper
 from utils.logger_helper import LoggerHelper
+from utils.string_helper import StringHelper
+
+
+
 
 
 
 
 class EmailHelper:
 class EmailHelper:
@@ -29,7 +35,7 @@ class EmailHelper:
                    subject: str,
                    subject: str,
                    body: str,
                    body: str,
                    body_is_html: bool = True,
                    body_is_html: bool = True,
-                   attachment_path: str = None):
+                   attachment_paths: str = None):
         msg = MIMEMultipart()
         msg = MIMEMultipart()
         msg['From'] = self.from_email
         msg['From'] = self.from_email
         msg['To'] = ', '.join(to_addr.split(','))
         msg['To'] = ', '.join(to_addr.split(','))
@@ -41,12 +47,13 @@ class EmailHelper:
         else:
         else:
             msg.attach(MIMEText(body, 'plain', 'utf-8'))
             msg.attach(MIMEText(body, 'plain', 'utf-8'))
 
 
-        if attachment_path:
-            self._attach_file(msg, attachment_path)
+        if attachment_paths:
+            attachment_arr = StringHelper.to_array(attachment_paths)
+            for attachment_path in attachment_arr:
+                self._attach_file(msg, attachment_path)
 
 
         try:
         try:
-            # with smtplib.SMTP(self.smtp_server, self.port, timeout=10) as server:
-            with smtplib.SMTP_SSL(self.smtp_server, timeout=10) as server:
+            with smtplib.SMTP_SSL(self.smtp_server,port=self.port, timeout=10) as server:
                 # server.starttls()
                 # server.starttls()
                 server.login(self.username, self.password)
                 server.login(self.username, self.password)
                 # 将 to_addr 字符串通过 split(',') 分割成列表,传递给 sendmail
                 # 将 to_addr 字符串通过 split(',') 分割成列表,传递给 sendmail
@@ -54,20 +61,49 @@ class EmailHelper:
                                 msg.as_string())
                                 msg.as_string())
             self.logger.info(f"邮件发送成功:{to_addr}")
             self.logger.info(f"邮件发送成功:{to_addr}")
             return True
             return True
+        except smtplib.SMTPAuthenticationError:
+            self.logger.error("SMTP 认证失败")
+        except smtplib.SMTPServerDisconnected:
+            self.logger.error("SMTP 服务器断开连接")
+        except smtplib.SMTPException as e:
+            self.logger.error(f"SMTP 异常: {e}")
         except Exception as e:
         except Exception as e:
             self.logger.error(f"邮件发送失败:{to_addr} {e}")
             self.logger.error(f"邮件发送失败:{to_addr} {e}")
             return False
             return False
 
 
+
     def _attach_file(self, msg: MIMEMultipart, attachment_path: str):
     def _attach_file(self, msg: MIMEMultipart, attachment_path: str):
         if not os.path.isfile(attachment_path):
         if not os.path.isfile(attachment_path):
-            raise FileNotFoundError(
-                f"The file {attachment_path} does not exist.")
+            self.logger.error(f"文件 {attachment_path} 不存在。")
+            return
+
+        file_size = os.path.getsize(attachment_path)
+        max_size = 1024 * 8192  # 8MB
+
+        if file_size > max_size:
+            self.logger.error(f"文件 {attachment_path} 大小超过限制 ({file_size} bytes > {max_size} bytes),不添加附件。")
+            return
+
+        # 根据文件名后缀获取 MIME 类型
+        content_type, _ = mimetypes.guess_type(attachment_path)
+        if content_type is None:
+            content_type = 'application/octet-stream'  # 默认类型
+        main_type, sub_type = content_type.split('/', 1)
 
 
         with open(attachment_path, "rb") as attachment:
         with open(attachment_path, "rb") as attachment:
-            part = MIMEBase('application', 'octet-stream')
-            part.set_payload(attachment.read())
-            encoders.encode_base64(part)
+            # part = MIMEBase('application', 'octet-stream')
+            part = MIMEBase(main_type, sub_type)
+            part.set_payload(attachment.read(max_size))
+            # 获取文件名并去除第一个 @ 字符前面的部分
+            name = os.path.basename(attachment_path)
+            at_index = name.find('@')
+            if at_index != -1:
+                name = name[at_index + 1:]
             part.add_header(
             part.add_header(
                 'Content-Disposition',
                 'Content-Disposition',
-                f"attachment; filename= {os.path.basename(attachment_path)}")
+                f"attachment; filename= {name}")
+            part.add_header('Content-ID', '<0>')
+            part.add_header('X-Attachment-Id', '0')
+            encoders.encode_base64(part)
             msg.attach(part)
             msg.attach(part)
+            self.logger.info(f"添加附件 {name} {attachment_path} 到邮件中。")

+ 67 - 0
SourceCode/TenderCrawler/app/utils/file_helper.py

@@ -0,0 +1,67 @@
+import os
+import requests
+from datetime import datetime
+from urllib.parse import urlparse
+
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+
+class FileHelper:
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+    DEFAULT_ATTACH_PATH = "./attaches/"
+    def __init__(self):
+        path = self.config.get("save.attach_file_path", self.DEFAULT_ATTACH_PATH)
+        path = path.replace("\\", "/")
+        path = path.replace("//", "/")
+        self._attach_file_path = path
+
+    def download_remote_file(self, file_url, file_name) -> str | None:
+        self.logger.info(f"下载远程文件: {file_url}  文件名:{file_name}")
+        current_timestamp = datetime.now().strftime("%H%M%S%f")[:-3]  # 取前三位毫秒
+        file_name = f"{current_timestamp}@{file_name}"
+        file_path = os.path.join(self._attach_file_path, f'{datetime.now().strftime("%Y-%m-%d")}')
+        if not os.path.exists(file_path):
+            os.makedirs(file_path)
+        path = os.path.join(file_path, file_name)
+        path = path.replace("\\", "/")
+        path = path.replace("//", "/")
+        # 10个不同的 User-Agent
+        user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/91.0.4472.124 Safari/605.1.15",
+            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
+        ]
+
+        # 根据文件名长度选择一个 User-Agent
+        ua_index = len(file_name) % len(user_agents)
+        # 解析 file_url 获取 Referer
+        parsed_url = urlparse(file_url)
+        referer = f"{parsed_url.scheme}://{parsed_url.netloc}/".replace("//download.", "//www.")
+        headers = {
+            'User-Agent': user_agents[ua_index],
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
+            'Referer': referer
+        }
+
+        try:
+            response = requests.get(file_url, headers=headers, allow_redirects=True)
+            response.raise_for_status()
+            with open(path, 'wb') as f:
+                f.write(response.content)
+            self.logger.info(f"文件下载成功: {file_name}")
+            return path
+        except requests.exceptions.HTTPError as http_err:
+            self.logger.error(f"HTTP 错误: {http_err}")
+        except Exception as e:
+            self.logger.error(f"文件下载失败: {file_name}。Exception: {e}")
+            return None

+ 16 - 15
SourceCode/TenderCrawler/app/utils/logger_helper.py

@@ -2,6 +2,7 @@ import os
 import logging
 import logging
 from logging.handlers import TimedRotatingFileHandler
 from logging.handlers import TimedRotatingFileHandler
 
 
+from utils.config_helper import ConfigHelper
 
 
 class LoggerHelper:
 class LoggerHelper:
     """
     """
@@ -9,20 +10,21 @@ class LoggerHelper:
     该类实现了单例模式,确保在整个应用程序中只有一个日志记录器实例被创建和使用
     该类实现了单例模式,确保在整个应用程序中只有一个日志记录器实例被创建和使用
     """
     """
     _instance = None
     _instance = None
+    config = ConfigHelper()
 
 
-    def __new__(self, *args, **kwargs):
+    def __new__(cls, *args, **kwargs):
         """
         """
         实现单例模式,确保日志记录器仅被创建一次
         实现单例模式,确保日志记录器仅被创建一次
         如果尚未创建实例,则创建并初始化日志记录器
         如果尚未创建实例,则创建并初始化日志记录器
         """
         """
-        if not self._instance:
-            self._instance = super(LoggerHelper,
-                                   self).__new__(self, *args, **kwargs)
+        if not cls._instance:
+            cls._instance = super(LoggerHelper,
+                                  cls).__new__(cls, *args, **kwargs)
             try:
             try:
-                self._instance._initialize_logger()
+                cls._instance._initialize_logger()
             except Exception as e:
             except Exception as e:
                 raise Exception(f"配置logger出错: {e}")
                 raise Exception(f"配置logger出错: {e}")
-        return self._instance
+        return cls._instance
 
 
     @property
     @property
     def logger(self):
     def logger(self):
@@ -34,13 +36,12 @@ class LoggerHelper:
         """
         """
         self._logger = logging.getLogger('app_logger')
         self._logger = logging.getLogger('app_logger')
         self._logger.setLevel(logging.INFO)
         self._logger.setLevel(logging.INFO)
-        log_folder = './logs'
-        if not os.path.exists(log_folder):
-            os.makedirs(log_folder)
+        log_file_path = self.config.get("logger.file_path", "./logs")
+        if not os.path.exists(log_file_path):
+            os.makedirs(log_file_path)
 
 
         # 创建按日期分割的文件处理器
         # 创建按日期分割的文件处理器
-        file_handler = TimedRotatingFileHandler(os.path.join(
-            log_folder, 'data_collector.log'),
+        file_handler = TimedRotatingFileHandler(os.path.join(log_file_path, 'crawler.log'),
                                                 when='midnight',
                                                 when='midnight',
                                                 interval=1,
                                                 interval=1,
                                                 backupCount=7,
                                                 backupCount=7,
@@ -64,11 +65,11 @@ class LoggerHelper:
         self._logger.addHandler(console_handler)
         self._logger.addHandler(console_handler)
 
 
     @classmethod
     @classmethod
-    def get_logger(self):
+    def get_logger(cls):
         """
         """
         提供初始化后的日志记录器实例
         提供初始化后的日志记录器实例
         :return: 初始化后的日志记录器实例
         :return: 初始化后的日志记录器实例
         """
         """
-        if not self._instance:
-            self._instance = self()
-        return self._instance._logger
+        if not cls._instance:
+            cls._instance = cls()
+        return cls._instance._logger

+ 77 - 0
SourceCode/TenderCrawler/app/utils/string_helper.py

@@ -0,0 +1,77 @@
+class StringHelper:
+
+    @staticmethod
+    def check_empty(s: str,default:str) -> str:
+        """
+        检查字符串是否为空
+        """
+        if s:
+            return s
+        return default
+
+
+
+    @staticmethod
+    def to_array(s: str, sep: str=",") -> list[str]:
+        """
+        将字符串按指定分隔符分割成数组。
+
+        :param s: 要分割的字符串。
+        :param sep: 分隔符,默认为逗号。
+        :return: 分割后的字符串数组。
+        """
+        if not s:
+            return []
+        if sep == ",":
+            s = s.replace(",", ",")
+        return s.split(sep)
+
+    @staticmethod
+    def startswith(s: str, prefix: str) -> str:
+        """
+        检查字符串是否以特定前缀开头,如果没有则补全。
+
+        :param s: 要检查的字符串。
+        :param prefix: 前缀。
+        :return: 如果字符串以指定前缀开头,返回原字符串;否则返回补全后的字符串。
+        """
+        if not s.startswith(prefix):
+            return prefix + s
+        return s
+
+    @staticmethod
+    def endswith(s: str, suffix: str) -> str:
+        """
+        检查字符串是否以特定后缀结尾,如果没有则补全。
+
+        :param s: 要检查的字符串。
+        :param suffix: 后缀。
+        :return: 如果字符串以指定后缀结尾,返回原字符串;否则返回补全后的字符串。
+        """
+        if not s.endswith(suffix):
+            return s + suffix
+        return s
+
+    @staticmethod
+    def split_and_clean(s: str, sep: str=",") -> list[str]:
+        """
+        将字符串按指定分隔符分割并去除空字符串。
+
+        :param s: 要分割的字符串。
+        :param sep: 分隔符,默认为逗号。
+        :return: 分割后的字符串数组,去除空字符串。
+        """
+        if not s:
+            return []
+        parts = StringHelper.to_array(s,sep)
+        return [part.strip() for part in parts if part.strip()]
+
+    @staticmethod
+    def remove_extra_spaces(s: str) -> str:
+        """
+        将字符串中的多个连续空格替换为单个空格。
+
+        :param s: 要处理的字符串。
+        :return: 替换后的字符串。
+        """
+        return ' '.join(s.split())

+ 30 - 28
SourceCode/TenderCrawler/docker-compose.yml

@@ -1,9 +1,9 @@
 version: '3.8'
 version: '3.8'
 
 
 services:
 services:
-  dc-mysql:
+  crawler-mysql:
     image: mysql:8.0.39
     image: mysql:8.0.39
-    container_name: y_data-collect-mysql
+    container_name: y_tender-crawler-mysql
     environment:
     environment:
       - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD}
       - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD}
       - MYSQL_DATABASE=${MYSQL_DATABASE}
       - MYSQL_DATABASE=${MYSQL_DATABASE}
@@ -12,23 +12,23 @@ services:
       - TZ=Asia/Shanghai
       - TZ=Asia/Shanghai
       # - MYSQL_DEFAULT_AUTHENTICATION_PLUGIN=mysql_native_password
       # - MYSQL_DEFAULT_AUTHENTICATION_PLUGIN=mysql_native_password
     volumes:
     volumes:
-      - /home/docker/data-collect/mysql/log:/var/log/mysql
-      - /home/docker/data-collect/mysql/data:/var/lib/mysql
-      - /home/docker/data-collect/mysql/conf.d:/etc/mysql/conf.d
-      - /etc/localtime:/etc/localtime:ro
-      - /home/docker/data-collect/mysql/init/init.sql:/docker-entrypoint-initdb.d/init.sql # 挂载 init.sql 文件
+       - /home/docker/tender-crawler/mysql/log:/var/log/mysql
+       - /home/docker/tender-crawler/mysql/data:/var/lib/mysql
+       - /home/docker/tender-crawler/mysql/conf.d:/etc/mysql/conf.d
+       - /etc/localtime:/etc/localtime:ro
+       - /home/docker/tender-crawler/app/init.sql:/docker-entrypoint-initdb.d/init.sql # 挂载 init.sql 文件
       # - ./.dev/mysql5.7/log:/var/log/mysql
       # - ./.dev/mysql5.7/log:/var/log/mysql
       # - ./.dev/mysql5.7/data:/var/lib/mysql
       # - ./.dev/mysql5.7/data:/var/lib/mysql
-      # - ./.dev/mysql8.0.39/log:/var/log/mysql
-      # - ./.dev/mysql8.0.39/data:/var/lib/mysql
-      # - ./init.sql:/docker-entrypoint-initdb.d/init.sql
+#      - ./.dev/mysql8.0.39/log:/var/log/mysql
+#      - ./.dev/mysql8.0.39/data:/var/lib/mysql
+#      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
     ports:
     ports:
       - '${MYSQL_PORT}:3306'
       - '${MYSQL_PORT}:3306'
     networks:
     networks:
-      - dc-net
+      - crawler-net
     restart: always
     restart: always
 
 
-  dc-selenium:
+  crawler-selenium:
     image: selenium/standalone-chrome:latest
     image: selenium/standalone-chrome:latest
     container_name: y_selenium
     container_name: y_selenium
     environment:
     environment:
@@ -37,26 +37,26 @@ services:
       - '${SELENIUM_CHROME_PORT}:4444'
       - '${SELENIUM_CHROME_PORT}:4444'
       - '5900:5900'
       - '5900:5900'
     networks:
     networks:
-      - dc-net
+      - crawler-net
     restart: always
     restart: always
 
 
-  dc-app:
+  crawler-app:
     build: .
     build: .
-    image: y_data-collect-app:1.0.0
-    container_name: y_data-collect-app
+    image: y_tender-crawler-app:1.0.0
+    container_name: y_tender-crawler-app
     depends_on:
     depends_on:
-      - dc-mysql
-      - dc-selenium
+      - crawler-mysql
+      - crawler-selenium
     environment:
     environment:
       - TZ=Asia/Shanghai
       - TZ=Asia/Shanghai
-      - APP_MYSQL__HOST=y_data-collect-mysql
+      - APP_MYSQL__HOST=y_tender-crawler-mysql
       - APP_MYSQL__PORT=3306
       - APP_MYSQL__PORT=3306
       - APP_MYSQL__DB=${MYSQL_DATABASE}
       - APP_MYSQL__DB=${MYSQL_DATABASE}
       - APP_MYSQL__USER=${MYSQL_USER}
       - APP_MYSQL__USER=${MYSQL_USER}
       - APP_MYSQL__PASSWORD=${MYSQL_PASSWORD}
       - APP_MYSQL__PASSWORD=${MYSQL_PASSWORD}
-      - APP_AI__KEY=
-      - APP_AI__URL=http://192.168.0.109:7580/api/chat
-      - APP_AI__MODEL=qwen2.5:7b
+#      - APP_AI__KEY=
+#      - APP_AI__URL=http://192.168.0.109:7580/api/chat
+#      - APP_AI__MODEL=qwen2.5:7b
       - APP_AI__MAX_TOKENS=1024
       - APP_AI__MAX_TOKENS=1024
       - APP_SCHEDULE__SLEEP_INTERVAL=600 #单位:秒 10分钟检查一次
       - APP_SCHEDULE__SLEEP_INTERVAL=600 #单位:秒 10分钟检查一次
       - APP_SCHEDULE__COLLECT=20:00,12:00
       - APP_SCHEDULE__COLLECT=20:00,12:00
@@ -65,17 +65,19 @@ services:
       - APP_SCHEDULE__RUN_NOW=1
       - APP_SCHEDULE__RUN_NOW=1
       - APP_SELENIUM__REMOTE_DRIVER_URL=http://y_selenium:4444/wd/hub
       - APP_SELENIUM__REMOTE_DRIVER_URL=http://y_selenium:4444/wd/hub
     volumes:
     volumes:
-      - /home/docker/data-collect/app/config.yml:/app/config.yml
-      - /home/docker/data-collect/app/logs:/app/logs
-      # - ./.dev/app/config.yml:/app/config.yml
-      # - ./.dev/app/logs:/app/logs
+      - /home/docker/tender-crawler/app/config.yml:/app/config.yml
+      - /home/docker/tender-crawler/app/logs:/app/logs
+      - /home/docker/tender-crawler/app/attaches:/app/attaches
+#      - ./.dev/app/config.yml:/app/config.yml
+#      - ./.dev/app/logs:/app/logs
+#      - ./.dev/app/attaches:/app/attaches
     networks:
     networks:
-      - dc-net
+      - crawler-net
     # 如果需要暴露端口
     # 如果需要暴露端口
     # ports:
     # ports:
     #   - "8080:8080"
     #   - "8080:8080"
     restart: always
     restart: always
 
 
 networks:
 networks:
-  dc-net:
+  crawler-net:
     driver: bridge
     driver: bridge

+ 57 - 20
SourceCode/TenderCrawler/init.sql

@@ -3,6 +3,26 @@
 SET NAMES utf8mb4;
 SET NAMES utf8mb4;
 SET FOREIGN_KEY_CHECKS = 0;
 SET FOREIGN_KEY_CHECKS = 0;
 
 
+-- ----------------------------
+-- Table structure for t_urls
+-- ----------------------------
+DROP TABLE IF EXISTS `t_urls`;
+CREATE TABLE `t_urls`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '需访问的URL链接',
+  `adapter_type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '适配器类型',
+  `username` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '用户名',
+  `password` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '密码',
+  `keywords` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '关键字,多个以”,“分隔',
+  `is_active` int(4) NULL DEFAULT NULL COMMENT '激活状态 1:激活 0:失活',
+  `sort` int(4) NULL DEFAULT NULL COMMENT '排序字段',
+  `remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.ccgp.gov.cn/index.shtml', 'ccgp', '', '', '红外,红外显微镜,傅里叶红外,红外光谱,显微红外,拉曼,激光共聚焦拉曼,拉曼显微镜,拉曼光谱,显微拉曼,气体分析\'', 1, 100, '中国政府采购网 https://www.ccgp.gov.cn/index.shtml');
+INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.chinabidding.com/', 'chinabidding', 'brukernano2011', '695765FqX', '红外光谱仪', 1,0, '中国国际招标网 (www.chinabidding.com 必联网)');
+
+
 -- ----------------------------
 -- ----------------------------
 -- Table structure for t_area_email
 -- Table structure for t_area_email
 -- ----------------------------
 -- ----------------------------
@@ -16,10 +36,26 @@ CREATE TABLE `t_area_email`  (
   PRIMARY KEY (`name`) USING BTREE
   PRIMARY KEY (`name`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
 
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('盐城', '江苏省盐城市,江苏盐城,盐城市,盐城', '349977741@qq.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('南京', '江苏省南京市,江苏南京,南京市,南京', '349977741@qq.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('全国', '全国', 'chancelot@foxmail.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('宁波', '浙江省宁波市,浙江宁波,宁波市,宁波', '349977741@qq.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('武汉', '武汉市,武汉,中国武汉,中国武汉市', 'chancelot@foxmail.com,349977741@qq.com', 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('济南', '江苏省济南市,江苏济南,济南市,济南', '349977741@qq.com', 1, NULL);
 INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('济南', '江苏省济南市,江苏济南,济南市,济南', '349977741@qq.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('全国', '全国', 'yueyy@iwbnet.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张志琼', '黑龙江,吉林,辽宁', 'zhiqiong.zhang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('王双', '河北,山东济南,山东德州', 'shuang.wang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('尚祖俭', '天津市,天津,中国天津,中国天津市', 'zujian.shang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('赵跃', '北京', 'yue.zhao@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张景灿', '陕西,新疆,宁夏,青海', 'jingcan.zhang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('穆彦竹', '山西,河南,甘肃', 'yanzhu.mu@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('廖然', '内蒙古', 'ran.liao@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吕小勇', '江苏', 'xiaoyong.lv@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张潇', '浙江,福建', 'xiao.zhang@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吴雪美', '上海', 'xuemei.wu@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('邬歆', '安徽,香港,澳门', 'xin.wu@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('冯新宝', '湖北,湖南', 'xinbao.feng@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('耿朝曦', '江西,贵州', 'zhaoxi.geng@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('李华斌', '广西,广东深圳', 'huabin.li@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吕万明', '海南,广东广州,广东中山', 'wanming.lv@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('许建光', '西藏,云南,广东', 'jianguang.xu@bruker.com', 0, NULL);
 
 
 
 
 -- ----------------------------
 -- ----------------------------
@@ -30,6 +66,22 @@ CREATE TABLE `t_collect_data`  (
   `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
   `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
   `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
   `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
   `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
   `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
+  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
+  `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
+  `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
+  `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+-- ----------------------------
+-- Table structure for t_collect_data_history
+-- ----------------------------
+DROP TABLE IF EXISTS `t_collect_data_history`;
+CREATE TABLE `t_collect_data_history`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
+  `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
+  `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
+  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
   `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
   `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
@@ -50,6 +102,8 @@ CREATE TABLE `t_data`  (
   `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '详细地点',
   `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '详细地点',
   `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
   `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
   `release_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '发布时间',
   `release_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '发布时间',
+  `devices` varchar(1000) NULL DEFAULT NULL COMMENT '相关设备',
+  `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NULL DEFAULT NULL COMMENT '状态 0:未推送 1:已推送',
   `status` int(4) NULL DEFAULT NULL COMMENT '状态 0:未推送 1:已推送',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `send_time` datetime NULL DEFAULT NULL COMMENT '推送时间',
   `send_time` datetime NULL DEFAULT NULL COMMENT '推送时间',
@@ -58,23 +112,6 @@ CREATE TABLE `t_data`  (
   PRIMARY KEY (`url`) USING BTREE
   PRIMARY KEY (`url`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
 
-INSERT INTO `t_data` (`url`, `no`, `title`, `date`, `area`, `address`, `summary`, `release_date`, `status`, `create_time`, `send_time`, `remark`) VALUES ('https://www.chinabidding.com/bidDetail/260794529.html', 'NWZ241216-2103-049601', '中石化华东油气分公司2024年度210306填料塔框架协议招标采购', '2024年12月27日9时0分', '全国', '中国石化物资电子招投标交易平台(https://bidding.epec.com)', '本招标项目为中国石油化工股份有限公司华东油气分公司2024年度210306填料塔框架协议招标采购,招标编号为NWZ241216-2103-049601。投标人须具备工业管道安装资质或压力管道元件制造资质,并在有效期内;具备A级压力容器制造证书且在有效期内。招标文件于2024年12月16日11时0分开始售卖,截止时间为2024年12月23日9时0分,电子投标文件需在2024年12月27日9时0分前递交。', '2024-12-17', 0, '2024-12-19 15:26:54', NULL, NULL);
-
--- ----------------------------
--- Table structure for t_urls
--- ----------------------------
-DROP TABLE IF EXISTS `t_urls`;
-CREATE TABLE `t_urls`  (
-  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '需访问的URL链接',
-  `type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '适配器类型',
-  `username` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '用户名',
-  `password` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '密码',
-  `keywords` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '关键字,多个以”,“分隔',
-  `is_active` int(4) NULL DEFAULT NULL COMMENT '激活状态 1:激活 0:失活',
-  `remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
-  PRIMARY KEY (`url`) USING BTREE
-) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
 
-INSERT INTO `t_urls` (`url`, `type`, `username`, `password`, `keywords`, `is_active`, `remark`) VALUES ('https://www.chinabidding.com/', 'chinabidding', 'brukernano2011', '695765FqX', '红外光谱仪', 1, '中国国际招标网 (www.chinabidding.com 必联网)\r\nBruker Beijing	 用户名:brukernano2011               密码:695765FqX');
 
 
 SET FOREIGN_KEY_CHECKS = 1;
 SET FOREIGN_KEY_CHECKS = 1;

+ 1 - 0
SourceCode/TenderCrawler/requirements.txt

@@ -6,3 +6,4 @@ Requests==2.32.3
 schedule==1.2.2
 schedule==1.2.2
 selenium==4.27.1
 selenium==4.27.1
 cryptography==41.0.4
 cryptography==41.0.4
+openai==1.58.1