Quellcode durchsuchen

Merge branch 'V2.0.0' of Crawler/TenderCrawler into dev

YueYunyun vor 6 Monaten
Ursprung
Commit
d51bdb0ac6
39 geänderte Dateien mit 2458 neuen und 1180 gelöschten Zeilen
  1. 1 1
      SourceCode/TenderCrawler/.env
  2. 13 0
      SourceCode/TenderCrawler/.script/Build_Dockerfile.run.xml
  3. 11 0
      SourceCode/TenderCrawler/.script/Run_TenderCrawler.run.xml
  4. 3 0
      SourceCode/TenderCrawler/app/__init__.py
  5. 10 0
      SourceCode/TenderCrawler/app/adapters/__init__.py
  6. 128 90
      SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py
  7. 102 93
      SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py
  8. 29 27
      SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py
  9. 21 7
      SourceCode/TenderCrawler/app/config.yml
  10. 7 0
      SourceCode/TenderCrawler/app/drivers/__init__.py
  11. 34 39
      SourceCode/TenderCrawler/app/drivers/driver_creator.py
  12. 0 0
      SourceCode/TenderCrawler/app/jobs/__init__.py
  13. 84 0
      SourceCode/TenderCrawler/app/jobs/data_clean.py
  14. 23 26
      SourceCode/TenderCrawler/app/jobs/data_collector.py
  15. 158 0
      SourceCode/TenderCrawler/app/jobs/data_process.py
  16. 330 0
      SourceCode/TenderCrawler/app/jobs/data_send.py
  17. 254 0
      SourceCode/TenderCrawler/app/jobs/job_runner.py
  18. 21 10
      SourceCode/TenderCrawler/app/main.py
  19. 0 71
      SourceCode/TenderCrawler/app/main/data_process.py
  20. 0 119
      SourceCode/TenderCrawler/app/main/data_send.py
  21. 0 151
      SourceCode/TenderCrawler/app/main/runner.py
  22. 42 6
      SourceCode/TenderCrawler/app/models/area_email.py
  23. 122 92
      SourceCode/TenderCrawler/app/models/collect_data.py
  24. 135 94
      SourceCode/TenderCrawler/app/models/process_data.py
  25. 218 0
      SourceCode/TenderCrawler/app/models/process_result_data.py
  26. 17 19
      SourceCode/TenderCrawler/app/models/url_setting.py
  27. 53 0
      SourceCode/TenderCrawler/app/stores/data_store_interface.py
  28. 54 19
      SourceCode/TenderCrawler/app/stores/default_data_store.py
  29. 86 16
      SourceCode/TenderCrawler/app/stores/mysql_data_store.py
  30. 147 1
      SourceCode/TenderCrawler/app/utils/__init__.py
  31. 51 101
      SourceCode/TenderCrawler/app/utils/ai_helper.py
  32. 11 13
      SourceCode/TenderCrawler/app/utils/config_helper.py
  33. 45 55
      SourceCode/TenderCrawler/app/utils/email_helper.py
  34. 61 22
      SourceCode/TenderCrawler/app/utils/file_helper.py
  35. 41 15
      SourceCode/TenderCrawler/app/utils/logger_helper.py
  36. 30 33
      SourceCode/TenderCrawler/app/utils/mysql_helper.py
  37. 7 9
      SourceCode/TenderCrawler/app/utils/string_helper.py
  38. 21 24
      SourceCode/TenderCrawler/docker-compose.yml
  39. 88 27
      SourceCode/TenderCrawler/init.sql

+ 1 - 1
SourceCode/TenderCrawler/.env

@@ -1,6 +1,6 @@
 SELENIUM_CHROME_PORT=3534
 MYSQL_ROOT_PASSWORD=123456qwertyu
-MYSQL_DATABASE=iwb_data_collect_v1.0
+MYSQL_DATABASE=iwb_data_collect_v2.0
 MYSQL_USER=iwb_data
 MYSQL_PASSWORD=123456iwb
 MYSQL_PORT=3535

+ 13 - 0
SourceCode/TenderCrawler/.script/Build_Dockerfile.run.xml

@@ -0,0 +1,13 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Build_Dockerfile" type="docker-deploy" factoryName="dockerfile" activateToolWindowBeforeRun="false" server-name="81">
+    <deployment type="dockerfile">
+      <settings>
+        <option name="imageTag" value="y_tender-crawler-app:2.0.1" />
+        <option name="buildOnly" value="true" />
+        <option name="containerName" value="" />
+        <option name="sourceFilePath" value="Dockerfile" />
+      </settings>
+    </deployment>
+    <method v="2" />
+  </configuration>
+</component>

+ 11 - 0
SourceCode/TenderCrawler/.script/Run_TenderCrawler.run.xml

@@ -0,0 +1,11 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Run_TenderCrawler" type="docker-deploy" factoryName="docker-compose.yml" server-name="81">
+    <deployment type="docker-compose.yml">
+      <settings>
+        <option name="envFilePath" value="" />
+        <option name="sourceFilePath" value="docker-compose.yml" />
+      </settings>
+    </deployment>
+    <method v="2" />
+  </configuration>
+</component>

+ 3 - 0
SourceCode/TenderCrawler/app/__init__.py

@@ -0,0 +1,3 @@
+import utils
+
+utils.reload_config()

+ 10 - 0
SourceCode/TenderCrawler/app/adapters/__init__.py

@@ -0,0 +1,10 @@
+from data_collection_adapter_interface import IDataCollectionAdapter
+from stores.data_store_interface import IDataStore
+
+
+def collect(adapter: IDataCollectionAdapter, keyword: str, store: IDataStore = None):
+    adapter.collect(keyword, store)
+
+
+def teardown(adapter: IDataCollectionAdapter):
+    adapter.teardown()

+ 128 - 90
SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py

@@ -1,23 +1,21 @@
 from time import sleep
 
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.webdriver.common.by import By
-from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
-from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from selenium.webdriver.support.wait import WebDriverWait
 
-from stores.data_store_interface import IDataStore
+import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
-from utils.file_helper import FileHelper
-
+from stores.data_store_interface import IDataStore
 
 
 class CcgpDataCollectionAdapter(IDataCollectionAdapter):
     """
     中国政府采购网数据采集适配器
     """
-    file_helper = FileHelper()
 
-    def __init__(self, url: str,store:IDataStore=None):
+    def __init__(self, url: str, store: IDataStore = None):
         self._url = url
         self._store = store
         self._driver = None
@@ -25,7 +23,7 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
         self._adapter_type = "ccgp"
 
     def login(self, username: str, password: str) -> None:
-       pass
+        pass
 
     def collect(self, keyword: str, store: IDataStore):
         if store:
@@ -33,7 +31,7 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
         self._keyword = keyword
         items = self._search(keyword)
         self._process_list(items)
-        if self.config.get_bool(self.batch_save_key):
+        if utils.get_config_bool(self.batch_save_key):
             self.store.save_collect_data(True)
 
     def _search(self, keyword: str) -> list:
@@ -41,20 +39,22 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
             if not keyword:
                 raise Exception("搜索关键字不能为空")
             wait = WebDriverWait(self.driver, 10, 1)
-            wait.until(
-                ec.presence_of_element_located((By.ID, "searchForm")))
+            wait.until(ec.presence_of_element_located((By.ID, "searchForm")))
             search_el = self.driver.find_element(By.ID, "kw")
             sleep(2)
             search_el.clear()
             search_el.send_keys(keyword)
             search_btn = self.driver.find_element(
-                By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']")
+                By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']"
+            )
             sleep(1)
             search_btn.click()
-            wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
-            default_search_txt = "近一周"
-            search_txt = self.config.get(self.search_day_key, default_search_txt)
-            self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
+            wait.until(
+                ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
+            )
+            default_search_txt = "近1周"
+            search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
+            utils.get_logger().info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
             if search_txt != default_search_txt:
                 last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
                 for last_el in last_els:
@@ -62,68 +62,80 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                         sleep(1)
                         last_el.click()
                         break
-                wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
+                wait.until(
+                    ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
+                )
             else:
                 sleep(1)
-
-
-            # try:
-            #     a_links = self.driver.find_elements(
-            #         By.XPATH, "//form[@id='pagerSubmitForm']/a")
-            #     count = len(a_links)
-            #     if count > 1:
-            #         count = count - 1
-            #     self.logger.info(f"共查询到 {count} 页")
-            # except Exception as e:
-            #     self.logger.error(f"搜索失败[尝试查询页数]: {e}")
-            items = self.driver.find_elements(By.XPATH,
-                                         "//ul[@class='vT-srch-result-list-bid']/li/a")
+            try:
+                p_els = self.driver.find_elements(
+                    By.XPATH, "//body/div[@class='vT_z']/div/div/p"
+                )
+                if len(p_els) > 0:
+                    utils.get_logger().info(f" {p_els[0].text}")
+                else:
+                    a_links = self.driver.find_elements(
+                        By.XPATH, "//div[@class='vT-srch-result-list']/p/a"
+                    )
+                    count = len(a_links)
+                    if count > 1:
+                        count = count - 1
+                    utils.get_logger().info(f"共查询到 {count} 页,每页 20 条")
+            except Exception as e:
+                utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
+            items = self.driver.find_elements(
+                By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
+            )
             return items
         except TimeoutException as e:
-            raise Exception(f"搜索失败 [超时]: {e}")
+            raise Exception(f"搜索失败 [{self._adapter_type}] [超时]: {e}")
         except NoSuchElementException as e:
-            raise Exception(f"搜索失败 [找不到元素]: {e}")
-
+            raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
 
-    def _process_list(self,  items: list) -> list:
+    def _process_list(self, items: list) -> list:
         if not items:
             return []
         for item in items:
-            self._process_item( item)
+            self._process_item(item)
         sleep(2)
         next_items = self._next_page()
-        return self._process_list( next_items)
-
+        return self._process_list(next_items)
 
     def _next_page(self) -> list:
         try:
             wait = WebDriverWait(self.driver, 10, 1)
             next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
-            wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
-            btn = self.driver.find_element(By.XPATH, next_path)
+            try:
+                btn = self.driver.find_element(By.XPATH, next_path)
+            except NoSuchElementException:
+                utils.get_logger().info(f"翻页结束 [{self._adapter_type}]")
+                return []
             btn.click()
-            self.logger.info(f"跳转到下页: {self.driver.current_url}")
+            utils.get_logger().info(f"跳转到下页: {self.driver.current_url}")
             sleep(5)
-            wait.until(ec.presence_of_element_located((By.ID, "vT-srch-result")))
-            items = self.driver.find_elements(By.XPATH,
-                                         "//ul[@class='vT-srch-result-list-bid']/li/a")
+            wait.until(
+                ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
+            )
+            items = self.driver.find_elements(
+                By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
+            )
             return items
         except NoSuchElementException as e:
-            raise Exception(f"翻页失败 [找不到元素]: {e}")
-        except TimeoutException:
-            self.logger.info("翻页结束")
-            return []
+            raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
+        except TimeoutException as e:
+            raise Exception(f"翻页结束 [{self._adapter_type}] [超时]: {e}")
 
-    def _process_item(self,  item):
+    def _process_item(self, item):
         main_handle = self.driver.current_window_handle
         wait = WebDriverWait(self.driver, 10, 1)
         close = True
         try:
-            url = item.get_attribute('href')
+            url = item.get_attribute("href")
             if self._check_is_collect_by_url(url):
                 close = False
                 return
-            self.logger.info(f"跳转详情")
+            # utils.get_logger().info(f"跳转详情")
+            print(".", end="")
             sleep(1)
             item.click()
             wait.until(ec.number_of_windows_to_be(2))
@@ -133,59 +145,85 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                     self.driver.switch_to.window(handle)
                     break
             wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
-            # 判断是否为投标公告
-            if self._check_type("中标公告") or  self._check_type("成交公告") or self._check_type("终止公告"):
-                self._save_db(url, "", is_invalid=True)
+
+            content = self.driver.find_element(
+                By.XPATH, "//div[@class='vF_deail_maincontent']"
+            ).text
+            # 排除其他公告
+            if self._check_type("其他公告"):
+                self._save_db(url, content, 3, is_invalid=True)
                 return
-            content = self.driver.find_element(By.XPATH, "//div[@class='vF_deail_maincontent']").text
+            # 判断是否为投标公告
+            data_type = (
+                1
+                if self._check_type("中标公告")
+                or self._check_type("成交公告")
+                or self._check_type("终止公告")
+                else 0
+            )
             if self._check_content(content):
-                paths = []
-
-                attach_els = self.driver.find_elements(By.XPATH, "//td[@class='bid_attachtab_content']/a")
-                attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
-
-                # 合并两个列表
-                all_attachments = attach_els + attach_2_els
-                attach_urls = []
-                if len(all_attachments) > 0:
-                    for attach_el in attach_els:
-                        attach_url = attach_el.get_attribute('href')
-                        if attach_url not in attach_urls:
-                            attach_urls.append(attach_url)
-                        else:
-                            self.logger.info(f"重复附件: {attach_url}")
-                            continue
-                        file_name =  attach_el.text or attach_el.get_attribute('download') or attach_url.split('/')[-1]
-                        if not file_name:
-                            continue
-                        # 检查 file_name 是否包含文件扩展名
-                        if '.' not in file_name:
-                            self.logger.warning(f"文件名 {file_name} 不包含扩展名,跳过下载。")
-                            continue
-                        path = self.file_helper.download_remote_file(attach_url, file_name)
-                        if path:
-                            paths.append(path)
-                attach_str = ",".join(paths)
-                self._save_db(url, content, attach_str)
+                attach_str = self._attach_download()
+                self._save_db(url, content, data_type, attach_str)
             else:
-                self._save_db(url, content, is_invalid=True)
+                self._save_db(url, content, data_type, is_invalid=True)
         except TimeoutException as e:
-            self.logger.error(
-                f"采集发生异常 Timeout: {self.driver.current_url}。Exception: {e}")
+            utils.get_logger().error(
+                f"采集发生异常 [{self._adapter_type}] Timeout: {self.driver.current_url}。Exception: {e}"
+            )
         except NoSuchElementException as e:
-            self.logger.error(
-                f"采集发生异常 NoSuchElement: {self.driver.current_url}。Exception: {e}")
-            raise Exception(f"采集失败 [找不到元素]: {e}")
+            utils.get_logger().error(
+                f"采集发生异常 [{self._adapter_type}] NoSuchElement: {self.driver.current_url}。Exception: {e}"
+            )
+            raise Exception(f"采集失败 [{self._adapter_type}] [找不到元素]: {e}")
         finally:
             if close:
                 sleep(1)
                 self.driver.close()
                 self.driver.switch_to.window(main_handle)
 
-    def _check_type(self,type_str: str)->bool:
+    def _check_type(self, type_str: str) -> bool:
         links = self.driver.find_elements(By.LINK_TEXT, type_str)
         if len(links) > 0:
-            self.logger.info(f"{type_str},跳过")
+            utils.get_logger().info(f"{type_str}")
             return True
         return False
 
+    def _attach_download(self):
+        paths = []
+
+        attach_els = self.driver.find_elements(
+            By.XPATH, "//td[@class='bid_attachtab_content']/a"
+        )
+        attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
+
+        # 合并两个列表
+        all_attachments = attach_els + attach_2_els
+        attach_urls = []
+        if len(all_attachments) > 0:
+            for attach_el in attach_els:
+                attach_url = attach_el.get_attribute("href")
+                if attach_url not in attach_urls:
+                    attach_urls.append(attach_url)
+                else:
+                    utils.get_logger().info(f"重复附件: {attach_url}")
+                    continue
+                file_name = (
+                    attach_el.text
+                    or attach_el.get_attribute("download")
+                    or attach_url.split("/")[-1]
+                )
+                if not file_name:
+                    continue
+                # 检查 file_name 是否包含文件扩展名
+                if "." not in file_name:
+                    utils.get_logger().warning(
+                        f"文件名 {file_name} 不包含扩展名,跳过下载。"
+                    )
+                    continue
+                path = utils.download_remote_file(attach_url, file_name)
+                if path:
+                    paths.append(path)
+        attach_str = ",".join(paths)
+        if attach_str:
+            utils.get_logger().info(f"附件下载完成: {attach_str}")
+        return attach_str

+ 102 - 93
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -1,13 +1,13 @@
 from time import sleep
 
-
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.webdriver.common.by import By
-from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
-from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from selenium.webdriver.support.wait import WebDriverWait
 
-from stores.data_store_interface import IDataStore
+import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
+from stores.data_store_interface import IDataStore
 
 
 class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
@@ -15,37 +15,18 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
     中国招标网数据采集适配器
     """
 
-
-    def __init__(self, url: str,store:IDataStore=None):
+    def __init__(self, url: str, store: IDataStore = None):
         self._url = url
         self._store = store
         self._driver = None
         self._keyword = None
         self._adapter_type = "chinabidding"
 
-    # @property
-    # def store(self) -> IDataStore:
-    #     return self._store
-    #
-    # @property
-    # def url(self):
-    #     return self._url
-    #
-    # @property
-    # def keyword(self):
-    #     return self._keyword
-    #
-    # @property
-    # def driver(self)->webdriver:
-    #     if not self._driver:
-    #         self._driver = self._create_driver()
-    #     return self._driver
-
-
     def login(self, username: str, password: str) -> None:
         try:
             login_el = self.driver.find_element(
-                By.XPATH, "//div[@id='loginRight']/a[@class='login']")
+                By.XPATH, "//div[@id='loginRight']/a[@class='login']"
+            )
             login_el.click()
             wait = WebDriverWait(self.driver, 10, 1)
             wait.until(ec.presence_of_element_located((By.ID, "userpass")))
@@ -57,90 +38,118 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             login_btn.click()
             wait.until(ec.presence_of_element_located((By.ID, "site-content")))
         except TimeoutException as e:
-            raise Exception(f"登录失败 [超时]: {e}")
+            raise Exception(f"登录失败 [{self._adapter_type}] [超时]: {e}")
         except NoSuchElementException as e:
-            raise Exception(f"登录失败 [找不到元素]: {e}")
-
+            raise Exception(f"登录失败 [{self._adapter_type}] [找不到元素]: {e}")
 
     def collect(self, keyword: str, store: IDataStore):
         if store:
             self._store = store
         self._keyword = keyword
-        items = self._search(keyword)
-        self._process_list(items)
-        if self.config.get_bool(self.batch_save_key):
+        items = self._search_by_type(keyword, 0)
+        self._process_list(items, 0)
+        sleep(2)
+        items = self._search_by_type(keyword, 1)
+        self._process_list(items, 1)
+        if utils.get_config_bool(self.batch_save_key):
             self.store.save_collect_data(True)
 
-    def _search(self, keyword: str) -> list:
+    def _search_by_type(self, keyword: str, data_type):
         try:
-            wait = WebDriverWait(self.driver, 10, 1)
-            wait.until(
-                ec.presence_of_element_located((By.ID, "projSearchForm")))
-            search_el = self.driver.find_element(By.ID, "fullText")
-            search_el.send_keys("")
-            search_el.send_keys(keyword)
-            search_btn = self.driver.find_element(
-                By.XPATH, "//form[@id='projSearchForm']/button")
-            search_btn.click()
-            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
-            default_search_txt = "近3日"
-            search_txt = self.config.get(self.search_day_key, default_search_txt)
-            self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
-            if search_txt != default_search_txt:
-                last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
-                last_el.click()
-                wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            self.driver.get(self._url)
+            if data_type == 0:
+                utils.get_logger().info(f"开始采集 招标公告")
+                el = self.driver.find_element(
+                    By.XPATH, "//div[@id='z-b-g-g']/h2/a[@class='more']"
+                )
             else:
-                sleep(1)
-            try:
-                a_links = self.driver.find_elements(
-                    By.XPATH, "//form[@id='pagerSubmitForm']/a")
-                count = len(a_links)
-                if count > 1:
-                    count = count - 1
-                self.logger.info(f"共查询到 {count} 页")
-            except Exception as e:
-                self.logger.error(f"搜索失败[尝试查询页数]: {e}")
-            items = self.driver.find_elements(By.XPATH,
-                                              "//ul[@class='as-pager-body']/li/a")
-            return items
+                utils.get_logger().info(f"开始采集 中标结果公告")
+                el = self.driver.find_element(
+                    By.XPATH, "//div[@id='z-b-jg-gg']/h2/a[@class='more']"
+                )
+            el.click()
+            wait = WebDriverWait(self.driver, 10, 1)
+            wait.until(ec.number_of_windows_to_be(2))
+            self.driver.close()
+            self.driver.switch_to.window(self.driver.window_handles[0])
+            return self._search(keyword)
         except TimeoutException as e:
-            raise Exception(f"搜索失败 [超时]: {e}")
+            raise Exception(f"搜索失败 [{self._adapter_type}] [超时]: {e}")
         except NoSuchElementException as e:
-            raise Exception(f"搜索失败 [找不到元素]: {e}")
+            raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
 
-    def _process_list(self, items: list) -> list:
+    def _search(self, keyword: str) -> list:
+        wait = WebDriverWait(self.driver, 10, 1)
+        wait.until(ec.presence_of_element_located((By.ID, "searchBidProjForm")))
+        search_el = self.driver.find_element(
+            By.XPATH, "//form[@id='searchBidProjForm']/ul/li/input[@id='fullText']"
+        )
+        search_el.clear()
+        search_el.send_keys(keyword)
+        search_btn = self.driver.find_element(
+            By.XPATH, "//form[@id='searchBidProjForm']/ul/li/button"
+        )
+        search_btn.click()
+        wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+        default_search_txt = "全部"
+        search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
+        utils.get_logger().info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
+        if search_txt != default_search_txt:
+            last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
+            sleep(1)
+            last_el.click()
+            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+        else:
+            sleep(1)
+        try:
+            a_links = self.driver.find_elements(
+                By.XPATH, "//form[@id='pagerSubmitForm']/a"
+            )
+            count = len(a_links)
+            if count > 1:
+                count = count - 1
+            utils.get_logger().info(f"共查询到 {count} 页,每页 10 条")
+        except Exception as e:
+            utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
+        items = self.driver.find_elements(By.XPATH, "//ul[@class='as-pager-body']/li/a")
+        return items
+
+    def _process_list(self, items: list, data_type) -> list:
         if not items:
             return []
         for item in items:
-            self._process_item(item)
+            self._process_item(item, data_type)
         sleep(2)
         next_items = self._next_page()
-        return self._process_list(next_items)
+        return self._process_list(next_items, data_type)
 
     def _next_page(self) -> list:
         try:
             wait = WebDriverWait(self.driver, 10, 1)
-            next_path = "//form[@id='pagerSubmitForm']/a[@class='next']"
-            wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
-            btn = self.driver.find_element(By.XPATH, next_path)
+            try:
+                btn = self.driver.find_element(
+                    By.XPATH, "//form[@id='pagerSubmitForm']/a[@class='next']"
+                )
+            except NoSuchElementException:
+                utils.get_logger().info(f"翻页结束 [{self._adapter_type}]")
+                return []
             btn.click()
-            self.logger.info(f"跳转到下页: {self.driver.current_url}")
+            utils.get_logger().info(f"跳转到下页: {self.driver.current_url}")
             wait.until(ec.presence_of_element_located((By.ID, "site-content")))
-            items = self.driver.find_elements(By.XPATH,
-                                         "//ul[@class='as-pager-body']/li/a")
+            items = self.driver.find_elements(
+                By.XPATH, "//ul[@class='as-pager-body']/li/a"
+            )
             return items
         except NoSuchElementException as e:
-            raise Exception(f"翻页失败 [找不到元素]: {e}")
-        except TimeoutException:
-            self.logger.info("翻页结束")
-            return []
+            raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
+        except TimeoutException as e:
+            raise Exception(f"翻页结束 [{self._adapter_type}] [超时]: {e}")
 
-    def _process_item(self, item):
+    def _process_item(self, item, data_type):
         main_handle = self.driver.current_window_handle
         close = True
         try:
-            url = item.get_attribute('href')
+            url = item.get_attribute("href")
             if self._check_is_collect_by_url(url):
                 close = False
                 return
@@ -153,27 +162,27 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                     self.driver.switch_to.window(handle)
                     break
             url = self.driver.current_url
-            self.logger.info(f"跳转详情")
-            wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
-            content = self.driver.find_element(By.TAG_NAME, "body").text
+            # utils.get_logger().info(f"跳转详情")
+            print(".", end="")
+            wait.until(ec.presence_of_element_located((By.CLASS_NAME, "content")))
+            content = self.driver.find_element(By.CLASS_NAME, "content").text
             if self._check_content(content):
-                self._save_db(url, content)
+                self._save_db(url, content, data_type)
             else:
-                self._save_db(url, content, is_invalid=True)
+                self._save_db(url, content, data_type, is_invalid=True)
 
         except TimeoutException as e:
-            self.logger.error(
-                f"采集发生异常 Timeout: {self.driver.current_url}。Exception: {e}")
+            utils.get_logger().error(
+                f"采集发生异常 [{self._adapter_type}] Timeout: {self.driver.current_url}。Exception: {e}"
+            )
             # raise Exception(f"采集失败 [超时]: {e}")
         except NoSuchElementException as e:
-            self.logger.error(
-                f"采集发生异常 NoSuchElement: {self.driver.current_url}。Exception: {e}")
-            raise Exception(f"采集失败 [找不到元素]: {e}")
+            utils.get_logger().error(
+                f"采集发生异常 [{self._adapter_type}] NoSuchElement: {self.driver.current_url}。Exception: {e}"
+            )
+            raise Exception(f"采集失败 [{self._adapter_type}] [找不到元素]: {e}")
         finally:
             if close:
                 sleep(2)
                 self.driver.close()
                 self.driver.switch_to.window(main_handle)
-
-
-

+ 29 - 27
SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py

@@ -1,36 +1,32 @@
-
-
 from abc import ABC, abstractmethod
+
 from selenium import webdriver
 
-from stores.data_store_interface import IDataStore
-from drivers.driver_creator import DriverCreator
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
+import drivers
+import utils
 from models.collect_data import CollectData
-from models.process_data import ProcessData
+from stores.data_store_interface import IDataStore
 
 
 class IDataCollectionAdapter(ABC):
     """
     数据收集适配器抽象类
     """
+
     _url = ""
     _store = None
     _driver = None
     _keyword = None
     _adapter_type = ""
 
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
-
-
     @property
     def search_day_key(self) -> str:
         return f"adapter.{self._adapter_type}.search_day"
+
     @property
     def batch_save_key(self) -> str:
         return f"adapter.{self._adapter_type}.batch_save"
+
     @property
     def store(self) -> IDataStore:
         return self._store
@@ -51,8 +47,7 @@ class IDataCollectionAdapter(ABC):
 
     def _create_driver(self) -> webdriver:
         try:
-            return DriverCreator().gen_remote_driver(self.url)
-            # return DriverCreator().gen_chrome_driver(self.url)
+            return drivers.gen_driver(self.url)
         except Exception as e:
             raise Exception(f"创建驱动器失败: {e}")
 
@@ -125,36 +120,43 @@ class IDataCollectionAdapter(ABC):
     def _check_is_collect_by_url(self, url: str) -> bool:
         old = self.store.query_one_collect_url(url)
         if old:
-            self.logger.info(f"已采集过: {url}")
+            utils.get_logger().info(f"已采集过: {url}")
             return True
         return False
-    def _check_content(self,content) -> bool:
-        collect_data_key = self.config.get("save.collect_data_key")
+
+    def _check_content(self, content) -> bool:
+        collect_data_key = utils.get_config_value("save.collect_data_key")
         if not collect_data_key:
-            self.logger.info("未配置 save.collect_data_key,跳过内容检查")
+            utils.get_logger().info("未配置 save.collect_data_key,跳过内容检查")
             return True
-        # self.logger.info(f"检查数据有效性: {collect_data_key}")
+        # utils.get_logger().info(f"检查数据有效性: {collect_data_key}")
         collect_data_key = collect_data_key.replace(",", ",")
         keys = collect_data_key.split(",")
         keys = [key.strip() for key in keys]
         for key in keys:
             key = key.strip()
-            # self.logger.info(f"检查数据有效性: {key}")
+            # utils.get_logger().info(f"检查数据有效性: {key}")
             if key in content:
-                self.logger.info(f"有效数据: {self.driver.current_url}")
+                utils.get_logger().info(f"有效数据: {self.driver.current_url}")
                 return True
 
         return False
 
-
-
-    def _save_db(self, url, content, attach_str = None,is_invalid=False):
+    def _save_db(self, url, content, data_type=0, attach_str=None, is_invalid=False):
         if not self.store:
-            self.logger.info(f"DataStore 未指定: {url},关键字{self.keyword}")
+            utils.get_logger().info(f"DataStore 未指定: {url},关键字{self.keyword}")
             return False
         else:
             status = 2 if is_invalid else 0
-            data = CollectData(url, self.keyword, content, attach_str, status)
-            self.store.insert_collect_data(data, self.config.get_bool(self.batch_save_key))
+            data = CollectData(
+                url=url,
+                keyword=self.keyword,
+                content=content,
+                data_type=data_type,
+                attach_path=attach_str,
+                status=status,
+            )
+            self.store.insert_collect_data(
+                data, utils.get_config_bool(self.batch_save_key)
+            )
             return True
-

+ 21 - 7
SourceCode/TenderCrawler/app/config.yml

@@ -1,8 +1,8 @@
+#file: noinspection SpellCheckingInspection,SpellCheckingInspection,SpellCheckingInspection
 adapter:
   chinabidding:
     #search_day: '今天'
-    #search_day: '近一周'
-    search_day: '近三天'
+    search_day: '近一周'
     model_name: 'chinabidding_data_collection_adapter'
     class_name: 'ChinabiddingDataCollectionAdapter'
     batch_save: True
@@ -34,8 +34,9 @@ ai:
   url: https://dashscope.aliyuncs.com/compatible-mode/v1
   model: qwen-plus
   max_tokens: 1024
-  system_prompt: 请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。
-  prompt_template: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),设备(device)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,area,date,address,release_date,summary,device字段的json格式字符串,没有找到或未提供的信息json字段为空。
+  system_prompt: 分析文本,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空,返回的一定是可以解析的json对象。
+  prompt_template_1: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),设备(devices)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,area,date,address,release_date,summary,devices字段的json格式字符串,没有找到或未提供的信息json字段为空,返回的一定是可以解析的json字符串。
+  prompt_template_2: 在以上内容中提取信息:编号(no) 、标题(title)、公告时间(date)、标中的总价格,返回带单位的字符串(price)、标中的公司,多个以逗号分割(bidder)、150-300字的标的物说明,标的物价格,公司的明细等内容摘要(summary),设备(devices)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,date,price,bidder,summary,devices字段的json格式字符串,没有找到或未提供的信息json字段为空,返回的一定是可以解析的json字符串。
 email:
 #  smtp_server: smtp.exmail.qq.com
 #  smtp_port: 465
@@ -47,14 +48,27 @@ email:
   smtp_user: yueyunyun88@163.com
   smtp_password: FWRwBZKHTLHjHT5F
   from_email: yueyunyun88@163.com
-
-  #error_email: yueyy@iwbnet.com
-schedule:
+  error_email: yueyy@iwbnet.com
+job:
+  event_id: 1 # 改变这个值,整点会检测重新加载任务
   sleep_interval: 10
   #sleep_interval: 600 #单位:秒 10分钟检查一次
   collect: 06:00,22:00 # 每天采集数据时间
   process: 07:00,10:00 # 每天采集数据时间
   send_email: 8:20,14:00 # 每天发送邮件时间
+  send_current_month_report_day: 30 # 每月几号发送本月中标报告
+  send_current_month_report_time: 08:20 # 每月几号记点发送本月中标报告
+  send_prev_month_report_day: 1 # 每月几号发送上月中标报告
+  send_prev_month_report_time: 08:20 # 每月几号记点发送上月中标报告
+  clean_data: 00:05 # 每日清理数据时间
   run_now: false
 selenium:
   remote_driver_url: http://127.0.0.1:3534/wd/hub
+clean:
+  day: 30 # 清理多少天前的数据 0不清理
+  # 下面的没有配置 默认使用 day 的配置
+  attach: 30 # 清理多少天前的附件 0不清理
+  log: 30 # 清理多少天前的日志 0不清理
+  collect_data: 30 # 清理多少天前的采集数据 0不清理
+  process_data: 30 # 清理多少天前的处理数据[招标] 0不清理
+  process_result_data: 60 # 清理多少天前的处理数据[中标] 0不清理 小于45会强制设为45

+ 7 - 0
SourceCode/TenderCrawler/app/drivers/__init__.py

@@ -0,0 +1,7 @@
+from selenium import webdriver
+
+from driver_creator import DriverCreator
+
+
+def gen_driver(url: str) -> webdriver:
+    return DriverCreator().gen_remote_driver(url)

+ 34 - 39
SourceCode/TenderCrawler/app/drivers/driver_creator.py

@@ -1,61 +1,55 @@
 from selenium import webdriver
 
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
+import utils
 
 
 class DriverCreator:
 
-    logger = LoggerHelper.get_logger()
-
     default_remote_driver_url = "http://127.0.0.1:4444/wd/hub"
 
     def gen_remote_driver(self, url):
         # 设置Chrome选项
         options = webdriver.ChromeOptions()
 
-        # options.add_argument('--headless')  # 无头模式运行
-        options.add_argument('--no-sandbox')
-        options.add_argument('--disable-dev-shm-usage')
-        options.add_experimental_option('excludeSwitches',
-                                        ['enable-automation'])
-        options.add_argument('--disable-blink-features=AutomationControlled')
-        options.add_argument('--disable-extensions')
+        options.add_argument("--headless")  # 无头模式运行
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        options.add_argument("--disable-blink-features=AutomationControlled")
+        options.add_argument("--disable-extensions")
         # 最大化窗口
-        options.add_argument('--start-maximized')
+        options.add_argument("--start-maximized")
         # 无痕浏览模式
-        options.add_argument('--incognito')
-
+        options.add_argument("--incognito")
 
-        remote_driver_url = ConfigHelper().get('selenium.remote_driver_url')
+        remote_driver_url = utils.get_config_value("selenium.remote_driver_url")
         if not remote_driver_url:
             remote_driver_url = self.default_remote_driver_url
-            self.logger.error(
-                f"未配置远程驱动地址,使用默认地址{self.default_remote_driver_url}")
-        self.logger.info(f"远程驱动地址{remote_driver_url}")
+            utils.get_logger().error(
+                f"未配置远程驱动地址,使用默认地址{self.default_remote_driver_url}"
+            )
+        utils.get_logger().info(f"远程驱动地址{remote_driver_url}")
 
         # 创建远程浏览器驱动实例
-        driver = webdriver.Remote(command_executor=remote_driver_url,
-                                  options=options)
+        driver = webdriver.Remote(command_executor=remote_driver_url, options=options)
         return self._gen_driver(driver, url)
 
     def gen_chrome_driver(self, url):
         # 设置Chrome选项,包括隐藏Selenium特征、设置代理IP和排除或关闭一些Selenium相关开关
         options = webdriver.ChromeOptions()
-        options.add_experimental_option('excludeSwitches',
-                                        ['enable-automation'])
-        options.add_argument('--disable-blink-features=AutomationControlled')
-        options.add_argument('--disable-extensions')
+        options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        options.add_argument("--disable-blink-features=AutomationControlled")
+        options.add_argument("--disable-extensions")
         # options.add_argument('--disable-gpu')
-        options.add_argument('--disable-notifications')
+        options.add_argument("--disable-notifications")
         # options.add_argument('--disable-popup-blocking')
         # options.add_argument('--disable-web-security')
         # options.add_argument('--ignore-certificate-errors')
         # options.add_argument('--no-sandbox')
         # 最大化窗口
-        options.add_argument('--start-maximized')
+        options.add_argument("--start-maximized")
         # 无痕浏览模式
-        options.add_argument('--incognito')
+        options.add_argument("--incognito")
         # options.add_argument('--user-data-dir=/dev/null')
         # options.add_argument('--proxy-server={}'.format(proxy_address + ':' + proxy_port))
         # options.add_argument('--proxy-auth={}:{}'.format(proxy_username, proxy_password))
@@ -65,16 +59,17 @@ class DriverCreator:
         driver = webdriver.Chrome(options=options)  # 创建Chrome浏览器驱动实例
         return self._gen_driver(driver, url)
 
-
-    def _gen_driver(self, driver, url):
+    @staticmethod
+    def _gen_driver(driver, url):
         # 设置user-agent,改变user-agent的值
-        if hasattr(driver, 'execute_cdp_cmd'):
+        if hasattr(driver, "execute_cdp_cmd"):
             # 隐藏navigator.webdriver标志,将其值修改为false或undefined
             driver.execute_cdp_cmd(
-                'Page.addScriptToEvaluateOnNewDocument', {
-                    'source':
-                    'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
-                })
+                "Page.addScriptToEvaluateOnNewDocument",
+                {
+                    "source": 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
+                },
+            )
 
             user_agents = [
                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
@@ -87,19 +82,19 @@ class DriverCreator:
 
             user_agent = user_agents[len(url) % len(user_agents)]
             # 设置user-agent,改变user-agent的值
-            driver.execute_cdp_cmd("Network.setUserAgentOverride",
-                                   {"userAgent": user_agent})
+            driver.execute_cdp_cmd(
+                "Network.setUserAgentOverride", {"userAgent": user_agent}
+            )
         else:
-            self.logger.warning("当前驱动不支持 execute_cdp_cmd 方法")
+            utils.get_logger().warning("当前驱动不支持 execute_cdp_cmd 方法")
         # url 去除空字符串
         url = url.strip()
         driver.get(url)
 
         # 设置隐式等待 5s
         driver.implicitly_wait(5)
-        self.logger.info(f"创建浏览器驱动,URL: {url}")
+        utils.get_logger().info(f"创建浏览器驱动,URL: {url}")
         return driver
 
-
     # def shutdown_driver(self,driver):
     #     driver.quit()

+ 0 - 0
SourceCode/TenderCrawler/app/main/__init__.py → SourceCode/TenderCrawler/app/jobs/__init__.py


+ 84 - 0
SourceCode/TenderCrawler/app/jobs/data_clean.py

@@ -0,0 +1,84 @@
+import utils
+from stores.mysql_data_store import MysqlDataStore
+
+
+class DataClean:
+    _store = None
+
+    def __init__(self):
+        self._clean_day = utils.get_config_int("clean.day", 30)
+        self._clean_attach_day = utils.get_config_int("clean.attach", self._clean_day)
+        self._clean_log_day = utils.get_config_int("clean.log", self._clean_day)
+        self._clean_collect_data_day = utils.get_config_int(
+            "clean.collect_data", self._clean_day
+        )
+        self._clean_process_data_day = utils.get_config_int(
+            "clean.process_data", self._clean_day
+        )
+        self._clean_process_result_data_day = utils.get_config_int(
+            "clean.process_result_data", self._clean_day
+        )
+        if self._clean_process_result_data_day < 45:
+            self._clean_process_result_data_day = 45
+        self._store = MysqlDataStore()
+
+    def clean(self):
+        try:
+            utils.get_logger().info("开始 清除历史文件数据")
+            self._clean_attach()
+            self._clean_log()
+            self._clean_collect_data()
+            self._clean_process_data()
+            self._clean_process_result_data()
+            utils.get_logger().info("清除历史文件数据 完成")
+        except Exception as e:
+            utils.get_logger().error(e)
+
+    def _clean_attach(self):
+        if self._clean_attach_day == 0:
+            utils.get_logger().info("跳过 清除历史附件数据")
+            return
+        utils.get_logger().info("开始 清除历史附件数据")
+        utils.clean_attach_file(self._clean_attach_day)
+        utils.get_logger().info("清除历史附件数据 完成")
+
+    def _clean_log(self):
+        if self._clean_log_day == 0:
+            utils.get_logger().info("跳过 清除历史日志数据")
+            return
+        utils.get_logger().info("开始 清除历史日志数据")
+        utils.clean_log_file(self._clean_log_day)
+        utils.get_logger().info("清除历史日志数据 完成")
+
+    def _clean_collect_data(self):
+        if self._clean_collect_data_day == 0:
+            utils.get_logger().info("跳过 清除历史采集数据")
+            return
+        utils.get_logger().info("开始 清除历史采集数据")
+        date = self._get_before_date(self._clean_collect_data_day)
+        self._store.delete_collect_data_before_date(date)
+        utils.get_logger().info("清除历史采集数据 完成")
+
+    def _clean_process_data(self):
+        if self._clean_process_data_day == 0:
+            utils.get_logger().info("跳过 清除历史处理数据[招标]")
+            return
+        utils.get_logger().info("开始 清除历史处理数据[招标]")
+        date = self._get_before_date(self._clean_process_data_day)
+        self._store.delete_process_data_before_date(date)
+        utils.get_logger().info("清除历史处理数据[招标] 完成")
+
+    def _clean_process_result_data(self):
+        if self._clean_process_result_data_day == 0:
+            utils.get_logger().info("跳过 清除历史处理数据[中标]")
+            return
+        utils.get_logger().info("开始 清除历史处理数据[中标]")
+        date = self._get_before_date(self._clean_process_data_day)
+        self._store.delete_process_result_data_before_date(date)
+        utils.get_logger().info("清除历史处理数据[中标] 完成")
+
+    @staticmethod
+    def _get_before_date(day: int) -> str:
+        from datetime import datetime, timedelta
+
+        return (datetime.now() - timedelta(days=day)).strftime("%Y-%m-%d")

+ 23 - 26
SourceCode/TenderCrawler/app/main/data_collector.py → SourceCode/TenderCrawler/app/jobs/data_collector.py

@@ -1,32 +1,23 @@
 import importlib
+
 from selenium import webdriver
 
+import adapters
+import utils
+from adapters.data_collection_adapter_interface import IDataCollectionAdapter
 from stores.data_store_interface import IDataStore
 from stores.default_data_store import DefaultDataStore
-from adapters.data_collection_adapter_interface import IDataCollectionAdapter
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
 
 
 class DataCollector:
 
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
     _adapter = None
     _driver = None
     _store = None
 
-    # 使用字典映射域名和适配器类
-    # _adapterModelMap = {"chinabidding": "chinabidding_data_collection_adapter"}
-
-    # _adapterClassMap = {"chinabidding": "ChinabiddingDataCollectionAdapter"}
-
-    def __init__(self,
-                 adapter_type: str,
-                 url: str,
-                 un: str,
-                 up: str,
-                 store: IDataStore = None):
+    def __init__(
+        self, adapter_type: str, url: str, un: str, up: str, store: IDataStore = None
+    ):
         self._adapter = self._gen_adapter(adapter_type, url)
         self._driver = self.adapter.driver
         # if type == "chinabidding":
@@ -53,23 +44,29 @@ class DataCollector:
         self._store = store
 
     def collect(self, keyword: str):
-        self.adapter.collect(keyword, self.store)
+        adapters.collect(self.adapter, keyword, self.store)
 
     def close(self):
-        self.logger.info(f"关闭浏览器驱动,URL: {self.adapter.url}")
-        self.adapter.teardown()
-
+        utils.get_logger().info(f"关闭浏览器驱动,URL: {self.adapter.url}")
+        adapters.teardown(self.adapter)
 
-    def _gen_adapter(self, adapter_type: str, url: str):
-        adapter_model_name = self.config.get(f"adapter.{adapter_type}.model_name")
-        adapter_class_name = self.config.get(f"adapter.{adapter_type}.class_name")
+    @staticmethod
+    def _gen_adapter(adapter_type: str, url: str):
+        adapter_model_name = utils.get_config_value(
+            f"adapter.{adapter_type}.model_name"
+        )
+        adapter_class_name = utils.get_config_value(
+            f"adapter.{adapter_type}.class_name"
+        )
         if adapter_class_name:
             try:
-                self.logger.info(
-                    f"生成适配器 TYPE:{adapter_type},适配器: {adapter_class_name},URL:{url}")
+                utils.get_logger().info(
+                    f"生成适配器 TYPE:{adapter_type},适配器: {adapter_class_name},URL:{url}"
+                )
                 # 使用 importlib 动态导入模块
                 adapter_module = importlib.import_module(
-                    f"adapters.{adapter_model_name}")
+                    f"adapters.{adapter_model_name}"
+                )
                 adapter_class = getattr(adapter_module, adapter_class_name)
                 adapter = adapter_class(url)
             except ImportError as e:

+ 158 - 0
SourceCode/TenderCrawler/app/jobs/data_process.py

@@ -0,0 +1,158 @@
+import utils
+from models.collect_data import CollectData
+from models.process_data import ProcessData
+from models.process_result_data import ProcessResultData
+from stores.data_store_interface import IDataStore
+
+
+class DataProcess:
+
+    _store = None
+
+    DEFAULT_AI_SYSTEM_PROMPT = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
+    DEFAULT_AI_PROMPT_TEMPLATE_1 = """在以上内容中提取信息:
+            编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary), 设备(devices)。
+            提取出相关设备的名称信息, 多个设备以逗号分割。
+            返回包含no, title, area, date, address, release_date, summary, devices字段的json格式字符串,没有找到或未提供的信息json字段为空。
+            """
+    DEFAULT_AI_PROMPT_TEMPLATE_2 = """在以上内容中提取信息:
+            编号(no) 、标题(title)、公告时间(date)、标中的总价格(price)、标中的公司,多个以逗号分割(bidder)、150-300字的标的物说明,标的物价格,公司的明细等内容摘要(summary),设备(devices)。
+            提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,date,price,bidder,summary字段的json格式字符串,没有找到或未提供的信息json字段为空  """
+
+    def __init__(self, store: IDataStore):
+        self._store = store
+        self._ai_system_prompt = utils.get_config_value(
+            "ai.system_prompt", self.DEFAULT_AI_SYSTEM_PROMPT
+        )
+        self._ai_prompt_template_1 = utils.get_config_value(
+            "ai.prompt_template_1", self.DEFAULT_AI_PROMPT_TEMPLATE_1
+        )
+        self._ai_prompt_template_2 = utils.get_config_value(
+            "ai.prompt_template_2", self.DEFAULT_AI_PROMPT_TEMPLATE_2
+        )
+
+    @property
+    def store(self) -> IDataStore:
+        return self._store
+
+    def process(self):
+        try:
+            urls = self.store.query_urls_to_process()
+            for item in urls:
+                self._process_item(item)
+            self.store.save_process_data(True)
+            self.store.save_process_result_data(True)
+        except Exception as e:
+            utils.get_logger().error(f"数据处理发生异常: {e}")
+            raise Exception(f"数据处理发生异常: {e}")
+
+    def _process_item(self, url: str) -> None:
+        try:
+            utils.get_logger().info(f"START ==>URL:{url}")
+            item = self.store.query_one_collect_by_url(url)
+            if not item:
+                utils.get_logger().info(f"END==> NOT FOUND URL:{url}")
+                return
+            if item.status == 1:
+                utils.get_logger().info(f"ALREADY1 URL:{url}")
+                return
+            data = (
+                self.store.query_one_process_by_url(url)
+                if item.data_type == 0
+                else self.store.query_one_process_result_by_url(url)
+            )
+            if data:
+                utils.get_logger().info(f"ALREADY2 [{item.data_type}] URL==> {url}")
+                return
+            data = (
+                self._ai_process_1(item)
+                if item.data_type == 0
+                else self._ai_process_2(item)
+            )
+            if data:
+                old = None
+                if data.no:
+                    old = (
+                        self.store.query_one_process_result_by_no(data.no)
+                        if item.data_type == 0
+                        else self.store.query_one_process_by_no(data.no)
+                    )
+                if not old:
+                    data.url = url
+                    data.keyword = item.keyword
+                    data.attach_path = item.attach_path
+                    if item.data_type == 0:
+                        self.store.insert_process_data(data)
+                    else:
+                        self.store.insert_process_result_data(data)
+                else:
+                    if old.url != url:
+                        if old.other_urls:
+                            old.other_urls += f",{url}"
+                        else:
+                            old.other_urls = url
+                        if item.data_type == 0:
+                            self.store.set_process_other_urls(data.url, old.other_urls)
+                        else:
+                            self.store.set_process_result_other_urls(
+                                data.url, old.other_urls
+                            )
+                    self.store.set_collect_process(old.url)
+                    utils.get_logger().info(
+                        f"ALREADY 编号: {data.no} URL:{old.other_urls}"
+                    )
+
+            utils.get_logger().info("END   ==>" + url)
+        except Exception as e:
+            utils.get_logger().error(f"数据处理发生异常: {url} {e}")
+
+    def _ai_process_1(self, item: CollectData) -> ProcessData | None:
+        try:
+            data = utils.call_openai(
+                self._ai_system_prompt, f"{item.content} {self._ai_prompt_template_1}"
+            )
+            area_str = data.get("area")
+
+            if "省" in area_str:
+                area_str_arr = area_str.split("省")
+                area_str = area_str_arr[1] if len(area_str_arr) > 1 else area_str_arr[0]
+            if "市" in area_str:
+                area_str_arr = area_str.split("市")
+                area_str = area_str_arr[0]
+
+            return ProcessData(
+                no=data.get("no"),
+                title=data.get("title"),
+                date=data.get("date"),
+                area=area_str,
+                address=data.get("address"),
+                devices=data.get("devices"),
+                summary=data.get("summary"),
+                release_date=data.get("release_date"),
+                prompt_tokens=data.get("prompt_tokens"),
+                completion_tokens=data.get("completion_tokens"),
+                total_tokens=data.get("total_tokens"),
+            )
+        except Exception as e:
+            utils.get_logger().error(f"AI 提取数据失败1: {item.url} {e}")
+            return None
+
+    def _ai_process_2(self, item: CollectData) -> ProcessResultData | None:
+        try:
+            data = utils.call_openai(
+                self._ai_system_prompt, f"{item.content} {self._ai_prompt_template_2}"
+            )
+            return ProcessResultData(
+                no=data.get("no"),
+                title=data.get("title"),
+                date=data.get("date"),
+                price=data.get("price"),
+                bidder=data.get("bidder"),
+                summary=data.get("summary"),
+                prompt_tokens=data.get("prompt_tokens"),
+                completion_tokens=data.get("completion_tokens"),
+                total_tokens=data.get("total_tokens"),
+            )
+        except Exception as e:
+            utils.get_logger().error(f"AI 提取数据失败2: {item.url} {e}")
+            return None

+ 330 - 0
SourceCode/TenderCrawler/app/jobs/data_send.py

@@ -0,0 +1,330 @@
+import calendar
+from datetime import datetime
+
+import utils
+from models.process_data import ProcessData
+from models.process_result_data import ProcessResultData
+from stores.data_store_interface import IDataStore
+
+
+class DataSend:
+    _error_arr = []
+    _email_area_arr = []
+    _email_area_virtual_arr = []
+
+    @property
+    def store(self) -> IDataStore:
+        return self._store
+
+    def __init__(self, store: IDataStore):
+        self._store = store
+        self._email_area_arr = self.store.query_all_emails()
+        self._email_area_virtual_arr = self.store.query_all_virtual_emails()
+
+    def send(self) -> None:
+        self._error_arr = []
+        list = self.store.query_to_send()
+        utils.get_logger().info(f"开始发送邮件,数量为 {len(list)}")
+        for item in list:
+            self._send_item(item)
+        if len(self._error_arr) > 0:
+            self._send_email_no_found()
+
+    def send_report_current_month(self):
+        # 查询当月的数据
+        start_date, end_date = self._get_first_and_last_day_of_current_month()
+        self._send_reports(start_date, end_date)
+
+    def send_report_prev_month(self):
+        # 查询上月的数据
+        start_date, end_date = self._get_first_and_last_day_of_prev_month()
+        self._send_reports(start_date, end_date)
+
+    def _send_reports(self, start_date, end_date):
+        utils.get_logger().info(
+            f"开始发送中标报告邮件,开始日期:{start_date.strftime("%Y-%m-%d")},结束日期:{end_date.strftime("%Y-%m-%d")}"
+        )
+        email = self.store.query_master_email()
+        if not email:
+            utils.get_logger().error("没有找到master email")
+            return
+        items = self.store.query_to_report_by_date(start_date, end_date)
+        title_prev = utils.get_config_value("email.report_title_prev", "【中标报告】")
+        title = f"{start_date.month}月中标结果报告"
+        body = self._build_report_email_html(title, items)
+        flag = utils.send_email(email, f"{title_prev} {title}", body, True)
+        if flag:
+            utils.get_logger().info("发送中标报告邮件成功")
+
+    def _send_item(self, item: ProcessData) -> None:
+        utils.get_logger().info(f"开始发送邮件,地区为:{item.area} ,URL为 {item.url}")
+        email = self._get_email_by_area(item.area)
+        if not email:
+            utils.get_logger().error(f"{item.area} 下没有找到email")
+            if item.area not in self._error_arr:
+                self._error_arr.append(item.area)
+            return
+        title_prev = utils.get_config_value("email.title_prev", "【招标信息】")
+        body = self._build_email_html(item)
+        flag = utils.send_email(
+            email, f"{title_prev} {item.title}", body, True, item.attach_path
+        )
+        if flag:
+            self.store.set_send(item.no)
+
+    def _get_email_by_area(
+        self, area: str, count: int = 0, virtual_area: str = None
+    ) -> str:
+        email = None
+        area_str = area
+        # if "省" in area:
+        #     area_str_arr = area.split("省")
+        #     area_str = area_str_arr[1] if len(area_str) > 1 else area_str_arr[0]
+        # if "市" in area:
+        #     area_str_arr = area.split("市")
+        #     area_str = area_str_arr[0]
+        for area_item in self._email_area_arr:
+            if area_str in area_item.area:
+                email = area_item.email
+                if virtual_area:
+                    new_area = f"{area_item.area},{virtual_area}"
+                    self.store.update_area_email_area_by_name(area_item.name, new_area)
+                    self._email_area_arr = self.store.query_all_emails()
+                break
+        if not email and count < 3:
+            area_name = self._get_email_by_area_virtual(area_str)
+            if area_name:
+                virtual_area = (
+                    f"{area_str},{virtual_area}" if virtual_area else area_str
+                )
+                email = self._get_email_by_area(area_name, count + 1, virtual_area)
+        return email
+
+    def _get_email_by_area_virtual(self, area: str) -> str:
+        name = None
+        for area_item in self._email_area_virtual_arr:
+            if area in area_item.area:
+                name = area_item.name
+                break
+        return name
+
+    @staticmethod
+    def _build_email_html(item: ProcessData, other: str = "") -> str:
+        html_body = f"""
+        <html>
+        <head>
+            <style>
+                body {{
+                    background-color: #f4f4f9;
+                    font-family: Arial, sans-serif;
+                    margin: 0;
+                    padding: 20px;
+                }}
+                h1 {{
+                    text-align: center;
+                    color: #333;
+                }}
+                .container {{
+                    max-width: 600px;
+                    margin: 0 auto;
+                    background-color: #fff;
+                    padding: 20px;
+                    border-radius: 8px;
+                    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+                }}
+                .button-container {{
+                    text-align: center;
+                    margin-top: 20px;
+                }}
+                .button {{
+                    display: inline-block;
+                    padding: 10px 20px;
+                    font-size: 16px;
+                    color: #fff!important;
+                    background-color: #007bff;
+                    text-decoration: none;
+                    border-radius: 5px;
+                    transition: background-color 0.3s;
+                }}
+                .button:hover {{
+                    background-color: #0056b3;
+                }}
+                .system {{
+                    color: #aaa;
+                }}
+
+            </style>
+        </head>
+        <body>
+            <div class="container">
+                <h1>{item.title}</h1>
+                <p><strong>发布日期:</strong> {item.release_date}</p>
+                <p><strong>招标编号:</strong> {item.no}</p>
+                <p><strong>开标时间:</strong> {item.date}</p>
+                <p><strong>开标地点:</strong> {item.address}</p>
+                <p><strong>标书摘要:</strong> {item.summary}</p>
+                <div class="button-container">
+                    <a href="{item.url}" class="button">查看详情</a>
+                </div>
+                <div>
+                    <h3>{other}</h3>
+                </div>
+                <p class="system">本邮件由系统自动发送,请勿回复。</p>
+
+            </div>
+        </body>
+        </html>
+        """
+        return html_body
+
+    def _build_report_email_html(self, title, items) -> str:
+        body = ""
+        for item in items:
+            body += self._build_report_email_body(item)
+        html = f"""
+        <html>
+        <head>
+            <style>
+                body {{
+                    background-color: #f4f4f9;
+                    font-family: Arial, sans-serif;
+                    margin: 0;
+                    padding: 20px;
+                }}
+                h1 {{
+                    text-align: center;
+                    color: #333;
+                }}
+                .container {{
+                    max-width: 600px;
+                    margin: 0 auto;
+                    background-color: #fff;
+                    padding: 20px;
+                    border-radius: 8px;
+                    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+                }}
+                .button-container {{
+                    text-align: center;
+                    margin-top: 20px;
+                }}
+                .button {{
+                    display: inline-block;
+                    padding: 10px 20px;
+                    font-size: 16px;
+                    color: #fff!important;
+                    background-color: #007bff;
+                    text-decoration: none;
+                    border-radius: 5px;
+                    transition: background-color 0.3s;
+                }}
+                .button:hover {{
+                    background-color: #0056b3;
+                }}
+                .system {{
+                    color: #aaa;
+                }}
+                .card {{
+                    background-color: #ffffff;
+                    border: 1px solid #dddddd;
+                    border-radius: 8px;
+                    margin-bottom: 20px;
+                    padding: 20px;
+                    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+                }}
+                .card h2 {{
+                    margin-top: 0;
+                }}
+                .card p {{
+                    margin: 0;
+                }}
+                .button-container {{
+                    text-align: center;
+                    margin-top: 15px;
+                }}
+                .button {{
+                    display: inline-block;
+                    padding: 6px 15px;
+                    font-size: 14px;
+                    color: #fff!important;
+                    background-color: #007bff;
+                    text-decoration: none;
+                    border-radius: 3px;
+                    transition: background-color 0.3s;
+                }}
+                .button:hover {{
+                    background-color: #0056b3;
+                }}
+            </style>
+        </head>
+        <body>
+            <div class="container">
+                <h1>{title}</h1>
+                {body}
+                <p class="system">本邮件由系统自动发送,请勿回复。</p>
+            </div>
+        </body>
+        </html>
+        """
+        return html
+
+    @staticmethod
+    def _build_report_email_body(item: ProcessResultData) -> str:
+        body = f"""
+           <div class="card">
+               <h2>{item.title}</h2>
+               <p><strong>项目编号:</strong> {item.no}</p>
+               <p><strong>公告日期:</strong> {item.date}</p>
+               <p><strong>关键词:</strong> {item.keyword}</p>
+               <p><strong>价格:</strong> {item.price}</p>
+               <p><strong>中标人:</strong> {item.bidder}</p>
+               <p><strong>摘要:</strong> {item.summary}</p>
+               <div class="button-container">
+                <a href="{item.url}" class="button">查看详情</a>
+               </div>
+           </div>
+           """
+        return body
+
+    def _send_email_no_found(self) -> None:
+        email = utils.get_config_value("email.error_email")
+        utils.get_logger().info(f"开始发送区域邮箱未匹配邮件: {email}")
+        if not email:
+            return
+        title = "Warning: 相关地区没有匹配到邮箱,请及时添加相关配置"
+        content = "以下区域中没有配置邮箱:\n\n    "
+        content += "、".join(self._error_arr)
+        content += "\n\n请及时添加相关配置。"
+        utils.send_email(email, title, content, False, None)
+
+    @staticmethod
+    def _get_first_and_last_day_of_current_month():
+        # 获取当前日期
+        today = datetime.today()
+        # 获取这个月的第一天
+        first_day_of_current_month = datetime(today.year, today.month, 1, 0, 0, 0)
+        # 获取这个月的最后一天
+        _, last_day = calendar.monthrange(today.year, today.month)
+        last_day_of_current_month = datetime(
+            today.year, today.month, last_day, 23, 59, 59
+        )
+        return first_day_of_current_month, last_day_of_current_month
+
+    @staticmethod
+    def _get_first_and_last_day_of_prev_month():
+        # 获取当前日期
+        today = datetime.today()
+        # 获取上个月的年份和月份
+        if today.month == 1:
+            prev_month_year = today.year - 1
+            prev_month = 12
+        else:
+            prev_month_year = today.year
+            prev_month = today.month - 1
+        # 获取上个月的第一天
+        first_day_prev_month = datetime(prev_month_year, prev_month, 1, 0, 0, 0)
+        # 获取上个月的最后一天
+        _, last_day = calendar.monthrange(prev_month_year, prev_month)
+        last_day_of_prev_month = datetime(
+            prev_month_year, prev_month, last_day, 23, 59, 59
+        )
+        return first_day_prev_month, last_day_of_prev_month

+ 254 - 0
SourceCode/TenderCrawler/app/jobs/job_runner.py

@@ -0,0 +1,254 @@
+import threading
+from datetime import datetime
+
+import schedule
+from dateutil import parser
+
+import utils
+from jobs.data_clean import DataClean
+from jobs.data_collector import DataCollector
+from jobs.data_process import DataProcess
+from jobs.data_send import DataSend
+from models.url_setting import UrlSetting
+from stores.mysql_data_store import MysqlDataStore
+
+
+class JobRunner:
+
+    store = MysqlDataStore()  # 复用 store 对象
+
+    def run_job(self, is_run_now=True):
+        try:
+            utils.get_logger().info("加载任务")
+
+            collect_time = utils.get_config_value("job.collect")
+            process_time = utils.get_config_value("job.process")
+            send_email_time = utils.get_config_value("job.send_email")
+            clean_data_time = utils.get_config_value("job.clean_data")
+
+            collect_times = self._validate_and_format_time(collect_time, ["06:00"])
+            for time in collect_times:
+                utils.get_logger().info(f"{time} 执行 采集处理数据 任务")
+                schedule.every().day.at(time).do(self._collect_process_job)
+
+            process_times = self._validate_and_format_time(
+                process_time, ["10:00", "15:00", "19:00"]
+            )
+            for time in process_times:
+                utils.get_logger().info(f"{time} 执行 AI处理数据  任务")
+                schedule.every().day.at(time).do(self._process_job)
+
+            send_email_times = self._validate_and_format_time(
+                send_email_time, ["08:20", "14:00"]
+            )
+            for time in send_email_times:
+                utils.get_logger().info(f"{time} 执行  发送邮件   任务")
+                schedule.every().day.at(time).do(self._send_job)
+
+            if utils.get_config_int("job.send_current_month_report_day") > 0:
+                report_time = utils.get_config_value(
+                    "job.send_current_month_report_time"
+                )
+                times = self._validate_and_format_time(report_time, ["08:20"])
+                for time in times:
+                    utils.get_logger().info(
+                        f"每月{str(self._get_current_month_report_day()).rjust(2,"0")}日 {time} 执行  发送当月报告   任务"
+                    )
+                    schedule.every().day.at(time).do(self._send_prev_month_report_job)
+
+            if utils.get_config_int("job.send_prev_month_report_day") > 0:
+                report_time = utils.get_config_value("job.send_prev_month_report_time")
+                times = self._validate_and_format_time(report_time, ["08:20"])
+                for time in times:
+                    utils.get_logger().info(
+                        f"每月{str(self._get_prev_month_report_day()).rjust(2,"0")}日 {time} 执行  发送上月报告   任务"
+                    )
+                    schedule.every().day.at(time).do(self._send_prev_month_report_job)
+
+            clean_data_times = self._validate_and_format_time(
+                clean_data_time, ["00:05"]
+            )
+            utils.get_logger().info(f"{clean_data_times[0]} 执行 清理数据 任务")
+            schedule.every().day.at(clean_data_times[0]).do(self._clean_job)
+
+            urls = UrlSetting().fetch_all()
+            if not urls or len(urls) == 0:
+                utils.get_logger().error("未找到任何 URL 设置")
+                return
+            utils.get_logger().info(f"共找到 {len(urls)} 个 URL 设置")
+            for url in urls:
+                utils.get_logger().info(f"{url}")
+
+            if is_run_now and utils.get_config_bool("job.run_now"):
+                utils.get_logger().info("立即执行采集任务")
+                self._collect_process_job()
+                # self._clean_job()
+                # self._process_job()
+                # self._send_job()
+
+        except Exception as e:
+            utils.get_logger().error(f"应用程序停止: {e}")
+            raise e
+
+    def restart_job(self):
+        schedule.clear()
+        utils.get_logger().info("定时配置更新,重启任务")
+        self.run_job(False)
+
+    def _collect_process_job(self):
+        threading.Thread(target=self._collect_process).start()
+
+    def _collect_process(self):
+        try:
+            utils.get_logger().info("开始执行 数据采集处理 任务")
+            url_setting = UrlSetting()
+            for url_setting in url_setting.fetch_all():
+                data_collector = None
+                try:
+                    utils.get_logger().info(f"开始采集: {url_setting.url}")
+                    data_collector = DataCollector(
+                        url_setting.adapter_type,
+                        url_setting.url,
+                        url_setting.username,
+                        url_setting.password,
+                        self.store,
+                    )
+                    keywords = url_setting.keywords
+                    keyword_array = keywords.split(",")
+                    for keyword in keyword_array:
+                        data_collector.collect(keyword)
+                    utils.get_logger().info(f"采集完成: {url_setting.url}")
+                except Exception as e:
+                    self._send_error_email(
+                        "数据采集",
+                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}",
+                    )
+                    utils.get_logger().error(f"采集发生异常: {e}")
+                finally:
+                    if data_collector:
+                        data_collector.close()
+
+                try:
+                    utils.get_logger().info(f"开始AI处理: {url_setting.url}")
+                    data_process = DataProcess(self.store)
+                    data_process.process()
+                except Exception as e:
+                    self._send_error_email(
+                        "AI数据处理",
+                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}",
+                    )
+                    utils.get_logger().error(f"AI处理发生异常: {e}")
+                    break  # 中断当前 URL 设置的处理
+            utils.get_logger().info("数据采集处理 任务执行完毕")
+        except Exception as e:
+            utils.get_logger().error(f"数据采集处理 任务执行失败: {e}")
+
+    def _process_job(self):
+        threading.Thread(target=self._process).start()
+
+    def _process(self):
+        try:
+            utils.get_logger().info("开始执行 AI处理数据 任务")
+            data_process = DataProcess(self.store)
+            data_process.process()
+            utils.get_logger().info("AI处理数据 任务执行完毕")
+        except Exception as e:
+            self._send_error_email("AI数据处理", f"\n    错误: {str(e)}")
+            utils.get_logger().error(f"AI任务 执行失败: {e}")
+
+    def _send_job(self):
+        try:
+            utils.get_logger().info("开始执行 邮件发送 任务")
+            DataSend(self.store).send()
+            utils.get_logger().info("邮件发送 任务执行完毕")
+        except Exception as e:
+            self._send_error_email("邮件发送", f"\n    错误: {str(e)}")
+            utils.get_logger().error(f"邮件发送 任务执行失败: {e}")
+
+    def _send_current_month_report_job(self):
+        try:
+            if datetime.today().day == self._get_current_month_report_day():
+                utils.get_logger().info("开始执行 邮件发送当月报告 任务")
+                DataSend(self.store).send_report_current_month()
+                utils.get_logger().info("邮件发送当月报告 任务执行完毕")
+        except Exception as e:
+            self._send_error_email("邮件发送", f"\n    错误: {str(e)}")
+            utils.get_logger().error(f"邮件发送当月报告 任务执行失败: {e}")
+
+    @staticmethod
+    def _get_current_month_report_day():
+        day = utils.get_config_int("job.send_current_month_report_day", 30)
+        if datetime.today().month == 2 and day > 28:
+            day = 28
+        if datetime.today().month in [4, 6, 9, 11] and day > 30:
+            day = 30
+        if day > 31:
+            day = 31
+        return day
+
+    def _send_prev_month_report_job(self):
+        try:
+            if datetime.today().day == self._get_prev_month_report_day():
+                utils.get_logger().info("开始执行 邮件发送上月报告 任务")
+                DataSend(self.store).send_report_prev_month()
+                utils.get_logger().info("邮件发送上月报告 任务执行完毕")
+        except Exception as e:
+            self._send_error_email("邮件发送", f"\n    错误: {str(e)}")
+            utils.get_logger().error(f"邮件发送上月报告 任务执行失败: {e}")
+
+    @staticmethod
+    def _get_prev_month_report_day():
+        day = utils.get_config_int("job.send_prev_month_report_day", 1)
+        if datetime.today().month == 2 and day > 28:
+            day = 28
+        if datetime.today().month in [4, 6, 9, 11] and day > 30:
+            day = 30
+        if day > 31:
+            day = 31
+        return day
+
+    def _clean_job(self):
+        try:
+            utils.get_logger().info("开始执行 清理数据 任务")
+            DataClean().clean()
+            utils.get_logger().info("清理数据 任务执行完毕")
+        except Exception as e:
+            self._send_error_email("清理数据", f"\n    错误: {str(e)}")
+            utils.get_logger().error(f"清理数据 任务执行失败: {e}")
+
+    @staticmethod
+    def _validate_and_format_time(time_str, default_time: list):
+        """验证并格式化时间字符串"""
+        if not time_str:
+            return default_time
+        time_str = time_str.strip().replace(",", ",")
+        # 分割字符串为列表
+        items = [item.strip().strip("'").strip('"') for item in time_str.split(",")]
+
+        # 初始化结果列表
+        formatted_times = []
+
+        for item in items:
+            if not item:
+                continue  # 跳过空字符串
+            try:
+                item = item.replace(":", ":")
+                # 使用 dateutil.parser 解析时间字符串
+                parsed_time = parser.parse(item).time().strftime("%H:%M:%S")
+                formatted_times.append(parsed_time)
+            except Exception as e:
+                utils.get_logger().error(f"配置时间解析错误: {item},: {e} ")
+        if len(formatted_times) == 0:
+            utils.get_logger().error(f"解析时间失败,使用默认时间 {default_time}")
+            return default_time
+        return formatted_times
+
+    @staticmethod
+    def _send_error_email(title: str, error: str) -> None:
+        email = utils.get_config_value("email.error_email")
+        utils.get_logger().info(f"发送错误邮件: {email}")
+        if not email:
+            return
+        title = f"{title}异常"
+        content = f"{title},请及时处理。\n\n异常信息:{error}"
+        utils.send_email(email, title, content, False, None)

+ 21 - 10
SourceCode/TenderCrawler/app/main.py

@@ -1,20 +1,31 @@
+import datetime
 import time
+
 import schedule
 
-from utils.config_helper import ConfigHelper
-from utils.logger_helper import LoggerHelper
-from main.runner import Runner
+import utils
+from jobs.job_runner import JobRunner
+
+DEFAULT_USER_SLEEP_INTERVAL = 10  # 配置默认时间间隔10秒
 
-logger = LoggerHelper.get_logger()
-DEFAULT_USER_SLEEP_INTERVAL = 60 * 30  # 配置默认时间间隔30分钟
+utils.get_logger().info("应用程序启动...")
 
-runner = Runner()
-runner.run()
+job = JobRunner()
+job.run_job()
 
-interval = ConfigHelper().get_int("schedule.sleep_interval",DEFAULT_USER_SLEEP_INTERVAL)
+interval = utils.get_config_int("job.sleep_interval", DEFAULT_USER_SLEEP_INTERVAL)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     while True:
-        logger.info(f"等待下次检查执行... {interval}秒后")
         schedule.run_pending()
+        now = datetime.datetime.now()
         time.sleep(interval)
+        # 重新加载配置及任务
+        if now.minute == 0 and now.second <= interval:
+            job_id = utils.get_config_int("job.event_id")
+            utils.reload_config()
+            interval = utils.get_config_int(
+                "job.sleep_interval", DEFAULT_USER_SLEEP_INTERVAL
+            )
+            if job_id != utils.get_config_int("job.event_id"):
+                job.restart_job()

+ 0 - 71
SourceCode/TenderCrawler/app/main/data_process.py

@@ -1,71 +0,0 @@
-from utils.logger_helper import LoggerHelper
-from utils.ai_helper import AiHelper
-from stores.data_store_interface import IDataStore
-from models.collect_data import CollectData
-from models.process_data import ProcessData
-
-
-class DataProcess:
-    logger = LoggerHelper.get_logger()
-
-    _store = None
-
-    def __init__(self, store: IDataStore):
-        self._store = store
-
-    @property
-    def store(self) -> IDataStore:
-        return self._store
-
-    def process(self):
-        try:
-            urls = self.store.query_urls_to_process()
-            for item in urls:
-                self._process_item(item)
-            self.store.save_process_data(True)
-        except Exception as e:
-            self.logger.error(f"数据处理过程中发生异常: {e}")
-
-    def _process_item(self, url: str) -> None:
-        self.logger.info("START ==>" + url)
-        item = self.store.query_one_collect_by_url(url)
-        if not item:
-            self.logger.info("END: NOT FOUND URL==>" + url)
-            return
-        if item.status == 1:
-            self.logger.info("ALREADY URL==>" + url)
-            return
-        data = self._ai_process(item)
-        if data:
-            old = None
-            if data.no:
-                old = self.store.query_one_process_by_no(data.no)
-            if not old:
-                data.url = url
-                data.keyword = item.keyword
-                self.store.insert_process_data(data)
-            else:
-                if old.url != url:
-                    if old.other_urls:
-                        old.other_urls += f",{url}"
-                    else:
-                        old.other_urls = url
-                    self.store.set_process_other_urls(data.url, old.other_urls)
-                self.logger.info(f"ALREADY 编号: {data.no} URL:{old.other_urls}")
-
-        self.logger.info("END   ==>" + url)
-
-    def _ai_process(self, item: CollectData) -> ProcessData | None:
-        try:
-            data = AiHelper().call_ai(item.content)
-            return data
-        except Exception as e:
-            self.logger.error(f"AI 提取数据失败: {item.url} {e}")
-            return None
-
-    # def _generate_unique_id(self) -> str:
-    #     from datetime import datetime
-    #     current_time = datetime.now().strftime("%Y%m%d%H%M%S%f")
-    #     thread_id = threading.current_thread().ident
-    #     unique_id = f"{current_time}-{thread_id}"
-    #     return unique_id

+ 0 - 119
SourceCode/TenderCrawler/app/main/data_send.py

@@ -1,119 +0,0 @@
-from utils.logger_helper import LoggerHelper
-from utils.email_helper import EmailHelper
-from stores.data_store_interface import IDataStore
-from models.process_data import ProcessData
-
-
-class DataSend:
-    logger = LoggerHelper.get_logger()
-    _error_arr = []
-
-    @property
-    def store(self) -> IDataStore:
-        return self._store
-
-    def __init__(self, store: IDataStore):
-        self._store = store
-
-    def send(self) -> None:
-        self._error_arr = []
-        list = self.store.query_to_send()
-        self.logger.info(f"开始发送邮件,数量为 {len(list)}")
-        for item in list:
-            self._send_item(item)
-        if len(self._error_arr) > 0:
-            self._send_email_no_found()
-
-    def _send_item(self, item: ProcessData) -> None:
-        self.logger.info(f"开始发送邮件,地区为:{item.area} ,URL为 {item.url}")
-        email = self.store.get_email_by_area(item.area)
-        if not email:
-            self.logger.error(f"{item.area} 下没有找到email")
-            if item.area not in self._error_arr:
-                self._error_arr.append(item.area)
-            return
-        body = self._build_email_content(item)
-        flag = EmailHelper().send_email(email, item.title, body, True, item.attach_path)
-        if flag:
-            self.store.set_send(item.no)
-
-    @staticmethod
-    def _build_email_content(item: ProcessData, other: str = "") -> str:
-        html_body = f"""
-        <html>
-        <head>
-            <style>
-                body {{
-                    background-color: #f4f4f9;
-                    font-family: Arial, sans-serif;
-                    margin: 0;
-                    padding: 20px;
-                }}
-                h1 {{
-                    text-align: center;
-                    color: #333;
-                }}
-                .container {{
-                    max-width: 600px;
-                    margin: 0 auto;
-                    background-color: #fff;
-                    padding: 20px;
-                    border-radius: 8px;
-                    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
-                }}
-                .button-container {{
-                    text-align: center;
-                    margin-top: 20px;
-                }}
-                .button {{
-                    display: inline-block;
-                    padding: 10px 20px;
-                    font-size: 16px;
-                    color: #fff!important;
-                    background-color: #007bff;
-                    text-decoration: none;
-                    border-radius: 5px;
-                    transition: background-color 0.3s;
-                }}
-                .button:hover {{
-                    background-color: #0056b3;
-                }}
-                .system {{
-                    color: #aaa;
-                }}
-
-            </style>
-        </head>
-        <body>
-            <div class="container">
-                <h1>{item.title}</h1>
-                <p><strong>搜索关键字:</strong> {item.keyword}</p>
-                <p><strong>发布日期:</strong> {item.release_date}</p>
-                <p><strong>招标编号:</strong> {item.no}</p>
-                <p><strong>开标时间:</strong> {item.date}</p>
-                <p><strong>开标地点:</strong> {item.address}</p>
-                <p><strong>标书摘要:</strong> {item.summary}</p>
-                <div class="button-container">
-                    <a href="{item.url}" class="button">查看详情</a>
-                </div>
-                <div>
-                    <h3>{other}</h3>
-                </div>
-                <p class="system">本邮件由系统自动发送,请勿回复。</p>
-
-            </div>
-        </body>
-        </html>
-        """
-        return html_body
-
-    def _send_email_no_found(self) -> None:
-        email = EmailHelper().config.get("email.error_email")
-        self.logger.info(f"开始发送区域邮箱未匹配邮件: {email}")
-        if not email:
-            return
-        title = "Warning: 相关地区没有匹配到邮箱,请及时添加相关配置"
-        content = "以下区域中没有配置邮箱:\n\n    "
-        content += "、".join(self._error_arr)
-        content += "\n\n请及时添加相关配置。"
-        EmailHelper().send_email(email, title, content, False, None)

+ 0 - 151
SourceCode/TenderCrawler/app/main/runner.py

@@ -1,151 +0,0 @@
-from dateutil import parser
-import schedule
-
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
-from stores.mysql_data_store import MysqlDataStore
-from models.url_setting import UrlSetting
-from main.data_collector import DataCollector
-from main.data_process import DataProcess
-from main.data_send import DataSend
-from utils.email_helper import EmailHelper
-from utils.file_helper import FileHelper
-
-class Runner:
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
-    store = MysqlDataStore()  # 复用 store 对象
-
-    def run(self):
-        self.logger.info("应用程序已启动!")
-        urls = UrlSetting().fetch_all()
-        if not urls or len(urls) == 0:
-            self.logger.error("未找到任何 URL 设置")
-            return
-        self.logger.info(f"共找到 {len(urls)} 个 URL 设置")
-
-        collect_time = self.config.get("schedule.collect")
-        process_time = self.config.get("schedule.process")
-        send_email_time = self.config.get("schedule.send_email")
-
-        collect_times = self._validate_and_format_time(collect_time, ["06:00"])
-        for time in collect_times:
-            self.logger.info(f"{time} 执行 采集处理数据 任务")
-            schedule.every().day.at(time).do(self._collect_process_job)
-
-        process_times = self._validate_and_format_time(
-            process_time, ["10:00", "15:00", "19:00"])
-        for time in process_times:
-            self.logger.info(f"{time} 执行  AI处理数据  任务")
-            schedule.every().day.at(time).do(self._process_job)
-
-        send_email_times = self._validate_and_format_time(
-            send_email_time, ["08:20", "14:00"])
-        for time in send_email_times:
-            self.logger.info(f"{time} 执行   发送邮件   任务")
-            schedule.every().day.at(time).do(self._send_job)
-        if self.config.get_bool("schedule.run_now"):
-            self.logger.info("立即执行任务")
-            self._collect_process_job()
-            # self._send_job()
-            # self._process_job()
-
-    def _collect_process_job(self):
-        try:
-            self.logger.info("开始执行数据采集处理任务")
-            url_setting = UrlSetting()
-            for url_setting in url_setting.fetch_all():
-                data_collector =None
-                try:
-                    self.logger.info(f"开始采集: {url_setting.url}")
-                    data_collector = DataCollector(url_setting.adapter_type,
-                                                   url_setting.url,
-                                                   url_setting.username,
-                                                   url_setting.password,
-                                                   self.store)
-                    keywords = url_setting.keywords
-                    keyword_array = keywords.split(',')
-                    for keyword in keyword_array:
-                        data_collector.collect(keyword)
-                    self.logger.info(f"采集完成: {url_setting.url}")
-                except Exception as e:
-                    self._send_error_email(
-                        "数据采集",
-                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
-                    )
-                    self.logger.error(f"采集发生异常: {e}")
-                finally:
-                    if data_collector:
-                        data_collector.close()
-
-                try:
-                    self.logger.info(f"开始AI处理: {url_setting.url}")
-                    data_process = DataProcess(self.store)
-                    data_process.process()
-                except Exception as e:
-                    self._send_error_email(
-                        "AI数据处理",
-                        f"\n    Type: {url_setting.adapter_type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
-                    )
-                    self.logger.error(f"AI处理发生异常: {e}")
-                    break  # 中断当前 URL 设置的处理
-            self.logger.info("数据采集处理任务执行完毕")
-        except Exception as e:
-            self.logger.error(f"数据采集处理任务执行失败: {e}")
-
-    def _process_job(self):
-        try:
-            self.logger.info("开始AI处理数据执行任务")
-            data_process = DataProcess(self.store)
-            data_process.process()
-            self.logger.info("AI处理数据任务执行完毕")
-        except Exception as e:
-            self._send_error_email("AI数据处理", f"\n    错误: {str(e)}")
-            self.logger.error(f"AI任务执行失败: {e}")
-
-    def _send_job(self):
-        try:
-            self.logger.info("开始邮件发送执行任务")
-            DataSend(self.store).send()
-            self.logger.info("邮件发送任务执行完毕")
-        except Exception as e:
-            self._send_error_email("邮件发送", f"\n    错误: {str(e)}")
-            self.logger.error(f"邮件发送任务执行失败: {e}")
-
-    def _validate_and_format_time(self, time_str, default_time: list):
-        """验证并格式化时间字符串"""
-        if not time_str:
-            return default_time
-        time_str = time_str.strip().replace(',', ',')
-        # 分割字符串为列表
-        items = [
-            item.strip().strip("'").strip('"') for item in time_str.split(',')
-        ]
-
-        # 初始化结果列表
-        formatted_times = []
-
-        for item in items:
-            if not item:
-                continue  # 跳过空字符串
-            try:
-                item = item.replace(':', ':')
-                # 使用 dateutil.parser 解析时间字符串
-                parsed_time = parser.parse(item).time().strftime('%H:%M:%S')
-                formatted_times.append(parsed_time)
-            except Exception as e:
-                self.logger.error(f"配置时间解析错误: {item},: {e} ")
-        if len(formatted_times) == 0:
-            self.logger.error(f"解析时间失败,使用默认时间 {default_time}")
-            return default_time
-        return formatted_times
-
-    def _send_error_email(self, title: str, error: str) -> None:
-        email_helper = EmailHelper()
-        email = self.config.get("email.error_email")
-        self.logger.info(f"发送错误邮件: {email}")
-        if not email:
-            return
-        title = f"{title}异常"
-        content = f"{title},请及时处理。\n\n异常信息:{error}"
-        email_helper.send_email(email, title, content, False, None)

+ 42 - 6
SourceCode/TenderCrawler/app/models/area_email.py

@@ -3,24 +3,35 @@ from utils.mysql_helper import MySQLHelper
 
 class AreaEmail:
 
-    def __init__(self, name=None, area=None, email=None,is_active=None,remark=None):
+    def __init__(
+        self,
+        name=None,
+        area=None,
+        email=None,
+        is_virtual=None,
+        is_active=None,
+        remark=None,
+    ):
         self.name = name
         self.area = area
         if email is None:
             email = ""
         self.email = email.replace(",", ",")
+        self.is_virtual = is_virtual
         self.is_active = is_active
+
         self.remark = remark
 
     def __repr__(self):
         return (
             f"<AreaEmail(name={self.name},area={self.area}, email={self.email}, "
-            f"is_active={self.is_active}, remark={self.remark})>")
+            f"is_active={self.is_active}, remark={self.remark})>"
+        )
 
     def to_dict(self):
         return {
-            'area': self.area,
-            'email': self.email,
+            "area": self.area,
+            "email": self.email,
         }
 
     # # 插入 AreaEmail 数据
@@ -35,8 +46,13 @@ class AreaEmail:
     #                   area_email.remark)
     #         db_helper.execute_non_query(query, params)
 
-    _query = "SELECT name,area,email FROM t_area_email WHERE is_active = 1"
+    _query = "SELECT name,area,email FROM t_area_email WHERE is_virtual = 0 and is_active = 1"
+    _query_virtual = "SELECT name,area,email FROM t_area_email WHERE is_virtual = 1 and is_active = 1"
+    _query_master = (
+        "SELECT email FROM t_area_email WHERE name='master' AND is_active = 1"
+    )
     _query_by_area = "SELECT email FROM t_area_email WHERE CONCAT(area,',') like %s AND is_active = 1"
+
     # 查询 AreaEmail 数据
     def fetch_all(self):
         with MySQLHelper() as db_helper:
@@ -44,10 +60,30 @@ class AreaEmail:
             data = [AreaEmail(**result) for result in results]
             return data
 
+    def fetch_all_virtual(self):
+        with MySQLHelper() as db_helper:
+            results = db_helper.execute_query(self._query_virtual)
+            data = [AreaEmail(**result) for result in results]
+            return data
+
     def fetch_one_by_area(self, area: str):
         with MySQLHelper() as db_helper:
-            params = ('%' + area + ',%', )
+            params = ("%" + area + ",%",)
             result = db_helper.fetch_one(self._query_by_area, params)
             if result is None:
                 return None
             return result["email"]
+
+    def fetch_master_email(self):
+        with MySQLHelper() as db_helper:
+            result = db_helper.fetch_one(self._query_master)
+            if result is None:
+                return None
+            return result["email"]
+
+    _update_area_query = "UPDATE t_area_email SET area = %s WHERE name = %s"
+
+    def update_area_email_area_by_name(self, name: str, area: str):
+        with MySQLHelper() as db_helper:
+            params = (area, name)
+            db_helper.execute_non_query(self._update_area_query, params)

+ 122 - 92
SourceCode/TenderCrawler/app/models/collect_data.py

@@ -1,26 +1,33 @@
 from datetime import datetime
+
+import utils
 from utils.mysql_helper import MySQLHelper
-from utils.logger_helper import LoggerHelper
 
 
 class CollectData:
 
-    logger = LoggerHelper.get_logger()
     UNPROCESSED = 0
     PROCESSED = 1
     INVALID = 2
 
-    def __init__(self,
-                 url=None,
-                 keyword=None,
-                 content=None,
-                 attach_path=None,
-                 status=UNPROCESSED,
-                 create_time=None,
-                 process_time=None):
+    DATA_TYPE_0 = 0
+    DATA_TYPE_RESULT = 1
+
+    def __init__(
+        self,
+        url=None,
+        keyword=None,
+        content=None,
+        data_type=None,
+        attach_path=None,
+        status=UNPROCESSED,
+        create_time=None,
+        process_time=None,
+    ):
         self.url = url
         self.keyword = keyword
         self.content = content
+        self.data_type = data_type
         self.attach_path = attach_path
         self.status = status
         self.create_time = create_time or datetime.now()
@@ -34,25 +41,31 @@ class CollectData:
         )
 
     _insert_query = """
-        INSERT IGNORE INTO t_collect_data (url, keyword, content, attach_path, status, create_time)
-        VALUES (%s, %s, %s, %s, %s, %s);
+        INSERT IGNORE INTO t_collect_data (url, keyword, content, data_type, attach_path, status, create_time)
+        VALUES (%s, %s, %s, %s, %s, %s, %s);
         """
     _insert_query_history = """
-         INSERT IGNORE INTO t_collect_data_history (url, keyword, content, attach_path, status, create_time)
-         VALUES (%s, %s, %s, %s, %s, %s);
+         INSERT IGNORE INTO t_collect_data_history (url, keyword, content, data_type, attach_path, status, create_time)
+         VALUES (%s, %s, %s, %s, %s, %s, %s);
          """
     _delete_query = """
-         DELETE FROM t_collect_data
-         WHERE url = %s;
+         DELETE FROM t_collect_data  WHERE url = %s;
          """
+
     def insert(self, collect_data):
         if not isinstance(collect_data, self.__class__):
             raise TypeError("collect_data 不是 CollectData 的实例")
         with MySQLHelper() as db_helper:
 
-            params = (collect_data.url, collect_data.keyword,
-                       collect_data.content,collect_data.attach_path,
-                      collect_data.status, datetime.now())
+            params = (
+                collect_data.url,
+                collect_data.keyword,
+                collect_data.content,
+                collect_data.data_type,
+                collect_data.attach_path,
+                collect_data.status,
+                datetime.now(),
+            )
             if collect_data.status == self.INVALID:
                 db_helper.execute_non_query(self._insert_query_history, params)
             else:
@@ -60,8 +73,9 @@ class CollectData:
 
     def insert_batch(self, collect_data_list):
         if not all(
-                isinstance(collect_data, self.__class__)
-                for collect_data in collect_data_list):
+            isinstance(collect_data, self.__class__)
+            for collect_data in collect_data_list
+        ):
             raise TypeError("collect_data_list 中的所有元素必须是 CollectData 的实例")
 
         params = [
@@ -69,10 +83,12 @@ class CollectData:
                 collect_data.url,
                 collect_data.keyword,
                 collect_data.content,
+                collect_data.data_type,
                 collect_data.attach_path,
                 collect_data.status,
-                datetime.now()  # 每次调用 datetime.now() 获取当前时间
-            ) for collect_data in collect_data_list
+                datetime.now(),  # 每次调用 datetime.now() 获取当前时间
+            )
+            for collect_data in collect_data_list
             if collect_data.status != 2
         ]
         params2 = [
@@ -80,21 +96,23 @@ class CollectData:
                 collect_data.url,
                 collect_data.keyword,
                 collect_data.content,
+                collect_data.data_type,
                 collect_data.attach_path,
                 collect_data.status,
-                datetime.now()  # 每次调用 datetime.now() 获取当前时间
-            ) for collect_data in collect_data_list
+                datetime.now(),  # 每次调用 datetime.now() 获取当前时间
+            )
+            for collect_data in collect_data_list
             if collect_data.status == 2
         ]
 
         with MySQLHelper() as db_helper:
             db_helper.execute_non_query(self._insert_query, params)
-            # 获取受影响的行数
-            affected_rows = db_helper.connection.affected_rows()
-
+            affected_rows1 = db_helper.connection.affected_rows()
+            utils.get_logger().info(f"成功插入 {affected_rows1} 条有效数据")
             db_helper.execute_non_query(self._insert_query_history, params2)
-            self.logger.info(f"成功插入 {affected_rows} 条数据")
-            return affected_rows
+            affected_rows2 = db_helper.connection.affected_rows()
+            utils.get_logger().info(f"成功插入 {affected_rows2} 条无效历史数据")
+            return affected_rows1 + affected_rows2
 
     # def insert_url(self, url: str, keyword: str, content: str):
     #     with MySQLHelper() as db_helper:
@@ -112,51 +130,51 @@ class CollectData:
     #         data = [CollectData(**result) for result in results]
     #         return data
 
+    _query = "SELECT url FROM t_collect_data"
+
     def fetch_all_urls(self) -> list[str]:
         with MySQLHelper() as db_helper:
-            query = "SELECT url FROM t_collect_data"
-            results = db_helper.execute_query(query)
+            results = db_helper.execute_query(self._query)
             # 使用列表推导式一次性提取所有 'url' 值
-            data = [result['url'] for result in results]
+            data = [result["url"] for result in results]
             return data
 
+    _process_url_query = "SELECT url FROM t_collect_data WHERE status = 0"
+
     def fetch_urls_to_process(self) -> list[str]:
         with MySQLHelper() as db_helper:
-            query = """
-            SELECT url
-            FROM t_collect_data
-            WHERE status = 0
-            """
-            results = db_helper.execute_query(query)
-            data = [result['url'] for result in results]
+            results = db_helper.execute_query(self._process_url_query)
+            data = [result["url"] for result in results]
             return data
 
+    _one_url_query = "SELECT url FROM `t_collect_data_history` WHERE url= %s UNION SELECT url FROM `t_collect_data`  WHERE url= %s LIMIT 1"
+
     def fetch_one_url(self, url: str):
         with MySQLHelper() as db_helper:
-            query = """
-             SELECT url FROM `t_collect_data_history` WHERE url= %s UNION SELECT url FROM `t_collect_data`  WHERE url= %s LIMIT 1
-            """
-            result = db_helper.fetch_one(query, (url, url))
+            result = db_helper.fetch_one(self._one_url_query, (url, url))
             if not result:
                 return None
             data = result["url"]
             return data
 
+    _one_collect_by_url_query = "ELECT url,keyword,content,data_type,attach_path,status FROM t_collect_data WHERE url = %s  LIMIT 1"
+
     def fetch_one_collect_by_url(self, url: str):
         with MySQLHelper() as db_helper:
-            query = """
-                SELECT url,keyword,content,status FROM t_collect_data WHERE url = %s  LIMIT 1
-            """
-            result = db_helper.fetch_one(query, (url, ))
+            result = db_helper.fetch_one(self._one_collect_by_url_query, (url,))
             if not result:
                 return None
-            data = CollectData(url=result["url"],
-                               keyword=result["keyword"],
-                               content=result["content"],
-                               status=result["status"])
+            data = CollectData(
+                url=result["url"],
+                keyword=result["keyword"],
+                content=result["content"],
+                data_type=result["data_type"],
+                attach_path=result["attach_path"],
+                status=result["status"],
+            )
             return data
 
-    def set_process(self, url: str):
+    def set_process(self, url):
         # with MySQLHelper() as db_helper:
         #     query = """
         #     UPDATE t_collect_data
@@ -164,61 +182,73 @@ class CollectData:
         #     WHERE url = %s
         #     """
         #     db_helper.execute_non_query(query, (url))
-        self.move_to_history_and_delete(url)
+        urls = [url]
+        self.move_to_history_and_delete(urls)
 
+    def set_process_list(self, urls: list):
+        self.move_to_history_and_delete(urls)
 
-    def move_to_history_and_delete(self, url: str):
+    def move_to_history_and_delete(self, urls: list):
         with MySQLHelper() as db_helper:
             # 查询 t_collect_data 中的数据
-            query = """
-             SELECT url, keyword, content, attach_path, status, create_time, process_time
-             FROM t_collect_data
-             WHERE url = %s
-             """
-            result = db_helper.fetch_one(query, (url,))
-            if not result:
-                self.logger.warning(f"URL {url} 未在 t_collect_data 中找到,无法移动到历史表并删除。")
+            placeholders = ", ".join(["%s"] * len(urls))
+            query = f"""
+                       SELECT url, keyword, content, data_type, attach_path, status, create_time, process_time
+                       FROM t_collect_data
+                       WHERE url IN  ({placeholders})
+                       """
+            results = db_helper.execute_query(query, urls)
+            if not results:
+                utils.get_logger().warning(
+                    f"URLs {urls} 未在 t_collect_data 中找到,无法移动到历史表并删除。"
+                )
                 return False
 
             # 将数据插入到 t_collect_data_history
             insert_query = self._insert_query_history
-            insert_params = (
-                result["url"],
-                result["keyword"],
-                result["content"],
-                result["attach_path"],
-                result["status"],
-                result["create_time"]
-            )
+            insert_params = [
+                (
+                    result["url"],
+                    result["keyword"],
+                    result["content"],
+                    result["data_type"],
+                    result["attach_path"],
+                    result["status"],
+                    result["create_time"],
+                )
+                for result in results
+            ]
             db_helper.execute_non_query(insert_query, insert_params)
 
             # 删除 t_collect_data 中的数据
-            delete_query = self._delete_query
-            delete_params = (url,)
-            db_helper.execute_non_query(delete_query, delete_params)
+            delete_query = f"DELETE FROM t_collect_data WHERE url IN ({placeholders})"
+            db_helper.execute_non_query(delete_query, urls)
 
-            self.logger.info(f"URL {url} 已从 t_collect_data 移动到 t_collect_data_history 并删除。")
+            utils.get_logger().info(
+                f"URLs {urls} 已从 t_collect_data 移动到 t_collect_data_history 并删除。"
+            )
             return True
 
-    def fetch_by_status(self, status=0):
-        with MySQLHelper() as db_helper:
-            query = """
-            SELECT url, keyword, content, status, create_time, process_time
-            FROM t_collect_data
-            WHERE status = %s
-            """
-            results = db_helper.execute_query(query, (status, ))
-            data = [CollectData(**result) for result in results]
-            return data
+    _update_status_query = " UPDATE t_collect_data SET status = %s WHERE url = %s"
 
     def set_status(self, collect_data):
-        if not isinstance(collect_data, self):
+        if not isinstance(collect_data, CollectData):
             raise TypeError("collect_data 不是 CollectData 的实例")
         with MySQLHelper() as db_helper:
-            query = """
-            UPDATE t_collect_data
-            SET status = %s
-            WHERE url = %s
-            """
             params = (collect_data.status, collect_data.url)
-            db_helper.execute_non_query(query, params)
+            db_helper.execute_non_query(self._update_status_query, params)
+
+    _delete_before_date_history_query = (
+        "DELETE FROM t_collect_data_history WHERE create_time < %s"
+    )
+    _delete_before_date_query = "DELETE FROM t_collect_data WHERE create_time < %s "
+
+    def delete_before_date(self, date: str):
+        with MySQLHelper() as db_helper:
+            params = (date,)
+            db_helper.execute_non_query(self._delete_before_date_history_query, params)
+            affected_rows = db_helper.connection.affected_rows()
+            db_helper.execute_non_query(self._delete_before_date_query, params)
+            affected_rows += db_helper.connection.affected_rows()
+            utils.get_logger().info(f"删除 {date} 之前共 {affected_rows} 条 采集记录。")
+            return affected_rows

+ 135 - 94
SourceCode/TenderCrawler/app/models/process_data.py

@@ -1,36 +1,39 @@
+import utils
 from datetime import datetime
+
 from utils.mysql_helper import MySQLHelper
-from utils.config_helper import ConfigHelper
-from utils.logger_helper import LoggerHelper
 
 
 class ProcessData:
 
-    logger = LoggerHelper.get_logger()
-
-    def __init__(self,
-                 no=None,
-                 title=None,
-                 url=None,
-                 keyword=None,
-                 date=None,
-                 area=None,
-                 address=None,
-                 summary=None,
-                 release_date=None,
-                 devices=None,
-                 attach_path=None,
-                 status=None,
-                 create_time=None,
-                 send_time=None,
-                 other_urls=None,
-                 remark=None):
+    def __init__(
+        self,
+        no=None,
+        title=None,
+        url=None,
+        keyword=None,
+        date=None,
+        area=None,
+        address=None,
+        summary=None,
+        release_date=None,
+        devices=None,
+        attach_path=None,
+        status=None,
+        create_time=None,
+        send_time=None,
+        other_urls=None,
+        prompt_tokens=None,
+        completion_tokens=None,
+        total_tokens=None,
+        remark=None,
+    ):
         self.no = no
         self.title = title
         self.url = url
         self.date = date
         if not area:
-            area = ConfigHelper().get("default_area", "全国")
+            area = utils.get_config_value("default_area", "全国")
         self.area = area.replace(" ", "")
         self.keyword = keyword
         self.address = address
@@ -42,6 +45,9 @@ class ProcessData:
         self.create_time = create_time or datetime.now()
         self.send_time = send_time
         self.other_urls = other_urls
+        self.prompt_tokens = prompt_tokens
+        self.completion_tokens = completion_tokens
+        self.total_tokens = total_tokens
         self.remark = remark
 
     def __repr__(self):
@@ -49,47 +55,22 @@ class ProcessData:
             f"ProcessData(no={self.no}, title={self.title}, date={self.date}, "
             f"area={self.area}, address={self.address}, summary={self.summary}, "
             f"status={self.status}, create_time={self.create_time}, "
-            f"send_time={self.send_time}, remark={self.remark})")
+            f"send_time={self.send_time}, remark={self.remark})"
+        )
 
     _insert_query = """
-              INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, devices, attach_path, status, create_time)
-              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+              INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, devices, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
+              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
           """
-    _update_query = """
-                UPDATE t_collect_data SET status = 1 WHERE url = %s;
-            """
+
+    # _update_query = """
+    #             UPDATE t_collect_data SET status = 1 WHERE url = %s;
+    #         """
     def insert(self, process_data):
         if not isinstance(process_data, self.__class__):
             raise TypeError("process_data 不是 ProcessData 的实例")
 
-        insert_params = (process_data.no,
-                         process_data.title,
-                         process_data.url,
-                         process_data.keyword,
-                         process_data.date,
-                         process_data.area,
-                         process_data.address,
-                         process_data.summary,
-                         process_data.release_date,
-                         process_data.devices,
-                         process_data.attach_path,
-                         0,
-                         datetime.now())
-
-        update_params = (process_data.url, )
-
-        with MySQLHelper() as db_helper:
-            db_helper.execute_non_query(self._insert_query, insert_params)
-            db_helper.execute_non_query(self._update_query, update_params)
-
-    def insert_batch(self, process_data_list):
-        if not all(
-                isinstance(process_data, self.__class__)
-                for process_data in process_data_list):
-            raise TypeError("process_data_list 中的所有元素必须是 ProcessData 的实例")
-
-
-        insert_params = [(
+        insert_params = (
             process_data.no,
             process_data.title,
             process_data.url,
@@ -103,68 +84,128 @@ class ProcessData:
             process_data.attach_path,
             0,
             datetime.now(),
-        ) for process_data in process_data_list]
+            process_data.prompt_tokens,
+            process_data.completion_tokens,
+            process_data.total_tokens,
+        )
 
-        update_params = [(process_data.url, )
-                         for process_data in process_data_list]
+        # update_params = (process_data.url, )
+
+        with MySQLHelper() as db_helper:
+            db_helper.execute_non_query(self._insert_query, insert_params)
+            # db_helper.execute_non_query(self._update_query, update_params)
+
+    def insert_batch(self, process_data_list):
+        if not all(
+            isinstance(process_data, self.__class__)
+            for process_data in process_data_list
+        ):
+            raise TypeError("process_data_list 中的所有元素必须是 ProcessData 的实例")
+
+        insert_params = [
+            (
+                process_data.no,
+                process_data.title,
+                process_data.url,
+                process_data.keyword,
+                process_data.date,
+                process_data.area,
+                process_data.address,
+                process_data.summary,
+                process_data.release_date,
+                process_data.devices,
+                process_data.attach_path,
+                0,
+                datetime.now(),
+                process_data.prompt_tokens,
+                process_data.completion_tokens,
+                process_data.total_tokens,
+            )
+            for process_data in process_data_list
+        ]
+
+        # update_params = [(process_data.url, )
+        #                  for process_data in process_data_list]
 
         with MySQLHelper() as db_helper:
             db_helper.execute_non_query(self._insert_query, insert_params)
             affected_rows = db_helper.connection.affected_rows()
-            self.logger.info(f"成功插入 {affected_rows} 条数据")
-            for param in update_params:
-                db_helper.execute_non_query(self._update_query, param)
+            utils.get_logger().info(f"成功插入 {affected_rows} 条数据")
+            # for param in update_params:
+            #     db_helper.execute_non_query(self._update_query, param)
             return affected_rows
 
-    _one_query = """
-                    SELECT url,no,other_urls,attach_path FROM t_data WHERE no = %s  LIMIT 1
-                """
+    _one_url_query = (
+        "SELECT url,no,other_urls,attach_path FROM t_data WHERE url = %s  LIMIT 1"
+    )
+
+    def fetch_one_process_by_url(self, url: str):
+        with MySQLHelper() as db_helper:
+            result = db_helper.fetch_one(self._one_url_query, (url,))
+            if not result:
+                return None
+            data = ProcessData(
+                url=result["url"],
+                no=result["no"],
+                other_urls=result["other_urls"],
+                attach_path=result["attach_path"],
+            )
+            return data
+
+    _one_no_query = (
+        "SELECT url,no,other_urls,attach_path FROM t_data WHERE no = %s  LIMIT 1"
+    )
+
     def fetch_one_process_by_no(self, no: str):
         with MySQLHelper() as db_helper:
 
-            result = db_helper.fetch_one(self._one_query, (no, ))
+            result = db_helper.fetch_one(self._one_no_query, (no,))
             if not result:
                 return None
-            data = ProcessData(url=result["url"],
-                               no=result["no"],
-                               other_urls=result["other_urls"],
-                               attach_path=result["attach_path"])
+            data = ProcessData(
+                url=result["url"],
+                no=result["no"],
+                other_urls=result["other_urls"],
+                attach_path=result["attach_path"],
+            )
             return data
 
-    def fetch_no_send(self):
+    _not_send_query = "SELECT no, title, url, keyword, date, area, address, summary, attach_path, release_date FROM t_data WHERE status = 0"
+
+    def fetch_not_send(self):
         with MySQLHelper() as db_helper:
-            query = "SELECT no, title, url, keyword, date, area, address, summary, attach_path, release_date FROM t_data WHERE status = 0"
-            results = db_helper.execute_query(query)
+            results = db_helper.execute_query(self._not_send_query)
             data = [ProcessData(**result) for result in results]
             return data
 
+    _set_send_query = "UPDATE t_data SET status = 1, send_time = %s WHERE no = %s"
+
     def set_send(self, no):
         with MySQLHelper() as db_helper:
-            query = """
-            UPDATE t_data
-            SET status = 1, send_time = %s
-            WHERE no = %s
-            """
+
             params = (datetime.now(), no)
-            db_helper.execute_non_query(query, params)
+            db_helper.execute_non_query(self._set_send_query, params)
+
+    _update_other_urls_query = "UPDATE t_data SET other_urls = %s WHERE url = %s"
 
     def set_other_urls(self, url, other_urls):
         with MySQLHelper() as db_helper:
-            query = """
-            UPDATE t_data
-            SET other_urls = %s
-            WHERE url = %s
-            """
-            update_query = """
-            UPDATE t_collect_data SET status = 1 WHERE url = %s;
-            """
             params = (other_urls, url)
-            db_helper.execute_non_query(query, params)
-            db_helper.execute_non_query(update_query, (url, ))
+            db_helper.execute_non_query(self._update_other_urls_query, params)
+
+    _delete_before_date_query = "DELETE FROM t_data WHERE date < %s"
 
-    def check_is_process_by_url(self, url):
+    def delete_before_date(self, date: str):
+        """
+        删除指定日期之前的数据
+        :param date: 日期字符串,格式为 YYYY-MM-DD
+        :return: 删除的行数
+        """
         with MySQLHelper() as db_helper:
-            query = "SELECT * FROM t_data WHERE url = %s"
-            params = (url, )
-            results = db_helper.execute_query(query, params)
-            return True if results else False
+            params = (date,)
+            db_helper.execute_non_query(self._delete_before_date_query, params)
+            affected_rows = db_helper.connection.affected_rows()
+            utils.get_logger().info(
+                f"删除 {date} 之前共 {affected_rows} 条 招标处理记录。"
+            )
+            return affected_rows

+ 218 - 0
SourceCode/TenderCrawler/app/models/process_result_data.py

@@ -0,0 +1,218 @@
+import utils
+from datetime import datetime
+
+from utils.mysql_helper import MySQLHelper
+
+
+class ProcessResultData:
+
+    def __init__(
+        self,
+        no=None,
+        title=None,
+        url=None,
+        keyword=None,
+        date=None,
+        price=None,
+        bidder=None,
+        summary=None,
+        attach_path=None,
+        status=None,
+        create_time=None,
+        send_time=None,
+        other_urls=None,
+        prompt_tokens=None,
+        completion_tokens=None,
+        total_tokens=None,
+        remark=None,
+    ):
+        self.no = no
+        self.title = title
+        self.url = url
+        self.keyword = keyword
+        self.date = date
+        self.price = price
+        self.bidder = bidder
+        self.summary = summary
+        self.attach_path = attach_path
+        self.status = status
+        self.create_time = create_time or datetime.now()
+        self.send_time = send_time
+        self.other_urls = other_urls
+        self.prompt_tokens = prompt_tokens
+        self.completion_tokens = completion_tokens
+        self.total_tokens = total_tokens
+        self.remark = remark
+
+    def __repr__(self):
+        return (
+            f"ProcessResultData(no={self.no}, title={self.title}, date={self.date}, "
+            f"keyword={self.keyword}, price={self.price}, bidder={self.bidder}, summary={self.summary}, attach_path={self.attach_path}, "
+            f"status={self.status}, create_time={self.create_time}, "
+            f"send_time={self.send_time}, remark={self.remark})"
+        )
+
+    _insert_query = """
+              INSERT IGNORE INTO t_data_result (no, title, url, keyword, date, price,  bidder, summary, attach_path, status, create_time, prompt_tokens, completion_tokens, total_tokens)
+              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+          """
+    # _update_query = """
+    #             UPDATE t_collect_data SET status = 1 WHERE url = %s;
+    #         """
+
+    def insert(self, process_result_data):
+        if not isinstance(process_result_data, self.__class__):
+            raise TypeError("process_result_data 不是 ProcessResultData 的实例")
+
+        insert_params = (
+            process_result_data.no,
+            process_result_data.title,
+            process_result_data.url,
+            process_result_data.keyword,
+            process_result_data.date,
+            process_result_data.price,
+            process_result_data.bidder,
+            process_result_data.summary,
+            process_result_data.attach_path,
+            0,
+            datetime.now(),
+            process_result_data.prompt_tokens,
+            process_result_data.completion_tokens,
+            process_result_data.total_tokens,
+        )
+
+        # update_params = (process_result_data.url, )
+
+        with MySQLHelper() as db_helper:
+            db_helper.execute_non_query(self._insert_query, insert_params)
+            # db_helper.execute_non_query(self._update_query, update_params)
+
+    def insert_batch(self, process_result_data_list):
+        if not all(
+            isinstance(process_result_data, self.__class__)
+            for process_result_data in process_result_data_list
+        ):
+            raise TypeError(
+                "process_result_data_list 中的所有元素必须是 ProcessResultData 的实例"
+            )
+
+        insert_params = [
+            (
+                process_result_data.no,
+                process_result_data.title,
+                process_result_data.url,
+                process_result_data.keyword,
+                process_result_data.date,
+                process_result_data.price,
+                process_result_data.bidder,
+                process_result_data.summary,
+                process_result_data.attach_path,
+                0,
+                datetime.now(),
+                process_result_data.prompt_tokens,
+                process_result_data.completion_tokens,
+                process_result_data.total_tokens,
+            )
+            for process_result_data in process_result_data_list
+        ]
+
+        # update_params = [(process_result_data.url, )
+        #                  for process_result_data in process_result_data_list]
+
+        with MySQLHelper() as db_helper:
+            db_helper.execute_non_query(self._insert_query, insert_params)
+            affected_rows = db_helper.connection.affected_rows()
+            utils.get_logger().info(f"成功插入 {affected_rows} 条数据")
+            # for param in update_params:
+            #     db_helper.execute_non_query(self._update_query, param)
+            return affected_rows
+
+    _one_url_query = """
+                      SELECT url,no,other_urls,attach_path FROM t_data_result WHERE url = %s  LIMIT 1
+                  """
+
+    def fetch_one_process_by_url(self, url: str):
+        with MySQLHelper() as db_helper:
+            result = db_helper.fetch_one(self._one_url_query, (url,))
+            if not result:
+                return None
+            data = ProcessResultData(
+                url=result["url"],
+                no=result["no"],
+                other_urls=result["other_urls"],
+                attach_path=result["attach_path"],
+            )
+            return data
+
+    _one_no_query = """
+                        SELECT url,no,other_urls,attach_path FROM t_data_result WHERE no = %s  LIMIT 1
+                    """
+
+    def fetch_one_process_by_no(self, no: str):
+        with MySQLHelper() as db_helper:
+            result = db_helper.fetch_one(self._one_no_query, (no,))
+            if not result:
+                return None
+            data = ProcessResultData(
+                url=result["url"],
+                no=result["no"],
+                other_urls=result["other_urls"],
+                attach_path=result["attach_path"],
+            )
+            return data
+
+    _not_send_query = "SELECT no, title, url, keyword, date, price, bidder, summary, attach_path, status, create_time, send_time FROM t_data_result WHERE status = 0"
+
+    def fetch_not_send(self):
+        with MySQLHelper() as db_helper:
+            results = db_helper.execute_query(self._not_send_query)
+            data = [ProcessResultData(**result) for result in results]
+            return data
+
+    _update_send_status_query = """
+           UPDATE t_data_result  SET status = 1, send_time = %s  WHERE no = %s
+           """
+
+    def set_send(self, no):
+        with MySQLHelper() as db_helper:
+            params = (datetime.now(), no)
+            db_helper.execute_non_query(self._update_send_status_query, params)
+
+    _update_other_urls_query = "UPDATE t_data_result SET other_urls = %s WHERE url = %s"
+
+    def set_other_urls(self, url, other_urls):
+        with MySQLHelper() as db_helper:
+            params = (other_urls, url)
+            db_helper.execute_non_query(self._update_other_urls_query, params)
+
+    _query_report = "select * from t_data_result where create_time between %s and %s"
+
+    def fetch_to_report_by_date(self, start_date, end_date):
+        """
+        获取需要生成报表的数据
+        :param start_date:
+        :param end_date:
+        :return:
+        """
+        with MySQLHelper() as db_helper:
+            params = (start_date, end_date)
+            results = db_helper.execute_query(self._query_report, params)
+            data = [ProcessResultData(**result) for result in results]
+            return data
+
+    _delete_before_date_query = "DELETE FROM t_data_result WHERE create_time < %s"
+
+    def delete_before_date(self, date: str):
+        """
+        删除指定日期之前的数据
+        :param date:
+        :return:
+        """
+        with MySQLHelper() as db_helper:
+            params = (date,)
+            db_helper.execute_non_query(self._delete_before_date_query, params)
+            affected_rows = db_helper.connection.affected_rows()
+            utils.get_logger().info(
+                f"删除 {date} 之前共 {affected_rows} 条 中标处理记录。"
+            )
+            return affected_rows

+ 17 - 19
SourceCode/TenderCrawler/app/models/url_setting.py

@@ -3,14 +3,16 @@ from utils.mysql_helper import MySQLHelper
 
 class UrlSetting:
 
-    def __init__(self,
-                 url=None,
-                 adapter_type=None,
-                 username=None,
-                 password=None,
-                 keywords=None,
-                 sort=None,
-                 is_active=None):
+    def __init__(
+        self,
+        url=None,
+        adapter_type=None,
+        username=None,
+        password=None,
+        keywords=None,
+        sort=None,
+        is_active=None,
+    ):
         self.url = url
         self.adapter_type = adapter_type
         self.username = username
@@ -21,21 +23,17 @@ class UrlSetting:
         self.sort = sort or 0
         self.is_active = is_active
 
-
     def __repr__(self):
-        return (
-            f"<UrlSetting(url={self.url}, type={self.adapter_type}, "
-            f"username={self.username}, keywords={self.keywords}, is_active={self.is_active})>"
-        )
+        return f"URL配置[ url: {self.url}  type: {self.adapter_type} keywords: {self.keywords}]"
 
     def to_dict(self):
         return {
-            'url': self.url,
-            'type': self.adapter_type,
-            'username': self.username,
-            'password': self.password,
-            'keywords': self.keywords,
-            'is_active': self.is_active
+            "url": self.url,
+            "type": self.adapter_type,
+            "username": self.username,
+            "password": self.password,
+            "keywords": self.keywords,
+            "is_active": self.is_active,
         }
 
     # # 插入 URL 设置数据

+ 53 - 0
SourceCode/TenderCrawler/app/stores/data_store_interface.py

@@ -1,6 +1,9 @@
 from abc import ABC, abstractmethod
+
+from models.area_email import AreaEmail
 from models.collect_data import CollectData
 from models.process_data import ProcessData
+from models.process_result_data import ProcessResultData
 
 
 class IDataStore(ABC):
@@ -20,6 +23,10 @@ class IDataStore(ABC):
     def save_collect_data(self, is_force=False):
         raise NotImplementedError("save 应由子类重写。")
 
+    @abstractmethod
+    def set_collect_process(self, url):
+        raise NotImplementedError("set_collect_process 应由子类重写。")
+
     @abstractmethod
     def query_urls_to_process(self):
         raise NotImplementedError("query_to_process 应由子类重写。")
@@ -32,6 +39,10 @@ class IDataStore(ABC):
     def query_one_process_by_no(self, no):
         raise NotImplementedError("query_one_process_by_no 应由子类重写。")
 
+    @abstractmethod
+    def query_one_process_by_url(self, no):
+        raise NotImplementedError("query_one_process_by_url 应由子类重写。")
+
     @abstractmethod
     def insert_process_data(self, data: ProcessData):
         raise NotImplementedError("insert_process_data 应由子类重写。")
@@ -44,14 +55,56 @@ class IDataStore(ABC):
     def set_process_other_urls(self, url, other_urls: str):
         raise NotImplementedError("save_process_data 应由子类重写。")
 
+    @abstractmethod
+    def query_one_process_result_by_url(self, url):
+        raise NotImplementedError("query_one_process_result_by_url 应由子类重写。")
+
+    @abstractmethod
+    def query_one_process_result_by_no(self, no):
+        raise NotImplementedError("query_one_process_result_by_no 应由子类重写。")
+
+    @abstractmethod
+    def insert_process_result_data(self,
+                                   data: ProcessResultData,
+                                   is_batch=True):
+        raise NotImplementedError("insert_process_result_data 应由子类重写。")
+
+    @abstractmethod
+    def save_process_result_data(self, is_force=False):
+        raise NotImplementedError("save_process_result_data 应由子类重写。")
+
+    @abstractmethod
+    def set_process_result_other_urls(self, url, other_urls: str):
+        raise NotImplementedError("set_process_result_other_urls 应由子类重写。")
+
     @abstractmethod
     def query_to_send(self):
         raise NotImplementedError("query_to_send 应由子类重写。")
 
+    @abstractmethod
+    def query_to_report_by_date(self, start_date, end_date):
+        raise NotImplementedError("query_to_report_by_date 应由子类重写。")
+
     @abstractmethod
     def set_send(self, no: str):
         raise NotImplementedError("set_send 应由子类重写。")
 
+    @abstractmethod
+    def query_all_emails(self) -> list[AreaEmail]:
+        raise NotImplementedError("get_emails 应由子类重写。")
+
+    @abstractmethod
+    def query_all_virtual_emails(self):
+        raise NotImplementedError("get_email_by_area 应由子类重写。")
+
+    @abstractmethod
+    def query_master_email(self) -> str:
+        raise NotImplementedError("get_master_email 应由子类重写。")
+
     @abstractmethod
     def get_email_by_area(self, area: str):
         raise NotImplementedError("get_email_by_area 应由子类重写。")
+
+    @abstractmethod
+    def update_area_email_area_by_name(self, name: str, area: str):
+        raise NotImplementedError("update_area_email_area_by_name 应由子类重写。")

+ 54 - 19
SourceCode/TenderCrawler/app/stores/default_data_store.py

@@ -1,46 +1,81 @@
-from utils.logger_helper import LoggerHelper
+import utils
+from models.process_result_data import ProcessResultData
 from stores.data_store_interface import IDataStore
 
 
 class DefaultDataStore(IDataStore):
 
-
-
-    logger = LoggerHelper.get_logger()
-
     def __init__(self):
         pass
 
-    def query_one_collect_url(self, url: str) :
-        self.logger.info(f"Default: fetch_one_url")
-    def insert_collect_data(self, data , is_batch=True):
-        self.logger.info(f"Default: insert_collect_data")
+    def query_one_collect_url(self, url: str):
+        utils.get_logger().info("Default: FETCH_ONE_URL")
 
+    def insert_collect_data(self, data, is_batch=True):
+        utils.get_logger().info("Default: INSERT_COLLECT_DATA")
 
     def save_collect_data(self, is_force=False):
-        self.logger.info("Default: SAVE")
+        utils.get_logger().info("Default: SAVE_COLLECT_DATA")
+
+    def set_collect_process(self, url):
+        utils.get_logger().info("Default: SET_COLLECT_PROCESS")
 
     def query_urls_to_process(self):
-        self.logger.info("Default: QUERY_TO_PROCESS")
+        utils.get_logger().info("Default: QUERY_TO_PROCESS")
 
     def query_one_collect_by_url(self, url):
-        self.logger.info("Default: QUERY_ONE_PROCESS")
+        utils.get_logger().info("Default: QUERY_ONE_PROCESS")
+
+    def query_one_process_by_url(self, no):
+        utils.get_logger().info("Default: query_one_process_by_url")
+
     def query_one_process_by_no(self, no):
-        self.logger.info(f"Default: query_one_process_by_no")
+        utils.get_logger().info("Default: query_one_process_by_no")
+
     def insert_process_data(self, data):
-        self.logger.info("Default: INSERT_PROCESS_DATA")
+        utils.get_logger().info("Default: INSERT_PROCESS_DATA")
 
     def save_process_data(self, is_force=False):
-        self.logger.info("Default: SAVE_PROCESS_DATA")
+        utils.get_logger().info("Default: SAVE_PROCESS_DATA")
 
     def set_process_other_urls(self, url, other_urls: str):
-        self.logger.info("Default: SET_PROCESS_OTHER_URLS")
+        utils.get_logger().info("Default: SET_PROCESS_OTHER_URLS")
+
+    def query_one_process_result_by_url(self, url):
+        utils.get_logger().info("Default: QUERY_ONE_PROCESS_RESULT_BY_URL")
+
+    def query_one_process_result_by_no(self, no):
+        utils.get_logger().info("Default: QUERY_ONE_PROCESS_RESULT_BY_NO")
+
+    def insert_process_result_data(self, data: ProcessResultData, is_batch=True):
+        utils.get_logger().info("Default: INSERT_PROCESS_RESULT_DATA")
+
+    def save_process_result_data(self, is_force=False):
+        utils.get_logger().info("Default: SAVE_PROCESS_RESULT_DATA")
+
+    def set_process_result_other_urls(self, url, other_urls: str):
+        utils.get_logger().info("Default: SET_PROCESS_RESULT_OTHER_URLS")
 
     def query_to_send(self):
-        self.logger.info("Default: QUERY_TO_SEND")
+        utils.get_logger().info("Default: QUERY_TO_SEND")
+
+    def query_to_report_by_date(self, start_date, end_date):
+        utils.get_logger().info("Default: QUERY_TO_REPORT_BY_DATE")
 
     def set_send(self, no: str):
-        self.logger.info("Default: SET_SEND")
+        utils.get_logger().info("Default: SET_SEND")
+
+    def query_all_emails(self):
+        utils.get_logger().info("Default: QUERY_ALL_EMAILS")
+
+    def query_all_virtual_emails(self):
+        utils.get_logger().info("Default: QUERY_ALL_VIRTUAL_EMAILS")
+
+    def query_master_email(self):
+        utils.get_logger().info("Default: GET_MASTER_EMAIL")
 
     def get_email_by_area(self, area: str):
-        self.logger.info("Default: GET_EMAIL_BY_AREA")
+        utils.get_logger().info("Default: GET_EMAIL_BY_AREA")
+
+    def update_area_email_area_by_name(self, name: str, area: str):
+        utils.get_logger().info("Default: UPDATE_AREA_EMAIL_AREA_BY_NAME")

+ 86 - 16
SourceCode/TenderCrawler/app/stores/mysql_data_store.py

@@ -1,37 +1,43 @@
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
-from stores.data_store_interface import IDataStore
+import utils
+from models.area_email import AreaEmail
 from models.collect_data import CollectData
 from models.process_data import ProcessData
-from models.area_email import AreaEmail
+from models.process_result_data import ProcessResultData
+from stores.data_store_interface import IDataStore
 
 
 class MysqlDataStore(IDataStore):
 
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
     _collectData = CollectData()
     _processData = ProcessData()
+    _processResultData = ProcessResultData()
     _areaEmail = AreaEmail()
 
     def __init__(self):
-        self._collect_size = self.config.get_int('save.collect_batch_size',1)
+        self._collect_size = utils.get_config_int("save.collect_batch_size", 1)
         self._collect_list = []
-        self._process_size = self.config.get_int('save.process_batch_size',1)
+        self._process_size = utils.get_config_int("save.process_batch_size", 1)
         self._process_list = []
+        self._process_result_list = []
 
     def query_one_collect_url(self, url: str) -> str | None:
         return self._collectData.fetch_one_url(url)
+
     def insert_collect_data(self, data: CollectData, is_batch=True):
         if not is_batch:
             self._collectData.insert(data)
+            utils.get_logger().info(f"保存 采集数据 到数据库: {data.url}")
         else:
             self._collect_list.append(data)
             self.save_collect_data()
 
     def save_collect_data(self, is_force=False):
-        if (is_force and len(self._collect_list)>0) or len(self._collect_list) >= self._collect_size:
-            self.logger.info("批量保存到数据库,数量: " + str(len(self._collect_list)))
+        if (is_force and len(self._collect_list) > 0) or len(
+            self._collect_list
+        ) >= self._collect_size:
+            utils.get_logger().info(
+                "批量保存 采集数据 到数据库,数量: " + str(len(self._collect_list))
+            )
             self._collectData.insert_batch(self._collect_list)
             self._collect_list = []
 
@@ -41,35 +47,99 @@ class MysqlDataStore(IDataStore):
     def query_one_collect_by_url(self, url):
         return self._collectData.fetch_one_collect_by_url(url)
 
+    def query_one_process_by_url(self, url):
+        return self._processData.fetch_one_process_by_url(url)
+
     def query_one_process_by_no(self, no):
         return self._processData.fetch_one_process_by_no(no)
 
     def insert_process_data(self, data: ProcessData, is_batch=True):
         if not is_batch:
             self._processData.insert(data)
-            self.logger.info(f"保存到数据库: {data.url}" )
+            self._collectData.set_process(data.url)
+            utils.get_logger().info(f"保存 处理数据 到数据库: {data.url}")
         else:
             self._process_list.append(data)
             self.save_process_data()
 
     # 插入到数据库时会把CollectData设为已处理
     def save_process_data(self, is_force=False):
-        if (is_force and len(self._process_list)>0) or len(self._process_list) >= self._process_size:
-            self.logger.info(f"批量保存到数据库,数量: {str(len(self._process_list))}")
+        if (is_force and len(self._process_list) > 0) or len(
+            self._process_list
+        ) >= self._process_size:
+            utils.get_logger().info(
+                f"批量保存 处理数据 到数据库,数量: {str(len(self._process_list))}"
+            )
             self._processData.insert_batch(self._process_list)
+            urls = [item.url for item in self._process_list]
+            self._collectData.set_process_list(urls)
             self._process_list = []
 
+    def set_collect_process(self, url):
+        return self._collectData.set_process(url)
+
     def set_process_other_urls(self, url, other_urls: str):
         return self._processData.set_other_urls(url, other_urls)
 
-    def check_url_is_process(self, url: str) -> bool:
-        return self._processData.check_is_process_by_url(url)
+    def query_one_process_result_by_url(self, url):
+        return self._processResultData.fetch_one_process_by_url(url)
+
+    def query_one_process_result_by_no(self, no):
+        return self._processResultData.fetch_one_process_by_no(no)
+
+    def insert_process_result_data(self, data: ProcessResultData, is_batch=True):
+        if not is_batch:
+            self._processResultData.insert(data)
+            self._collectData.set_process(data.url)
+            utils.get_logger().info(f"保存 处理数据结果 到数据库: {data.url}")
+        else:
+            self._process_result_list.append(data)
+            self.save_process_result_data()
+
+    def save_process_result_data(self, is_force=False):
+        if (is_force and len(self._process_result_list) > 0) or len(
+            self._process_result_list
+        ) >= self._process_size:
+            utils.get_logger().info(
+                f"批量保存 处理数据结果 到数据库,数量: {str(len(self._process_result_list))}"
+            )
+            self._processResultData.insert_batch(self._process_result_list)
+            urls = [item.url for item in self._process_result_list]
+            self._collectData.set_process_list(urls)
+            self._process_result_list = []
+
+    def set_process_result_other_urls(self, url, other_urls: str):
+        return self._processResultData.set_other_urls(url, other_urls)
 
     def query_to_send(self):
-        return self._processData.fetch_no_send()
+        return self._processData.fetch_not_send()
+
+    def query_to_report_by_date(self, start_date, end_date):
+        return self._processResultData.fetch_to_report_by_date(start_date, end_date)
 
     def set_send(self, no: str):
         self._processData.set_send(no)
 
+    def query_all_emails(self) -> list[AreaEmail]:
+        return self._areaEmail.fetch_all()
+
+    def query_all_virtual_emails(self) -> list[AreaEmail]:
+        return self._areaEmail.fetch_all_virtual()
+
     def get_email_by_area(self, area: str) -> str:
         return self._areaEmail.fetch_one_by_area(area)
+
+    def query_master_email(self) -> str:
+        return self._areaEmail.fetch_master_email()
+
+    def update_area_email_area_by_name(self, name: str, area: str):
+        return self._areaEmail.update_area_email_area_by_name(name, area)
+
+    def delete_collect_data_before_date(self, date: str):
+        return self._collectData.delete_before_date(date)
+
+    def delete_process_data_before_date(self, date: str):
+        return self._processData.delete_before_date(date)
+
+    def delete_process_result_data_before_date(self, date: str):
+        return self._processResultData.delete_before_date(date)

+ 147 - 1
SourceCode/TenderCrawler/app/utils/__init__.py

@@ -1,3 +1,149 @@
+"""
+utils/__init__.py
+
+该模块初始化文件,导入了多个辅助工具类,并定义了一系列便捷函数,用于日志记录、配置管理、文件操作、字符串处理和邮件发送等功能。
+"""
+
+import json
+
+from utils.ai_helper import AiHelper
 from utils.config_helper import ConfigHelper
+from utils.email_helper import EmailHelper
+from utils.file_helper import FileHelper
+from utils.logger_helper import LoggerHelper
+from utils.string_helper import StringHelper
+
+
+def get_logger():
+    """
+    获取日志记录器实例。
+
+    该函数通过调用LoggerHelper类的静态方法get_logger()来获取一个日志记录器实例。
+    主要用于需要记录日志的位置,通过该函数获取日志记录器实例,然后进行日志记录。
+    这样做可以保持日志记录的一致性和集中管理。
+
+    :return: Logger实例,用于记录日志。
+    """
+    return LoggerHelper.get_logger()
+
+
+def clean_log_file(day: int):
+    """
+    清理指定天数之前的日志文件。
+
+    :param day: 整数,表示清理多少天前的日志文件。
+    """
+    LoggerHelper.clean_log_file(day)
+
+
+def get_config():
+    """
+    获取配置管理器实例。
+
+    该函数返回一个ConfigHelper实例,用于读取和管理应用程序的配置信息。
+
+    :return: ConfigHelper实例,用于配置管理。
+    """
+    return ConfigHelper()
+
+
+def reload_config():
+    """
+    重新加载配置文件。
+
+    该函数会重新加载配置文件中的内容,适用于配置文件发生更改后需要重新加载的情况。
+    """
+    get_config().load_config()
+
+
+def get_config_value(key: str, default: str = None):
+    """
+    获取配置项的值。
+
+    :param key: 字符串,配置项的键。
+    :param default: 字符串,默认值(可选)。
+    :return: 配置项的值,如果不存在则返回默认值。
+    """
+    return get_config().get(key, default)
+
+
+def get_config_int(key: str, default: int = None):
+    """
+    获取配置项的整数值。
+
+    :param key: 字符串,配置项的键。
+    :param default: 整数,默认值(可选)。
+    :return: 配置项的整数值,如果不存在则返回默认值。
+    """
+    return get_config().get_int(key, default)
+
+
+def get_config_bool(key: str):
+    """
+    获取配置项的布尔值。
+
+    :param key: 字符串,配置项的键。
+    :return: 配置项的布尔值。
+    """
+    return get_config().get_bool(key)
+
+
+def download_remote_file(file_url: str, file_name: str) -> str:
+    """
+    下载远程文件并保存到本地。
+
+    :param file_url: 字符串,远程文件的URL。
+    :param file_name: 字符串,保存到本地的文件名。
+    :return: 字符串,下载后的文件路径。
+    """
+    return FileHelper().download_remote_file(file_url, file_name)
+
+
+def clean_attach_file(day: int):
+    """
+    清理指定天数之前的附件文件。
+
+    :param day: 整数,表示清理多少天前的附件文件。
+    """
+    FileHelper().clean_attach_file(day)
+
+
+def to_array(s: str, split: str = ",") -> list[str]:
+    """
+    将字符串按指定分隔符拆分为数组。
+
+    :param s: 字符串,待拆分的字符串。
+    :param split: 字符串,分隔符。
+    :return: 列表,拆分后的数组。
+    """
+    return StringHelper.to_array(s, split)
+
+
+def call_openai(system_prompt: str, user_prompt: str) -> json:
+    """
+    调用OpenAI API进行对话。
+
+    :param system_prompt: 字符串,系统提示信息。
+    :param user_prompt: 字符串,用户输入的提示信息。
+    :return: JSON对象,API返回的结果。
+    """
+    return AiHelper().call_openai(system_prompt, user_prompt)
+
+
+def send_email(
+    to_addr: str,
+    subject: str,
+    body: str,
+    body_is_html: bool = True,
+    attachment_paths: str = None,
+):
+    """
+    发送电子邮件。
 
-ConfigHelper().load_config()
+    :param to_addr: 字符串,收件人地址。
+    :param subject: 字符串,邮件主题。
+    :param body: 字符串,邮件正文。
+    :param body_is_html: 布尔值,是否为HTML格式,默认为True。
+    :param attachment_paths: 字符串,附件路径(可选)。
+    """
+    EmailHelper().send_email(to_addr, subject, body, body_is_html, attachment_paths)

+ 51 - 101
SourceCode/TenderCrawler/app/utils/ai_helper.py

@@ -1,44 +1,27 @@
+import json
 import re
-import requests
+
 from openai import OpenAI
-import json
 
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
-from models.process_data import ProcessData
+import utils
 
 
 class AiHelper:
 
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
-
     _ai_api_key = None
     _ai_api_url = None
     _ai_max_tokens = 150
-    DEFAULT_AI_SYSTEM_PROMPT = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
-    DEFAULT_AI_PROMPT_TEMPLATE = """在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、
-    开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),相关采购设备的名称信息,多个设备以逗号分割(device)。
-    返回包含no,title,area,date,address,release_date,summary,device字段的json格式字符串,没有找到或未提供的信息json字段为空。
-"""
 
     def __init__(self):
-        self._ai_api_key = self.config.get("ai.key")
-        self._ai_api_url = self.config.get("ai.url")
-        self._api_model = self.config.get("ai.model")
-        max_tokens = self.config.get("ai.max_tokens")
+        self._ai_api_key = utils.get_config_value("ai.key")
+        self._ai_api_url = utils.get_config_value("ai.url")
+        self._api_model = utils.get_config_value("ai.model")
+        max_tokens = utils.get_config_value("ai.max_tokens")
         if max_tokens:
             self._ai_max_tokens = int(max_tokens)
-        self._ai_system_prompt = self.config.get("ai.system_prompt",
-                                                 self.DEFAULT_AI_SYSTEM_PROMPT)
-        self._ai_prompt_template = self.config.get(
-            "ai.prompt_template", self.DEFAULT_AI_PROMPT_TEMPLATE)
-
 
-    def call_ai(self, content: str) -> ProcessData:
-        # 截取前100个字符进行日志记录
-        # truncated_content = content[:100]
-        self.logger.info("调用AI API")
+    def call_openai(self, system_prompt: str, user_prompt: str) -> json:
+        utils.get_logger().info("调用AI API")
         if self._ai_api_key is None:
             raise Exception("AI API key 没有配置")
         if self._ai_api_url is None:
@@ -48,23 +31,34 @@ class AiHelper:
         client = OpenAI(api_key=self._ai_api_key, base_url=self._ai_api_url)
         completion = client.chat.completions.create(
             model=self._api_model,
-            messages=[{
-                "role": "system",
-                "content": self._ai_system_prompt,
-            }, {
-                "role": "user",
-                "content": f"{content}  {self._ai_prompt_template}",
-            }],
+            messages=[
+                {
+                    "role": "system",
+                    "content": system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": user_prompt,
+                },
+            ],
             stream=False,
             temperature=0.7,
         )
-
-        self.logger.info(f"AI Response: {completion.model_dump_json()}")
-        response = json.loads(completion.model_dump_json())
-        #self.logger.info(f"AI Response: {response}")
         try:
-            res_str = self._extract_message_content(response)
-            return self._parse_response(res_str, True)
+            response = completion.model_dump_json()
+
+            response_json = json.loads(response)
+            res_str = self._extract_message_content(response_json)
+            result = self._parse_response(res_str, True)
+            if result:
+                usage = response_json["usage"]
+                result["completion_tokens"] = usage.get("completion_tokens", 0)
+                result["prompt_tokens"] = usage.get("prompt_tokens", 0)
+                result["total_tokens"] = usage.get("total_tokens", 0)
+                # utils.get_logger().info(f"AI Process JSON: {result}")
+            else:
+                utils.get_logger().info(f"AI Response: {response}")
+            return result
         except Exception as e:
             raise Exception(f"解析 AI 响应错误: {e}")
 
@@ -79,18 +73,19 @@ class AiHelper:
             raise Exception("AI 响应中未找到有效的 choices 或 message 数据")
 
         # 移除多余的 ```json 和 ```
-        if message_content.startswith("```json") and message_content.endswith(
-                "```"):
+        if message_content.startswith("```json") and message_content.endswith("```"):
             message_content = message_content[6:-3]
 
         # 去除开头的 'n' 字符
-        if message_content.startswith('n'):
+        if message_content.startswith("n"):
             message_content = message_content[1:]
         # 移除无效的转义字符和时间戳前缀
-        message_content = re.sub(r'\\[0-9]{2}', '',
-                                 message_content)  # 移除 \32 等无效转义字符
-        message_content = re.sub(r'\d{4}-\d{2}-\dT\d{2}:\d{2}:\d{2}\.\d+Z', '',
-                                 message_content)  # 移除时间戳
+        message_content = re.sub(
+            r"\\[0-9]{2}", "", message_content
+        )  # 移除 \32 等无效转义字符
+        message_content = re.sub(
+            r"\d{4}-\d{2}-\dT\d{2}:\d{2}:\d{2}\.\d+Z", "", message_content
+        )  # 移除时间戳
         message_content = message_content.strip()  # 去除首尾空白字符
 
         # 替换所有的反斜杠
@@ -98,65 +93,20 @@ class AiHelper:
 
         return message_content
 
-    def _parse_response(self, response: str, first=True) -> ProcessData:
-        self.logger.info(f"AI Response JSON STR: {response}")
+    def _parse_response(self, response: str, first=True) -> json:
+        # utils.get_logger().info(f"AI Response JSON STR: {response}")
         try:
             data = json.loads(response)
-            return ProcessData(no=data.get("no"),
-                               title=data.get("title"),
-                               date=data.get("date"),
-                               area=data.get("area"),
-                               address=data.get("address"),
-                               devices=data.get("device"),
-                               summary=data.get("summary"),
-                               release_date=data.get("release_date"))
+            return data
+
         except json.JSONDecodeError as e:
             if first:
-                self.logger.error(f"JSON 解析错误,去除部分特殊字符重新解析一次: {e}")
+                utils.get_logger().error(
+                    f"JSON 解析错误,去除部分特殊字符重新解析一次: {e}"
+                )
                 # 替换中文引号为空
-                message_content = re.sub(r'[“”]', "", response)  # 替换双引号
-                message_content = re.sub(r'[‘’]', "", message_content)  # 替换单引号
+                message_content = re.sub(r"[“”]", "", response)  # 替换双引号
+                message_content = re.sub(r"[‘’]", "", message_content)  # 替换单引号
                 return self._parse_response(message_content, False)
             else:
-                raise Exception(f"解析 AI 响应错误: {e}")
-
-
-    def call_ai_1(self, content: str) -> ProcessData:
-        # 截取前100个字符进行日志记录
-        # truncated_content = content[:100]
-        self.logger.info("调用AI API")
-        if self._ai_api_key is None:
-            raise Exception("AI API key 没有配置")
-        if self._ai_api_url is None:
-            raise Exception("AI API url 没有配置")
-        if self._api_model is None:
-            raise Exception("AI API model 没有配置")
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self._ai_api_key}"
-        }
-        messages = [{
-            "role": "system",
-            "content": self._ai_system_prompt
-        }, {
-            "role": "user",
-            "content": f"{content} {self._ai_prompt_template}"
-        }]
-
-        data = {
-            "model": self._api_model,
-            "messages": messages,
-            "stream": False,
-            "max_tokens": self._ai_max_tokens
-        }
-        response = requests.post(self._ai_api_url, headers=headers, json=data)
-        if response.status_code == 200:
-            try:
-                self.logger.info(f"AI Response: {response.text}")
-                res_str = self._extract_message_content(response.json())
-                return self._parse_response(res_str, True)
-            except Exception as e:
-                raise Exception(f"解析 AI 响应错误: {e}")
-        else:
-            raise Exception(
-                f"调用 AI 错误: {response.status_code} - {response.text}")
+                raise Exception(f"解析 AI 响应错误: {response} {e}")

+ 11 - 13
SourceCode/TenderCrawler/app/utils/config_helper.py

@@ -1,14 +1,13 @@
 import os
-import yaml
 
+import yaml
 
 
 class ConfigHelper:
     _instance = None
 
     # 默认配置文件路径
-    default_config_path = os.path.join(os.path.dirname(__file__), '..',
-                                       'config.yml')
+    default_config_path = os.path.join(os.path.dirname(__file__), "..", "config.yml")
 
     # 类变量存储加载的配置
     _config = None
@@ -28,7 +27,7 @@ class ConfigHelper:
                 self._path = path
             if not os.path.exists(self._path):
                 raise FileNotFoundError(f"没有找到文件或目录:'{self._path}'")
-        with open(self._path, 'r', encoding='utf-8') as file:
+        with open(self._path, "r", encoding="utf-8") as file:
             self._config = yaml.safe_load(file)
         # 合并环境变量配置
         self._merge_env_vars()
@@ -38,9 +37,8 @@ class ConfigHelper:
     def _merge_env_vars(self, env_prefix="APP_"):  # 环境变量前缀为 APP_
         for key, value in os.environ.items():
             if key.startswith(env_prefix):
-                config_key = key[len(env_prefix):].lower()
-                self._set_nested_key(self._config, config_key.split('__'),
-                                     value)
+                config_key = key[len(env_prefix) :].lower()
+                self._set_nested_key(self._config, config_key.split("__"), value)
 
     def _set_nested_key(self, config, keys, value):
         if len(keys) > 1:
@@ -50,10 +48,10 @@ class ConfigHelper:
         else:
             config[keys[0]] = value
 
-    def get(self, key:str, default:str=None):
+    def get(self, key: str, default: str = None):
         if self._config is None:
             self.load_config(self._path)
-        keys = key.split('.')
+        keys = key.split(".")
         config = self._config
         for k in keys:
             if isinstance(config, dict) and k in config:
@@ -62,15 +60,15 @@ class ConfigHelper:
                 return default
         return config
 
-    def get_bool(self, key:str)->bool:
-        val = str(self.get(key,"0"))
+    def get_bool(self, key: str) -> bool:
+        val = str(self.get(key, "0"))
         return True if val.lower() == "true" or val == "1" else False
 
-    def get_int(self, key:str, default:int=0)->int:
+    def get_int(self, key: str, default: int = 0) -> int:
         val = self.get(key)
         if not val:
             return default
-        try :
+        try:
             return int(val)
         except ValueError:
             return default

+ 45 - 55
SourceCode/TenderCrawler/app/utils/email_helper.py

@@ -1,94 +1,86 @@
-import smtplib
-import os
-import mimetypes
+import os, mimetypes, smtplib, utils
+from email import encoders
+from email.mime.base import MIMEBase
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
-from email.mime.base import MIMEBase
-from email import encoders
-
-
-from utils.config_helper import ConfigHelper
-from utils.logger_helper import LoggerHelper
-from utils.string_helper import StringHelper
-
-
-
 
 
 class EmailHelper:
 
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
-
     def __init__(self):
-        self.smtp_server = self.config.get("email.smtp_server")
-        self.port = self.config.get("email.smtp_port")
-        self.username = self.config.get("email.smtp_user")
-        self.password = self.config.get("email.smtp_password")
-        self.from_email = self.config.get("email.from_email")
+        self.smtp_server = utils.get_config_value("email.smtp_server")
+        self.port = utils.get_config_value("email.smtp_port")
+        self.username = utils.get_config_value("email.smtp_user")
+        self.password = utils.get_config_value("email.smtp_password")
+        self.from_email = utils.get_config_value("email.from_email")
         # print(
         #     f"server:{self.smtp_server},port:{self.port},username:{self.username},password:{self.password},from_email:{self.from_email}"
         # )
 
-    def send_email(self,
-                   to_addr: str,
-                   subject: str,
-                   body: str,
-                   body_is_html: bool = True,
-                   attachment_paths: str = None):
+    def send_email(
+        self,
+        to_addr: str,
+        subject: str,
+        body: str,
+        body_is_html: bool = True,
+        attachment_paths: str = None,
+    ):
         msg = MIMEMultipart()
-        msg['From'] = self.from_email
-        msg['To'] = ', '.join(to_addr.split(','))
-        msg['Subject'] = subject
+        msg["From"] = self.from_email
+        msg["To"] = ", ".join(to_addr.split(","))
+        msg["Subject"] = subject
 
         # 根据 body_is_html 参数设置 MIMEText 类型
         if body_is_html:
-            msg.attach(MIMEText(body, 'html', 'utf-8'))
+            msg.attach(MIMEText(body, "html", "utf-8"))
         else:
-            msg.attach(MIMEText(body, 'plain', 'utf-8'))
+            msg.attach(MIMEText(body, "plain", "utf-8"))
 
         if attachment_paths:
-            attachment_arr = StringHelper.to_array(attachment_paths)
+            attachment_arr = utils.to_array(attachment_paths)
             for attachment_path in attachment_arr:
                 self._attach_file(msg, attachment_path)
 
         try:
-            with smtplib.SMTP_SSL(self.smtp_server,port=self.port, timeout=10) as server:
+            with smtplib.SMTP_SSL(
+                self.smtp_server, port=self.port, timeout=10
+            ) as server:
                 # server.starttls()
                 server.login(self.username, self.password)
                 # 将 to_addr 字符串通过 split(',') 分割成列表,传递给 sendmail
-                server.sendmail(self.from_email, to_addr.split(','),
-                                msg.as_string())
-            self.logger.info(f"邮件发送成功:{to_addr}")
+                server.sendmail(self.from_email, to_addr.split(","), msg.as_string())
+            utils.get_logger().info(f"邮件发送成功:{to_addr}")
             return True
         except smtplib.SMTPAuthenticationError:
-            self.logger.error("SMTP 认证失败")
+            utils.get_logger().error("SMTP 认证失败")
         except smtplib.SMTPServerDisconnected:
-            self.logger.error("SMTP 服务器断开连接")
+            utils.get_logger().error("SMTP 服务器断开连接")
         except smtplib.SMTPException as e:
-            self.logger.error(f"SMTP 异常: {e}")
+            utils.get_logger().error(f"SMTP 异常: {e}")
         except Exception as e:
-            self.logger.error(f"邮件发送失败:{to_addr} {e}")
+            utils.get_logger().error(f"邮件发送失败:{to_addr} {e}")
             return False
 
-
-    def _attach_file(self, msg: MIMEMultipart, attachment_path: str):
+    @staticmethod
+    def _attach_file(msg: MIMEMultipart, attachment_path: str):
         if not os.path.isfile(attachment_path):
-            self.logger.error(f"文件 {attachment_path} 不存在。")
+            utils.get_logger().error(f"文件 {attachment_path} 不存在。")
             return
 
         file_size = os.path.getsize(attachment_path)
         max_size = 1024 * 8192  # 8MB
 
         if file_size > max_size:
-            self.logger.error(f"文件 {attachment_path} 大小超过限制 ({file_size} bytes > {max_size} bytes),不添加附件。")
+            utils.get_logger().error(
+                f"文件 {attachment_path} 大小超过限制 ({file_size} bytes > {max_size} bytes),不添加附件。"
+            )
             return
 
         # 根据文件名后缀获取 MIME 类型
         content_type, _ = mimetypes.guess_type(attachment_path)
         if content_type is None:
-            content_type = 'application/octet-stream'  # 默认类型
-        main_type, sub_type = content_type.split('/', 1)
+            content_type = "application/octet-stream"  # 默认类型
+        main_type, sub_type = content_type.split("/", 1)
 
         with open(attachment_path, "rb") as attachment:
             # part = MIMEBase('application', 'octet-stream')
@@ -96,14 +88,12 @@ class EmailHelper:
             part.set_payload(attachment.read(max_size))
             # 获取文件名并去除第一个 @ 字符前面的部分
             name = os.path.basename(attachment_path)
-            at_index = name.find('@')
+            at_index = name.find("@")
             if at_index != -1:
-                name = name[at_index + 1:]
-            part.add_header(
-                'Content-Disposition',
-                f"attachment; filename= {name}")
-            part.add_header('Content-ID', '<0>')
-            part.add_header('X-Attachment-Id', '0')
+                name = name[at_index + 1 :]
+            part.add_header("Content-Disposition", f"attachment; filename= {name}")
+            part.add_header("Content-ID", "<0>")
+            part.add_header("X-Attachment-Id", "0")
             encoders.encode_base64(part)
             msg.attach(part)
-            self.logger.info(f"添加附件 {name} {attachment_path} 到邮件中。")
+            utils.get_logger().info(f"添加附件 {name} {attachment_path} 到邮件中。")

+ 61 - 22
SourceCode/TenderCrawler/app/utils/file_helper.py

@@ -1,26 +1,29 @@
-import os
-import requests
-from datetime import datetime
+import os, shutil,utils
+from datetime import datetime, timedelta
 from urllib.parse import urlparse
 
-from utils.logger_helper import LoggerHelper
-from utils.config_helper import ConfigHelper
+import requests
+
+
+
 
 class FileHelper:
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
+
     DEFAULT_ATTACH_PATH = "./attaches/"
+
     def __init__(self):
-        path = self.config.get("save.attach_file_path", self.DEFAULT_ATTACH_PATH)
+        path = utils.get_config_value("save.attach_file_path", self.DEFAULT_ATTACH_PATH)
         path = path.replace("\\", "/")
         path = path.replace("//", "/")
         self._attach_file_path = path
 
-    def download_remote_file(self, file_url, file_name) -> str | None:
-        self.logger.info(f"下载远程文件: {file_url}  文件名:{file_name}")
+    def download_remote_file(self, file_url: str, file_name: str) -> str | None:
+        utils.get_logger().info(f"下载远程文件: {file_url}  文件名:{file_name}")
         current_timestamp = datetime.now().strftime("%H%M%S%f")[:-3]  # 取前三位毫秒
         file_name = f"{current_timestamp}@{file_name}"
-        file_path = os.path.join(self._attach_file_path, f'{datetime.now().strftime("%Y-%m-%d")}')
+        file_path = os.path.join(
+            self._attach_file_path, f'{datetime.now().strftime("%Y-%m-%d")}'
+        )
         if not os.path.exists(file_path):
             os.makedirs(file_path)
         path = os.path.join(file_path, file_name)
@@ -37,31 +40,67 @@ class FileHelper:
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
             "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
             "Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1",
-            "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
+            "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36",
         ]
 
         # 根据文件名长度选择一个 User-Agent
         ua_index = len(file_name) % len(user_agents)
         # 解析 file_url 获取 Referer
         parsed_url = urlparse(file_url)
-        referer = f"{parsed_url.scheme}://{parsed_url.netloc}/".replace("//download.", "//www.")
+        referer = f"{parsed_url.scheme}://{parsed_url.netloc}/".replace(
+            "//download.", "//www."
+        )
         headers = {
-            'User-Agent': user_agents[ua_index],
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
-            'Referer': referer
+            "User-Agent": user_agents[ua_index],
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Encoding": "gzip, deflate, br",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
+            "Referer": referer,
         }
 
         try:
             response = requests.get(file_url, headers=headers, allow_redirects=True)
             response.raise_for_status()
-            with open(path, 'wb') as f:
+            with open(path, "wb") as f:
                 f.write(response.content)
-            self.logger.info(f"文件下载成功: {file_name}")
+            utils.get_logger().info(f"文件下载成功: {file_name}")
             return path
         except requests.exceptions.HTTPError as http_err:
-            self.logger.error(f"HTTP 错误: {http_err}")
+            utils.get_logger().error(f"HTTP 错误: {http_err}")
         except Exception as e:
-            self.logger.error(f"文件下载失败: {file_name}。Exception: {e}")
+            utils.get_logger().error(f"文件下载失败: {file_name}。Exception: {e}")
             return None
+
+    def clean_attach_file(self, day: int) -> None:
+        try:
+            current_time = datetime.now()
+            cutoff_time = current_time - timedelta(days=day)
+            for root, dirs, _ in os.walk(self._attach_file_path):
+                for dir_name in dirs:
+                    path = os.path.join(root, dir_name)
+                    dir_path = (
+                        str(path).replace(self._attach_file_path, "").replace("\\", "/")
+                    )
+                    if dir_path.count("/") > 0:
+                        continue
+                    try:
+                        dir_date = datetime.strptime(dir_path, "%Y-%m-%d")
+                        if dir_date < cutoff_time:
+                            try:
+                                shutil.rmtree(path)
+                                utils.get_logger().info(
+                                    f"  删除目录及其内容: {dir_path}"
+                                )
+                            except PermissionError:
+                                utils.get_logger().error(
+                                    f"  权限错误,无法删除目录: {dir_path}"
+                                )
+                            except Exception as e:
+                                utils.get_logger().error(
+                                    f"  删除目录失败: {dir_path}。Exception: {e}"
+                                )
+                    except ValueError:
+                        # 如果目录名称不符合 %Y-%m/%d 格式,跳过
+                        continue
+        except Exception as e:
+            utils.get_logger().error(f"文件清理失败。Exception: {e}")

+ 41 - 15
SourceCode/TenderCrawler/app/utils/logger_helper.py

@@ -1,16 +1,23 @@
-import os
 import logging
+import os
+from datetime import datetime
 from logging.handlers import TimedRotatingFileHandler
 
 from utils.config_helper import ConfigHelper
 
+
 class LoggerHelper:
     """
     日志辅助类,用于创建和提供日志记录器实例
     该类实现了单例模式,确保在整个应用程序中只有一个日志记录器实例被创建和使用
     """
+
     _instance = None
     config = ConfigHelper()
+    _log_file_name = f"{config.get("logger.file_name", "crawler")}.log"
+    _log_file_path = config.get("logger.file_path", "./logs")
+    _log_level_string = config.get("logger.level", "INFO")
+    _log_level = logging.getLevelName(_log_level_string)
 
     def __new__(cls, *args, **kwargs):
         """
@@ -18,8 +25,7 @@ class LoggerHelper:
         如果尚未创建实例,则创建并初始化日志记录器
         """
         if not cls._instance:
-            cls._instance = super(LoggerHelper,
-                                  cls).__new__(cls, *args, **kwargs)
+            cls._instance = super(LoggerHelper, cls).__new__(cls, *args, **kwargs)
             try:
                 cls._instance._initialize_logger()
             except Exception as e:
@@ -34,18 +40,20 @@ class LoggerHelper:
         """
         初始化日志记录器,包括设置日志级别、创建处理器和格式化器,并将它们组合起来
         """
-        self._logger = logging.getLogger('app_logger')
-        self._logger.setLevel(logging.INFO)
-        log_file_path = self.config.get("logger.file_path", "./logs")
-        if not os.path.exists(log_file_path):
-            os.makedirs(log_file_path)
+        self._logger = logging.getLogger("app_logger")
+        self._logger.setLevel(self._log_level)
+
+        if not os.path.exists(self._log_file_path):
+            os.makedirs(self._log_file_path)
 
         # 创建按日期分割的文件处理器
-        file_handler = TimedRotatingFileHandler(os.path.join(log_file_path, 'crawler.log'),
-                                                when='midnight',
-                                                interval=1,
-                                                backupCount=7,
-                                                encoding='utf-8')
+        file_handler = TimedRotatingFileHandler(
+            os.path.join(self._log_file_path, self._log_file_name),
+            when="midnight",
+            interval=1,
+            backupCount=7,
+            encoding="utf-8",
+        )
         file_handler.setLevel(logging.INFO)
 
         # 创建控制台处理器
@@ -53,8 +61,7 @@ class LoggerHelper:
         console_handler.setLevel(logging.INFO)
 
         # 创建格式化器
-        formatter = logging.Formatter(
-            '%(asctime)s - %(levelname)s - %(message)s')
+        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
 
         # 将格式化器添加到处理器
         file_handler.setFormatter(formatter)
@@ -73,3 +80,22 @@ class LoggerHelper:
         if not cls._instance:
             cls._instance = cls()
         return cls._instance._logger
+
+    @classmethod
+    def clean_log_file(cls, day: int):
+        if not os.path.exists(cls._log_file_path):
+            return
+        for filename in os.listdir(cls._log_file_path):
+            if filename != cls._log_file_name and filename.startswith(
+                cls._log_file_name
+            ):
+                try:
+                    file_path = os.path.join(cls._log_file_path, filename)
+                    file_time = datetime.strptime(
+                        filename.replace(f"{cls._log_file_name}.", ""), "%Y-%m-%d"
+                    )
+                    if (datetime.now() - file_time).days > day:
+                        os.remove(file_path)
+                        cls.get_logger().info(f"  删除日志文件: {file_path}")
+                except Exception as e:
+                    cls.get_logger().error(f"删除日志文件出错: {filename} {e}")

+ 30 - 33
SourceCode/TenderCrawler/app/utils/mysql_helper.py

@@ -1,45 +1,44 @@
-import pymysql
+import pymysql, utils
 from pymysql.cursors import DictCursor
-from utils.config_helper import ConfigHelper
-from utils.logger_helper import LoggerHelper
 
 
 class MySQLHelper:
 
-    logger = LoggerHelper.get_logger()
-    config = ConfigHelper()
-
     def __init__(self):
         try:
-            self.host = self.config.get('mysql.host')
-            self.user = self.config.get('mysql.user')
-            self.password = self.config.get('mysql.password')
-            self.db = self.config.get('mysql.db')
-            self.port = int(self.config.get('mysql.port'))
-            self.charset = self.config.get('mysql.charset')
+            self.host = utils.get_config_value("mysql.host")
+            self.user = utils.get_config_value("mysql.user")
+            self.password = utils.get_config_value("mysql.password")
+            self.db = utils.get_config_value("mysql.db")
+            self.port = int(utils.get_config_value("mysql.port"))
+            self.charset = utils.get_config_value("mysql.charset")
             self.connection = None
         except Exception as e:
-            self.logger.error(f"加载数据库配置文件失败: {e}")
+            utils.get_logger().error(f"加载数据库配置文件失败: {e}")
 
     def connect(self):
         try:
-            self.connection = pymysql.connect(host=self.host,
-                                              user=self.user,
-                                              password=self.password,
-                                              db=self.db,
-                                              port=self.port,
-                                              charset=self.charset,
-                                              cursorclass=DictCursor)
-            # self.logger.info(f"成功连接到数据库:{self.db}。")
+            self.connection = pymysql.connect(
+                host=self.host,
+                user=self.user,
+                password=self.password,
+                db=self.db,
+                port=self.port,
+                charset=self.charset,
+                cursorclass=DictCursor,
+            )
+            # utils.get_logger().info(f"成功连接到数据库:{self.db}。")
         except pymysql.MySQLError as e:
-            self.logger.error(f"数据库连接失败: {self.host}:{self.port} {self.db}")
+            utils.get_logger().error(
+                f"数据库连接失败: {self.host}:{self.port} {self.db}"
+            )
             self.connection = None  # 确保连接失败时设置为 None
             raise Exception(f"连接数据库失败: {e}")
 
     def disconnect(self):
         if self.connection and self.connection.open:
             self.connection.close()
-            # self.logger.info("数据库连接已关闭。")
+            # utils.get_logger().info("数据库连接已关闭。")
 
     def execute_query(self, query, params=None):
         try:
@@ -48,17 +47,16 @@ class MySQLHelper:
                 result = cursor.fetchall()
                 return result
         except pymysql.MySQLError as e:
-            self.logger.error(f"执行查询时出错:{e}")
+            utils.get_logger().error(f"执行查询时出错:{e}")
             return None
 
     def execute_non_query(self, query, params=None):
-        if isinstance(params, list) and all(
-                isinstance(p, tuple) for p in params):
+        if isinstance(params, list) and all(isinstance(p, tuple) for p in params):
             self.execute_many(query, params)
         elif isinstance(params, tuple):
             self.execute(query, params)
         else:
-            self.execute(query, (params, ))
+            self.execute(query, (params,))
 
     def execute(self, query, params=None):
         try:
@@ -66,18 +64,17 @@ class MySQLHelper:
                 cursor.execute(query, params)
                 self.connection.commit()
         except pymysql.MySQLError as e:
-            self.logger.error(f"执行非查询时出错:{e}")
+            utils.get_logger().error(f"执行非查询时出错:{e}")
             self.connection.rollback()
 
     def execute_many(self, query, params: list):
-        if isinstance(params, list) and all(
-                isinstance(p, tuple) for p in params):
+        if isinstance(params, list) and all(isinstance(p, tuple) for p in params):
             try:
                 with self.connection.cursor() as cursor:
                     cursor.executemany(query, params)
                     self.connection.commit()
             except pymysql.MySQLError as e:
-                self.logger.error(f"执行非查询时出错:{e}")
+                utils.get_logger().error(f"执行非查询时出错:{e}")
                 self.connection.rollback()
         else:
             raise ValueError("参数必须是元组列表")
@@ -89,7 +86,7 @@ class MySQLHelper:
                 result = cursor.fetchone()
                 return result
         except pymysql.MySQLError as e:
-            self.logger.error(f"获取一条记录时出错:{e}")
+            utils.get_logger().error(f"获取一条记录时出错:{e}")
             return None
 
     def __enter__(self):
@@ -113,7 +110,7 @@ class MySQLHelper:
         :param traceback: 异常的traceback对象, 如果没有异常则为None。
         """
         if exc_type:
-            self.logger.error(
+            utils.get_logger().error(
                 f"数据库发生异常,断开连接。异常类型:{exc_type}, 异常值:{exc_value} traceback: {traceback}"
             )
         self.disconnect()  # 断开连接

+ 7 - 9
SourceCode/TenderCrawler/app/utils/string_helper.py

@@ -1,7 +1,7 @@
 class StringHelper:
 
     @staticmethod
-    def check_empty(s: str,default:str) -> str:
+    def check_empty(s: str, default: str) -> str:
         """
         检查字符串是否为空
         """
@@ -9,10 +9,8 @@ class StringHelper:
             return s
         return default
 
-
-
     @staticmethod
-    def to_array(s: str, sep: str=",") -> list[str]:
+    def to_array(s: str, sep: str = ",") -> list[str]:
         """
         将字符串按指定分隔符分割成数组。
 
@@ -27,7 +25,7 @@ class StringHelper:
         return s.split(sep)
 
     @staticmethod
-    def startswith(s: str, prefix: str) -> str:
+    def e_startswith(s: str, prefix: str) -> str:
         """
         检查字符串是否以特定前缀开头,如果没有则补全。
 
@@ -40,7 +38,7 @@ class StringHelper:
         return s
 
     @staticmethod
-    def endswith(s: str, suffix: str) -> str:
+    def e_endswith(s: str, suffix: str) -> str:
         """
         检查字符串是否以特定后缀结尾,如果没有则补全。
 
@@ -53,7 +51,7 @@ class StringHelper:
         return s
 
     @staticmethod
-    def split_and_clean(s: str, sep: str=",") -> list[str]:
+    def split_and_clean(s: str, sep: str = ",") -> list[str]:
         """
         将字符串按指定分隔符分割并去除空字符串。
 
@@ -63,7 +61,7 @@ class StringHelper:
         """
         if not s:
             return []
-        parts = StringHelper.to_array(s,sep)
+        parts = StringHelper.to_array(s, sep)
         return [part.strip() for part in parts if part.strip()]
 
     @staticmethod
@@ -74,4 +72,4 @@ class StringHelper:
         :param s: 要处理的字符串。
         :return: 替换后的字符串。
         """
-        return ' '.join(s.split())
+        return " ".join(s.split())

+ 21 - 24
SourceCode/TenderCrawler/docker-compose.yml

@@ -12,16 +12,15 @@ services:
       - TZ=Asia/Shanghai
       # - MYSQL_DEFAULT_AUTHENTICATION_PLUGIN=mysql_native_password
     volumes:
-       - /home/docker/tender-crawler/mysql/log:/var/log/mysql
-       - /home/docker/tender-crawler/mysql/data:/var/lib/mysql
-       - /home/docker/tender-crawler/mysql/conf.d:/etc/mysql/conf.d
-       - /etc/localtime:/etc/localtime:ro
-       - /home/docker/tender-crawler/app/init.sql:/docker-entrypoint-initdb.d/init.sql # 挂载 init.sql 文件
+      - /home/docker/tender-crawler_v2/mysql/log:/var/log/mysql
+      - /home/docker/tender-crawler_v2/mysql/data:/var/lib/mysql
+      - /etc/localtime:/etc/localtime:ro
+      - /home/docker/tender-crawler_v2/app/init.sql:/docker-entrypoint-initdb.d/init.sql # 挂载 init.sql 文件
       # - ./.dev/mysql5.7/log:/var/log/mysql
       # - ./.dev/mysql5.7/data:/var/lib/mysql
-#      - ./.dev/mysql8.0.39/log:/var/log/mysql
-#      - ./.dev/mysql8.0.39/data:/var/lib/mysql
-#      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
+      # - ./.dev/mysql8.0.39/log:/var/log/mysql
+      # - ./.dev/mysql8.0.39/data:/var/lib/mysql
+      # - ./init.sql:/docker-entrypoint-initdb.d/init.sql
     ports:
       - '${MYSQL_PORT}:3306'
     networks:
@@ -42,7 +41,7 @@ services:
 
   crawler-app:
     build: .
-    image: y_tender-crawler-app:1.0.0
+    image: y_tender-crawler-app:2.0.1
     container_name: y_tender-crawler-app
     depends_on:
       - crawler-mysql
@@ -54,23 +53,21 @@ services:
       - APP_MYSQL__DB=${MYSQL_DATABASE}
       - APP_MYSQL__USER=${MYSQL_USER}
       - APP_MYSQL__PASSWORD=${MYSQL_PASSWORD}
-#      - APP_AI__KEY=
-#      - APP_AI__URL=http://192.168.0.109:7580/api/chat
-#      - APP_AI__MODEL=qwen2.5:7b
-      - APP_AI__MAX_TOKENS=1024
-      - APP_SCHEDULE__SLEEP_INTERVAL=600 #单位:秒 10分钟检查一次
-      - APP_SCHEDULE__COLLECT=20:00,12:00
-      - APP_SCHEDULE__PROCESS=23:00,4:00,13:00
-      - APP_SCHEDULE__SEND_EMAIL=08:20,14:00
-      - APP_SCHEDULE__RUN_NOW=1
+      #      - APP_AI__KEY=
+      #      - APP_AI__URL=http://192.168.0.109:7580/api/chat
+      #      - APP_AI__MODEL=qwen2.5:7b
+      - APP_JOB__COLLECT=20:00,12:00
+      - APP_JOB__PROCESS=23:00,4:00,13:00
+      - APP_JOB__SEND_EMAIL=08:20,14:00
+      - APP_JOB__RUN_NOW=1
       - APP_SELENIUM__REMOTE_DRIVER_URL=http://y_selenium:4444/wd/hub
     volumes:
-      - /home/docker/tender-crawler/app/config.yml:/app/config.yml
-      - /home/docker/tender-crawler/app/logs:/app/logs
-      - /home/docker/tender-crawler/app/attaches:/app/attaches
-#      - ./.dev/app/config.yml:/app/config.yml
-#      - ./.dev/app/logs:/app/logs
-#      - ./.dev/app/attaches:/app/attaches
+      - /home/docker/tender-crawler_v2/app/config.yml:/app/config.yml
+      - /home/docker/tender-crawler_v2/app/logs:/app/logs
+      - /home/docker/tender-crawler_v2/app/attaches:/app/attaches
+    #      - ./.dev/app/config.yml:/app/config.yml
+    #      - ./.dev/app/logs:/app/logs
+    #      - ./.dev/app/attaches:/app/attaches
     networks:
       - crawler-net
     # 如果需要暴露端口

+ 88 - 27
SourceCode/TenderCrawler/init.sql

@@ -1,3 +1,4 @@
+# noinspection SpellCheckingInspectionForFile
 
 
 SET NAMES utf8mb4;
@@ -19,8 +20,8 @@ CREATE TABLE `t_urls`  (
   PRIMARY KEY (`url`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
-INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.ccgp.gov.cn/index.shtml', 'ccgp', '', '', '红外,红外显微镜,傅里叶红外,红外光谱,显微红外,拉曼,激光共聚焦拉曼,拉曼显微镜,拉曼光谱,显微拉曼,气体分析\'', 1, 100, '中国政府采购网 https://www.ccgp.gov.cn/index.shtml');
-INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.chinabidding.com/', 'chinabidding', 'brukernano2011', '695765FqX', '红外光谱仪', 1,0, '中国国际招标网 (www.chinabidding.com 必联网)');
+INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.ccgp.gov.cn/index.shtml', 'ccgp', '', '', '红外光谱仪,红外显微镜,傅里叶红外,红外光谱,显微红外,拉曼,激光共聚焦拉曼,拉曼显微镜,拉曼光谱,显微拉曼,红外,气体分析', 1, 100, '中国政府采购网 https://www.ccgp.gov.cn/index.shtml');
+INSERT INTO `t_urls` (`url`, `adapter_type`, `username`, `password`, `keywords`, `is_active`, `sort`, `remark`) VALUES ('https://www.chinabidding.com/', 'chinabidding', 'brukernano2011', '695765FqX', '红外光谱仪,红外显微镜,傅里叶红外,红外光谱,显微红外,拉曼,激光共聚焦拉曼,拉曼显微镜,拉曼光谱,显微拉曼,红外,气体分析', 1,0, '中国国际招标网 (www.chinabidding.com 必联网)');
 
 
 -- ----------------------------
@@ -31,31 +32,63 @@ CREATE TABLE `t_area_email`  (
   `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '名称',
   `area` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '地区 多个以”,\"分隔',
   `email` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '邮箱 多个以 ”,\" 分隔',
-  `is_active` int(4) NULL DEFAULT NULL COMMENT '激活状态 1:激活 0:失活',
+  `is_virtual` int(4) NULL DEFAULT 1 COMMENT '是否虚拟的,没有email的',
+  `is_active` int(4) NULL DEFAULT 0 COMMENT '激活状态 1:激活 0:失活',
   `remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
   PRIMARY KEY (`name`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('全国', '全国', 'chancelot@foxmail.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('宁波', '浙江省宁波市,浙江宁波,宁波市,宁波', '349977741@qq.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('武汉', '武汉市,武汉,中国武汉,中国武汉市', 'chancelot@foxmail.com,349977741@qq.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('济南', '江苏省济南市,江苏济南,济南市,济南', '349977741@qq.com', 1, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张志琼', '黑龙江,吉林,辽宁', 'zhiqiong.zhang@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('王双', '河北,山东济南,山东德州', 'shuang.wang@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('尚祖俭', '天津市,天津,中国天津,中国天津市', 'zujian.shang@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('赵跃', '北京', 'yue.zhao@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张景灿', '陕西,新疆,宁夏,青海', 'jingcan.zhang@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('穆彦竹', '山西,河南,甘肃', 'yanzhu.mu@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('廖然', '内蒙古', 'ran.liao@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吕小勇', '江苏', 'xiaoyong.lv@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('张潇', '浙江,福建', 'xiao.zhang@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吴雪美', '上海', 'xuemei.wu@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('邬歆', '安徽,香港,澳门', 'xin.wu@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('冯新宝', '湖北,湖南', 'xinbao.feng@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('耿朝曦', '江西,贵州', 'zhaoxi.geng@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('李华斌', '广西,广东深圳', 'huabin.li@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('吕万明', '海南,广东广州,广东中山', 'wanming.lv@bruker.com', 0, NULL);
-INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('许建光', '西藏,云南,广东', 'jianguang.xu@bruker.com', 0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('master', 'master', 'chancelot@foxmail.com,349977741@qq.com', 0,1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('all', '全国', 'chancelot@foxmail.com,349977741@qq.com', 0,1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('张志琼', '黑龙江,吉林,辽宁', 'zhiqiong.zhang@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('王双', '河北,济南,山东德州', 'shuang.wang@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('范国春', '山东', 'guochun.fan@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('尚祖俭', '天津', 'zujian.shang@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('赵跃', '北京', 'yue.zhao@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('张景灿', '陕西,新疆,宁夏,青海', 'jingcan.zhang@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('穆彦竹', '山西,河南,甘肃', 'yanzhu.mu@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('廖然', '内蒙古', 'ran.liao@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('吕小勇', '江苏', 'xiaoyong.lv@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('张潇', '浙江,福建', 'xiao.zhang@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('吴雪美', '上海', 'xuemei.wu@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('邬歆', '安徽,香港,澳门', 'xin.wu@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('冯新宝', '湖北,湖南', 'xinbao.feng@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('耿朝曦', '江西,贵州', 'zhaoxi.geng@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('李华斌', '广西,深圳', 'huabin.li@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('吕万明', '海南,广州,中山', 'wanming.lv@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('许建光', '西藏,云南,广东', 'jianguang.xu@bruker.com', 0,0, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('全国', '北京,天津,上海,重庆,河北,山西,黑龙江,吉林,辽宁,江苏,浙江,安徽,福建,江西,山东,河南,湖北,湖南,广东,海南,四川,贵州,云南,陕西,甘肃,青海,台湾,内蒙古,广西,西藏,宁夏,新疆,香港,澳门', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('北京', '东城区,西城区,朝阳区,丰台区,石景山区,海淀区,门头沟区,房山区,通州区,顺义区,大兴区,昌平区,平谷区,怀柔区,密云区,延庆区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('上海', '黄浦区,徐汇区,长宁区,静安区,普陀区,虹口区,杨浦区,宝山区,闵行区,嘉定区,浦东新区,金山区,松江区,青浦区,奉贤区,崇明区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('天津', '和平区,河东区,河西区,南开区,河北区,红桥区,东丽区,西青区,北辰区,武清区,宝坻区,滨海新区,宁河区,静海区,蓟州区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('重庆', '渝中区,大渡口区,沙坪坝区,九龙坡区,南岸区,北碚区,渝北区,巴南区,长寿区,江北区,沙坪坝区,九龙坡区,南岸区,北碚区,渝北区,巴南区,长寿区,永川区,合川区,南川区,璧山区,江津区,合川区,永川区,南川区,璧山区,江津区,合川区,永川区,南川区,璧山区,江津区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('江苏', '南京,苏州,无锡,常州,镇江,南通,扬州,盐城,连云港,淮安,宿迁,泰州,徐州', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('黑龙江','哈尔滨,齐齐哈尔,鸡西,鹤岗,双鸭山,大庆,伊春,佳木斯,七台河,牡丹江,黑河,绥化,大兴安岭地区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('吉林', '长春,吉林,白山,延边朝鲜族自治州', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('辽宁', '沈阳,大连,鞍山,抚顺,本溪,丹东,锦州,营口,阜新,辽阳,盘锦,铁岭,朝阳,葫芦岛', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('陕西', '西安,铜川,宝鸡,咸阳,渭南,延安,汉中,榆林,安康,商洛', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('新疆', '乌鲁木齐,克拉玛依,吐鲁番,哈密,昌吉回族自治州,博尔塔拉蒙古自治州,巴音郭楞蒙古自治州,克孜勒苏柯尔克孜自治州,阿克苏地区,喀什地区,和田地区,伊犁哈萨克自治州,塔城地区,阿勒泰地区,石河子,阿拉尔,图木舒克,五家渠', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('宁夏', '银川,石嘴山,吴忠,固原,中卫', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('青海', '西宁,海东,海北藏族自治州,黄南藏族自治州,海南藏族自治州,果洛藏族自治州,玉树藏族自治州,海西蒙古族藏族自治州', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('山西', '太原,大同,阳泉,长治,晋城,朔州,晋中,运城,忻州,临汾,吕梁', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('河北', '石家庄,唐山,秦皇岛,邯郸,邢台,保定,张家口,承德,沧州,廊坊,衡水', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('河南', '郑州,开封,洛阳,平顶山,安阳,鹤壁,新乡,焦作,濮阳,许昌,漯河,三门峡,南阳,商丘,周口,驻马店,济源示范区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('甘肃', '兰州,嘉峪关,金昌,白银,天水,武威,张掖,平凉,酒泉,庆阳,定西,陇南,临夏回族自治州,甘南藏族自治州', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('山东', '青岛,淄博,枣庄,东营,烟台,潍坊,济宁,泰安,威海,日照,莱州,临沂,聊城,滨州,菏泽', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('内蒙古', '呼和浩特,包头,乌海,赤峰,通辽,鄂尔多斯,呼伦贝尔,巴彦淖尔,乌兰察布,兴安盟,锡林郭勒盟,阿拉善盟', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('浙江', '杭州,宁波,温州,嘉兴,湖州,绍兴,金华,衢州,舟山,台州,丽水', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('福建', '福州,厦门,莆田,三明,泉州,漳州,南平,龙岩,宁德', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('安徽', '合肥,芜湖,蚌埠,淮南,马鞍山,淮北,铜陵,安庆,黄山,滁州,阜阳,宿州,六安,亳州,池州,宣城', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('湖北', '武汉,黄石,十堰,宜昌,襄阳,鄂州,荆门,孝感,荆州,黄冈,咸宁,随州,恩施土家族苗族自治州,仙桃,潜江,天门,神农架林区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('湖南', '长沙,株洲,湘潭,衡阳,邵阳,岳阳,常德,张家界,益阳,郴州,永州,怀化,娄底,湘西土家族苗族自治州', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('江西', '南昌,景德镇,萍乡,九江,新余,鹰潭,赣州,吉安,宜春,抚州,上饶', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('贵州', '贵阳,六盘水,遵义,安顺,毕节,铜仁,黔东南苗族侗族自治州,黔南布依族苗族自治州,黔西南布依族苗族自治州', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('广西', '南宁,柳州,桂林,梧州,北海,防城港,钦州,贵港,玉林,百色,贺州,河池,来宾,崇左', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('海南', '海口,三亚,三沙,儋州,琼海,文昌,万宁,东方,澄迈,定安,屯昌,临高,白沙黎族自治县,昌江黎族自治县,乐东黎族自治县,陵水黎族自治县,保亭黎族苗族自治县,琼中黎族苗族自治县', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('广东', '珠海,汕头,佛山,韶关,湛江,肇庆,江门,茂名,惠州,梅州,汕尾,河源,阳江,清远,东莞,潮州,揭阳,云浮', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('西藏', '拉萨,日喀则,昌都,林芝,山南,那曲,阿里地区', '', 1, 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_virtual`, `is_active`, `remark`) VALUES ('云南', '昆明,曲靖,玉溪,保山,昭通,丽江,普洱,临沧,红河哈尼族彝族自治州,文山壮族苗族自治州,西双版纳傣族自治州,大理白族自治州,德宏傣族景颇族自治州,怒江傈僳族自治州,迪庆藏族自治州', '', 1, 1, NULL);
+
 
 
 -- ----------------------------
@@ -66,7 +99,8 @@ CREATE TABLE `t_collect_data`  (
   `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
   `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
   `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
-  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
+  `data_type` int(4) NOT NULL DEFAULT 0 COMMENT '数据类型 0:招标 1:中标',
+  `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
@@ -81,7 +115,8 @@ CREATE TABLE `t_collect_data_history`  (
   `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
   `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
   `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
-  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
+  `data_type` int(4) NOT NULL DEFAULT 0 COMMENT '数据类型 0:招标 1:中标',
+  `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
@@ -104,14 +139,40 @@ CREATE TABLE `t_data`  (
   `release_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '发布时间',
   `devices` varchar(1000) NULL DEFAULT NULL COMMENT '相关设备',
   `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
-  `status` int(4) NULL DEFAULT NULL COMMENT '状态 0:未推送 1:已推送',
+  `status` int(4) NULL DEFAULT 0 COMMENT '状态 0:未推送 1:已推送',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `send_time` datetime NULL DEFAULT NULL COMMENT '推送时间',
   `other_urls` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '其他连接(招标编号相同的多个链接)',
+  `prompt_tokens` int NULL DEFAULT 0 COMMENT '输入token数量',
+  `completion_tokens` int NULL DEFAULT 0 COMMENT '输出token数量',
+  `total_tokens` int NULL DEFAULT 0 COMMENT '总token数量',
   `remark` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
   PRIMARY KEY (`url`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
-
+-- ----------------------------
+-- Table structure for t_data_result
+-- ----------------------------
+DROP TABLE IF EXISTS `t_data_result`;
+CREATE TABLE `t_data_result`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '详情链接',
+  `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '搜索关键字',
+  `no` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标编号',
+  `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标标题',
+  `date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '公告时间',
+  `price` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '中标金额',
+  `bidder` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '中标人',
+  `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
+  `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
+  `status` int(4) NULL DEFAULT 0 COMMENT '状态 0:未推送 1:已推送',
+  `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
+  `send_time` datetime NULL DEFAULT NULL COMMENT '推送时间',
+  `other_urls` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '其他连接(招标编号相同的多个链接)',
+  `prompt_tokens` int NULL DEFAULT 0 COMMENT '输入token数量',
+  `completion_tokens` int NULL DEFAULT 0 COMMENT '输出token数量',
+  `total_tokens` int NULL DEFAULT 0 COMMENT '总token数量',
+  `remark` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
 
 SET FOREIGN_KEY_CHECKS = 1;