Ver código fonte

Merge branch 'dev' of Crawler/TenderCrawler into main

YueYunyun 6 meses atrás
pai
commit
ef0ad1eb52

+ 1 - 0
.gitignore

@@ -162,3 +162,4 @@ cython_debug/
 .dev/
 logs/
 attaches/
+temp_files/

+ 2 - 2
SourceCode/TenderCrawler/app/adapters/__init__.py

@@ -2,8 +2,8 @@ from adapters.data_collection_adapter_interface import IDataCollectionAdapter
 from stores.data_store_interface import IDataStore
 
 
-def collect(adapter: IDataCollectionAdapter, keyword: str, store: IDataStore = None):
-    adapter.collect(keyword, store)
+def collect(adapter: IDataCollectionAdapter, keywords: str, store: IDataStore = None):
+    adapter.collect(keywords, store)
 
 
 def teardown(adapter: IDataCollectionAdapter):

+ 21 - 23
SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py

@@ -3,7 +3,7 @@ from time import sleep
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as ec
-from selenium.webdriver.support.wait import WebDriverWait
+
 
 import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
@@ -21,14 +21,12 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
         self._driver = None
         self._keyword = None
         self._adapter_type = "ccgp"
+        self._next_count = 0
 
     def login(self, username: str, password: str) -> None:
         pass
 
-    def collect(self, keyword: str, store: IDataStore):
-        if store:
-            self._store = store
-        self._keyword = keyword
+    def _collect(self, keyword: str):
         items = self._search(keyword)
         self._process_list(items)
         if utils.get_config_bool(self.batch_save_key):
@@ -38,8 +36,7 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
         try:
             if not keyword:
                 raise Exception("搜索关键字不能为空")
-            wait = WebDriverWait(self.driver, 10, 1)
-            wait.until(ec.presence_of_element_located((By.ID, "searchForm")))
+            self._wait_until(ec.presence_of_element_located((By.ID, "searchForm")))
             search_el = self.driver.find_element(By.ID, "kw")
             sleep(2)
             search_el.clear()
@@ -49,12 +46,13 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
             )
             sleep(1)
             search_btn.click()
-            wait.until(
+            self._next_count = 0
+            self._wait_until(
                 ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
             )
             default_search_txt = "近1周"
             search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
-            utils.get_logger().info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
+            utils.get_logger().debug(f"搜索日期条件: {search_txt}")
             if search_txt != default_search_txt:
                 last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
                 for last_el in last_els:
@@ -62,7 +60,7 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                         sleep(1)
                         last_el.click()
                         break
-                wait.until(
+                self._wait_until(
                     ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
                 )
             else:
@@ -72,7 +70,7 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                     By.XPATH, "//body/div[@class='vT_z']/div/div/p"
                 )
                 if len(p_els) > 0:
-                    utils.get_logger().info(f" {p_els[0].text}")
+                    utils.get_logger().debug(f" {p_els[0].text}")
                 else:
                     a_links = self.driver.find_elements(
                         By.XPATH, "//div[@class='vT-srch-result-list']/p/a"
@@ -80,7 +78,7 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                     count = len(a_links)
                     if count > 1:
                         count = count - 1
-                    utils.get_logger().info(f"共查询到 {count} 页,每页 20 条")
+                    utils.get_logger().debug(f"共查询到 {count} 页,每页 20 条")
             except Exception as e:
                 utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
             items = self.driver.find_elements(
@@ -103,17 +101,19 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
 
     def _next_page(self) -> list:
         try:
-            wait = WebDriverWait(self.driver, 10, 1)
             next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
             try:
                 btn = self.driver.find_element(By.XPATH, next_path)
             except NoSuchElementException:
-                utils.get_logger().info(f"翻页结束 [{self._adapter_type}]")
+                utils.get_logger().debug(f"翻页结束 [{self._adapter_type}]")
                 return []
             btn.click()
-            utils.get_logger().info(f"跳转到下页: {self.driver.current_url}")
-            sleep(5)
-            wait.until(
+            self._next_count += 1
+            utils.get_logger().debug(
+                f"下一页[{self._next_count+1}]: {self.driver.current_url}"
+            )
+            sleep(1)
+            self._wait_until(
                 ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
             )
             items = self.driver.find_elements(
@@ -123,28 +123,26 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
         except NoSuchElementException as e:
             raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
         except TimeoutException as e:
-            raise Exception(f"翻页结束 [{self._adapter_type}] [超时]: {e}")
+            raise Exception(f"翻页失败 [{self._adapter_type}] [超时]: {e}")
 
     def _process_item(self, item):
         main_handle = self.driver.current_window_handle
-        wait = WebDriverWait(self.driver, 10, 1)
         close = True
         try:
             url = item.get_attribute("href")
             if self._check_is_collect_by_url(url):
                 close = False
                 return
-            # utils.get_logger().info(f"跳转详情")
-            print(".", end="")
+            utils.get_logger().debug(f"跳转详情")
             sleep(1)
             item.click()
-            wait.until(ec.number_of_windows_to_be(2))
+            self._wait_until(ec.number_of_windows_to_be(2))
             handles = self.driver.window_handles
             for handle in handles:
                 if handle != main_handle:
                     self.driver.switch_to.window(handle)
                     break
-            wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
+            self._wait_until(ec.presence_of_element_located((By.TAG_NAME, "body")))
 
             content = self.driver.find_element(
                 By.XPATH, "//div[@class='vF_deail_maincontent']"

+ 21 - 26
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -3,7 +3,6 @@ from time import sleep
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as ec
-from selenium.webdriver.support.wait import WebDriverWait
 
 import utils
 from adapters.data_collection_adapter_interface import IDataCollectionAdapter
@@ -21,6 +20,7 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
         self._driver = None
         self._keyword = None
         self._adapter_type = "chinabidding"
+        self._next_count = 0
 
     def login(self, username: str, password: str) -> None:
         try:
@@ -28,24 +28,20 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                 By.XPATH, "//div[@id='loginRight']/a[@class='login']"
             )
             login_el.click()
-            wait = WebDriverWait(self.driver, 10, 1)
-            wait.until(ec.presence_of_element_located((By.ID, "userpass")))
+            self._wait_until(ec.presence_of_element_located((By.ID, "userpass")))
             un_el = self.driver.find_element(By.ID, "username")
             un_el.send_keys(username)
             pass_el = self.driver.find_element(By.ID, "userpass")
             pass_el.send_keys(password)
             login_btn = self.driver.find_element(By.ID, "login-button")
             login_btn.click()
-            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
         except TimeoutException as e:
             raise Exception(f"登录失败 [{self._adapter_type}] [超时]: {e}")
         except NoSuchElementException as e:
             raise Exception(f"登录失败 [{self._adapter_type}] [找不到元素]: {e}")
 
-    def collect(self, keyword: str, store: IDataStore):
-        if store:
-            self._store = store
-        self._keyword = keyword
+    def _collect(self, keyword: str):
         items = self._search_by_type(keyword, 0)
         self._process_list(items, 0)
         sleep(2)
@@ -68,8 +64,7 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                     By.XPATH, "//div[@id='z-b-jg-gg']/h2/a[@class='more']"
                 )
             el.click()
-            wait = WebDriverWait(self.driver, 10, 1)
-            wait.until(ec.number_of_windows_to_be(2))
+            self._wait_until(ec.number_of_windows_to_be(2))
             self.driver.close()
             self.driver.switch_to.window(self.driver.window_handles[0])
             return self._search(keyword)
@@ -79,8 +74,7 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
 
     def _search(self, keyword: str) -> list:
-        wait = WebDriverWait(self.driver, 10, 1)
-        wait.until(ec.presence_of_element_located((By.ID, "searchBidProjForm")))
+        self._wait_until(ec.presence_of_element_located((By.ID, "searchBidProjForm")))
         search_el = self.driver.find_element(
             By.XPATH, "//form[@id='searchBidProjForm']/ul/li/input[@id='fullText']"
         )
@@ -90,15 +84,16 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             By.XPATH, "//form[@id='searchBidProjForm']/ul/li/button"
         )
         search_btn.click()
-        wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+        self._next_count = 0
+        self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
         default_search_txt = "全部"
         search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
-        utils.get_logger().info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
+        utils.get_logger().debug(f"搜索日期条件: {search_txt}")
         if search_txt != default_search_txt:
             last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
             sleep(1)
             last_el.click()
-            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
         else:
             sleep(1)
         try:
@@ -108,7 +103,7 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             count = len(a_links)
             if count > 1:
                 count = count - 1
-            utils.get_logger().info(f"共查询到 {count} 页,每页 10 条")
+            utils.get_logger().debug(f"共查询到 {count} 页,每页 10 条")
         except Exception as e:
             utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
         items = self.driver.find_elements(By.XPATH, "//ul[@class='as-pager-body']/li/a")
@@ -125,17 +120,19 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
 
     def _next_page(self) -> list:
         try:
-            wait = WebDriverWait(self.driver, 10, 1)
             try:
                 btn = self.driver.find_element(
                     By.XPATH, "//form[@id='pagerSubmitForm']/a[@class='next']"
                 )
             except NoSuchElementException:
-                utils.get_logger().info(f"翻页结束 [{self._adapter_type}]")
+                utils.get_logger().debug(f"翻页结束 [{self._adapter_type}]")
                 return []
             btn.click()
-            utils.get_logger().info(f"跳转到下页: {self.driver.current_url}")
-            wait.until(ec.presence_of_element_located((By.ID, "site-content")))
+            self._next_count += 1
+            utils.get_logger().debug(
+                f"下一页[{self._next_count+1}]: {self.driver.current_url}"
+            )
+            self._wait_until(ec.presence_of_element_located((By.ID, "site-content")))
             items = self.driver.find_elements(
                 By.XPATH, "//ul[@class='as-pager-body']/li/a"
             )
@@ -143,7 +140,7 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
         except NoSuchElementException as e:
             raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
         except TimeoutException as e:
-            raise Exception(f"翻页结束 [{self._adapter_type}] [超时]: {e}")
+            raise Exception(f"翻页失败 [{self._adapter_type}] [超时]: {e}")
 
     def _process_item(self, item, data_type):
         main_handle = self.driver.current_window_handle
@@ -154,17 +151,15 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
                 close = False
                 return
             item.click()
-            wait = WebDriverWait(self.driver, 10, 1)
-            wait.until(ec.number_of_windows_to_be(2))
+            self._wait_until(ec.number_of_windows_to_be(2))
             handles = self.driver.window_handles
             for handle in handles:
                 if handle != main_handle:
                     self.driver.switch_to.window(handle)
                     break
             url = self.driver.current_url
-            # utils.get_logger().info(f"跳转详情")
-            print(".", end="")
-            wait.until(ec.presence_of_element_located((By.CLASS_NAME, "content")))
+            utils.get_logger().debug(f"跳转详情")
+            self._wait_until(ec.presence_of_element_located((By.CLASS_NAME, "content")))
             content = self.driver.find_element(By.CLASS_NAME, "content").text
             if self._check_content(content):
                 self._save_db(url, content, data_type)

+ 78 - 41
SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py

@@ -1,6 +1,9 @@
 from abc import ABC, abstractmethod
+from typing import Callable, Union, Literal
 
 from selenium import webdriver
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.support.wait import WebDriverWait, D, T
 
 import drivers
 import utils
@@ -16,8 +19,12 @@ class IDataCollectionAdapter(ABC):
     _url = ""
     _store = None
     _driver = None
-    _keyword = None
     _adapter_type = ""
+    _cur_keyword = None
+    _keywords = None
+    _keyword_array = None
+    _error_count = 0
+    _max_error_count = utils.get_config_int("adapter.max_error_count", 5)
 
     @property
     def search_day_key(self) -> str:
@@ -36,20 +43,56 @@ class IDataCollectionAdapter(ABC):
         return self._url
 
     @property
-    def keyword(self):
-        return self._keyword
+    def cur_keyword(self):
+        return self._cur_keyword
+
+    @property
+    def keywords(self):
+        return self._keywords
+
+    @property
+    def keyword_array(self):
+        return self._keyword_array
 
     @property
     def driver(self) -> webdriver:
         if not self._driver:
-            self._driver = self._create_driver()
+            try:
+                self._driver = drivers.gen_driver(self.url)
+            except Exception as e:
+                raise Exception(f"创建驱动器失败: {e}")
         return self._driver
 
-    def _create_driver(self) -> webdriver:
-        try:
-            return drivers.gen_driver(self.url)
-        except Exception as e:
-            raise Exception(f"创建驱动器失败: {e}")
+    def collect(self, keywords: str, store: IDataStore) -> None:
+        """
+        处理搜索结果列表,返回处理后的数据列表
+
+        :param keywords: 搜索结果列表
+        :param store: 数据储存库
+        :type keywords: str
+        :return: 处理后的数据列表
+        :rtype: list
+        :raises Exception: 如果处理失败,应抛出异常
+        """
+        if store:
+            self._store = store
+        if not keywords:
+            raise Exception("未指定搜索关键字")
+        utils.get_logger().info(f"开始采集: {keywords}")
+        self._error_count = 0
+        self._keyword_array = keywords.split(",")
+        count = 0
+        for keyword in self._keyword_array:
+            if not keyword:
+                continue
+            try:
+                count += 1
+                self._cur_keyword = keyword
+                utils.get_logger().info(f"采集关键字[{count}]: {keyword}")
+                self._error_count = 0
+                self._collect(keyword)
+            except Exception as e:
+                raise Exception(f"采集数据失败: {e}")
 
     @abstractmethod
     def login(self, username: str, password: str) -> None:
@@ -68,42 +111,34 @@ class IDataCollectionAdapter(ABC):
         except Exception as e:
             raise Exception(f"登录失败: {e}")
 
-    @abstractmethod
-    def _search(self, keyword: str) -> list:
-        """
-        根据关键字搜索,返回搜索结果列表
+    def _wait(self, timeout=20, poll_frequency=1):
+        return WebDriverWait(self.driver, timeout, poll_frequency)
 
-        :param keyword: 搜索关键字
-        :type keyword: str
-        :return: 搜索结果列表
-        :rtype: list
-        :raises Exception: 如果搜索失败,应抛出异常
-        """
+    def _wait_until(
+        self,
+        method: Callable[[D], Union[Literal[False], T]],
+        timeout=20,
+        poll_frequency=1,
+    ):
         try:
-            results = []
-            # 实现搜索逻辑
-            return results if results else []
-        except Exception as e:
-            raise Exception(f"搜索失败: {e}")
+            self._wait(timeout, poll_frequency).until(method)
+        except TimeoutException as e:
+            self._error_count += 1
+            utils.get_logger().error(
+                f"采集数据 超时 [{self._error_count}/{self._max_error_count}]"
+            )
+            if self._error_count > self._max_error_count:
+                raise e
+            self._wait_until(method)
 
     @abstractmethod
-    def collect(self, keyword: str, store: IDataStore) -> None:
+    def _collect(self, keyword: str) -> None:
         """
-        处理搜索结果列表,返回处理后的数据列表
-
-        :param keyword: 搜索结果列表
-        :param store: 数据储存库
+        根据关键字采集
+        :param keyword: 搜索关键字
         :type keyword: str
-        :return: 处理后的数据列表
-        :rtype: list
-        :raises Exception: 如果处理失败,应抛出异常
         """
-        try:
-            if keyword:
-                # 实现处理逻辑
-                pass
-        except Exception as e:
-            raise Exception(f"处理失败: {e}")
+        pass
 
     def teardown(self) -> None:
         """
@@ -120,7 +155,7 @@ class IDataCollectionAdapter(ABC):
     def _check_is_collect_by_url(self, url: str) -> bool:
         old = self.store.query_one_collect_url(url)
         if old:
-            utils.get_logger().info(f"已采集过: {url}")
+            utils.get_logger().debug(f"已采集过: {url}")
             return True
         return False
 
@@ -144,13 +179,15 @@ class IDataCollectionAdapter(ABC):
 
     def _save_db(self, url, content, data_type=0, attach_str=None, is_invalid=False):
         if not self.store:
-            utils.get_logger().info(f"DataStore 未指定: {url},关键字{self.keyword}")
+            utils.get_logger().info(
+                f"DataStore 未指定: {url},关键字{self.cur_keyword}"
+            )
             return False
         else:
             status = 2 if is_invalid else 0
             data = CollectData(
                 url=url,
-                keyword=self.keyword,
+                keyword=self.cur_keyword,
                 content=content,
                 data_type=data_type,
                 attach_path=attach_str,

+ 5 - 1
SourceCode/TenderCrawler/app/config.yml

@@ -1,5 +1,6 @@
 #file: noinspection SpellCheckingInspection,SpellCheckingInspection,SpellCheckingInspection
 adapter:
+  max_error_count: 5
   chinabidding:
     #search_day: '今天'
     search_day: '近一周'
@@ -15,11 +16,13 @@ adapter:
 default_area: '全国'
 logger:
   file-path: './logs/'
+  level: 'debug'
 save:
   collect_data_key: '红外光谱仪,拉曼光谱仪'
   collect_batch_size: 100
   process_batch_size: 1 #AI处理一条插入一条
-  attach_file_path: './attaches/'
+  attach_file_path: './temp_files/attaches/'
+  report_file_path: './temp_files/report/'
 mysql:
   host: 192.168.0.81
   port: 3307
@@ -72,3 +75,4 @@ clean:
   collect_data: 30 # 清理多少天前的采集数据 0不清理
   process_data: 30 # 清理多少天前的处理数据[招标] 0不清理
   process_result_data: 60 # 清理多少天前的处理数据[中标] 0不清理 小于45会强制设为45
+  report: 90 # 清理多少天前的报表 0不清理  小于60会强制设为60

+ 12 - 0
SourceCode/TenderCrawler/app/jobs/data_clean.py

@@ -8,6 +8,9 @@ class DataClean:
     def __init__(self):
         self._clean_day = utils.get_config_int("clean.day", 30)
         self._clean_attach_day = utils.get_config_int("clean.attach", self._clean_day)
+        self._clean_report_day = utils.get_config_int("clean.report", self._clean_day)
+        if self._clean_report_day < 60:
+            self._clean_report_day = 60
         self._clean_log_day = utils.get_config_int("clean.log", self._clean_day)
         self._clean_collect_data_day = utils.get_config_int(
             "clean.collect_data", self._clean_day
@@ -30,6 +33,7 @@ class DataClean:
             self._clean_collect_data()
             self._clean_process_data()
             self._clean_process_result_data()
+            self._clean_report()
             utils.get_logger().info("清除历史文件数据 完成")
         except Exception as e:
             utils.get_logger().error(e)
@@ -42,6 +46,14 @@ class DataClean:
         utils.clean_attach_file(self._clean_attach_day)
         utils.get_logger().info("清除历史附件数据 完成")
 
+    def _clean_report(self):
+        if self._clean_report_day == 0:
+            utils.get_logger().info("跳过 清除中标报告报表")
+            return
+        utils.get_logger().info("开始 清除中标报告报表")
+        utils.clean_report_file(self._clean_report_day)
+        utils.get_logger().info("清除中标报告报表 完成")
+
     def _clean_log(self):
         if self._clean_log_day == 0:
             utils.get_logger().info("跳过 清除历史日志数据")

+ 4 - 2
SourceCode/TenderCrawler/app/jobs/data_collector.py

@@ -43,8 +43,10 @@ class DataCollector:
     def set_store(self, store: IDataStore) -> None:
         self._store = store
 
-    def collect(self, keyword: str):
-        adapters.collect(self.adapter, keyword, self.store)
+    def collect(self, keywords: str):
+        if not self.store:
+            raise Exception("未设置存储器")
+        adapters.collect(self.adapter, keywords, self.store)
 
     def close(self):
         utils.get_logger().info(f"关闭浏览器驱动,URL: {self.adapter.url}")

+ 70 - 57
SourceCode/TenderCrawler/app/jobs/data_send.py

@@ -52,7 +52,8 @@ class DataSend:
         title_prev = utils.get_config_value("email.report_title_prev", "【中标报告】")
         title = f"{start_date.month}月中标结果报告"
         body = self._build_report_email_html(title, items)
-        flag = utils.send_email(email, f"{title_prev} {title}", body, True)
+        attach_path = self._gen_report_exlecl(title, items)
+        flag = utils.send_email(email, f"{title_prev} {title}", body, True, attach_path)
         if flag:
             utils.get_logger().info("发送中标报告邮件成功")
 
@@ -177,10 +178,8 @@ class DataSend:
         """
         return html_body
 
-    def _build_report_email_html(self, title, items) -> str:
-        body = ""
-        for item in items:
-            body += self._build_report_email_body(item)
+    def _build_report_email_html(self, title, items: list[ProcessResultData]) -> str:
+        body = self._build_report_email_body(items)
         html = f"""
         <html>
         <head>
@@ -196,63 +195,52 @@ class DataSend:
                     color: #333;
                 }}
                 .container {{
-                    max-width: 600px;
+                    max-width: 1000px;
                     margin: 0 auto;
                     background-color: #fff;
                     padding: 20px;
                     border-radius: 8px;
                     box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
                 }}
-                .button-container {{
-                    text-align: center;
-                    margin-top: 20px;
-                }}
-                .button {{
-                    display: inline-block;
-                    padding: 10px 20px;
-                    font-size: 16px;
-                    color: #fff!important;
-                    background-color: #007bff;
-                    text-decoration: none;
-                    border-radius: 5px;
-                    transition: background-color 0.3s;
-                }}
-                .button:hover {{
-                    background-color: #0056b3;
-                }}
                 .system {{
                     color: #aaa;
+                    font-size: 80%;
                 }}
-                .card {{
+                .table-container {{
+                    overflow-x: auto;
+                    width: 100%;
+                }}
+                .table {{
+                    width: 1000px;
                     background-color: #ffffff;
                     border: 1px solid #dddddd;
                     border-radius: 8px;
                     margin-bottom: 20px;
                     padding: 20px;
                     box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+                    border-collapse: collapse;
                 }}
-                .card h2 {{
-                    margin-top: 0;
+                .table th, .table td {{
+                    padding: 5px;
+                    border-bottom: 1px solid #dddddd;
+                    word-wrap: break-word;
+                    text-align: center;
+                    font-size:12px;
                 }}
-                .card p {{
-                    margin: 0;
+                .table th:not(:first-child), .table td:not(:first-child) {{
+                    border-left: 1px solid #dddddd;
                 }}
-                .button-container {{
-                    text-align: center;
-                    margin-top: 15px;
+                .table th {{
+                    padding: 10px;
+                    background-color: #f8f9fa;
+                    font-weight: bold;
+                    font-size:14px;
                 }}
-                .button {{
-                    display: inline-block;
-                    padding: 6px 15px;
-                    font-size: 14px;
-                    color: #fff!important;
-                    background-color: #007bff;
-                    text-decoration: none;
-                    border-radius: 3px;
-                    transition: background-color 0.3s;
+                .table tr:last-child td {{
+                    border-bottom: none;
                 }}
-                .button:hover {{
-                    background-color: #0056b3;
+                .table td a{{
+                    color: #007bff;
                 }}
             </style>
         </head>
@@ -268,23 +256,48 @@ class DataSend:
         return html
 
     @staticmethod
-    def _build_report_email_body(item: ProcessResultData) -> str:
-        body = f"""
-           <div class="card">
-               <h2>{item.title}</h2>
-               <p><strong>项目编号:</strong> {item.no}</p>
-               <p><strong>公告日期:</strong> {item.date}</p>
-               <p><strong>关键词:</strong> {item.keyword}</p>
-               <p><strong>价格:</strong> {item.price}</p>
-               <p><strong>中标人:</strong> {item.bidder}</p>
-               <p><strong>摘要:</strong> {item.summary}</p>
-               <div class="button-container">
-                <a href="{item.url}" class="button">查看详情</a>
-               </div>
-           </div>
-           """
+    def _build_report_email_body(items: list[ProcessResultData]) -> str:
+        if not items:
+            return ""
+
+        body = """
+        <div class="table-container">
+            <table class="table">
+                <tr>
+                    <th style="width:200px">项目名称</th>
+                    <th style="width:150px">公告日期</th>
+                    <th style="width:120px">价格</th>
+                    <th>中标人</th>
+                </tr>
+        """
+        for item in items:
+            body += f"""
+            <tr>
+                <td><a  title="点击查看详情" href="{item.url}">{item.title}</a></td>
+                <td>{item.date}</td>
+                <td>{item.price}</td>
+                <td>{item.bidder}</td>
+            </tr>
+            """
+        body += "</table></div>"
         return body
 
+    @staticmethod
+    def _gen_report_exlecl(title, items: list[ProcessResultData]) -> str:
+        if not items:
+            return ""
+            # 将 list 数据转换为 DataFrame
+        data = {
+            "项目编号": [item.no for item in items],
+            "项目名称": [item.title for item in items],
+            "公告日期": [item.date for item in items],
+            "价格": [item.price for item in items],
+            "中标人": [item.bidder for item in items],
+            "公告摘要": [item.summary for item in items],
+            "URL": [item.url for item in items],
+        }
+        return utils.save_reort_excel(data, title)
+
     def _send_email_no_found(self) -> None:
         email = utils.get_config_value("email.error_email")
         utils.get_logger().info(f"开始发送区域邮箱未匹配邮件: {email}")

+ 1 - 4
SourceCode/TenderCrawler/app/jobs/job_runner.py

@@ -113,10 +113,7 @@ class JobRunner:
                         url_setting.password,
                         self.store,
                     )
-                    keywords = url_setting.keywords
-                    keyword_array = keywords.split(",")
-                    for keyword in keyword_array:
-                        data_collector.collect(keyword)
+                    data_collector.collect(url_setting.keywords)
                     utils.get_logger().info(f"采集完成: {url_setting.url}")
                 except Exception as e:
                     self._send_error_email(

+ 20 - 0
SourceCode/TenderCrawler/app/utils/__init__.py

@@ -108,6 +108,26 @@ def clean_attach_file(day: int):
     FileHelper().clean_attach_file(day)
 
 
+def save_reort_excel(data, file_name: str = None) -> str:
+    """
+    保存报表数据到Excel文件。
+
+    :param data: 列表,报表数据。
+    :param file_name: 字符串,保存的文件名(可选)。
+    :return: 字符串,保存的文件路径。
+    """
+    return FileHelper().save_report_excel(data, file_name)
+
+
+def clean_report_file(day: int):
+    """
+    清理指定天数之前的报表文件。
+
+    :param day: 整数,表示清理多少天前的报表文件。
+    """
+    FileHelper().clean_report_file(day)
+
+
 def to_array(s: str, split: str = ",") -> list[str]:
     """
     将字符串按指定分隔符拆分为数组。

+ 70 - 9
SourceCode/TenderCrawler/app/utils/file_helper.py

@@ -1,21 +1,29 @@
-import os, shutil,utils
+import os, shutil, utils
+import pandas as pd
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
 
 import requests
 
 
-
-
 class FileHelper:
 
-    DEFAULT_ATTACH_PATH = "./attaches/"
+    DEFAULT_ATTACH_PATH = "./temp_files/attaches/"
+    DEFAULT_REPORT_PATH = "./temp_files/reoport/"
 
     def __init__(self):
-        path = utils.get_config_value("save.attach_file_path", self.DEFAULT_ATTACH_PATH)
-        path = path.replace("\\", "/")
-        path = path.replace("//", "/")
-        self._attach_file_path = path
+        attach_path = utils.get_config_value(
+            "save.attach_file_path", self.DEFAULT_ATTACH_PATH
+        )
+        attach_path = attach_path.replace("\\", "/")
+        attach_path = attach_path.replace("//", "/")
+        self._attach_file_path = attach_path
+        report_path = utils.get_config_value(
+            "save.report_file_path", self.DEFAULT_REPORT_PATH
+        )
+        report_path = report_path.replace("\\", "/")
+        report_path = report_path.replace("//", "/")
+        self._report_file_path = report_path
 
     def download_remote_file(self, file_url: str, file_name: str) -> str | None:
         utils.get_logger().info(f"下载远程文件: {file_url}  文件名:{file_name}")
@@ -103,4 +111,57 @@ class FileHelper:
                         # 如果目录名称不符合 %Y-%m/%d 格式,跳过
                         continue
         except Exception as e:
-            utils.get_logger().error(f"文件清理失败。Exception: {e}")
+            utils.get_logger().error(f"attach 文件清理失败。Exception: {e}")
+
+    def save_report_excel(self, data, file_name: str = None) -> str:
+        try:
+            df = pd.DataFrame(data)
+            file_path = os.path.join(
+                self._report_file_path, f'{datetime.now().strftime("%Y-%m-%d")}'
+            )
+            if not os.path.exists(file_path):
+                os.makedirs(file_path)
+            file_name = f"{file_name}_{datetime.now().strftime('%H%M%S')}.xlsx"
+            path = os.path.join(file_path, file_name)
+            path = path.replace("\\", "/")
+            path = path.replace("//", "/")
+            df.to_excel(path, index=False)
+            utils.get_logger().debug(f"Report报存成功: {file_name}")
+            return path
+        except Exception as e:
+            utils.get_logger().error(f"保存 Report Excel 文件失败。Exception: {e}")
+            return ""
+
+    def clean_report_file(self, day: int) -> None:
+        try:
+            current_time = datetime.now()
+            cutoff_time = current_time - timedelta(days=day)
+            for root, dirs, _ in os.walk(self._report_file_path):
+                for dir_name in dirs:
+                    path = os.path.join(root, dir_name)
+                    dir_path = (
+                        str(path).replace(self._report_file_path, "").replace("\\", "/")
+                    )
+                    if dir_path.count("/") > 0:
+                        continue
+                    try:
+                        dir_date = datetime.strptime(dir_path, "%Y-%m-%d")
+                        if dir_date < cutoff_time:
+                            try:
+                                shutil.rmtree(path)
+                                utils.get_logger().info(
+                                    f"  Report 删除目录及其内容: {dir_path}"
+                                )
+                            except PermissionError:
+                                utils.get_logger().error(
+                                    f"  Report 权限错误,无法删除目录: {dir_path}"
+                                )
+                            except Exception as e:
+                                utils.get_logger().error(
+                                    f"  Report 删除目录失败: {dir_path}。Exception: {e}"
+                                )
+                    except ValueError:
+                        # 如果目录名称不符合 %Y-%m/%d 格式,跳过
+                        continue
+        except Exception as e:
+            utils.get_logger().error(f"Report 文件清理失败。Exception: {e}")

+ 16 - 4
SourceCode/TenderCrawler/app/utils/logger_helper.py

@@ -16,7 +16,6 @@ class LoggerHelper:
     _log_file_name = f"{config.get("logger.file_name", "crawler")}.log"
     _log_file_path = config.get("logger.file_path", "./logs")
     _log_level_string = config.get("logger.level", "INFO")
-    _log_level = logging.getLevelName(_log_level_string)
 
     def __new__(cls, *args, **kwargs):
         """
@@ -39,8 +38,9 @@ class LoggerHelper:
         """
         初始化日志记录器,包括设置日志级别、创建处理器和格式化器,并将它们组合起来
         """
+        log_level = self._get_log_level()
         self._logger = logging.getLogger("app_logger")
-        self._logger.setLevel(self._log_level)
+        self._logger.setLevel(log_level)
 
         if not os.path.exists(self._log_file_path):
             os.makedirs(self._log_file_path)
@@ -53,11 +53,11 @@ class LoggerHelper:
             backupCount=7,
             encoding="utf-8",
         )
-        file_handler.setLevel(logging.INFO)
+        file_handler.setLevel(log_level)
 
         # 创建控制台处理器
         console_handler = logging.StreamHandler()
-        console_handler.setLevel(logging.INFO)
+        console_handler.setLevel(logging.DEBUG)
 
         # 创建格式化器
         formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
@@ -70,6 +70,18 @@ class LoggerHelper:
         self._logger.addHandler(file_handler)
         self._logger.addHandler(console_handler)
 
+    def _get_log_level(self):
+        try:
+            # 尝试将字符串转换为 logging 模块中的日志级别常量
+            log_level = getattr(logging, self._log_level_string.upper())
+            if not isinstance(log_level, int):
+                raise ValueError
+            return log_level
+        except (AttributeError, ValueError):
+            raise ValueError(
+                f"配置logger出错: Unknown level: '{self._log_level_string}'"
+            )
+
     @classmethod
     def get_logger(cls):
         """

+ 2 - 1
SourceCode/TenderCrawler/docker-compose.yml

@@ -56,6 +56,7 @@ services:
       #      - APP_AI__KEY=
       #      - APP_AI__URL=http://192.168.0.109:7580/api/chat
       #      - APP_AI__MODEL=qwen2.5:7b
+      - APP_LOGGER__LEVEL=INFO
       - APP_JOB__COLLECT=20:00,12:00
       - APP_JOB__PROCESS=23:00,4:00,13:00
       - APP_JOB__SEND_EMAIL=08:20,14:00
@@ -64,7 +65,7 @@ services:
     volumes:
       - /home/docker/tender-crawler_v2/app/config.yml:/app/config.yml
       - /home/docker/tender-crawler_v2/app/logs:/app/logs
-      - /home/docker/tender-crawler_v2/app/attaches:/app/attaches
+      - /home/docker/tender-crawler_v2/app/temp_files:/app/temp_files
     #      - ./.dev/app/config.yml:/app/config.yml
     #      - ./.dev/app/logs:/app/logs
     #      - ./.dev/app/attaches:/app/attaches

+ 2 - 1
SourceCode/TenderCrawler/requirements.txt

@@ -1,9 +1,10 @@
 PyMySQL==1.1.1
 python_dateutil==2.9.0.post0
 PyYAML==6.0.2
-PyYAML==6.0.2
 Requests==2.32.3
 schedule==1.2.2
 selenium==4.27.1
 cryptography==41.0.4
 openai==1.58.1
+pandas~=2.2.3
+openpyxl==3.1.5