Browse Source

Add 增加历史数据清理,优化包名及代码

YueYunyun 6 months ago
parent
commit
1930b27a53
24 changed files with 371 additions and 91 deletions
  1. 13 0
      SourceCode/TenderCrawler/.script/Build_Dockerfile.run.xml
  2. 11 0
      SourceCode/TenderCrawler/.script/Run_TenderCrawler.run.xml
  3. 20 15
      SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py
  4. 1 1
      SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py
  5. 16 8
      SourceCode/TenderCrawler/app/config.yml
  6. 0 0
      SourceCode/TenderCrawler/app/jobs/__init__.py
  7. 82 0
      SourceCode/TenderCrawler/app/jobs/data_clean.py
  8. 0 0
      SourceCode/TenderCrawler/app/jobs/data_collector.py
  9. 2 2
      SourceCode/TenderCrawler/app/jobs/data_process.py
  10. 7 4
      SourceCode/TenderCrawler/app/jobs/data_send.py
  11. 61 27
      SourceCode/TenderCrawler/app/jobs/job_runner.py
  12. 15 7
      SourceCode/TenderCrawler/app/main.py
  13. 19 1
      SourceCode/TenderCrawler/app/models/collect_data.py
  14. 15 0
      SourceCode/TenderCrawler/app/models/process_data.py
  15. 16 1
      SourceCode/TenderCrawler/app/models/process_result_data.py
  16. 1 2
      SourceCode/TenderCrawler/app/models/url_setting.py
  17. 5 1
      SourceCode/TenderCrawler/app/stores/data_store_interface.py
  18. 4 1
      SourceCode/TenderCrawler/app/stores/default_data_store.py
  19. 11 0
      SourceCode/TenderCrawler/app/stores/mysql_data_store.py
  20. 9 6
      SourceCode/TenderCrawler/app/utils/ai_helper.py
  21. 30 2
      SourceCode/TenderCrawler/app/utils/file_helper.py
  22. 26 6
      SourceCode/TenderCrawler/app/utils/logger_helper.py
  23. 5 5
      SourceCode/TenderCrawler/docker-compose.yml
  24. 2 2
      SourceCode/TenderCrawler/init.sql

+ 13 - 0
SourceCode/TenderCrawler/.script/Build_Dockerfile.run.xml

@@ -0,0 +1,13 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Build_Dockerfile" type="docker-deploy" factoryName="dockerfile" activateToolWindowBeforeRun="false" server-name="81">
+    <deployment type="dockerfile">
+      <settings>
+        <option name="imageTag" value="y_tender-crawler-app:2.0.1" />
+        <option name="buildOnly" value="true" />
+        <option name="containerName" value="" />
+        <option name="sourceFilePath" value="Dockerfile" />
+      </settings>
+    </deployment>
+    <method v="2" />
+  </configuration>
+</component>

+ 11 - 0
SourceCode/TenderCrawler/.script/Run_TenderCrawler.run.xml

@@ -0,0 +1,11 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="Run_TenderCrawler" type="docker-deploy" factoryName="docker-compose.yml" server-name="81">
+    <deployment type="docker-compose.yml">
+      <settings>
+        <option name="envFilePath" value="" />
+        <option name="sourceFilePath" value="docker-compose.yml" />
+      </settings>
+    </deployment>
+    <method v="2" />
+  </configuration>
+</component>

+ 20 - 15
SourceCode/TenderCrawler/app/adapters/ccgp_data_collection_adapter.py

@@ -52,7 +52,7 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
             sleep(1)
             search_btn.click()
             wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
-            default_search_txt = "近周"
+            default_search_txt = "近1周"
             search_txt = self.config.get(self.search_day_key, default_search_txt)
             self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
             if search_txt != default_search_txt:
@@ -65,17 +65,19 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
                 wait.until(ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")))
             else:
                 sleep(1)
-
-
-            # try:
-            #     a_links = self.driver.find_elements(
-            #         By.XPATH, "//form[@id='pagerSubmitForm']/a")
-            #     count = len(a_links)
-            #     if count > 1:
-            #         count = count - 1
-            #     self.logger.info(f"共查询到 {count} 页")
-            # except Exception as e:
-            #     self.logger.error(f"搜索失败[尝试查询页数]: {e}")
+            try:
+                p_els = self.driver.find_elements(By.XPATH, "//body/div[@class='vT_z']/div/div/p")
+                if len(p_els) > 0:
+                    self.logger.info(f" {p_els[0].text}")
+                else:
+                    a_links = self.driver.find_elements(
+                        By.XPATH, "//div[@class='vT-srch-result-list']/p/a")
+                    count = len(a_links)
+                    if count > 1:
+                        count = count - 1
+                    self.logger.info(f"共查询到 {count} 页,每页 20 条")
+            except Exception as e:
+                self.logger.error(f"搜索失败[尝试查询页数]: {e}")
             items = self.driver.find_elements(By.XPATH,
                                          "//ul[@class='vT-srch-result-list-bid']/li/a")
             return items
@@ -138,10 +140,13 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
             wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
 
             content = self.driver.find_element(By.XPATH, "//div[@class='vF_deail_maincontent']").text
+            # 排除其他公告
+            if self._check_type("其他公告"):
+                self._save_db(url, content, 3, is_invalid=True)
+                return
             # 判断是否为投标公告
-            data_type = 0 if self._check_type("中标公告") or self._check_type("成交公告") or self._check_type(
-                    "终止公告") or self._check_type("其他公告") else 1
-
+            data_type = 1 if self._check_type("中标公告") or self._check_type("成交公告") or self._check_type(
+                    "终止公告")  else 0
             if self._check_content(content):
                 attach_str = self._attach_download()
                 self._save_db(url, content, data_type, attach_str)

+ 1 - 1
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -102,7 +102,7 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
             count = len(a_links)
             if count > 1:
                 count = count - 1
-            self.logger.info(f"共查询到 {count} 页")
+            self.logger.info(f"共查询到 {count} 页,每页 10 条")
         except Exception as e:
             self.logger.error(f"搜索失败[尝试查询页数]: {e}")
         items = self.driver.find_elements(By.XPATH,

+ 16 - 8
SourceCode/TenderCrawler/app/config.yml

@@ -1,8 +1,7 @@
 adapter:
   chinabidding:
     #search_day: '今天'
-    #search_day: '近一周'
-    search_day: '近三天'
+    search_day: '近一周'
     model_name: 'chinabidding_data_collection_adapter'
     class_name: 'ChinabiddingDataCollectionAdapter'
     batch_save: True
@@ -34,9 +33,9 @@ ai:
   url: https://dashscope.aliyuncs.com/compatible-mode/v1
   model: qwen-plus
   max_tokens: 1024
-  system_prompt: 请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。
-  prompt_template_1: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),设备(devices)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,area,date,address,release_date,summary,devices字段的json格式字符串,没有找到或未提供的信息json字段为空。
-  prompt_template_2: 在以上内容中提取信息:编号(no) 、标题(title)、公告时间(date)、标中的总价格(price)、标中的公司,多个以逗号分割(bidder)、150-300字的标的物说明,标的物价格,公司的明细等内容摘要(summary),设备(devices)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,date,price,bidder,summary字段的json格式字符串,没有找到或未提供的信息json字段为空。
+  system_prompt: 分析文本,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空,返回的一定是可以解析的json对象
+  prompt_template_1: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary),设备(devices)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,area,date,address,release_date,summary,devices字段的json格式字符串,没有找到或未提供的信息json字段为空,返回的一定是可以解析的json字符串
+  prompt_template_2: 在以上内容中提取信息:编号(no) 、标题(title)、公告时间(date)、标中的总价格,返回带单位的字符串(price)、标中的公司,多个以逗号分割(bidder)、150-300字的标的物说明,标的物价格,公司的明细等内容摘要(summary),设备(devices)。提取出相关设备的名称信息,多个设备以逗号分割。返回包含no,title,date,price,bidder,summary,devices字段的json格式字符串,没有找到或未提供的信息json字段为空,返回的一定是可以解析的json字符串
 email:
 #  smtp_server: smtp.exmail.qq.com
 #  smtp_port: 465
@@ -48,9 +47,9 @@ email:
   smtp_user: yueyunyun88@163.com
   smtp_password: FWRwBZKHTLHjHT5F
   from_email: yueyunyun88@163.com
-
-  #error_email: yueyy@iwbnet.com
-schedule:
+  error_email: yueyy@iwbnet.com
+job:
+  event_id: 1 # 改变这个值,整点会检测重新加载任务
   sleep_interval: 10
   #sleep_interval: 600 #单位:秒 10分钟检查一次
   collect: 06:00,22:00 # 每天采集数据时间
@@ -60,6 +59,15 @@ schedule:
   send_current_month_report_time: 08:20 # 每月几号记点发送本月中标报告
   send_prev_month_report_day: 1 # 每月几号发送上月中标报告
   send_prev_month_report_time: 08:20 # 每月几号记点发送上月中标报告
+  clean_data: 00:05 # 每日清理数据时间
   run_now: false
 selenium:
   remote_driver_url: http://127.0.0.1:3534/wd/hub
+clean:
+  day: 30 # 清理多少天前的数据 0不清理
+  # 下面的没有配置 默认使用 day 的配置
+  attach: 30 # 清理多少天前的附件 0不清理
+  log: 30 # 清理多少天前的日志 0不清理
+  collect_data: 30 # 清理多少天前的采集数据 0不清理
+  process_data: 30 # 清理多少天前的处理数据[招标] 0不清理
+  process_result_data: 60 # 清理多少天前的处理数据[中标] 0不清理 小于45会强制设为45

+ 0 - 0
SourceCode/TenderCrawler/app/main/__init__.py → SourceCode/TenderCrawler/app/jobs/__init__.py


+ 82 - 0
SourceCode/TenderCrawler/app/jobs/data_clean.py

@@ -0,0 +1,82 @@
+from utils.config_helper import ConfigHelper
+from utils.logger_helper import LoggerHelper
+from utils.file_helper import FileHelper
+from stores.mysql_data_store import MysqlDataStore
+
+class DataClean:
+    _store = None
+    config = ConfigHelper()
+    logger = LoggerHelper.get_logger()
+
+
+    def __init__(self):
+        self._clean_day = self.config.get_int("clean.day", 30)
+        self._clean_attach_day = self.config.get_int("clean.attach", self._clean_day)
+        self._clean_log_day = self.config.get_int("clean.log", self._clean_day)
+        self._clean_collect_data_day = self.config.get_int("clean.collect_data", self._clean_day)
+        self._clean_process_data_day = self.config.get_int("clean.process_data", self._clean_day)
+        self._clean_process_result_data_day = self.config.get_int("clean.process_result_data", self._clean_day)
+        if self._clean_process_result_data_day < 45:
+            self._clean_process_result_data_day = 45
+        self._store = MysqlDataStore()
+
+    def clean(self):
+        try:
+            self.logger.info("开始 清除历史文件数据")
+            self._clean_attach()
+            self._clean_log()
+            self._clean_collect_data()
+            self._clean_process_data()
+            self._clean_process_result_data()
+            self.logger.info("清除历史文件数据 完成")
+        except Exception as e:
+            self.logger.error(e)
+
+    def _clean_attach(self):
+        if self._clean_attach_day == 0:
+            self.logger.info("跳过 清除历史附件数据")
+            return
+        self.logger.info("开始 清除历史附件数据")
+        FileHelper().clean_attach_file(self._clean_attach_day)
+        self.logger.info("清除历史附件数据 完成")
+
+    def _clean_log(self):
+        if self._clean_log_day == 0:
+            self.logger.info("跳过 清除历史日志数据")
+            return
+        self.logger.info("开始 清除历史日志数据")
+        LoggerHelper.delete_log(self._clean_log_day)
+        self.logger.info("清除历史日志数据 完成")
+
+    def _clean_collect_data(self):
+        if self._clean_collect_data_day == 0:
+            self.logger.info("跳过 清除历史采集数据")
+            return
+        self.logger.info("开始 清除历史采集数据")
+        date = self._get_before_date(self._clean_collect_data_day)
+        self._store.delete_collect_data_before_date(date)
+        self.logger.info("清除历史采集数据 完成")
+
+    def _clean_process_data(self):
+        if self._clean_process_data_day == 0:
+            self.logger.info("跳过 清除历史处理数据[招标]")
+            return
+        self.logger.info("开始 清除历史处理数据[招标]")
+        date = self._get_before_date(self._clean_process_data_day)
+        self._store.delete_process_data_before_date(date)
+        self.logger.info("清除历史处理数据[招标] 完成")
+
+    def _clean_process_result_data(self):
+        if self._clean_process_result_data_day == 0:
+            self.logger.info("跳过 清除历史处理数据[中标]")
+            return
+        self.logger.info("开始 清除历史处理数据[中标]")
+        date = self._get_before_date(self._clean_process_data_day)
+        self._store.delete_process_result_data_before_date(date)
+        self.logger.info("清除历史处理数据[中标] 完成")
+
+    @staticmethod
+    def  _get_before_date(day:int) -> str:
+        from datetime import datetime, timedelta
+        return (datetime.now() - timedelta(days=day)).strftime("%Y-%m-%d")
+

+ 0 - 0
SourceCode/TenderCrawler/app/main/data_collector.py → SourceCode/TenderCrawler/app/jobs/data_collector.py


+ 2 - 2
SourceCode/TenderCrawler/app/main/data_process.py → SourceCode/TenderCrawler/app/jobs/data_process.py

@@ -92,6 +92,7 @@ class DataProcess:
                         else:
                             self.store.set_process_result_other_urls(
                                 data.url, old.other_urls)
+                    self.store.set_collect_process(old.url)
                     self.logger.info(
                         f"ALREADY 编号: {data.no} URL:{old.other_urls}")
 
@@ -112,8 +113,7 @@ class DataProcess:
                     area_str_arr) > 1 else area_str_arr[0]
             if "市" in area_str:
                 area_str_arr = area_str.split("市")
-                area_str = area_str_arr[1] if len(
-                    area_str_arr) > 1 else area_str_arr[0]
+                area_str = area_str_arr[0]
 
             return ProcessData(
                 no=data.get("no"),

+ 7 - 4
SourceCode/TenderCrawler/app/main/data_send.py → SourceCode/TenderCrawler/app/jobs/data_send.py

@@ -2,6 +2,7 @@ from datetime import datetime
 import calendar
 
 from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
 from utils.email_helper import EmailHelper
 from stores.data_store_interface import IDataStore
 from models.process_data import ProcessData
@@ -10,6 +11,7 @@ from models.process_result_data import ProcessResultData
 
 class DataSend:
     logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
     _error_arr = []
     _email_area_arr = []
     _email_area_virtual_arr = []
@@ -49,9 +51,10 @@ class DataSend:
             self.logger.error("没有找到master email")
             return
         items = self.store.query_to_report_by_date(start_date, end_date)
+        title_prev = self.config.get("email.report_title_prev", "【中标报告】")
         title = f"{start_date.month}月中标结果报告"
         body = self._build_report_email_html(title, items)
-        flag = EmailHelper().send_email(email, title, body, True)
+        flag = EmailHelper().send_email(email, f"{title_prev} {title}",body, True)
         if flag:
             self.logger.info("发送中标报告邮件成功")
 
@@ -63,8 +66,9 @@ class DataSend:
             if item.area not in self._error_arr:
                 self._error_arr.append(item.area)
             return
+        title_prev = self.config.get("email.title_prev", "【招标信息】")
         body = self._build_email_html(item)
-        flag = EmailHelper().send_email(email, item.title, body, True, item.attach_path)
+        flag = EmailHelper().send_email(email, f"{title_prev} {item.title}", body, True, item.attach_path)
         if flag:
             self.store.set_send(item.no)
 
@@ -76,7 +80,7 @@ class DataSend:
         #     area_str = area_str_arr[1] if len(area_str) > 1 else area_str_arr[0]
         # if "市" in area:
         #     area_str_arr = area.split("市")
-        #     area_str = area_str_arr[1] if len(area_str) > 1 else area_str_arr[0]
+        #     area_str = area_str_arr[0]
         for area_item in self._email_area_arr:
             if area_str in area_item.area:
                 email = area_item.email
@@ -150,7 +154,6 @@ class DataSend:
         <body>
             <div class="container">
                 <h1>{item.title}</h1>
-                <p><strong>搜索关键字:</strong> {item.keyword}</p>
                 <p><strong>发布日期:</strong> {item.release_date}</p>
                 <p><strong>招标编号:</strong> {item.no}</p>
                 <p><strong>开标时间:</strong> {item.date}</p>

+ 61 - 27
SourceCode/TenderCrawler/app/main/runner.py → SourceCode/TenderCrawler/app/jobs/job_runner.py

@@ -1,34 +1,32 @@
-import schedule
+import schedule,threading
 from dateutil import parser
 from datetime import datetime
 
 from utils.logger_helper import LoggerHelper
 from utils.config_helper import ConfigHelper
+from utils.email_helper import EmailHelper
 from stores.mysql_data_store import MysqlDataStore
 from models.url_setting import UrlSetting
-from main.data_collector import DataCollector
-from main.data_process import DataProcess
-from main.data_send import DataSend
-from utils.email_helper import EmailHelper
+from jobs.data_collector import DataCollector
+from jobs.data_process import DataProcess
+from jobs.data_send import DataSend
+from jobs.data_clean import DataClean
 
 
-class Runner:
+class JobRunner:
     logger = LoggerHelper.get_logger()
     config = ConfigHelper()
     store = MysqlDataStore()  # 复用 store 对象
 
-    def run(self):
+    def run_job(self, is_run_now=True):
         try:
-            self.logger.info("应用程序已启动!")
-            urls = UrlSetting().fetch_all()
-            if not urls or len(urls) == 0:
-                self.logger.error("未找到任何 URL 设置")
-                return
-            self.logger.info(f"共找到 {len(urls)} 个 URL 设置")
+            self.logger.info("加载任务")
+
 
-            collect_time = self.config.get("schedule.collect")
-            process_time = self.config.get("schedule.process")
-            send_email_time = self.config.get("schedule.send_email")
+            collect_time = self.config.get("job.collect")
+            process_time = self.config.get("job.process")
+            send_email_time = self.config.get("job.send_email")
+            clean_data_time = self.config.get("job.clean_data")
 
             collect_times = self._validate_and_format_time(
                 collect_time, ["06:00"])
@@ -48,31 +46,53 @@ class Runner:
                 self.logger.info(f"{time} 执行  发送邮件   任务")
                 schedule.every().day.at(time).do(self._send_job)
 
-            if self.config.get_int("schedule.send_current_month_report_day")>0:
-                report_time = self.config.get("schedule.send_current_month_report_time")
+            if self.config.get_int("job.send_current_month_report_day")>0:
+                report_time = self.config.get("job.send_current_month_report_time")
                 times = self._validate_and_format_time(report_time,["08:20"])
                 for time in times:
-                    self.logger.info(f"每月{self._get_current_month_report_day()}日 {time} 执行  发送当月报告   任务")
+                    self.logger.info(f"每月{str(self._get_current_month_report_day()).rjust(2,"0")}日 {time} 执行  发送当月报告   任务")
                     schedule.every().day.at(time).do(self._send_prev_month_report_job)
 
-            if self.config.get_int("schedule.send_prev_month_report_day")>0:
-                report_time = self.config.get("schedule.send_prev_month_report_time")
+            if self.config.get_int("job.send_prev_month_report_day")>0:
+                report_time = self.config.get("job.send_prev_month_report_time")
                 times = self._validate_and_format_time(report_time, ["08:20"])
                 for time in times:
-                    self.logger.info(f"每月{self._get_prev_month_report_day()}日 {time} 执行  发送上月报告   任务")
+                    self.logger.info(f"每月{str(self._get_prev_month_report_day()).rjust(2,"0")}日 {time} 执行  发送上月报告   任务")
                     schedule.every().day.at(time).do(self._send_prev_month_report_job)
 
-            if self.config.get_bool("schedule.run_now"):
-                self.logger.info("立即执行任务")
+            clean_data_times = self._validate_and_format_time(
+                clean_data_time, ["00:05"])
+            self.logger.info(f"{clean_data_times[0]} 执行 清理数据 任务")
+            schedule.every().day.at(clean_data_times[0]).do(self._clean_job)
+
+            urls = UrlSetting().fetch_all()
+            if not urls or len(urls) == 0:
+                self.logger.error("未找到任何 URL 设置")
+                return
+            self.logger.info(f"共找到 {len(urls)} 个 URL 设置")
+            for url in urls:
+                self.logger.info(f"{url}")
+
+            if is_run_now and self.config.get_bool("job.run_now"):
+                self.logger.info("立即执行采集任务")
                 self._collect_process_job()
-                self._send_job()
+                # self._clean_job()
                 # self._process_job()
+                # self._send_job()
 
         except Exception as e:
             self.logger.error(f"应用程序停止: {e}")
             raise e
 
+    def restart_job(self):
+        schedule.clear()
+        self.logger.info("定时配置更新,重启任务")
+        self.run_job(False)
+
     def _collect_process_job(self):
+        threading.Thread(target=self._collect_process).start()
+
+    def _collect_process(self):
         try:
             self.logger.info("开始执行 数据采集处理 任务")
             url_setting = UrlSetting()
@@ -116,6 +136,9 @@ class Runner:
             self.logger.error(f"数据采集处理 任务执行失败: {e}")
 
     def _process_job(self):
+        threading.Thread(target=self._process).start()
+
+    def _process(self):
         try:
             self.logger.info("开始执行 AI处理数据 任务")
             data_process = DataProcess(self.store)
@@ -145,7 +168,7 @@ class Runner:
             self.logger.error(f"邮件发送当月报告 任务执行失败: {e}")
 
     def _get_current_month_report_day(self):
-        day = self.config.get_int("schedule.send_current_month_report_day",30)
+        day = self.config.get_int("job.send_current_month_report_day",30)
         if datetime.today().month==2 and day > 28 :
             day = 28
         if datetime.today().month in [4,6,9,11] and day > 30:
@@ -165,7 +188,7 @@ class Runner:
             self.logger.error(f"邮件发送上月报告 任务执行失败: {e}")
 
     def _get_prev_month_report_day(self):
-        day = self.config.get_int("schedule.send_prev_month_report_day",1)
+        day = self.config.get_int("job.send_prev_month_report_day",1)
         if datetime.today().month == 2 and day > 28:
             day = 28
         if datetime.today().month in [4, 6, 9, 11] and day > 30:
@@ -173,6 +196,16 @@ class Runner:
         if day > 31:
             day = 31
         return day
+
+    def _clean_job(self):
+        try:
+            self.logger.info("开始执行 清理数据 任务")
+            DataClean().clean()
+            self.logger.info("清理数据 任务执行完毕")
+        except Exception as e:
+            self._send_error_email("清理数据", f"\n    错误: {str(e)}")
+            self.logger.error(f"清理数据 任务执行失败: {e}")
+
     def _validate_and_format_time(self, time_str, default_time: list):
         """验证并格式化时间字符串"""
         if not time_str:
@@ -210,3 +243,4 @@ class Runner:
         title = f"{title}异常"
         content = f"{title},请及时处理。\n\n异常信息:{error}"
         email_helper.send_email(email, title, content, False, None)
+

+ 15 - 7
SourceCode/TenderCrawler/app/main.py

@@ -4,20 +4,28 @@ import schedule
 
 from utils.config_helper import ConfigHelper
 from utils.logger_helper import LoggerHelper
-from main.runner import Runner
+from jobs.job_runner import JobRunner
 
 logger = LoggerHelper.get_logger()
+config =  ConfigHelper()
 DEFAULT_USER_SLEEP_INTERVAL = 10  # 配置默认时间间隔10秒
+logger.info("应用程序启动...")
 
-runner = Runner()
-runner.run()
+job = JobRunner()
+job.run_job()
 
-interval = ConfigHelper().get_int("schedule.sleep_interval",DEFAULT_USER_SLEEP_INTERVAL)
+interval = config.get_int("job.sleep_interval",DEFAULT_USER_SLEEP_INTERVAL)
 
 if __name__ == '__main__':
     while True:
-        now = datetime.datetime.now()
-        if now.minute == 0:
-            logger.info("程序运行中...")
         schedule.run_pending()
+        now = datetime.datetime.now()
         time.sleep(interval)
+        # 重新加载配置及任务
+        if now.minute == 0 and now.second <= interval:
+            job_id = config.get("job.event_id")
+            config.load_config()
+            interval = config.get_int("job.sleep_interval", DEFAULT_USER_SLEEP_INTERVAL)
+            if job_id != config.get("job.event_id"):
+                job.restart_job()
+

+ 19 - 1
SourceCode/TenderCrawler/app/models/collect_data.py

@@ -153,7 +153,7 @@ class CollectData:
     def fetch_one_collect_by_url(self, url: str):
         with MySQLHelper() as db_helper:
             query = """
-                SELECT url,keyword,content,data_type,status FROM t_collect_data WHERE url = %s  LIMIT 1
+                SELECT url,keyword,content,data_type,attach_path,status FROM t_collect_data WHERE url = %s  LIMIT 1
             """
             result = db_helper.fetch_one(query, (url, ))
             if not result:
@@ -162,6 +162,7 @@ class CollectData:
                                keyword=result["keyword"],
                                content=result["content"],
                                data_type=result["data_type"],
+                               attach_path=result["attach_path"],
                                status=result["status"])
             return data
 
@@ -234,3 +235,20 @@ class CollectData:
             """
             params = (collect_data.status, collect_data.url)
             db_helper.execute_non_query(query, params)
+
+    def delete_before_date(self, date: str):
+        with MySQLHelper() as db_helper:
+            query1 = """
+                     DELETE FROM t_collect_data_history WHERE create_time < %s ;
+                     """
+            query2 = """
+                     DELETE FROM t_collect_data WHERE create_time < %s ;
+                     """
+            params = (date, )
+            db_helper.execute_non_query(query1, params)
+            affected_rows = db_helper.connection.affected_rows()
+            db_helper.execute_non_query(query2, params)
+            affected_rows += db_helper.connection.affected_rows()
+            self.logger.info(
+                f"删除 {date} 之前共 {affected_rows} 条 采集记录。")
+            return affected_rows

+ 15 - 0
SourceCode/TenderCrawler/app/models/process_data.py

@@ -195,3 +195,18 @@ class ProcessData:
             params = (url, )
             results = db_helper.execute_query(query, params)
             return True if results else False
+
+    def delete_before_date(self, date: str):
+        """
+        删除指定日期之前的数据
+        :param date: 日期字符串,格式为 YYYY-MM-DD
+        :return: 删除的行数
+        """
+        with MySQLHelper() as db_helper:
+            query = "DELETE FROM t_data WHERE date < %s"
+            params = (date, )
+            db_helper.execute_non_query(query, params)
+            affected_rows = db_helper.connection.affected_rows()
+            self.logger.info(
+                f"删除 {date} 之前共 {affected_rows} 条 招标处理记录。")
+            return affected_rows

+ 16 - 1
SourceCode/TenderCrawler/app/models/process_result_data.py

@@ -198,4 +198,19 @@ class ProcessResultData:
             params = (start_date, end_date)
             results = db_helper.execute_query(self._query_report, params)
             data = [ProcessResultData(**result) for result in results]
-            return data
+            return data
+
+    def delete_before_date(self, date: str):
+        """
+        删除指定日期之前的数据
+        :param date:
+        :return:
+        """
+        with MySQLHelper() as db_helper:
+            query = "DELETE FROM t_data_result WHERE create_time < %s"
+            params = (date, )
+            db_helper.execute_non_query(query, params)
+            affected_rows = db_helper.connection.affected_rows()
+            self.logger.info(
+                f"删除 {date} 之前共 {affected_rows} 条 中标处理记录。")
+            return affected_rows

+ 1 - 2
SourceCode/TenderCrawler/app/models/url_setting.py

@@ -24,8 +24,7 @@ class UrlSetting:
 
     def __repr__(self):
         return (
-            f"<UrlSetting(url={self.url}, type={self.adapter_type}, "
-            f"username={self.username}, keywords={self.keywords}, is_active={self.is_active})>"
+            f"URL配置[ url: {self.url}  type: {self.adapter_type} keywords: {self.keywords}]"
         )
 
     def to_dict(self):

+ 5 - 1
SourceCode/TenderCrawler/app/stores/data_store_interface.py

@@ -23,6 +23,10 @@ class IDataStore(ABC):
     def save_collect_data(self, is_force=False):
         raise NotImplementedError("save 应由子类重写。")
 
+    @abstractmethod
+    def set_collect_process(self, url):
+        raise NotImplementedError("set_collect_process 应由子类重写。")
+
     @abstractmethod
     def query_urls_to_process(self):
         raise NotImplementedError("query_to_process 应由子类重写。")
@@ -90,7 +94,7 @@ class IDataStore(ABC):
         raise NotImplementedError("get_emails 应由子类重写。")
 
     @abstractmethod
-    def query_all_virtual_emails(self, area: str):
+    def query_all_virtual_emails(self):
         raise NotImplementedError("get_email_by_area 应由子类重写。")
 
     @abstractmethod

+ 4 - 1
SourceCode/TenderCrawler/app/stores/default_data_store.py

@@ -17,7 +17,10 @@ class DefaultDataStore(IDataStore):
         self.logger.info("Default: INSERT_COLLECT_DATA")
 
     def save_collect_data(self, is_force=False):
-        self.logger.info("Default: SAVE")
+        self.logger.info("Default: SAVE_COLLECT_DATA")
+
+    def set_collect_process(self, url):
+        self.logger.info("Default: SET_COLLECT_PROCESS")
 
     def query_urls_to_process(self):
         self.logger.info("Default: QUERY_TO_PROCESS")

+ 11 - 0
SourceCode/TenderCrawler/app/stores/mysql_data_store.py

@@ -74,6 +74,9 @@ class MysqlDataStore(IDataStore):
             self._collectData.set_process_list(urls)
             self._process_list = []
 
+    def set_collect_process(self, url):
+        return self._collectData.set_process(url)
+
     def set_process_other_urls(self, url, other_urls: str):
         return self._processData.set_other_urls(url, other_urls)
 
@@ -134,3 +137,11 @@ class MysqlDataStore(IDataStore):
 
     def update_area_email_area_by_name(self, name: str, area: str):
         return self._areaEmail.update_area_email_area_by_name(name, area)
+
+    def delete_collect_data_before_date(self, date: str):
+        return self._collectData.delete_before_date(date)
+
+    def delete_process_data_before_date(self, date: str):
+        return self._processData.delete_before_date(date)
+    def delete_process_result_data_before_date(self, date: str):
+        return self._processResultData.delete_before_date(date)

+ 9 - 6
SourceCode/TenderCrawler/app/utils/ai_helper.py

@@ -46,15 +46,18 @@ class AiHelper:
         )
         try:
             response = completion.model_dump_json()
-            self.logger.info(f"AI Response: {response}")
+
             response_json = json.loads(response)
             res_str = self._extract_message_content(response_json)
             result = self._parse_response(res_str, True)
-            usage = response_json["usage"]
-            result["completion_tokens"] = usage.get("completion_tokens", 0)
-            result["prompt_tokens"] = usage.get("prompt_tokens", 0)
-            result["total_tokens"] = usage.get("total_tokens", 0)
-            self.logger.info(f"AI Process JSON: {result}")
+            if result:
+                usage = response_json["usage"]
+                result["completion_tokens"] = usage.get("completion_tokens", 0)
+                result["prompt_tokens"] = usage.get("prompt_tokens", 0)
+                result["total_tokens"] = usage.get("total_tokens", 0)
+                # self.logger.info(f"AI Process JSON: {result}")
+            else:
+                self.logger.info(f"AI Response: {response}")
             return result
         except Exception as e:
             raise Exception(f"解析 AI 响应错误: {e}")

+ 30 - 2
SourceCode/TenderCrawler/app/utils/file_helper.py

@@ -1,6 +1,7 @@
 import os
+import shutil
 import requests
-from datetime import datetime
+from datetime import datetime, timedelta
 from urllib.parse import urlparse
 
 from utils.logger_helper import LoggerHelper
@@ -20,7 +21,7 @@ class FileHelper:
         self.logger.info(f"下载远程文件: {file_url}  文件名:{file_name}")
         current_timestamp = datetime.now().strftime("%H%M%S%f")[:-3]  # 取前三位毫秒
         file_name = f"{current_timestamp}@{file_name}"
-        file_path = os.path.join(self._attach_file_path, f'{datetime.now().strftime("%Y-%m/%d")}')
+        file_path = os.path.join(self._attach_file_path, f'{datetime.now().strftime("%Y-%m-%d")}')
         if not os.path.exists(file_path):
             os.makedirs(file_path)
         path = os.path.join(file_path, file_name)
@@ -65,3 +66,30 @@ class FileHelper:
         except Exception as e:
             self.logger.error(f"文件下载失败: {file_name}。Exception: {e}")
             return None
+
+
+    def clean_attach_file(self, day:int) -> None:
+        try:
+            current_time = datetime.now()
+            cutoff_time = current_time - timedelta(days=day)
+            for root, dirs, _ in os.walk(self._attach_file_path):
+                for dir_name in dirs:
+                    path = os.path.join(root, dir_name)
+                    dir_path = str(path).replace(self._attach_file_path,"").replace("\\","/")
+                    if dir_path.count("/") > 0:
+                        continue
+                    try:
+                        dir_date = datetime.strptime(dir_path, "%Y-%m-%d")
+                        if dir_date < cutoff_time:
+                            try:
+                                shutil.rmtree(path)
+                                self.logger.info(f"  删除目录及其内容: {dir_path}")
+                            except PermissionError:
+                                self.logger.error(f"  权限错误,无法删除目录: {dir_path}")
+                            except Exception as e:
+                                self.logger.error(f"  删除目录失败: {dir_path}。Exception: {e}")
+                    except ValueError:
+                        # 如果目录名称不符合 %Y-%m/%d 格式,跳过
+                        continue
+        except Exception as e:
+            self.logger.error(f"文件清理失败。Exception: {e}")

+ 26 - 6
SourceCode/TenderCrawler/app/utils/logger_helper.py

@@ -1,6 +1,7 @@
 import os
 import logging
 from logging.handlers import TimedRotatingFileHandler
+from datetime import datetime
 
 from utils.config_helper import ConfigHelper
 
@@ -11,7 +12,10 @@ class LoggerHelper:
     """
     _instance = None
     config = ConfigHelper()
-
+    _log_file_name = f"{config.get("logger.file_name", "crawler")}.log"
+    _log_file_path = config.get("logger.file_path", "./logs")
+    _log_level_string = config.get("logger.level", "INFO")
+    _log_level = logging.getLevelName(_log_level_string)
     def __new__(cls, *args, **kwargs):
         """
         实现单例模式,确保日志记录器仅被创建一次
@@ -26,6 +30,7 @@ class LoggerHelper:
                 raise Exception(f"配置logger出错: {e}")
         return cls._instance
 
+
     @property
     def logger(self):
         return self._logger
@@ -35,13 +40,13 @@ class LoggerHelper:
         初始化日志记录器,包括设置日志级别、创建处理器和格式化器,并将它们组合起来
         """
         self._logger = logging.getLogger('app_logger')
-        self._logger.setLevel(logging.INFO)
-        log_file_path = self.config.get("logger.file_path", "./logs")
-        if not os.path.exists(log_file_path):
-            os.makedirs(log_file_path)
+        self._logger.setLevel(self._log_level)
+
+        if not os.path.exists(self._log_file_path):
+            os.makedirs(self._log_file_path)
 
         # 创建按日期分割的文件处理器
-        file_handler = TimedRotatingFileHandler(os.path.join(log_file_path, 'crawler.log'),
+        file_handler = TimedRotatingFileHandler(os.path.join(self._log_file_path, self._log_file_name),
                                                 when='midnight',
                                                 interval=1,
                                                 backupCount=7,
@@ -73,3 +78,18 @@ class LoggerHelper:
         if not cls._instance:
             cls._instance = cls()
         return cls._instance._logger
+
+    @classmethod
+    def delete_log(cls, day:int) :
+        if not os.path.exists(cls._log_file_path):
+            return
+        for filename in os.listdir(cls._log_file_path):
+            if  filename!=cls._log_file_name and filename.startswith(cls._log_file_name):
+               try:
+                   file_path = os.path.join(cls._log_file_path, filename)
+                   file_time = datetime.strptime(filename.replace(f"{cls._log_file_name}.",""), "%Y-%m-%d")
+                   if (datetime.now() - file_time).days > day:
+                       os.remove(file_path)
+                       cls.get_logger().info(f"  删除日志文件: {file_path}")
+               except Exception as e:
+                   cls.get_logger().error(f"删除日志文件出错: {filename} {e}")

+ 5 - 5
SourceCode/TenderCrawler/docker-compose.yml

@@ -41,7 +41,7 @@ services:
 
   crawler-app:
     build: .
-    image: y_tender-crawler-app:2.0.0
+    image: y_tender-crawler-app:2.0.1
     container_name: y_tender-crawler-app
     depends_on:
       - crawler-mysql
@@ -56,10 +56,10 @@ services:
       #      - APP_AI__KEY=
       #      - APP_AI__URL=http://192.168.0.109:7580/api/chat
       #      - APP_AI__MODEL=qwen2.5:7b
-      - APP_SCHEDULE__COLLECT=20:00,12:00
-      - APP_SCHEDULE__PROCESS=23:00,4:00,13:00
-      - APP_SCHEDULE__SEND_EMAIL=08:20,14:00
-      - APP_SCHEDULE__RUN_NOW=1
+      - APP_JOB__COLLECT=20:00,12:00
+      - APP_JOB__PROCESS=23:00,4:00,13:00
+      - APP_JOB__SEND_EMAIL=08:20,14:00
+      - APP_JOB__RUN_NOW=1
       - APP_SELENIUM__REMOTE_DRIVER_URL=http://y_selenium:4444/wd/hub
     volumes:
       - /home/docker/tender-crawler_v2/app/config.yml:/app/config.yml

+ 2 - 2
SourceCode/TenderCrawler/init.sql

@@ -99,7 +99,7 @@ CREATE TABLE `t_collect_data`  (
   `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
   `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
   `data_type` int(4) NOT NULL DEFAULT 0 COMMENT '数据类型 0:招标 1:中标',
-  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
+  `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
@@ -115,7 +115,7 @@ CREATE TABLE `t_collect_data_history`  (
   `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
   `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
   `data_type` int(4) NOT NULL DEFAULT 0 COMMENT '数据类型 0:招标 1:中标',
-  `attach_path` varchar(1000) NULL DEFAULT NULL COMMENT '附件路径',
+  `attach_path` varchar(2000) NULL DEFAULT NULL COMMENT '附件路径',
   `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
   `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
   `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',