YueYunyun 8 сар өмнө
commit
e554683521
38 өөрчлөгдсөн 2374 нэмэгдсэн , 0 устгасан
  1. 2 0
      .gitattributes
  2. 162 0
      .gitignore
  3. BIN
      Doc/招标信息爬取推送需求分析.docx
  4. 6 0
      SourceCode/TenderCrawler/.dockerignore
  5. 6 0
      SourceCode/TenderCrawler/.env
  6. 3 0
      SourceCode/TenderCrawler/.script/cmd
  7. 28 0
      SourceCode/TenderCrawler/.vscode/launch.json
  8. 35 0
      SourceCode/TenderCrawler/Dockerfile
  9. 0 0
      SourceCode/TenderCrawler/app/__init__.py
  10. 0 0
      SourceCode/TenderCrawler/app/adapters/__init__.py
  11. 189 0
      SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py
  12. 119 0
      SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py
  13. 41 0
      SourceCode/TenderCrawler/app/config.yml
  14. 0 0
      SourceCode/TenderCrawler/app/drivers/__init__.py
  15. 97 0
      SourceCode/TenderCrawler/app/drivers/driver_creator.py
  16. 27 0
      SourceCode/TenderCrawler/app/main.py
  17. 0 0
      SourceCode/TenderCrawler/app/main/__init__.py
  18. 87 0
      SourceCode/TenderCrawler/app/main/data_collector.py
  19. 71 0
      SourceCode/TenderCrawler/app/main/data_process.py
  20. 118 0
      SourceCode/TenderCrawler/app/main/data_send.py
  21. 150 0
      SourceCode/TenderCrawler/app/main/runner.py
  22. 51 0
      SourceCode/TenderCrawler/app/models/area_email.py
  23. 148 0
      SourceCode/TenderCrawler/app/models/collect_data.py
  24. 166 0
      SourceCode/TenderCrawler/app/models/process_data.py
  25. 55 0
      SourceCode/TenderCrawler/app/models/url_setting.py
  26. 0 0
      SourceCode/TenderCrawler/app/stores/__init__.py
  27. 56 0
      SourceCode/TenderCrawler/app/stores/data_store_interface.py
  28. 40 0
      SourceCode/TenderCrawler/app/stores/default_data_store.py
  29. 83 0
      SourceCode/TenderCrawler/app/stores/mysql_data_store.py
  30. 3 0
      SourceCode/TenderCrawler/app/utils/__init__.py
  31. 126 0
      SourceCode/TenderCrawler/app/utils/ai_helper.py
  32. 70 0
      SourceCode/TenderCrawler/app/utils/config_helper.py
  33. 73 0
      SourceCode/TenderCrawler/app/utils/email_helper.py
  34. 74 0
      SourceCode/TenderCrawler/app/utils/logger_helper.py
  35. 119 0
      SourceCode/TenderCrawler/app/utils/mysql_helper.py
  36. 81 0
      SourceCode/TenderCrawler/docker-compose.yml
  37. 80 0
      SourceCode/TenderCrawler/init.sql
  38. 8 0
      SourceCode/TenderCrawler/requirements.txt

+ 2 - 0
.gitattributes

@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto

+ 162 - 0
.gitignore

@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+.dev/
+logs/

BIN
Doc/招标信息爬取推送需求分析.docx


+ 6 - 0
SourceCode/TenderCrawler/.dockerignore

@@ -0,0 +1,6 @@
+# .dockerignore
+.git
+logs
+__pycache__
+*.log
+*.pyc

+ 6 - 0
SourceCode/TenderCrawler/.env

@@ -0,0 +1,6 @@
+SELENIUM_CHROME_PORT=3534
+MYSQL_ROOT_PASSWORD=123456qwertyu
+MYSQL_DATABASE=iwb_data_collect_v1.0
+MYSQL_USER=iwb_data
+MYSQL_PASSWORD=123456iwb
+MYSQL_PORT=3535

+ 3 - 0
SourceCode/TenderCrawler/.script/cmd

@@ -0,0 +1,3 @@
+# 更新 requirements.txt
+# mysql8.0.39 需要添加依赖  cryptography==41.0.4
+pipreqs . --encoding=utf8 --force

+ 28 - 0
SourceCode/TenderCrawler/.vscode/launch.json

@@ -0,0 +1,28 @@
+{
+	// 使用 IntelliSense 了解相关属性。
+	// 悬停以查看现有属性的描述。
+	// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
+	"version": "0.2.0",
+	"configurations": [
+		{
+			"name": "Python: DataCollection",
+			"type": "python",
+			"request": "launch",
+			"program": "app/main.py",
+			"console": "integratedTerminal",
+			"justMyCode": true,
+			"env": {
+				// "APP_MYSQL__HOST": "localhost",
+				// "APP_MYSQL__PORT": "3535",
+				// "APP_MYSQL__DB": "iwb_data_collect_v1.0",
+				// "APP_MYSQL__USER": "root",
+				// "APP_MYSQL__PASSWORD": "123456qwertyu",
+				"APP_SAVE__COLLECT_BATCH_SIZE": "20",
+				"APP_SAVE__PROCESS_BATCH_SIZE": "1",
+				"APP_SCHEDULE__COLLECT": "12:53",
+				"APP_SCHEDULE__SEND_EMAIL": "22:48",
+				"APP_SCHEDULE__RUN_NOW": "1"
+			}
+		}
+	]
+}

+ 35 - 0
SourceCode/TenderCrawler/Dockerfile

@@ -0,0 +1,35 @@
+# 第一阶段:构建
+# 使用官方的 Python 基础镜像
+FROM python:3.13-slim AS builder
+
+RUN mkdir /app
+
+WORKDIR /app
+# 明确指定 requirements.txt 的路径
+COPY requirements.txt .
+# 安装项目依赖
+RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 在 builder 阶段添加调试命令
+# RUN pip freeze > installed-packages.txt
+
+# 复制项目文件到工作目录
+COPY ./app /app
+
+# 将/etc/localtime链接到上海时区文件
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+
+# 第二阶段:运行
+FROM python:3.13-slim
+
+WORKDIR /app
+COPY --from=builder /usr/local/lib/python3.13/site-packages /usr/local/lib/python3.13/site-packages
+COPY --from=builder /app /app
+
+# 暴露端口(如果有需要)
+# EXPOSE 8080
+
+# 设置环境变量(如果有需要)
+# ENV MY_VARIABLE=value
+
+# 运行项目
+CMD ["python", "main.py"]

+ 0 - 0
SourceCode/TenderCrawler/app/__init__.py


+ 0 - 0
SourceCode/TenderCrawler/app/adapters/__init__.py


+ 189 - 0
SourceCode/TenderCrawler/app/adapters/chinabidding_data_collection_adapter.py

@@ -0,0 +1,189 @@
+from time import sleep
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+
+from drivers.driver_creator import DriverCreator
+from stores.data_store_interface import IDataStore
+from adapters.data_collection_adapter_interface import IDataCollectionAdapter
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+
+
+class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
+    """
+    中国招标网数据采集适配器
+    """
+
+    logger = LoggerHelper.get_logger()
+
+    def __init__(self, url: str):
+        self._url = url
+        self._store = None
+        self._driver = None
+        self._keyword = None
+
+    @property
+    def store(self) -> IDataStore:
+        return self._store
+
+    @property
+    def url(self):
+        return self._url
+
+    @property
+    def keyword(self):
+        return self._keyword
+
+    @property
+    def driver(self):
+        if not self._driver:
+            self._driver = self.createDriver()
+        return self._driver
+
+    def createDriver(self) -> webdriver:
+        try:
+            return DriverCreator().GenRemoteDriver(self.url)
+        except Exception as e:
+            raise Exception(f"创建驱动器失败: {e}")
+
+    def login(self, driver, username: str, password: str) -> None:
+        try:
+            loginEl = driver.find_element(
+                By.XPATH, "//div[@id='loginRight']/a[@class='login']")
+            loginEl.click()
+            wait = WebDriverWait(driver, 10, 1)
+            wait.until(EC.presence_of_element_located((By.ID, "userpass")))
+            unEl = driver.find_element(By.ID, "username")
+            unEl.send_keys(username)
+            passEl = driver.find_element(By.ID, "userpass")
+            passEl.send_keys(password)
+            loginBtn = driver.find_element(By.ID, "login-button")
+            loginBtn.click()
+            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
+        except TimeoutException as e:
+            raise Exception(f"登录失败 [超时]: {e}")
+        except NoSuchElementException as e:
+            raise Exception(f"登录失败 [找不到元素]: {e}")
+
+    def search(self, driver, keyword: str) -> list:
+        try:
+            self._keyword = keyword
+            wait = WebDriverWait(driver, 10, 1)
+            wait.until(
+                EC.presence_of_element_located((By.ID, "projSearchForm")))
+            searchEl = driver.find_element(By.ID, "fullText")
+            searchEl.send_keys(keyword)
+            searchBtn = driver.find_element(
+                By.XPATH, "//form[@id='projSearchForm']/button")
+            searchBtn.click()
+            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
+            # 查询3天内的数据
+            search_txt = ConfigHelper().get("adapter.chinabidding.search_day")
+            if not search_txt:
+                search_txt = "近三天"
+            self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
+            lastEl = driver.find_element(By.LINK_TEXT, search_txt)
+            lastEl.click()
+            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
+            try:
+                aLinks = driver.find_elements(
+                    By.XPATH, "//form[@id='pagerSubmitForm']/a")
+                count = len(aLinks)
+                if count > 1:
+                    count = count - 1
+                self.logger.info(f"共查询到 {count} 页")
+            except Exception as e:
+                self.logger.error(f"搜索失败[尝试查询页数]: {e}")
+            items = driver.find_elements(By.XPATH,
+                                         "//ul[@class='as-pager-body']/li/a")
+            return items
+        except TimeoutException as e:
+            raise Exception(f"搜索失败 [超时]: {e}")
+        except NoSuchElementException as e:
+            raise Exception(f"搜索失败 [找不到元素]: {e}")
+
+    def collect(self, driver, items: list, store: IDataStore) -> list:
+        if store:
+            self._store = store
+        self._process_list(driver, items)
+        self.store.save_collect_data(True)
+
+    def _next_page(self, driver) -> list:
+        try:
+            wait = WebDriverWait(driver, 10, 1)
+            nextPath = "//form[@id='pagerSubmitForm']/a[@class='next']"
+            wait.until(EC.presence_of_element_located((By.XPATH, nextPath)))
+            btn = driver.find_element(By.XPATH, nextPath)
+            btn.click()
+            self.logger.info(f"跳转到下页: {driver.current_url}")
+            wait.until(EC.presence_of_element_located((By.ID, "site-content")))
+            items = driver.find_elements(By.XPATH,
+                                         "//ul[@class='as-pager-body']/li/a")
+            return items
+        except NoSuchElementException as e:
+            raise Exception(f"翻页失败 [找不到元素]: {e}")
+        except TimeoutException:
+            self.logger.info("翻页结束")
+            return []
+
+    def _process_item(self, driver, item):
+        try:
+            currentHandle = driver.current_window_handle
+            url = item.get_attribute('href')
+            old = self.store.query_one_collect_by_url(url)
+            if old:
+                self.logger.info(f"已采集过: {url}")
+                return
+            item.click()
+            wait = WebDriverWait(driver, 10, 1)
+            wait.until(EC.number_of_windows_to_be(2))
+            handles = driver.window_handles
+            for handle in handles:
+                if handle != currentHandle:
+                    driver.switch_to.window(handle)
+                    break
+            url = driver.current_url
+            self.logger.info(f"跳转详情: {driver.current_url}")
+            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+            content = driver.find_element(By.TAG_NAME, "body").text
+            self._save(url, content)
+            sleep(1)
+            driver.close()
+            sleep(2)
+        except TimeoutException as e:
+            self.logger.error(
+                f"采集发生异常 Timeout: {driver.current_url}。Exception: {e}")
+            # raise Exception(f"采集失败 [超时]: {e}")
+        except NoSuchElementException as e:
+            self.logger.error(
+                f"采集发生异常 NoSuchElement: {driver.current_url}。Exception: {e}")
+            raise Exception(f"采集失败 [找不到元素]: {e}")
+        finally:
+            driver.switch_to.window(currentHandle)
+
+    def _save(self, url, content):
+        # self.logger.info(f"保存数据: {url},关键字{self.keyword}")
+        if not self.store:
+            self.logger.info(f"DataStore 未指定: {url},关键字{self.keyword}")
+        else:
+            self.store.insert_collect_data(url, self.keyword, content, True)
+
+    def _process_list(self, driver, items: list) -> list:
+        if not items:
+            return []
+        for item in items:
+            self._process_item(driver, item)
+        sleep(2)
+        next_items = self._next_page(driver)
+        return self._process_list(driver, next_items)
+
+    def teardown(self, driver) -> None:
+        try:
+            if driver:
+                driver.quit()
+        except Exception as e:
+            raise Exception(f"关闭驱动器失败: {e}")

+ 119 - 0
SourceCode/TenderCrawler/app/adapters/data_collection_adapter_interface.py

@@ -0,0 +1,119 @@
+from abc import ABC, abstractmethod
+from selenium import webdriver
+
+from stores.data_store_interface import IDataStore
+
+
+class IDataCollectionAdapter(ABC):
+    """
+    数据收集适配器抽象类
+    """
+
+    @property
+    @abstractmethod
+    def url(self):
+        """
+        驱动器初始打开的URL
+
+        :return:  驱动器初始打开的URL
+        :rtype: str
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def driver(self):
+        """
+        创建的驱动器
+
+        :return:  驱动器
+        :rtype: webdriver
+        """
+        pass
+
+    @abstractmethod
+    def createDriver(self) -> webdriver:
+        """
+        根据URL创建一个浏览器驱动器
+
+        :return: 创建的驱动器
+        :rtype: webdriver
+        :raises Exception: 如果创建驱动器失败,应抛出异常
+        """
+        try:
+            # 实现创建驱动器的逻辑
+            pass
+        except Exception as e:
+            raise Exception(f"创建驱动器失败: {e}")
+
+    @abstractmethod
+    def login(self, driver, username: str, password: str) -> None:
+        """
+        如果需要登录,则登录后跳转到搜索页面(不自动跳转的需要手动执行)
+
+        :param driver: 浏览器驱动器实例
+        :param username: 用户名
+        :type username: str
+        :param password: 密码
+        :type password: str
+        :raises Exception: 如果登录失败,应抛出异常
+        """
+        try:
+            # 实现登录逻辑
+            pass
+        except Exception as e:
+            raise Exception(f"登录失败: {e}")
+
+    @abstractmethod
+    def search(self, driver, keyword: str) -> list:
+        """
+        根据关键字搜索,返回搜索结果列表
+
+        :param driver: 浏览器驱动器实例
+        :param keyword: 搜索关键字
+        :type keyword: str
+        :return: 搜索结果列表
+        :rtype: list
+        :raises Exception: 如果搜索失败,应抛出异常
+        """
+        try:
+            results = []
+            # 实现搜索逻辑
+            return results if results else []
+        except Exception as e:
+            raise Exception(f"搜索失败: {e}")
+
+    @abstractmethod
+    def collect(self, driver, items: list, store: IDataStore) -> list:
+        """
+        处理搜索结果列表,返回处理后的数据列表
+
+        :param driver: 浏览器驱动器实例
+        :param items: 搜索结果列表
+        :type items: list
+        :return: 处理后的数据列表
+        :rtype: list
+        :raises Exception: 如果处理失败,应抛出异常
+        """
+        try:
+            processed_items = []
+            if items:
+                # 实现处理逻辑
+                pass
+            return processed_items
+        except Exception as e:
+            raise Exception(f"处理失败: {e}")
+
+    @abstractmethod
+    def teardown(self, driver) -> None:
+        """
+        关闭浏览器驱动器
+
+        :param driver: 浏览器驱动器实例
+        :raises Exception: 如果关闭驱动器失败,应抛出异常
+        """
+        try:
+            if driver:
+                driver.quit()
+        except Exception as e:
+            raise Exception(f"关闭驱动器失败: {e}")

+ 41 - 0
SourceCode/TenderCrawler/app/config.yml

@@ -0,0 +1,41 @@
+adapter:
+  chinabidding:
+    #search_day: '今天'
+    search_day: '近一月'
+    model_name: 'chinabidding_data_collection_adapter'
+    class_name: 'ChinabiddingDataCollectionAdapter'
+default_area: '全国'
+save:
+  collect_batch_size: 100
+  process_batch_size: 1 #AI处理一条插入一条
+mysql:
+  host: 192.168.0.81
+  port: 3307
+  db: iwb_data_collect_dev
+  user: root
+  password: Iwb-2024
+  charset: utf8mb4
+ai:
+  key: 1
+  url: http://192.168.0.109:7580/api/chat
+  # url: https://api.qwen.aliyun.com/v1/models/qwen/completions
+  model: qwen2.5:7b
+  max_tokens: 1024
+  system_prompt: 请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。
+  prompt_template: 在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary)。编号一般在“招标编号:”的后面,例如 (招标编号:xxxxxxx...), “xxxxxxx...”就是编号(no)。返回包含no,title,area,date,address,release_date,summary字段的json格式字符串,没有找到或未提供的信息json字段为空。
+email:
+  smtp_server: smtp.exmail.qq.com
+  smtp_port: 587
+  smtp_user: yueyy@iwbnet.com
+  smtp_password: EXN38AtT97FX635c
+  from_email: yueyy@iwbnet.com
+  error_email: yueyy@iwbnet.com
+schedule:
+  sleep_interval: 10
+  #sleep_interval: 600 #单位:秒 10分钟检查一次
+  collect: 06:00,22:00 # 每天采集数据时间
+  process: 07:00,10:00 # 每天采集数据时间
+  send_email: 8:20,14:00 # 每天发送邮件时间
+  run_now: false
+selenium:
+  remote_driver_url: http://127.0.0.1:3534/wd/hub

+ 0 - 0
SourceCode/TenderCrawler/app/drivers/__init__.py


+ 97 - 0
SourceCode/TenderCrawler/app/drivers/driver_creator.py

@@ -0,0 +1,97 @@
+from selenium import webdriver
+
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+
+
+class DriverCreator:
+
+    logger = LoggerHelper.get_logger()
+
+    default_remote_driver_url = "http://127.0.0.1:4444/wd/hub"
+
+    def GenRemoteDriver(self, url):
+        # 设置Chrome选项
+        options = webdriver.ChromeOptions()
+
+        options.add_argument('--headless')  # 无头模式运行
+        options.add_argument('--no-sandbox')
+        options.add_argument('--disable-dev-shm-usage')
+        # 无痕浏览模式
+        options.add_argument('--incognito')
+
+        remote_driver_url = ConfigHelper().get('selenium.remote_driver_url')
+        if not remote_driver_url:
+            remote_driver_url = self.default_remote_driver_url
+            self.logger.error(
+                f"未配置远程驱动地址,使用默认地址{self.default_remote_driver_url}")
+        self.logger.info(f"远程驱动地址{remote_driver_url}")
+
+        # 创建远程浏览器驱动实例
+        driver = webdriver.Remote(command_executor=remote_driver_url,
+                                  options=options)
+        return self._genDriver(driver, url)
+
+    def GenChromeDriver(self, url):
+        # 设置Chrome选项,包括隐藏Selenium特征、设置代理IP和排除或关闭一些Selenium相关开关
+        options = webdriver.ChromeOptions()
+        options.add_experimental_option('excludeSwitches',
+                                        ['enable-automation'])
+        options.add_argument('--disable-blink-features=AutomationControlled')
+        options.add_argument('--disable-extensions')
+        # options.add_argument('--disable-gpu')
+        # options.add_argument('--disable-infobars')
+        options.add_argument('--disable-notifications')
+        # options.add_argument('--disable-popup-blocking')
+        # options.add_argument('--disable-web-security')
+        # options.add_argument('--ignore-certificate-errors')
+        # options.add_argument('--no-sandbox')
+        # 最大化窗口
+        options.add_argument('--start-maximized')
+        # 无痕浏览模式
+        options.add_argument('--incognito')
+        # options.add_argument('--user-data-dir=/dev/null')
+        # options.add_argument('--proxy-server={}'.format(proxy_address + ':' + proxy_port))
+        # options.add_argument('--proxy-auth={}:{}'.format(proxy_username, proxy_password))
+        # options.add_experimental_option('excludeSwitches', ['enable-automation', 'useAutomationExtension'])
+        # 阻止浏览器窗口自动关闭
+        # options.add_experimental_option('detach', True)
+        driver = webdriver.Chrome(options=options)  # 创建Chrome浏览器驱动实例
+        return self._genDriver(driver, url)
+
+    def _genDriver(self, driver, url):
+        # 检查是否为 ChromeDriver 或 FirefoxDriver
+        if isinstance(driver, (webdriver.Chrome, webdriver.Firefox)):
+            # 隐藏navigator.webdriver标志,将其值修改为false或undefined
+            driver.execute_cdp_cmd(
+                'Page.addScriptToEvaluateOnNewDocument', {
+                    'source':
+                    'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
+                })
+
+            user_agents = [
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/98.0",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/98.0.1108.62 Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.62",
+                "Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 OPR/84.0.4140.129",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/98.0.1108.48 Chrome/98.0.4758.102 Safari/537.36 EdgA/98.0.1108.48",
+            ]
+
+            user_agent = user_agents[len(url) % len(user_agents)]
+            # 设置user-agent,改变user-agent的值
+            driver.execute_cdp_cmd("Network.setUserAgentOverride",
+                                   {"userAgent": user_agent})
+        else:
+            self.logger.warning("当前驱动不支持 execute_cdp_cmd 方法")
+        # url 去除空字符串
+        url = url.strip()
+        driver.get(url)
+
+        # 设置隐式等待 5s
+        driver.implicitly_wait(5)
+        self.logger.info(f"创建浏览器驱动,URL: {url}")
+        return driver
+
+    def ShutdownDriver(driver):
+        driver.quit()

+ 27 - 0
SourceCode/TenderCrawler/app/main.py

@@ -0,0 +1,27 @@
+import time
+import schedule
+
+from utils.config_helper import ConfigHelper
+from utils.logger_helper import LoggerHelper
+from main.runner import Runner
+
+logger = LoggerHelper.get_logger()
+DEFAUlT_SLEEP_INTERVAL = 60 * 30  # 配置默认时间间隔30分钟
+
+runner = Runner()
+runner.run()
+
+try:
+    intervalStr = ConfigHelper().get("schedule.sleep_interval")
+    interval = int(intervalStr)
+except Exception:
+    interval = DEFAUlT_SLEEP_INTERVAL
+    logger.warning(
+        f"schedule.sleep_interval {intervalStr} 配置不正确, 使用默认配置: {DEFAUlT_SLEEP_INTERVAL}秒"
+    )
+
+if __name__ == '__main__':
+    while True:
+        logger.info(f"等待下次检查执行... {interval}秒后")
+        schedule.run_pending()
+        time.sleep(interval)

+ 0 - 0
SourceCode/TenderCrawler/app/main/__init__.py


+ 87 - 0
SourceCode/TenderCrawler/app/main/data_collector.py

@@ -0,0 +1,87 @@
+import importlib
+from selenium import webdriver
+
+from stores.data_store_interface import IDataStore
+from stores.default_data_store import DefaultDataStore
+from adapters.data_collection_adapter_interface import IDataCollectionAdapter
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+
+
+class DataCollector:
+
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+    _adapter = None
+    _driver = None
+    _store = None
+
+    # 使用字典映射域名和适配器类
+    # _adapterModelMap = {"chinabidding": "chinabidding_data_collection_adapter"}
+
+    # _adapterClassMap = {"chinabidding": "ChinabiddingDataCollectionAdapter"}
+
+    def __init__(self,
+                 type: str,
+                 url: str,
+                 un: str,
+                 up: str,
+                 store: IDataStore = None):
+        self._adapter = self._genAdapter(type, url)
+        self._driver = self.adapter.createDriver()
+        # if type == "chinabidding":
+        #     return
+        self.adapter.login(self.driver, un, up)
+        if store:
+            self._store = store
+        else:
+            self._store = DefaultDataStore()
+
+    @property
+    def driver(self) -> webdriver:
+        return self._driver
+
+    @property
+    def store(self) -> IDataStore:
+        return self._store
+
+    @property
+    def adapter(self) -> IDataCollectionAdapter:
+        return self._adapter
+
+    def setStore(self, store: IDataStore) -> None:
+        self._store = store
+
+    def collect(self, keyword: str):
+        items = self.adapter.search(self.driver, keyword)
+        self.adapter.collect(self.driver, items, self.store)
+
+    def close(self):
+        self.logger.info(f"关闭浏览器驱动,URL: {self.adapter.url}")
+        self.adapter.teardown(self.driver)
+
+    def collectWithStore(self, keyword: str, store: IDataStore):
+        self.setStore(store)
+        self.collect(keyword)
+
+    def _genAdapter(self, type: str, url: str):
+        adapterModelName = self.config.get(f"adapter.{type}.model_name")
+        adapterClassName = self.config.get(f"adapter.{type}.class_name")
+        if adapterClassName:
+            try:
+                self.logger.info(
+                    f"生成适配器 TYPE:{type},适配器: {adapterClassName},URL:{url}")
+                # 使用 importlib 动态导入模块
+                adapterModule = importlib.import_module(
+                    f"adapters.{adapterModelName}")
+                adapterClass = getattr(adapterModule, adapterClassName)
+                adapter = adapterClass(url)
+            except ImportError as e:
+                raise ImportError(f"无法导入适配器模块 {adapterModelName}") from e
+            except AttributeError as e:
+                raise AttributeError(
+                    f"适配器模块 {adapterModelName} 中找不到类 {adapterClassName}"
+                ) from e
+        else:
+            raise Exception("不支持的适配器类型")
+        return adapter

+ 71 - 0
SourceCode/TenderCrawler/app/main/data_process.py

@@ -0,0 +1,71 @@
+from utils.logger_helper import LoggerHelper
+from utils.ai_helper import AiHelper
+from stores.data_store_interface import IDataStore
+from models.collect_data import CollectData
+from models.process_data import ProcessData
+
+
+class DataProcess:
+    logger = LoggerHelper.get_logger()
+
+    _store = None
+
+    def __init__(self, store: IDataStore):
+        self._store = store
+
+    @property
+    def store(self) -> IDataStore:
+        return self._store
+
+    def process(self):
+        try:
+            urls = self.store.query_urls_to_process()
+            for item in urls:
+                self._process_item(item)
+            self.store.save_process_data(True)
+        except Exception as e:
+            self.logger.error(f"数据处理过程中发生异常: {e}")
+
+    def _process_item(self, url: str) -> None:
+        self.logger.info("START ==>" + url)
+        item = self.store.query_one_collect_by_url(url)
+        if not item:
+            self.logger.info("END: NOT FOUND URL==>" + url)
+            return
+        if item.status == 1:
+            self.logger.info("ALREADY URL==>" + url)
+            return
+        data = self._ai_process(item)
+        if data:
+            old = None
+            if data.no:
+                old = self.store.query_one_process_by_no(data.no)
+            if not old:
+                data.url = url
+                data.keyword = item.keyword
+                self.store.insert_process_data(data)
+            else:
+                if old.url != url:
+                    if old.other_urls:
+                        old.other_urls += f",{url}"
+                    else:
+                        old.other_urls = url
+                    self.store.set_process_other_urls(data.url, old.other_urls)
+                self.logger.info(f"ALREADY 编号: {data.no} URL:{old.other_urls}")
+
+        self.logger.info("END   ==>" + url)
+
+    def _ai_process(self, item: CollectData) -> ProcessData:
+        try:
+            data = AiHelper().call_ai(item.content)
+            return data
+        except Exception as e:
+            self.logger.error(f"AI 提取数据失败: {item.url} {e}")
+            return None
+
+    # def _generate_unique_id(self) -> str:
+    #     from datetime import datetime
+    #     current_time = datetime.now().strftime("%Y%m%d%H%M%S%f")
+    #     thread_id = threading.current_thread().ident
+    #     unique_id = f"{current_time}-{thread_id}"
+    #     return unique_id

+ 118 - 0
SourceCode/TenderCrawler/app/main/data_send.py

@@ -0,0 +1,118 @@
+from utils.logger_helper import LoggerHelper
+from utils.email_helper import EmailHelper
+from stores.data_store_interface import IDataStore
+from models.process_data import ProcessData
+
+
+class DataSend:
+    logger = LoggerHelper.get_logger()
+    _error_arr = []
+
+    @property
+    def store(self) -> IDataStore:
+        return self._store
+
+    def __init__(self, store: IDataStore):
+        self._store = store
+
+    def send(self) -> None:
+        self._error_arr = []
+        list = self.store.query_to_send()
+        self.logger.info(f"开始发送邮件,数量为 {len(list)}")
+        for item in list:
+            self._send_item(item)
+        if len(self._error_arr) > 0:
+            self._send_email_no_found()
+
+    def _send_item(self, item: ProcessData) -> None:
+        self.logger.info(f"开始发送邮件,地区为:{item.area} ,URL为 {item.url}")
+        email = self.store.get_email_by_area(item.area)
+        if not email:
+            self.logger.error(f"{item.area} 下没有找到email")
+            if (item.area not in self._error_arr):
+                self._error_arr.append(item.area)
+            return
+        body = self._build_email_content(item)
+        flag = EmailHelper().send_email(email, item.title, body, True, None)
+        if flag:
+            self.store.set_send(item.no)
+
+    def _build_email_content(self, item: ProcessData, other: str = "") -> str:
+        html_body = f"""
+        <html>
+        <head>
+            <style>
+                body {{
+                    background-color: #f4f4f9;
+                    font-family: Arial, sans-serif;
+                    margin: 0;
+                    padding: 20px;
+                }}
+                h1 {{
+                    text-align: center;
+                    color: #333;
+                }}
+                .container {{
+                    max-width: 600px;
+                    margin: 0 auto;
+                    background-color: #fff;
+                    padding: 20px;
+                    border-radius: 8px;
+                    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+                }}
+                .button-container {{
+                    text-align: center;
+                    margin-top: 20px;
+                }}
+                .button {{
+                    display: inline-block;
+                    padding: 10px 20px;
+                    font-size: 16px;
+                    color: #fff!important;
+                    background-color: #007bff;
+                    text-decoration: none;
+                    border-radius: 5px;
+                    transition: background-color 0.3s;
+                }}
+                .button:hover {{
+                    background-color: #0056b3;
+                }}
+                .system {{
+                    color: #aaa;
+                }}
+
+            </style>
+        </head>
+        <body>
+            <div class="container">
+                <h1>{item.title}</h1>
+                <p><strong>搜索关键字:</strong> {item.keyword}</p>
+                <p><strong>发布日期:</strong> {item.release_date}</p>
+                <p><strong>招标编号:</strong> {item.no}</p>
+                <p><strong>开标时间:</strong> {item.date}</p>
+                <p><strong>开标地点:</strong> {item.address}</p>
+                <p><strong>标书摘要:</strong> {item.summary}</p>
+                <div class="button-container">
+                    <a href="{item.url}" class="button">查看详情</a>
+                </div>
+                <div>
+                    <h3>{other}</h3>
+                </div>
+                <p class="system">本邮件由系统自动发送,请勿回复。</p>
+
+            </div>
+        </body>
+        </html>
+        """
+        return html_body
+
+    def _send_email_no_found(self) -> None:
+        email = EmailHelper().config.get("email.error_email")
+        self.logger.info(f"开始发送区域邮箱未匹配邮件: {email}")
+        if not email:
+            return
+        title = "Warning: 相关地区没有匹配到邮箱,请及时添加相关配置"
+        content = "以下区域中没有配置邮箱:\n\n    "
+        content += "、".join(self._error_arr)
+        content += "\n\n请及时添加相关配置。"
+        EmailHelper().send_email(email, title, content, False, None)

+ 150 - 0
SourceCode/TenderCrawler/app/main/runner.py

@@ -0,0 +1,150 @@
+from dateutil import parser
+import schedule
+
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+from stores.mysql_data_store import MysqlDataStore
+from models.url_setting import UrlSetting
+from main.data_collector import DataCollector
+from main.data_process import DataProcess
+from main.data_send import DataSend
+from utils.email_helper import EmailHelper
+
+
+class Runner:
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+    store = MysqlDataStore()  # 复用 store 对象
+
+    def run(self):
+        self.logger.info("应用程序已启动!")
+        urls = UrlSetting().fetch_all()
+        if not urls or len(urls) == 0:
+            self.logger.error("未找到任何 URL 设置")
+            return
+        self.logger.info(f"共找到 {len(urls)} 个 URL 设置")
+
+        collect_time = self.config.get("schedule.collect")
+        process_time = self.config.get("schedule.process")
+        send_email_time = self.config.get("schedule.send_email")
+
+        collect_times = self._validate_and_format_time(collect_time, ["06:00"])
+        for time in collect_times:
+            self.logger.info(f"{time} 执行 采集处理数据 任务")
+            schedule.every().day.at(time).do(self._collect_process_job)
+
+        process_times = self._validate_and_format_time(
+            process_time, ["10:00", "15:00", "19:00"])
+        for time in process_times:
+            self.logger.info(f"{time} 执行  AI处理数据  任务")
+            schedule.every().day.at(time).do(self._process_job)
+
+        send_email_times = self._validate_and_format_time(
+            send_email_time, ["08:20", "14:00"])
+        for time in send_email_times:
+            self.logger.info(f"{time} 执行   发送邮件   任务")
+            schedule.every().day.at(time).do(self._send_job)
+        run_now = self.config.get("schedule.run_now")
+        if run_now and (str(run_now).lower() == 'true' or str(run_now) == '1'):
+            self.logger.info("立即执行任务")
+            self._collect_process_job()
+            self._send_job()
+            # self._process_job()
+
+    def _collect_process_job(self):
+        try:
+            self.logger.info("开始执行数据采集处理任务")
+            urlSetting = UrlSetting()
+            for url_setting in urlSetting.fetch_all():
+                try:
+                    self.logger.info(f"开始采集: {url_setting.url}")
+                    dataCollector = DataCollector(url_setting.type,
+                                                  url_setting.url,
+                                                  url_setting.username,
+                                                  url_setting.password,
+                                                  self.store)
+                    keywords = url_setting.keywords
+                    keywordArray = keywords.split(',')
+                    for keyword in keywordArray:
+                        dataCollector.collect(keyword)
+                    self.logger.info(f"采集完成: {url_setting.url}")
+                except Exception as e:
+                    self._send_error_email(
+                        "数据采集",
+                        f"\n    Type: {url_setting.type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
+                    )
+                    self.logger.error(f"采集发生异常: {e}")
+                finally:
+                    dataCollector.close()
+
+                try:
+                    self.logger.info(f"开始AI处理: {url_setting.url}")
+                    dataProcess = DataProcess(self.store)
+                    dataProcess.process()
+                except Exception as e:
+                    self._send_error_email(
+                        "AI数据处理",
+                        f"\n    Type: {url_setting.type} \n    Url: {url_setting.url}\n    错误: {str(e)}"
+                    )
+                    self.logger.error(f"AI处理发生异常: {e}")
+                    break  # 中断当前 URL 设置的处理
+            self.logger.info("数据采集处理任务执行完毕")
+        except Exception as e:
+            self.logger.error(f"数据采集处理任务执行失败: {e}")
+
+    def _process_job(self):
+        try:
+            self.logger.info("开始AI处理数据执行任务")
+            dataProcess = DataProcess(self.store)
+            dataProcess.process()
+            self.logger.info("AI处理数据任务执行完毕")
+        except Exception as e:
+            self._send_error_email("AI数据处理", f"\n    错误: {str(e)}")
+            self.logger.error(f"AI任务执行失败: {e}")
+
+    def _send_job(self):
+        try:
+            self.logger.info("开始邮件发送执行任务")
+            DataSend(self.store).send()
+            self.logger.info("邮件发送任务执行完毕")
+        except Exception as e:
+            self._send_error_email("邮件发送", f"\n    错误: {str(e)}")
+            self.logger.error(f"邮件发送任务执行失败: {e}")
+
+    def _validate_and_format_time(self, time_str, default_time: list):
+        """验证并格式化时间字符串"""
+        if not time_str:
+            return default_time
+        time_str = time_str.strip().replace(',', ',')
+        # 分割字符串为列表
+        items = [
+            item.strip().strip("'").strip('"') for item in time_str.split(',')
+        ]
+
+        # 初始化结果列表
+        formatted_times = []
+
+        for item in items:
+            if not item:
+                continue  # 跳过空字符串
+            try:
+                item = item.replace(':', ':')
+                # 使用 dateutil.parser 解析时间字符串
+                parsed_time = parser.parse(item).time().strftime('%H:%M:%S')
+                formatted_times.append(parsed_time)
+            except Exception as e:
+                self.logger.error(f"配置时间解析错误: {item},: {e} ")
+        if len(formatted_times) == 0:
+            self.logger.error(f"解析时间失败,使用默认时间 {default_time}")
+            return default_time
+        return formatted_times
+
+    def _send_error_email(self, title: str, error: str) -> None:
+        email_helper = EmailHelper()
+        email = self.config.get("email.error_email")
+        self.logger.info(f"发送错误邮件: {email}")
+        if not email:
+            return
+        title = f"{title}异常"
+        content = f"{title},请及时处理。\n\n异常信息:{error}"
+        email_helper.send_email(email, title, content, False, None)

+ 51 - 0
SourceCode/TenderCrawler/app/models/area_email.py

@@ -0,0 +1,51 @@
+from utils.mysql_helper import MySQLHelper
+
+
+class AreaEmail:
+
+    def __init__(self, name=None, area=None, email=None):
+        self.name = name
+        self.area = area
+        if email is None:
+            email = ""
+        self.email = email.replace(",", ",")
+
+    def __repr__(self):
+        return (
+            f"<AreaEmail(name={self.name},area={self.area}, email={self.email}, "
+            f"is_active={self.is_active}, remark={self.remark})>")
+
+    def to_dict(self):
+        return {
+            'area': self.area,
+            'email': self.email,
+        }
+
+    # # 插入 AreaEmail 数据
+    # def insert(self, area_email):
+    #     if not isinstance(area_email, AreaEmail):
+    #         raise TypeError("area_email 必须是 AreaEmail 的实例")
+    #     with MySQLHelper() as db_helper:
+    #         query = (
+    #             "INSERT INTO t_area_email (area, email, is_active, remark) "
+    #             "VALUES (%s, %s, %s, %s)")
+    #         params = (area_email.area, area_email.email, area_email.is_active,
+    #                   area_email.remark)
+    #         db_helper.execute_non_query(query, params)
+
+    # 查询 AreaEmail 数据
+    def fetch_all(self):
+        with MySQLHelper() as db_helper:
+            query = "SELECT name,area,email FROM t_area_email WHERE is_active = 1"
+            results = db_helper.execute_query(query)
+            data = [AreaEmail(**result) for result in results]
+            return data
+
+    def fetch_one_by_area(self, area: str):
+        with MySQLHelper() as db_helper:
+            query = "SELECT email FROM t_area_email WHERE CONCAT(area,',') like %s AND is_active = 1"
+            params = ('%' + area + ',%', )
+            result = db_helper.fetch_one(query, params)
+            if result is None:
+                return None
+            return result["email"]

+ 148 - 0
SourceCode/TenderCrawler/app/models/collect_data.py

@@ -0,0 +1,148 @@
+from datetime import datetime
+from utils.mysql_helper import MySQLHelper
+from utils.logger_helper import LoggerHelper
+
+
+class CollectData:
+
+    logger = LoggerHelper.get_logger()
+
+    def __init__(self,
+                 url=None,
+                 keyword=None,
+                 content=None,
+                 status=None,
+                 create_time=None,
+                 process_time=None):
+        self.url = url
+        self.keyword = keyword
+        self.content = content
+        self.status = status
+        self.create_time = create_time or datetime.now()
+        self.process_time = process_time
+
+    def __repr__(self):
+        return (
+            f"CollectData(url={self.url}, keyword={self.keyword}, "
+            f"content={self.content}, status={self.status}, "
+            f"create_time={self.create_time}, process_time={self.process_time})"
+        )
+
+    def insert(self, collect_data):
+        if not isinstance(collect_data, self.__class__):
+            raise TypeError("collect_data 不是 CollectData 的实例")
+        with MySQLHelper() as db_helper:
+            query = """
+                INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time)
+                VALUES (%s, %s, %s, %s, %s)
+                """
+            params = (collect_data.url, collect_data.keyword,
+                      collect_data.content, 0, datetime.now())
+            db_helper.execute_non_query(query, params)
+
+    def insert_batch(self, collect_data_list):
+        if not all(
+                isinstance(collect_data, self.__class__)
+                for collect_data in collect_data_list):
+            raise TypeError("collect_data_list 中的所有元素必须是 CollectData 的实例")
+
+        query = """
+            INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time)
+            VALUES (%s, %s, %s, %s, %s)
+            """
+        params = [
+            (
+                collect_data.url,
+                collect_data.keyword,
+                collect_data.content,
+                collect_data.status,
+                datetime.now()  # 每次调用 datetime.now() 获取当前时间
+            ) for collect_data in collect_data_list
+        ]
+        with MySQLHelper() as db_helper:
+            db_helper.execute_non_query(query, params)
+            # 获取受影响的行数
+            affected_rows = db_helper.connection.affected_rows()
+            self.logger.info(f"成功插入 {affected_rows} 条数据")
+            return affected_rows
+
+    def insert_url(self, url: str, keyword: str, content: str):
+        with MySQLHelper() as db_helper:
+            query = """
+                INSERT IGNORE INTO t_collect_data (url, keyword, content, status, create_time, process_time)
+                VALUES (%s, %s, %s, %s, %s, %s)
+                """
+            params = (url, keyword, content, 0, datetime.now, None)
+            db_helper.execute_non_query(query, params)
+
+    # def fetch_all():
+    #     with MySQLHelper() as db_helper:
+    #         query = "SELECT * FROM t_collect_data"
+    #         results = db_helper.execute_query(query)
+    #         data = [CollectData(**result) for result in results]
+    #         return data
+
+    def fetch_all_urls(self) -> list[str]:
+        with MySQLHelper() as db_helper:
+            query = "SELECT url FROM t_collect_data"
+            results = db_helper.execute_query(query)
+            # 使用列表推导式一次性提取所有 'url' 值
+            data = [result['url'] for result in results]
+            return data
+
+    def fetch_urls_to_process(self) -> list[str]:
+        with MySQLHelper() as db_helper:
+            query = """
+            SELECT url
+            FROM t_collect_data
+            WHERE status = 0
+            """
+            results = db_helper.execute_query(query)
+            data = [result['url'] for result in results]
+            return data
+
+    def fetch_one_collect_by_url(self, url: str):
+        with MySQLHelper() as db_helper:
+            query = """
+                SELECT url,keyword,content,status FROM t_collect_data WHERE url = %s  LIMIT 1
+            """
+            result = db_helper.fetch_one(query, (url, ))
+            if not result:
+                return None
+            data = CollectData(url=result["url"],
+                               keyword=result["keyword"],
+                               content=result["content"],
+                               status=result["status"])
+            return data
+
+    def set_process(self, url: str):
+        with MySQLHelper() as db_helper:
+            query = """
+            UPDATE t_collect_data
+            SET status = 1
+            WHERE url = %s
+            """
+            db_helper.execute_non_query(query, (url))
+
+    def fetch_by_status(self, status=0):
+        with MySQLHelper() as db_helper:
+            query = """
+            SELECT url, keyword, content, status, create_time, process_time
+            FROM t_collect_data
+            WHERE status = %s
+            """
+            results = db_helper.execute_query(query, (status, ))
+            data = [CollectData(**result) for result in results]
+            return data
+
+    def set_status(self, collect_data):
+        if not isinstance(collect_data, self):
+            raise TypeError("collect_data 不是 CollectData 的实例")
+        with MySQLHelper() as db_helper:
+            query = """
+            UPDATE t_collect_data
+            SET status = %s
+            WHERE url = %s
+            """
+            params = (collect_data.status, collect_data.url)
+            db_helper.execute_non_query(query, params)

+ 166 - 0
SourceCode/TenderCrawler/app/models/process_data.py

@@ -0,0 +1,166 @@
+from datetime import datetime
+from utils.mysql_helper import MySQLHelper
+from utils.config_helper import ConfigHelper
+from utils.logger_helper import LoggerHelper
+
+
+class ProcessData:
+
+    logger = LoggerHelper.get_logger()
+
+    def __init__(self,
+                 no=None,
+                 title=None,
+                 url=None,
+                 keyword=None,
+                 date=None,
+                 area=None,
+                 address=None,
+                 summary=None,
+                 release_date=None,
+                 status=None,
+                 create_time=None,
+                 send_time=None,
+                 other_urls=None,
+                 remark=None):
+        self.no = no
+        self.title = title
+        self.url = url
+        self.date = date
+        if not area:
+            area = ConfigHelper().get("default_area")
+        if not area:
+            area = "全国"
+        self.area = area.replace(" ", "")
+        self.keyword = keyword
+        self.address = address
+        self.summary = summary
+        self.release_date = release_date
+        self.status = status
+        self.create_time = create_time or datetime.now()
+        self.send_time = send_time
+        self.other_urls = other_urls
+        self.remark = remark
+
+    def __repr__(self):
+        return (
+            f"ProcessData(no={self.no}, title={self.title}, date={self.date}, "
+            f"area={self.area}, address={self.address}, summary={self.summary}, "
+            f"status={self.status}, create_time={self.create_time}, "
+            f"send_time={self.send_time}, remark={self.remark})")
+
+    def insert(self, process_data):
+        if not isinstance(process_data, self.__class__):
+            raise TypeError("process_data 不是 ProcessData 的实例")
+
+        insert_query = """
+            INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, status, create_time)
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+        """
+
+        update_query = """
+            UPDATE t_collect_data SET status = 1 WHERE url = %s;
+        """
+
+        insert_params = (process_data.no, process_data.title, process_data.url,
+                         process_data.keyword, process_data.date,
+                         process_data.area, process_data.address,
+                         process_data.summary, process_data.release_date, 0,
+                         datetime.now())
+
+        update_params = (process_data.url, )
+
+        with MySQLHelper() as db_helper:
+            db_helper.execute_non_query(insert_query, insert_params)
+            db_helper.execute_non_query(update_query, update_params)
+
+    def insert_batch(self, process_data_list):
+        if not all(
+                isinstance(process_data, self.__class__)
+                for process_data in process_data_list):
+            raise TypeError("process_data_list 中的所有元素必须是 ProcessData 的实例")
+
+        insert_query = """
+            INSERT IGNORE INTO t_data (no, title, url, keyword, date, area, address, summary, release_date, status, create_time)
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+        """
+
+        update_query = """
+            UPDATE t_collect_data SET status = 1 WHERE url = %s;
+        """
+
+        insert_params = [(
+            process_data.no,
+            process_data.title,
+            process_data.url,
+            process_data.keyword,
+            process_data.date,
+            process_data.area,
+            process_data.address,
+            process_data.summary,
+            process_data.release_date,
+            0,
+            datetime.now(),
+        ) for process_data in process_data_list]
+
+        update_params = [(process_data.url, )
+                         for process_data in process_data_list]
+
+        with MySQLHelper() as db_helper:
+            db_helper.execute_non_query(insert_query, insert_params)
+            affected_rows = db_helper.connection.affected_rows()
+            self.logger.info(f"成功插入 {affected_rows} 条数据")
+            for param in update_params:
+                db_helper.execute_non_query(update_query, param)
+            return affected_rows
+
+    def fetch_one_process_by_no(self, no: str):
+        with MySQLHelper() as db_helper:
+            query = """
+                SELECT url,no,other_urls FROM t_data WHERE no = %s  LIMIT 1
+            """
+            result = db_helper.fetch_one(query, (no, ))
+            if not result:
+                return None
+            data = ProcessData(url=result["url"],
+                               no=result["no"],
+                               other_urls=result["other_urls"])
+            return data
+
+    def fetch_no_send(self):
+        with MySQLHelper() as db_helper:
+            query = "SELECT no, title, url, keyword, date, area, address, summary, release_date FROM t_data WHERE status = 0"
+            results = db_helper.execute_query(query)
+            data = [ProcessData(**result) for result in results]
+            return data
+
+    def set_send(self, no):
+        with MySQLHelper() as db_helper:
+            query = """
+            UPDATE t_data
+            SET status = 1, send_time = %s
+            WHERE no = %s
+            """
+            params = (datetime.now(), no)
+            db_helper.execute_non_query(query, params)
+
+    def set_other_urls(self, url, other_urls):
+        with MySQLHelper() as db_helper:
+            query = """
+            UPDATE t_data
+            SET other_urls = %s
+            WHERE url = %s
+            """
+            update_query = """
+            UPDATE t_collect_data SET status = 1 WHERE url = %s;
+            """
+            params = (other_urls, url)
+            db_helper.execute_non_query(query, params)
+            db_helper.execute_non_query(update_query, (url, ))
+
+    def check_is_process_by_url(self, url):
+        with MySQLHelper() as db_helper:
+            query = "SELECT * FROM t_data WHERE url = %s"
+            params = (url, )
+            results = db_helper.execute_query(query, params)
+            return True if results else False

+ 55 - 0
SourceCode/TenderCrawler/app/models/url_setting.py

@@ -0,0 +1,55 @@
+from utils.mysql_helper import MySQLHelper
+
+
+class UrlSetting:
+
+    def __init__(self,
+                 url=None,
+                 type=None,
+                 username=None,
+                 password=None,
+                 keywords=None):
+        self.url = url
+        self.type = type
+        self.username = username
+        self.password = password
+        if not keywords:
+            keywords = ""
+        self.keywords = keywords.replace(",", ",")
+
+    def __repr__(self):
+        return (
+            f"<UrlSetting(url={self.url}, type={self.type}, "
+            f"username={self.username}, keywords={self.keywords}, is_active={self.is_active})>"
+        )
+
+    def to_dict(self):
+        return {
+            'url': self.url,
+            'type': self.type,
+            'username': self.username,
+            'password': self.password,
+            'keywords': self.keywords,
+            'is_active': self.is_active
+        }
+
+    # # 插入 URL 设置数据
+    # def insert(self, url_setting):
+    #     if not isinstance(url_setting, UrlSetting):
+    #         raise TypeError("url_setting必须是 UrlSetting 的实例")
+    #     with MySQLHelper() as db_helper:
+    #         query = (
+    #             "INSERT INTO t_urls (url, type, username, password, keywords, is_active) "
+    #             "VALUES (%s, %s, %s, %s, %s, %s)")
+    #         params = (url_setting.url, url_setting.type, url_setting.username,
+    #                   url_setting.password, url_setting.keywords,
+    #                   url_setting.is_active)
+    #         db_helper.execute_non_query(query, params)
+
+    # 查询 URL 设置数据
+    def fetch_all(self):
+        with MySQLHelper() as db_helper:
+            query = "SELECT  url, type, username, password, keywords FROM t_urls WHERE is_active = 1"
+            results = db_helper.execute_query(query)
+            data = [UrlSetting(**result) for result in results]
+            return data

+ 0 - 0
SourceCode/TenderCrawler/app/stores/__init__.py


+ 56 - 0
SourceCode/TenderCrawler/app/stores/data_store_interface.py

@@ -0,0 +1,56 @@
+from abc import ABC, abstractmethod
+from models.process_data import ProcessData
+
+
+class IDataStore(ABC):
+    """
+    定义数据保存接口
+    """
+
+    @abstractmethod
+    def insert_collect_data(self,
+                            url: str,
+                            keyword: str,
+                            content: str,
+                            is_batch=True) -> None:
+        raise NotImplementedError("insert 应由子类重写。")
+
+    @abstractmethod
+    def save_collect_data(self, is_force=False):
+        raise NotImplementedError("save 应由子类重写。")
+
+    @abstractmethod
+    def query_urls_to_process(self):
+        raise NotImplementedError("query_to_process 应由子类重写。")
+
+    @abstractmethod
+    def query_one_collect_by_url(self, url):
+        raise NotImplementedError("query_one_collect_by_url 应由子类重写。")
+
+    @abstractmethod
+    def query_one_process_by_no(self, no):
+        raise NotImplementedError("query_one_process_by_no 应由子类重写。")
+
+    @abstractmethod
+    def insert_process_data(self, data: ProcessData):
+        raise NotImplementedError("insert_process_data 应由子类重写。")
+
+    @abstractmethod
+    def save_process_data(self, is_force=False):
+        raise NotImplementedError("save_process_data 应由子类重写。")
+
+    @abstractmethod
+    def set_process_other_urls(self, url, other_urls: str):
+        raise NotImplementedError("save_process_data 应由子类重写。")
+
+    @abstractmethod
+    def query_to_send(self):
+        raise NotImplementedError("query_to_send 应由子类重写。")
+
+    @abstractmethod
+    def set_send(self, no: str):
+        raise NotImplementedError("set_send 应由子类重写。")
+
+    @abstractmethod
+    def get_email_by_area(self, area: str):
+        raise NotImplementedError("get_email_by_area 应由子类重写。")

+ 40 - 0
SourceCode/TenderCrawler/app/stores/default_data_store.py

@@ -0,0 +1,40 @@
+from utils.logger_helper import LoggerHelper
+from stores.data_store_interface import IDataStore
+
+
+class DefaultDataStore(IDataStore):
+
+    logger = LoggerHelper.get_logger()
+
+    def __init__(self):
+        pass
+
+    def insert_collect_data(self, url, keyword, content):
+        self.logger.info(f"Default: INSERT {url},关键字:{keyword}")
+
+    def save_collect_data(self, is_force=False):
+        self.logger.info("Default: SAVE")
+
+    def query_urls_to_process(self):
+        self.logger.info("Default: QUERY_TO_PROCESS")
+
+    def query_one_collect_by_url(self, url):
+        self.logger.info("Default: QUERY_ONE_PROCESS")
+
+    def insert_process_data(self, data):
+        self.logger.info("Default: INSERT_PROCESS_DATA")
+
+    def save_process_data(self, is_force=False):
+        self.logger.info("Default: SAVE_PROCESS_DATA")
+
+    def set_process_other_urls(self, url, other_urls: str):
+        self.logger.info("Default: SET_PROCESS_OTHER_URLS")
+
+    def query_to_send(self):
+        self.logger.info("Default: QUERY_TO_SEND")
+
+    def set_send(self, no: str):
+        self.logger.info("Default: SET_SEND")
+
+    def get_email_by_area(self, area: str):
+        self.logger.info("Default: GET_EMAIL_BY_AREA")

+ 83 - 0
SourceCode/TenderCrawler/app/stores/mysql_data_store.py

@@ -0,0 +1,83 @@
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+from stores.data_store_interface import IDataStore
+from models.collect_data import CollectData
+from models.process_data import ProcessData
+from models.area_email import AreaEmail
+
+
+class MysqlDataStore(IDataStore):
+
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+    _collectData = CollectData()
+    _processData = ProcessData()
+    _areaEmail = AreaEmail()
+
+    def __init__(self):
+        size = self.config.get('save.collect_batch_size')
+        if not size:
+            size = 1
+        self._collect_size = int(size)
+        self._collect_list = []
+        size = self.config.get('save.process_batch_size')
+        if not size:
+            size = 1
+        self._process_size = int(size)
+        self._process_list = []
+
+    def insert_collect_data(self,
+                            url: str,
+                            keyword: str,
+                            content: str,
+                            is_batch=True):
+        data = CollectData(url, keyword, content, 0)
+        if not is_batch:
+            self._collectData.insert(data)
+        else:
+            self._collect_list.append(data)
+            self.save_collect_data()
+
+    def save_collect_data(self, is_force=False):
+        if (is_force or len(self._collect_list) >= self._collect_size):
+            self.logger.info("批量保存到数据库,数量: " + str(len(self._collect_list)))
+            self._collectData.insert_batch(self._collect_list)
+            self._collect_list = []
+
+    def query_urls_to_process(self):
+        return self._collectData.fetch_urls_to_process()
+
+    def query_one_collect_by_url(self, url):
+        return self._collectData.fetch_one_collect_by_url(url)
+
+    def query_one_process_by_no(self, no):
+        return self._processData.fetch_one_process_by_no(no)
+
+    def insert_process_data(self, data: ProcessData, is_batch=True):
+        if not is_batch:
+            self._processData.insert(data)
+        else:
+            self._process_list.append(data)
+            self.save_process_data()
+
+    # 插入到数据库时会把CollectData设为已处理
+    def save_process_data(self, is_force=False):
+        if (is_force or len(self._process_list) >= self._process_size):
+            self.logger.info("批量保存到数据库,数量: " + str(len(self._process_list)))
+            self._processData.insert_batch(self._process_list)
+            self._process_list = []
+
+    def set_process_other_urls(self, url, other_urls: str):
+        return self._processData.set_other_urls(url, other_urls)
+
+    def check_url_is_process(self, url: str) -> bool:
+        return self._processData.check_is_process_by_url(url)
+
+    def query_to_send(self):
+        return self._processData.fetch_no_send()
+
+    def set_send(self, no: str):
+        self._processData.set_send(no)
+
+    def get_email_by_area(self, area: str) -> str:
+        return self._areaEmail.fetch_one_by_area(area)

+ 3 - 0
SourceCode/TenderCrawler/app/utils/__init__.py

@@ -0,0 +1,3 @@
+from utils.config_helper import ConfigHelper
+
+ConfigHelper().load_config()

+ 126 - 0
SourceCode/TenderCrawler/app/utils/ai_helper.py

@@ -0,0 +1,126 @@
+import re
+import requests
+
+from utils.logger_helper import LoggerHelper
+from utils.config_helper import ConfigHelper
+from models.process_data import ProcessData
+
+
+class AiHelper:
+
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+
+    _ai_api_key = None
+    _ai_api_url = None
+    _ai_max_tokens = 150
+    _ai_system_prompt = "请帮我分析以下文字,提取出关键信息,并以json格式字符串返回,如果部分信息为空,则该字段返回为空。"
+    _ai_prompt_template = """在以上内容中提取信息:编号(no) 、标题(title)、在哪个城市招标(area)、开标的时间(date)、
+    开标的地点(address)、发布时间(release_date)、150字左右的招标条件要求及联系方式等内容摘要(summary)。
+    编号一般在“招标编号:”的后面,例如 (招标编号:xxxxxxx...), “xxxxxxx...”就是编号(no)。"
+    返回包含no,title,area,date,address,release_date,summary字段的json格式字符串,没有找到的信息json字段为空。"""
+
+    def __init__(self):
+        self._ai_api_key = self.config.get("ai.key")
+        self._ai_api_url = self.config.get("ai.url")
+        self._api_model = self.config.get("ai.model")
+        max_tokens = self.config.get("ai.max_tokens")
+        if max_tokens:
+            self._ai_max_tokens = int(max_tokens)
+        system_prompt = self.config.get("ai.system_prompt")
+        if system_prompt:
+            self._ai_system_prompt = system_prompt
+        prompt_template = self.config.get("ai.prompt_template")
+        if prompt_template:
+            self._ai_prompt_template = prompt_template
+
+    def call_ai(self, content: str) -> ProcessData:
+        # 截取前100个字符进行日志记录
+        # truncated_content = content[:100]
+        self.logger.info("调用AI API")
+        if self._ai_api_key is None:
+            raise Exception("AI API key 没有配置")
+        if self._ai_api_url is None:
+            raise Exception("AI API url 没有配置")
+        if self._api_model is None:
+            raise Exception("AI API model 没有配置")
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self._ai_api_key}"
+        }
+        messages = [{
+            "role": "system",
+            "content": self._ai_system_prompt
+        }, {
+            "role": "user",
+            "content": f"{content} {self._ai_prompt_template}"
+        }]
+
+        data = {
+            "model": self._api_model,
+            "messages": messages,
+            "stream": False,
+            "max_tokens": self._ai_max_tokens
+        }
+        response = requests.post(self._ai_api_url, headers=headers, json=data)
+        if response.status_code == 200:
+            try:
+                self.logger.info(f"AI Response: {response.text}")
+                resStr = self._extract_message_content(response.json())
+                return self._parse_response(resStr, True)
+            except Exception as e:
+                raise Exception(f"解析 AI 响应错误: {e}")
+        else:
+            raise Exception(
+                f"调用 AI 错误: {response.status_code} - {response.text}")
+
+    def _extract_message_content(self, response_json: dict) -> str:
+        if "choices" in response_json and len(response_json["choices"]) > 0:
+            choice = response_json["choices"][0]
+            message_content = choice.get("message", {}).get("content", "")
+        elif "message" in response_json:
+            message_content = response_json["message"].get("content", "")
+        else:
+            raise Exception("AI 响应中未找到有效的 choices 或 message 数据")
+
+        # 移除多余的 ```json 和 ```
+        if message_content.startswith("```json") and message_content.endswith(
+                "```"):
+            message_content = message_content[6:-3]
+
+        # 去除开头的 'n' 字符
+        if message_content.startswith('n'):
+            message_content = message_content[1:]
+        # 移除无效的转义字符和时间戳前缀
+        message_content = re.sub(r'\\[0-9]{2}', '',
+                                 message_content)  # 移除 \32 等无效转义字符
+        message_content = re.sub(r'\d{4}-\d{2}-\dT\d{2}:\d{2}:\d{2}\.\d+Z', '',
+                                 message_content)  # 移除时间戳
+        message_content = message_content.strip()  # 去除首尾空白字符
+
+        # 替换所有的反斜杠
+        message_content = message_content.replace("\\", "")
+
+        return message_content
+
+    def _parse_response(self, response: str, first=True) -> ProcessData:
+        import json
+        self.logger.info(f"AI Response JSON STR: {response}")
+        try:
+            data = json.loads(response)
+            return ProcessData(no=data.get("no"),
+                               title=data.get("title"),
+                               date=data.get("date"),
+                               area=data.get("area"),
+                               address=data.get("address"),
+                               summary=data.get("summary"),
+                               release_date=data.get("release_date"))
+        except json.JSONDecodeError as e:
+            if first:
+                self.logger.error(f"JSON 解析错误,去除部分特殊字符重新解析一次: {e}")
+                # 替换中文引号为空
+                message_content = re.sub(r'[“”]', "", response)  # 替换双引号
+                message_content = re.sub(r'[‘’]', "", message_content)  # 替换单引号
+                return self._parse_response(message_content, False)
+            else:
+                raise Exception(f"解析 AI 响应错误: {e}")

+ 70 - 0
SourceCode/TenderCrawler/app/utils/config_helper.py

@@ -0,0 +1,70 @@
+import os
+import yaml
+
+from utils.logger_helper import LoggerHelper
+
+
+class ConfigHelper:
+    _instance = None
+    logger = LoggerHelper.get_logger()
+
+    # 默认配置文件路径
+    default_config_path = os.path.join(os.path.dirname(__file__), '..',
+                                       'config.yml')
+
+    # 类变量存储加载的配置
+    _config = None
+    _path = None
+
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super(ConfigHelper, cls).__new__(cls)
+        return cls._instance
+
+    def load_config(self, path=None):
+        if self._config is None:
+            if not path:
+                # self.logger.info(f"使用默认配置文件:{self.default_config_path}")
+                self._path = self.default_config_path
+            else:
+                self._path = path
+            if not os.path.exists(self._path):
+                raise FileNotFoundError(f"没有找到文件或目录:'{self._path}'")
+        with open(self._path, 'r', encoding='utf-8') as file:
+            self._config = yaml.safe_load(file)
+        # 合并环境变量配置
+        self._merge_env_vars()
+        # self.logger.info(f"加载的配置文件内容:{self._config}")
+        return self._config
+
+    def _merge_env_vars(self, env_prefix="APP_"):  # 环境变量前缀为 APP_
+        for key, value in os.environ.items():
+            if key.startswith(env_prefix):
+                config_key = key[len(env_prefix):].lower()
+                self._set_nested_key(self._config, config_key.split('__'),
+                                     value)
+
+    def _set_nested_key(self, config, keys, value):
+        if len(keys) > 1:
+            if keys[0] not in config or not isinstance(config[keys[0]], dict):
+                config[keys[0]] = {}
+            self._set_nested_key(config[keys[0]], keys[1:], value)
+        else:
+            config[keys[0]] = value
+
+    def get(self, key):
+        if self._config is None:
+            self.load_config(self._path)
+        keys = key.split('.')
+        config = self._config
+        for k in keys:
+            if isinstance(config, dict) and k in config:
+                config = config[k]
+            else:
+                return None
+        return config
+
+    def get_all(self):
+        if self._config is None:
+            self.load_config(self._path)
+        return self._config

+ 73 - 0
SourceCode/TenderCrawler/app/utils/email_helper.py

@@ -0,0 +1,73 @@
+import smtplib
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.mime.base import MIMEBase
+from email import encoders
+import os
+
+from utils.config_helper import ConfigHelper
+from utils.logger_helper import LoggerHelper
+
+
+class EmailHelper:
+
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+
+    def __init__(self):
+        self.smtp_server = self.config.get("email.smtp_server")
+        self.port = self.config.get("email.smtp_port")
+        self.username = self.config.get("email.smtp_user")
+        self.password = self.config.get("email.smtp_password")
+        self.from_email = self.config.get("email.from_email")
+        # print(
+        #     f"server:{self.smtp_server},port:{self.port},username:{self.username},password:{self.password},from_email:{self.from_email}"
+        # )
+
+    def send_email(self,
+                   to_addr: str,
+                   subject: str,
+                   body: str,
+                   body_is_html: bool = True,
+                   attachment_path: str = None):
+        msg = MIMEMultipart()
+        msg['From'] = self.from_email
+        msg['To'] = ', '.join(to_addr.split(','))
+        msg['Subject'] = subject
+
+        # 根据 body_is_html 参数设置 MIMEText 类型
+        if body_is_html:
+            msg.attach(MIMEText(body, 'html', 'utf-8'))
+        else:
+            msg.attach(MIMEText(body, 'plain', 'utf-8'))
+
+        if attachment_path:
+            self._attach_file(msg, attachment_path)
+
+        try:
+            # with smtplib.SMTP(self.smtp_server, self.port, timeout=10) as server:
+            with smtplib.SMTP_SSL(self.smtp_server, timeout=10) as server:
+                # server.starttls()
+                server.login(self.username, self.password)
+                # 将 to_addr 字符串通过 split(',') 分割成列表,传递给 sendmail
+                server.sendmail(self.from_email, to_addr.split(','),
+                                msg.as_string())
+            self.logger.info(f"邮件发送成功:{to_addr}")
+            return True
+        except Exception as e:
+            self.logger.error(f"邮件发送失败:{to_addr} {e}")
+            return False
+
+    def _attach_file(self, msg: MIMEMultipart, attachment_path: str):
+        if not os.path.isfile(attachment_path):
+            raise FileNotFoundError(
+                f"The file {attachment_path} does not exist.")
+
+        with open(attachment_path, "rb") as attachment:
+            part = MIMEBase('application', 'octet-stream')
+            part.set_payload(attachment.read())
+            encoders.encode_base64(part)
+            part.add_header(
+                'Content-Disposition',
+                f"attachment; filename= {os.path.basename(attachment_path)}")
+            msg.attach(part)

+ 74 - 0
SourceCode/TenderCrawler/app/utils/logger_helper.py

@@ -0,0 +1,74 @@
+import os
+import logging
+from logging.handlers import TimedRotatingFileHandler
+
+
+class LoggerHelper:
+    """
+    日志辅助类,用于创建和提供日志记录器实例
+    该类实现了单例模式,确保在整个应用程序中只有一个日志记录器实例被创建和使用
+    """
+    _instance = None
+
+    def __new__(self, *args, **kwargs):
+        """
+        实现单例模式,确保日志记录器仅被创建一次
+        如果尚未创建实例,则创建并初始化日志记录器
+        """
+        if not self._instance:
+            self._instance = super(LoggerHelper,
+                                   self).__new__(self, *args, **kwargs)
+            try:
+                self._instance._initialize_logger()
+            except Exception as e:
+                raise Exception(f"配置logger出错: {e}")
+        return self._instance
+
+    @property
+    def logger(self):
+        return self._logger
+
+    def _initialize_logger(self):
+        """
+        初始化日志记录器,包括设置日志级别、创建处理器和格式化器,并将它们组合起来
+        """
+        self._logger = logging.getLogger('app_logger')
+        self._logger.setLevel(logging.INFO)
+        log_folder = './logs'
+        if not os.path.exists(log_folder):
+            os.makedirs(log_folder)
+
+        # 创建按日期分割的文件处理器
+        file_handler = TimedRotatingFileHandler(os.path.join(
+            log_folder, 'data_collector.log'),
+                                                when='midnight',
+                                                interval=1,
+                                                backupCount=7,
+                                                encoding='utf-8')
+        file_handler.setLevel(logging.INFO)
+
+        # 创建控制台处理器
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.INFO)
+
+        # 创建格式化器
+        formatter = logging.Formatter(
+            '%(asctime)s - %(levelname)s - %(message)s')
+
+        # 将格式化器添加到处理器
+        file_handler.setFormatter(formatter)
+        console_handler.setFormatter(formatter)
+
+        # 将处理器添加到日志记录器
+        self._logger.addHandler(file_handler)
+        self._logger.addHandler(console_handler)
+
+    @classmethod
+    def get_logger(self):
+        """
+        提供初始化后的日志记录器实例
+        :return: 初始化后的日志记录器实例
+        """
+        if not self._instance:
+            self._instance = self()
+        return self._instance._logger

+ 119 - 0
SourceCode/TenderCrawler/app/utils/mysql_helper.py

@@ -0,0 +1,119 @@
+import pymysql
+from pymysql.cursors import DictCursor
+from utils.config_helper import ConfigHelper
+from utils.logger_helper import LoggerHelper
+
+
+class MySQLHelper:
+
+    logger = LoggerHelper.get_logger()
+    config = ConfigHelper()
+
+    def __init__(self):
+        try:
+            self.host = self.config.get('mysql.host')
+            self.user = self.config.get('mysql.user')
+            self.password = self.config.get('mysql.password')
+            self.db = self.config.get('mysql.db')
+            self.port = int(self.config.get('mysql.port'))
+            self.charset = self.config.get('mysql.charset')
+            self.connection = None
+        except Exception as e:
+            self.logger.error(f"加载数据库配置文件失败: {e}")
+
+    def connect(self):
+        try:
+            self.connection = pymysql.connect(host=self.host,
+                                              user=self.user,
+                                              password=self.password,
+                                              db=self.db,
+                                              port=self.port,
+                                              charset=self.charset,
+                                              cursorclass=DictCursor)
+            # self.logger.info(f"成功连接到数据库:{self.db}。")
+        except pymysql.MySQLError as e:
+            self.logger.error(f"数据库连接失败: {self.host}:{self.port} {self.db}")
+            self.connection = None  # 确保连接失败时设置为 None
+            raise Exception(f"连接数据库失败: {e}")
+
+    def disconnect(self):
+        if self.connection and self.connection.open:
+            self.connection.close()
+            # self.logger.info("数据库连接已关闭。")
+
+    def execute_query(self, query, params=None):
+        try:
+            with self.connection.cursor() as cursor:
+                cursor.execute(query, params)
+                result = cursor.fetchall()
+                return result
+        except pymysql.MySQLError as e:
+            self.logger.error(f"执行查询时出错:{e}")
+            return None
+
+    def execute_non_query(self, query, params=None):
+        if isinstance(params, list) and all(
+                isinstance(p, tuple) for p in params):
+            self.execute_many(query, params)
+        elif isinstance(params, tuple):
+            self.execute(query, params)
+        else:
+            self.execute(query, (params, ))
+
+    def execute(self, query, params=None):
+        try:
+            with self.connection.cursor() as cursor:
+                cursor.execute(query, params)
+                self.connection.commit()
+        except pymysql.MySQLError as e:
+            self.logger.error(f"执行非查询时出错:{e}")
+            self.connection.rollback()
+
+    def execute_many(self, query, params: list):
+        if isinstance(params, list) and all(
+                isinstance(p, tuple) for p in params):
+            try:
+                with self.connection.cursor() as cursor:
+                    cursor.executemany(query, params)
+                    self.connection.commit()
+            except pymysql.MySQLError as e:
+                self.logger.error(f"执行非查询时出错:{e}")
+                self.connection.rollback()
+        else:
+            raise ValueError("参数必须是元组列表")
+
+    def fetch_one(self, query, params=None):
+        try:
+            with self.connection.cursor() as cursor:
+                cursor.execute(query, params)
+                result = cursor.fetchone()
+                return result
+        except pymysql.MySQLError as e:
+            self.logger.error(f"获取一条记录时出错:{e}")
+            return None
+
+    def __enter__(self):
+        """
+        当进入上下文时自动调用此方法。
+        它负责建立连接,并将当前实例返回,以便在上下文中使用。
+
+        :return: 返回实例本身,以便在上下文中使用。
+        """
+
+        self.connect()  # 建立连接
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        """
+        当退出上下文时自动调用此方法。
+        无论上下文中的代码是否完成或因异常退出,此方法都会被调用,以确保断开连接。
+
+        :param exc_type: 异常类型, 如果没有异常则为None。
+        :param exc_value: 异常值, 如果没有异常则为None。
+        :param traceback: 异常的traceback对象, 如果没有异常则为None。
+        """
+        if exc_type:
+            self.logger.error(
+                f"数据库发生异常,断开连接。异常类型:{exc_type}, 异常值:{exc_value} traceback: {traceback}"
+            )
+        self.disconnect()  # 断开连接

+ 81 - 0
SourceCode/TenderCrawler/docker-compose.yml

@@ -0,0 +1,81 @@
+version: '3.8'
+
+services:
+  dc-mysql:
+    image: mysql:8.0.39
+    container_name: y_data-collect-mysql
+    environment:
+      - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD}
+      - MYSQL_DATABASE=${MYSQL_DATABASE}
+      - MYSQL_USER=${MYSQL_USER}
+      - MYSQL_PASSWORD=${MYSQL_PASSWORD}
+      - TZ=Asia/Shanghai
+      # - MYSQL_DEFAULT_AUTHENTICATION_PLUGIN=mysql_native_password
+    volumes:
+      - /home/docker/data-collect/mysql/log:/var/log/mysql
+      - /home/docker/data-collect/mysql/data:/var/lib/mysql
+      - /home/docker/data-collect/mysql/conf.d:/etc/mysql/conf.d
+      - /etc/localtime:/etc/localtime:ro
+      - /home/docker/data-collect/mysql/init/init.sql:/docker-entrypoint-initdb.d/init.sql # 挂载 init.sql 文件
+      # - ./.dev/mysql5.7/log:/var/log/mysql
+      # - ./.dev/mysql5.7/data:/var/lib/mysql
+      # - ./.dev/mysql8.0.39/log:/var/log/mysql
+      # - ./.dev/mysql8.0.39/data:/var/lib/mysql
+      # - ./init.sql:/docker-entrypoint-initdb.d/init.sql
+    ports:
+      - '${MYSQL_PORT}:3306'
+    networks:
+      - dc-net
+    restart: always
+
+  dc-selenium:
+    image: selenium/standalone-chrome:latest
+    container_name: y_selenium
+    environment:
+      - shm-size:"2g"
+    ports:
+      - '${SELENIUM_CHROME_PORT}:4444'
+      - '5900:5900'
+    networks:
+      - dc-net
+    restart: always
+
+  dc-app:
+    build: .
+    image: y_data-collect-app:1.0.0
+    container_name: y_data-collect-app
+    depends_on:
+      - dc-mysql
+      - dc-selenium
+    environment:
+      - TZ=Asia/Shanghai
+      - APP_MYSQL__HOST=y_data-collect-mysql
+      - APP_MYSQL__PORT=3306
+      - APP_MYSQL__DB=${MYSQL_DATABASE}
+      - APP_MYSQL__USER=${MYSQL_USER}
+      - APP_MYSQL__PASSWORD=${MYSQL_PASSWORD}
+      - APP_AI__KEY=
+      - APP_AI__URL=http://192.168.0.109:7580/api/chat
+      - APP_AI__MODEL=qwen2.5:7b
+      - APP_AI__MAX_TOKENS=1024
+      - APP_SCHEDULE__SLEEP_INTERVAL=600 #单位:秒 10分钟检查一次
+      - APP_SCHEDULE__COLLECT=20:00,12:00
+      - APP_SCHEDULE__PROCESS=23:00,4:00,13:00
+      - APP_SCHEDULE__SEND_EMAIL=08:20,14:00
+      - APP_SCHEDULE__RUN_NOW=1
+      - APP_SELENIUM__REMOTE_DRIVER_URL=http://y_selenium:4444/wd/hub
+    volumes:
+      - /home/docker/data-collect/app/config.yml:/app/config.yml
+      - /home/docker/data-collect/app/logs:/app/logs
+      # - ./.dev/app/config.yml:/app/config.yml
+      # - ./.dev/app/logs:/app/logs
+    networks:
+      - dc-net
+    # 如果需要暴露端口
+    # ports:
+    #   - "8080:8080"
+    restart: always
+
+networks:
+  dc-net:
+    driver: bridge

+ 80 - 0
SourceCode/TenderCrawler/init.sql

@@ -0,0 +1,80 @@
+
+
+SET NAMES utf8mb4;
+SET FOREIGN_KEY_CHECKS = 0;
+
+-- ----------------------------
+-- Table structure for t_area_email
+-- ----------------------------
+DROP TABLE IF EXISTS `t_area_email`;
+CREATE TABLE `t_area_email`  (
+  `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '名称',
+  `area` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '地区 多个以”,\"分隔',
+  `email` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '邮箱 多个以 ”,\" 分隔',
+  `is_active` int(4) NULL DEFAULT NULL COMMENT '激活状态 1:激活 0:失活',
+  `remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
+  PRIMARY KEY (`name`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('盐城', '江苏省盐城市,江苏盐城,盐城市,盐城', '349977741@qq.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('南京', '江苏省南京市,江苏南京,南京市,南京', '349977741@qq.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('济南', '江苏省济南市,江苏济南,济南市,济南', '349977741@qq.com', 1, NULL);
+INSERT INTO `t_area_email` (`name`, `area`, `email`, `is_active`, `remark`) VALUES ('全国', '全国', 'yueyy@iwbnet.com', 1, NULL);
+
+
+-- ----------------------------
+-- Table structure for t_collect_data
+-- ----------------------------
+DROP TABLE IF EXISTS `t_collect_data`;
+CREATE TABLE `t_collect_data`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '页面详情URL',
+  `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '检索到页面的关键字',
+  `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '页面详情',
+  `status` int(4) NOT NULL DEFAULT 0 COMMENT '状态 0:未处理 1:已处理',
+  `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
+  `process_time` datetime NULL DEFAULT NULL COMMENT '处理时间',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+-- ----------------------------
+-- Table structure for t_data
+-- ----------------------------
+DROP TABLE IF EXISTS `t_data`;
+CREATE TABLE `t_data`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '详情链接',
+  `keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '搜索关键字',
+  `no` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标编号',
+  `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标标题',
+  `date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标时间',
+  `area` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '招标地区',
+  `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '详细地点',
+  `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL COMMENT '招标摘要',
+  `release_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '发布时间',
+  `status` int(4) NULL DEFAULT NULL COMMENT '状态 0:未推送 1:已推送',
+  `create_time` datetime NULL DEFAULT NULL COMMENT '创建时间',
+  `send_time` datetime NULL DEFAULT NULL COMMENT '推送时间',
+  `other_urls` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '其他连接(招标编号相同的多个链接)',
+  `remark` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+INSERT INTO `t_data` (`url`, `no`, `title`, `date`, `area`, `address`, `summary`, `release_date`, `status`, `create_time`, `send_time`, `remark`) VALUES ('https://www.chinabidding.com/bidDetail/260794529.html', 'NWZ241216-2103-049601', '中石化华东油气分公司2024年度210306填料塔框架协议招标采购', '2024年12月27日9时0分', '全国', '中国石化物资电子招投标交易平台(https://bidding.epec.com)', '本招标项目为中国石油化工股份有限公司华东油气分公司2024年度210306填料塔框架协议招标采购,招标编号为NWZ241216-2103-049601。投标人须具备工业管道安装资质或压力管道元件制造资质,并在有效期内;具备A级压力容器制造证书且在有效期内。招标文件于2024年12月16日11时0分开始售卖,截止时间为2024年12月23日9时0分,电子投标文件需在2024年12月27日9时0分前递交。', '2024-12-17', 0, '2024-12-19 15:26:54', NULL, NULL);
+
+-- ----------------------------
+-- Table structure for t_urls
+-- ----------------------------
+DROP TABLE IF EXISTS `t_urls`;
+CREATE TABLE `t_urls`  (
+  `url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL COMMENT '需访问的URL链接',
+  `type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '适配器类型',
+  `username` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '用户名',
+  `password` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '密码',
+  `keywords` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '关键字,多个以”,“分隔',
+  `is_active` int(4) NULL DEFAULT NULL COMMENT '激活状态 1:激活 0:失活',
+  `remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NULL DEFAULT NULL COMMENT '备注',
+  PRIMARY KEY (`url`) USING BTREE
+) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_bin ROW_FORMAT = Dynamic;
+
+INSERT INTO `t_urls` (`url`, `type`, `username`, `password`, `keywords`, `is_active`, `remark`) VALUES ('https://www.chinabidding.com/', 'chinabidding', 'brukernano2011', '695765FqX', '红外光谱仪', 1, '中国国际招标网 (www.chinabidding.com 必联网)\r\nBruker Beijing	 用户名:brukernano2011               密码:695765FqX');
+
+SET FOREIGN_KEY_CHECKS = 1;

+ 8 - 0
SourceCode/TenderCrawler/requirements.txt

@@ -0,0 +1,8 @@
+PyMySQL==1.1.1
+python_dateutil==2.9.0.post0
+PyYAML==6.0.2
+PyYAML==6.0.2
+Requests==2.32.3
+schedule==1.2.2
+selenium==4.27.1
+cryptography==41.0.4