|
@@ -1,172 +1,213 @@
|
|
from time import sleep
|
|
from time import sleep
|
|
|
|
+from typing import List, Optional
|
|
|
|
|
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as ec
|
|
from selenium.webdriver.support import expected_conditions as ec
|
|
|
|
|
|
-
|
|
|
|
import utils
|
|
import utils
|
|
from adapters.data_collection_adapter_interface import IDataCollectionAdapter
|
|
from adapters.data_collection_adapter_interface import IDataCollectionAdapter
|
|
-from stores.data_store_interface import IDataStore
|
|
|
|
|
|
|
|
|
|
|
|
-class CcgpDataCollectionAdapter(IDataCollectionAdapter):
|
|
|
|
- """
|
|
|
|
- 中国政府采购网数据采集适配器
|
|
|
|
- """
|
|
|
|
|
|
+class CCGPDataCollectionAdapter(IDataCollectionAdapter):
|
|
|
|
+ """中国政府采购网数据采集适配器"""
|
|
|
|
|
|
- def __init__(self, url: str, store: IDataStore = None):
|
|
|
|
- self._url = url
|
|
|
|
- self._store = store
|
|
|
|
- self._driver = None
|
|
|
|
- self._keyword = None
|
|
|
|
- self._adapter_type = "ccgp"
|
|
|
|
- self._next_count = 0
|
|
|
|
|
|
+ def __init__(self, url: str):
|
|
|
|
+ """初始化适配器
|
|
|
|
+
|
|
|
|
+ Args:
|
|
|
|
+ url: 目标网站URL
|
|
|
|
+ """
|
|
|
|
+ super().__init__(url, "ccgp", "近1周")
|
|
|
|
|
|
def login(self, username: str, password: str) -> None:
|
|
def login(self, username: str, password: str) -> None:
|
|
|
|
+ """登录网站(CCGP无需登录)"""
|
|
pass
|
|
pass
|
|
|
|
|
|
- def _collect(self, keyword: str):
|
|
|
|
- items = self._search(keyword)
|
|
|
|
- if len(items) <= 0:
|
|
|
|
- return
|
|
|
|
- self._process_list(items)
|
|
|
|
- if utils.get_config_bool(self.batch_save_key):
|
|
|
|
- self.store.save_collect_data(True)
|
|
|
|
|
|
+ def _collect(self, keyword: str) -> None:
|
|
|
|
+ """执行数据采集
|
|
|
|
|
|
- def _search(self, keyword: str) -> list:
|
|
|
|
|
|
+ Args:
|
|
|
|
+ keyword: 单个搜索关键词
|
|
|
|
+ """
|
|
try:
|
|
try:
|
|
- if not keyword:
|
|
|
|
- raise Exception("搜索关键字不能为空")
|
|
|
|
- self.driver.get(self._url)
|
|
|
|
- if not self._wait_until(
|
|
|
|
- ec.presence_of_element_located((By.ID, "searchForm"))
|
|
|
|
- ):
|
|
|
|
- return []
|
|
|
|
- search_el = self.driver.find_element(By.ID, "kw")
|
|
|
|
- sleep(2)
|
|
|
|
- search_el.clear()
|
|
|
|
- search_el.send_keys(keyword)
|
|
|
|
- search_btn = self.driver.find_element(
|
|
|
|
- By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']"
|
|
|
|
- )
|
|
|
|
- sleep(1)
|
|
|
|
- search_btn.click()
|
|
|
|
- self._next_count = 0
|
|
|
|
- if not self._wait_until(
|
|
|
|
- ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
|
|
|
|
- ):
|
|
|
|
- return []
|
|
|
|
- default_search_txt = "近1周"
|
|
|
|
- search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
|
|
|
|
- utils.get_logger().debug(f"搜索日期条件: {search_txt}")
|
|
|
|
- if search_txt != default_search_txt:
|
|
|
|
|
|
+ # 获取搜索时间范围
|
|
|
|
+
|
|
|
|
+ self.logger.info(f"开始采集关键词: {keyword}, 时间范围: {self._search_txt}")
|
|
|
|
+
|
|
|
|
+ # 搜索数据
|
|
|
|
+ items = self._search(keyword)
|
|
|
|
+ if not items:
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ # 处理数据列表
|
|
|
|
+ self._process_list(items)
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ self.logger.error(f"采集失败: {e}")
|
|
|
|
+ raise
|
|
|
|
+
|
|
|
|
+ def _search(self, keyword: str) -> List:
|
|
|
|
+ """搜索数据
|
|
|
|
+
|
|
|
|
+ Args:
|
|
|
|
+ keyword: 搜索关键词
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ List: 搜索结果列表
|
|
|
|
+ """
|
|
|
|
+ # 打开搜索页面
|
|
|
|
+ self.driver.get(self.url)
|
|
|
|
+
|
|
|
|
+ # 等待搜索框
|
|
|
|
+ self._wait_for(
|
|
|
|
+ ec.presence_of_element_located((By.ID, "searchForm")),
|
|
|
|
+ message="搜索框加载超时",
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 输入关键词
|
|
|
|
+ search_el = self.driver.find_element(By.ID, "kw")
|
|
|
|
+ sleep(2)
|
|
|
|
+ search_el.clear()
|
|
|
|
+ search_el.send_keys(keyword)
|
|
|
|
+
|
|
|
|
+ # 点击搜索
|
|
|
|
+ search_btn = self.driver.find_element(
|
|
|
|
+ By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']"
|
|
|
|
+ )
|
|
|
|
+ sleep(1)
|
|
|
|
+ search_btn.click()
|
|
|
|
+
|
|
|
|
+ # 等待结果加载
|
|
|
|
+ self._next_count = 0
|
|
|
|
+ self._wait_for(
|
|
|
|
+ ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
|
|
|
|
+ message="搜索结果加载超时",
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 设置时间范围
|
|
|
|
+ self._set_search_date()
|
|
|
|
+
|
|
|
|
+ # 获取结果列表
|
|
|
|
+ items = self.driver.find_elements(
|
|
|
|
+ By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
|
|
|
|
+ )
|
|
|
|
+ return items
|
|
|
|
+
|
|
|
|
+ def _set_search_date(self) -> None:
|
|
|
|
+ """设置搜索时间范围"""
|
|
|
|
+ try:
|
|
|
|
+ if self._search_txt != self._default_search_txt:
|
|
last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
|
|
last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
|
|
for last_el in last_els:
|
|
for last_el in last_els:
|
|
- if search_txt == last_el.text:
|
|
|
|
|
|
+ if self._search_txt == last_el.text:
|
|
sleep(1)
|
|
sleep(1)
|
|
last_el.click()
|
|
last_el.click()
|
|
break
|
|
break
|
|
- if not self._wait_until(
|
|
|
|
- ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
|
|
|
|
- ):
|
|
|
|
- return []
|
|
|
|
|
|
+
|
|
|
|
+ self._wait_for(
|
|
|
|
+ ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
|
|
|
|
+ message="设置时间范围后页面加载超时",
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
sleep(1)
|
|
sleep(1)
|
|
- try:
|
|
|
|
- p_els = self.driver.find_elements(
|
|
|
|
- By.XPATH, "//body/div[@class='vT_z']/div/div/p"
|
|
|
|
- )
|
|
|
|
- if len(p_els) > 0:
|
|
|
|
- utils.get_logger().debug(f" {p_els[0].text}")
|
|
|
|
- else:
|
|
|
|
- a_links = self.driver.find_elements(
|
|
|
|
- By.XPATH, "//div[@class='vT-srch-result-list']/p/a"
|
|
|
|
- )
|
|
|
|
- count = len(a_links)
|
|
|
|
- if count > 1:
|
|
|
|
- count = count - 1
|
|
|
|
- utils.get_logger().debug(f"共查询到 {count} 页,每页 20 条")
|
|
|
|
- except Exception as e:
|
|
|
|
- utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
|
|
|
|
- items = self.driver.find_elements(
|
|
|
|
- By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
|
|
|
|
- )
|
|
|
|
- return items
|
|
|
|
- except TimeoutException as e:
|
|
|
|
- raise Exception(f"搜索失败 [{self._adapter_type}] [超时]: {e}")
|
|
|
|
- except NoSuchElementException as e:
|
|
|
|
- raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
|
|
|
|
|
|
|
|
- def _process_list(self, items: list) -> list:
|
|
|
|
|
|
+ except Exception as e:
|
|
|
|
+ self.logger.error(f"设置时间范围失败: {e}")
|
|
|
|
+
|
|
|
|
+ def _process_list(self, items: List) -> None:
|
|
|
|
+ """处理数据列表
|
|
|
|
+
|
|
|
|
+ Args:
|
|
|
|
+ items: 数据列表
|
|
|
|
+ """
|
|
if not items:
|
|
if not items:
|
|
- return []
|
|
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ # 处理当前页
|
|
for item in items:
|
|
for item in items:
|
|
self._process_item(item)
|
|
self._process_item(item)
|
|
sleep(2)
|
|
sleep(2)
|
|
|
|
+
|
|
|
|
+ # 处理下一页
|
|
next_items = self._next_page()
|
|
next_items = self._next_page()
|
|
- if len(items) <= 0:
|
|
|
|
- return []
|
|
|
|
- return self._process_list(next_items)
|
|
|
|
|
|
+ if next_items:
|
|
|
|
+ self._process_list(next_items)
|
|
|
|
|
|
- def _next_page(self) -> list:
|
|
|
|
|
|
+ def _next_page(self) -> Optional[List]:
|
|
|
|
+ """获取下一页数据"""
|
|
try:
|
|
try:
|
|
|
|
+ # 查找下一页按钮
|
|
next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
|
|
next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
|
|
try:
|
|
try:
|
|
btn = self.driver.find_element(By.XPATH, next_path)
|
|
btn = self.driver.find_element(By.XPATH, next_path)
|
|
except NoSuchElementException:
|
|
except NoSuchElementException:
|
|
- utils.get_logger().debug(f"翻页结束 [{self._adapter_type}]")
|
|
|
|
- return []
|
|
|
|
|
|
+ self.logger.debug("已到最后一页")
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+ # 点击下一页
|
|
btn.click()
|
|
btn.click()
|
|
self._next_count += 1
|
|
self._next_count += 1
|
|
- utils.get_logger().debug(
|
|
|
|
- f"下一页[{self._next_count+1}]: {self.driver.current_url}"
|
|
|
|
- )
|
|
|
|
|
|
+ self.logger.debug(f"下一页[{self._next_count+1}]")
|
|
sleep(1)
|
|
sleep(1)
|
|
- if not self._wait_until(
|
|
|
|
- ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result"))
|
|
|
|
- ):
|
|
|
|
- return []
|
|
|
|
|
|
+
|
|
|
|
+ # 等待页面加载
|
|
|
|
+ self._wait_for(
|
|
|
|
+ ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
|
|
|
|
+ message="下一页加载超时",
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 获取数据列表
|
|
items = self.driver.find_elements(
|
|
items = self.driver.find_elements(
|
|
By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
|
|
By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
|
|
)
|
|
)
|
|
return items
|
|
return items
|
|
|
|
+
|
|
except NoSuchElementException as e:
|
|
except NoSuchElementException as e:
|
|
- raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
|
|
|
|
- except TimeoutException as e:
|
|
|
|
- raise Exception(f"翻页失败 [{self._adapter_type}] [超时]: {e}")
|
|
|
|
|
|
+ raise Exception(f"页面元素未找到: {e}")
|
|
|
|
|
|
- def _process_item(self, item):
|
|
|
|
|
|
+ def _process_item(self, item) -> None:
|
|
|
|
+ """处理单条数据"""
|
|
main_handle = self.driver.current_window_handle
|
|
main_handle = self.driver.current_window_handle
|
|
close = True
|
|
close = True
|
|
|
|
+
|
|
try:
|
|
try:
|
|
|
|
+ # 检查URL是否已采集
|
|
url = item.get_attribute("href")
|
|
url = item.get_attribute("href")
|
|
if self._check_is_collect_by_url(url):
|
|
if self._check_is_collect_by_url(url):
|
|
close = False
|
|
close = False
|
|
return
|
|
return
|
|
- utils.get_logger().debug(f"跳转详情")
|
|
|
|
|
|
+
|
|
|
|
+ # 打开详情页
|
|
|
|
+ self.logger.debug("打开详情页")
|
|
sleep(1)
|
|
sleep(1)
|
|
item.click()
|
|
item.click()
|
|
- if not self._wait_until(ec.number_of_windows_to_be(2)):
|
|
|
|
- return
|
|
|
|
|
|
+
|
|
|
|
+ # 切换窗口
|
|
|
|
+ self._wait_for(ec.number_of_windows_to_be(2), message="新窗口打开超时")
|
|
|
|
+
|
|
handles = self.driver.window_handles
|
|
handles = self.driver.window_handles
|
|
for handle in handles:
|
|
for handle in handles:
|
|
if handle != main_handle:
|
|
if handle != main_handle:
|
|
self.driver.switch_to.window(handle)
|
|
self.driver.switch_to.window(handle)
|
|
break
|
|
break
|
|
- if not self._wait_until(
|
|
|
|
- ec.presence_of_element_located((By.TAG_NAME, "body"))
|
|
|
|
- ):
|
|
|
|
- return
|
|
|
|
|
|
|
|
|
|
+ # 等待页面加载
|
|
|
|
+ self._wait_for(
|
|
|
|
+ ec.presence_of_element_located((By.TAG_NAME, "body")),
|
|
|
|
+ message="详情页加载超时",
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 获取内容
|
|
content = self.driver.find_element(
|
|
content = self.driver.find_element(
|
|
By.XPATH, "//div[@class='vF_deail_maincontent']"
|
|
By.XPATH, "//div[@class='vF_deail_maincontent']"
|
|
).text
|
|
).text
|
|
- # 排除其他公告
|
|
|
|
|
|
+
|
|
|
|
+ # 判断公告类型
|
|
if self._check_type("其他公告"):
|
|
if self._check_type("其他公告"):
|
|
self._save_db(url, content, 3, is_invalid=True)
|
|
self._save_db(url, content, 3, is_invalid=True)
|
|
return
|
|
return
|
|
- # 判断是否为投标公告
|
|
|
|
|
|
+
|
|
data_type = (
|
|
data_type = (
|
|
1
|
|
1
|
|
if self._check_type("中标公告")
|
|
if self._check_type("中标公告")
|
|
@@ -174,20 +215,19 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
|
|
or self._check_type("终止公告")
|
|
or self._check_type("终止公告")
|
|
else 0
|
|
else 0
|
|
)
|
|
)
|
|
|
|
+
|
|
|
|
+ # 检查关键词并保存
|
|
if self._check_content(content):
|
|
if self._check_content(content):
|
|
attach_str = self._attach_download()
|
|
attach_str = self._attach_download()
|
|
self._save_db(url, content, data_type, attach_str)
|
|
self._save_db(url, content, data_type, attach_str)
|
|
else:
|
|
else:
|
|
self._save_db(url, content, data_type, is_invalid=True)
|
|
self._save_db(url, content, data_type, is_invalid=True)
|
|
|
|
+
|
|
except TimeoutException as e:
|
|
except TimeoutException as e:
|
|
- utils.get_logger().error(
|
|
|
|
- f"采集发生异常 [{self._adapter_type}] Timeout: {self.driver.current_url}。Exception: {e}"
|
|
|
|
- )
|
|
|
|
|
|
+ self.logger.error(f"处理数据超时: {e}")
|
|
except NoSuchElementException as e:
|
|
except NoSuchElementException as e:
|
|
- utils.get_logger().error(
|
|
|
|
- f"采集发生异常 [{self._adapter_type}] NoSuchElement: {self.driver.current_url}。Exception: {e}"
|
|
|
|
- )
|
|
|
|
- raise Exception(f"采集失败 [{self._adapter_type}] [找不到元素]: {e}")
|
|
|
|
|
|
+ self.logger.error(f"页面元素未找到: {e}")
|
|
|
|
+ raise
|
|
finally:
|
|
finally:
|
|
if close:
|
|
if close:
|
|
sleep(1)
|
|
sleep(1)
|
|
@@ -195,56 +235,75 @@ class CcgpDataCollectionAdapter(IDataCollectionAdapter):
|
|
self.driver.switch_to.window(main_handle)
|
|
self.driver.switch_to.window(main_handle)
|
|
|
|
|
|
def _check_type(self, type_str: str) -> bool:
|
|
def _check_type(self, type_str: str) -> bool:
|
|
|
|
+ """检查公告类型
|
|
|
|
+
|
|
|
|
+ Args:
|
|
|
|
+ type_str: 类型文本
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ bool: 是否匹配
|
|
|
|
+ """
|
|
links = self.driver.find_elements(By.LINK_TEXT, type_str)
|
|
links = self.driver.find_elements(By.LINK_TEXT, type_str)
|
|
- if len(links) > 0:
|
|
|
|
- utils.get_logger().info(f"{type_str}")
|
|
|
|
|
|
+ if links:
|
|
|
|
+ self.logger.info(f"公告类型: {type_str}")
|
|
return True
|
|
return True
|
|
return False
|
|
return False
|
|
|
|
|
|
- def _attach_download(self):
|
|
|
|
|
|
+ def _attach_download(self) -> Optional[str]:
|
|
|
|
+ """下载附件
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ str: 附件路径
|
|
|
|
+ """
|
|
paths = []
|
|
paths = []
|
|
|
|
|
|
|
|
+ # 查找附件链接
|
|
attach_els = self.driver.find_elements(
|
|
attach_els = self.driver.find_elements(
|
|
By.XPATH, "//td[@class='bid_attachtab_content']/a"
|
|
By.XPATH, "//td[@class='bid_attachtab_content']/a"
|
|
)
|
|
)
|
|
attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
|
|
attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
|
|
- # 合并两个列表
|
|
|
|
all_attachments = attach_els + attach_2_els
|
|
all_attachments = attach_els + attach_2_els
|
|
- utils.get_logger().debug(
|
|
|
|
- f"附件检索数量: {len(attach_els)}/{len(attach_2_els)}/{len(all_attachments)}"
|
|
|
|
|
|
+
|
|
|
|
+ self.logger.debug(
|
|
|
|
+ f"附件数量: {len(attach_els)}/{len(attach_2_els)}/{len(all_attachments)}"
|
|
)
|
|
)
|
|
|
|
+
|
|
|
|
+ # 下载附件
|
|
attach_urls = []
|
|
attach_urls = []
|
|
- if len(all_attachments) > 0:
|
|
|
|
- for attach_el in all_attachments:
|
|
|
|
|
|
+ for attach_el in all_attachments:
|
|
|
|
+ try:
|
|
|
|
+ # 获取附件信息
|
|
attach_url = attach_el.get_attribute("href")
|
|
attach_url = attach_el.get_attribute("href")
|
|
- if attach_url not in attach_urls:
|
|
|
|
- attach_urls.append(attach_url)
|
|
|
|
- else:
|
|
|
|
- utils.get_logger().info(f"重复附件: {attach_url}")
|
|
|
|
|
|
+ if attach_url in attach_urls:
|
|
|
|
+ self.logger.info(f"重复附件: {attach_url}")
|
|
continue
|
|
continue
|
|
|
|
+ attach_urls.append(attach_url)
|
|
|
|
+
|
|
|
|
+ # 获取文件名
|
|
file_name = (
|
|
file_name = (
|
|
attach_el.text
|
|
attach_el.text
|
|
or attach_el.get_attribute("download")
|
|
or attach_el.get_attribute("download")
|
|
or attach_url.split("/")[-1]
|
|
or attach_url.split("/")[-1]
|
|
)
|
|
)
|
|
- if not file_name:
|
|
|
|
- continue
|
|
|
|
- # 检查 file_name 是否包含文件扩展名
|
|
|
|
- if "." not in file_name:
|
|
|
|
- utils.get_logger().warning(
|
|
|
|
- f"文件名 {file_name} 不包含扩展名,跳过下载。"
|
|
|
|
- )
|
|
|
|
|
|
+ if not file_name or "." not in file_name:
|
|
|
|
+ self.logger.warning(f"无效文件名: {file_name}")
|
|
continue
|
|
continue
|
|
- utils.get_logger().debug(
|
|
|
|
- f"开始下载附件: {file_name} 链接: {attach_url}"
|
|
|
|
- )
|
|
|
|
|
|
+
|
|
|
|
+ # 下载文件
|
|
|
|
+ self.logger.debug(f"下载附件: {file_name}")
|
|
path = utils.download_remote_file(attach_url, file_name)
|
|
path = utils.download_remote_file(attach_url, file_name)
|
|
if path:
|
|
if path:
|
|
- utils.get_logger().debug(f"下载附件路径: {path}")
|
|
|
|
|
|
+ self.logger.debug(f"下载成功: {path}")
|
|
paths.append(path)
|
|
paths.append(path)
|
|
else:
|
|
else:
|
|
- utils.get_logger().warning(f"下载附件失败: {file_name}")
|
|
|
|
|
|
+ self.logger.warning(f"下载失败: {file_name}")
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ self.logger.error(f"处理附件失败: {e}")
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ # 返回附件路径
|
|
attach_str = ",".join(paths)
|
|
attach_str = ",".join(paths)
|
|
if attach_str:
|
|
if attach_str:
|
|
- utils.get_logger().info(f"附件下载完成: {attach_str}")
|
|
|
|
|
|
+ self.logger.info(f"附件下载完成: {attach_str}")
|
|
return attach_str
|
|
return attach_str
|