ccgp_data_collection_adapter.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. from time import sleep
  2. from typing import List, Optional
  3. from selenium.common.exceptions import TimeoutException, NoSuchElementException
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.support import expected_conditions as ec
  6. import utils
  7. from adapters.data_collection_adapter_interface import IDataCollectionAdapter
  8. class CCGPDataCollectionAdapter(IDataCollectionAdapter):
  9. """中国政府采购网数据采集适配器"""
  10. def __init__(self, url: str):
  11. """初始化适配器
  12. Args:
  13. url: 目标网站URL
  14. """
  15. super().__init__(url, "ccgp", "近1周")
  16. def login(self, username: str, password: str) -> None:
  17. """登录网站(CCGP无需登录)"""
  18. pass
  19. def _collect(self, keyword: str) -> None:
  20. """执行数据采集
  21. Args:
  22. keyword: 单个搜索关键词
  23. """
  24. try:
  25. # 获取搜索时间范围
  26. self.logger.info(f"开始采集关键词: {keyword}, 时间范围: {self._search_txt}")
  27. # 搜索数据
  28. items = self._search(keyword)
  29. if not items:
  30. return
  31. # 处理数据列表
  32. self._process_list(items)
  33. except Exception as e:
  34. self.logger.error(f"采集失败: {e}")
  35. raise
  36. def _search(self, keyword: str) -> List:
  37. """搜索数据
  38. Args:
  39. keyword: 搜索关键词
  40. Returns:
  41. List: 搜索结果列表
  42. """
  43. # 打开搜索页面
  44. self.driver.get(self.url)
  45. # 等待搜索框
  46. self._wait_for(
  47. ec.presence_of_element_located((By.ID, "searchForm")),
  48. message="搜索框加载超时",
  49. )
  50. # 输入关键词
  51. search_el = self.driver.find_element(By.ID, "kw")
  52. sleep(2)
  53. search_el.clear()
  54. search_el.send_keys(keyword)
  55. # 点击搜索
  56. search_btn = self.driver.find_element(
  57. By.XPATH, "//form[@id='searchForm']/input[@id='doSearch2']"
  58. )
  59. sleep(1)
  60. search_btn.click()
  61. # 等待结果加载
  62. self._next_count = 0
  63. self._wait_for(
  64. ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
  65. message="搜索结果加载超时",
  66. )
  67. # 设置时间范围
  68. self._set_search_date()
  69. # 获取结果列表
  70. items = self.driver.find_elements(
  71. By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
  72. )
  73. return items
  74. def _set_search_date(self) -> None:
  75. """设置搜索时间范围"""
  76. try:
  77. if self._search_txt != self._default_search_txt:
  78. last_els = self.driver.find_elements(By.XPATH, "//ul[@id='datesel']/li")
  79. for last_el in last_els:
  80. if self._search_txt == last_el.text:
  81. sleep(1)
  82. last_el.click()
  83. break
  84. self._wait_for(
  85. ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
  86. message="设置时间范围后页面加载超时",
  87. )
  88. else:
  89. sleep(1)
  90. except Exception as e:
  91. self.logger.error(f"设置时间范围失败: {e}")
  92. def _process_list(self, items: List) -> None:
  93. """处理数据列表
  94. Args:
  95. items: 数据列表
  96. """
  97. if not items:
  98. return
  99. # 处理当前页
  100. for item in items:
  101. self._process_item(item)
  102. sleep(2)
  103. # 处理下一页
  104. next_items = self._next_page()
  105. if next_items:
  106. self._process_list(next_items)
  107. def _next_page(self) -> Optional[List]:
  108. """获取下一页数据"""
  109. try:
  110. # 查找下一页按钮
  111. next_path = "//div[@class='vT-srch-result-list']/p/a[@class='next']"
  112. try:
  113. btn = self.driver.find_element(By.XPATH, next_path)
  114. except NoSuchElementException:
  115. self.logger.debug("已到最后一页")
  116. return None
  117. # 点击下一页
  118. btn.click()
  119. self._next_count += 1
  120. self.logger.debug(f"下一页[{self._next_count+1}]")
  121. sleep(1)
  122. # 等待页面加载
  123. self._wait_for(
  124. ec.presence_of_element_located((By.CLASS_NAME, "vT-srch-result")),
  125. message="下一页加载超时",
  126. )
  127. # 获取数据列表
  128. items = self.driver.find_elements(
  129. By.XPATH, "//ul[@class='vT-srch-result-list-bid']/li/a"
  130. )
  131. return items
  132. except NoSuchElementException as e:
  133. raise Exception(f"页面元素未找到: {e}")
  134. def _process_item(self, item) -> None:
  135. """处理单条数据"""
  136. main_handle = self.driver.current_window_handle
  137. close = True
  138. try:
  139. # 检查URL是否已采集
  140. url = item.get_attribute("href")
  141. if self._check_is_collect_by_url(url):
  142. close = False
  143. return
  144. # 打开详情页
  145. self.logger.debug("打开详情页")
  146. sleep(1)
  147. item.click()
  148. # 切换窗口
  149. self._wait_for(ec.number_of_windows_to_be(2), message="新窗口打开超时")
  150. handles = self.driver.window_handles
  151. for handle in handles:
  152. if handle != main_handle:
  153. self.driver.switch_to.window(handle)
  154. break
  155. # 等待页面加载
  156. self._wait_for(
  157. ec.presence_of_element_located((By.TAG_NAME, "body")),
  158. message="详情页加载超时",
  159. )
  160. # 获取内容
  161. content = self.driver.find_element(
  162. By.XPATH, "//div[@class='vF_deail_maincontent']"
  163. ).text
  164. # 判断公告类型
  165. if self._check_type("其他公告"):
  166. self._save_db(url, content, 3, is_invalid=True)
  167. return
  168. data_type = (
  169. 1
  170. if self._check_type("中标公告")
  171. or self._check_type("成交公告")
  172. or self._check_type("终止公告")
  173. else 0
  174. )
  175. # 检查关键词并保存
  176. if self._check_content(content):
  177. attach_str = self._attach_download()
  178. self._save_db(url, content, data_type, attach_str)
  179. else:
  180. self._save_db(url, content, data_type, is_invalid=True)
  181. except TimeoutException as e:
  182. self.logger.error(f"处理数据超时: {e}")
  183. except NoSuchElementException as e:
  184. self.logger.error(f"页面元素未找到: {e}")
  185. raise
  186. finally:
  187. if close:
  188. sleep(1)
  189. self.driver.close()
  190. self.driver.switch_to.window(main_handle)
  191. def _check_type(self, type_str: str) -> bool:
  192. """检查公告类型
  193. Args:
  194. type_str: 类型文本
  195. Returns:
  196. bool: 是否匹配
  197. """
  198. links = self.driver.find_elements(By.LINK_TEXT, type_str)
  199. if links:
  200. self.logger.info(f"公告类型: {type_str}")
  201. return True
  202. return False
  203. def _attach_download(self) -> Optional[str]:
  204. """下载附件
  205. Returns:
  206. str: 附件路径
  207. """
  208. paths = []
  209. # 查找附件链接
  210. attach_els = self.driver.find_elements(
  211. By.XPATH, "//td[@class='bid_attachtab_content']/a"
  212. )
  213. attach_2_els = self.driver.find_elements(By.XPATH, "//a[@ignore='1']")
  214. all_attachments = attach_els + attach_2_els
  215. self.logger.debug(
  216. f"附件数量: {len(attach_els)}/{len(attach_2_els)}/{len(all_attachments)}"
  217. )
  218. # 下载附件
  219. attach_urls = []
  220. for attach_el in all_attachments:
  221. try:
  222. # 获取附件信息
  223. attach_url = attach_el.get_attribute("href")
  224. if attach_url in attach_urls:
  225. self.logger.info(f"重复附件: {attach_url}")
  226. continue
  227. attach_urls.append(attach_url)
  228. # 获取文件名
  229. file_name = (
  230. attach_el.text
  231. or attach_el.get_attribute("download")
  232. or attach_url.split("/")[-1]
  233. )
  234. if not file_name or "." not in file_name:
  235. self.logger.warning(f"无效文件名: {file_name}")
  236. continue
  237. # 下载文件
  238. self.logger.debug(f"下载附件: {file_name}")
  239. path = utils.download_remote_file(attach_url, file_name)
  240. if path:
  241. self.logger.debug(f"下载成功: {path}")
  242. paths.append(path)
  243. else:
  244. self.logger.warning(f"下载失败: {file_name}")
  245. except Exception as e:
  246. self.logger.error(f"处理附件失败: {e}")
  247. continue
  248. # 返回附件路径
  249. attach_str = ",".join(paths)
  250. if attach_str:
  251. self.logger.info(f"附件下载完成: {attach_str}")
  252. return attach_str