chinabidding_data_collection_adapter.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. from time import sleep
  2. from selenium.common.exceptions import TimeoutException, NoSuchElementException
  3. from selenium.webdriver.common.by import By
  4. from selenium.webdriver.support import expected_conditions as ec
  5. from selenium.webdriver.support.wait import WebDriverWait
  6. import utils
  7. from adapters.data_collection_adapter_interface import IDataCollectionAdapter
  8. from stores.data_store_interface import IDataStore
  9. class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
  10. """
  11. 中国招标网数据采集适配器
  12. """
  13. def __init__(self, url: str, store: IDataStore = None):
  14. self._url = url
  15. self._store = store
  16. self._driver = None
  17. self._keyword = None
  18. self._adapter_type = "chinabidding"
  19. self._next_count = 0
  20. def login(self, username: str, password: str) -> None:
  21. try:
  22. login_el = self.driver.find_element(
  23. By.XPATH, "//div[@id='loginRight']/a[@class='login']"
  24. )
  25. login_el.click()
  26. wait = WebDriverWait(self.driver, 10, 1)
  27. wait.until(ec.presence_of_element_located((By.ID, "userpass")))
  28. # if not self._wait_until(
  29. # ec.presence_of_element_located((By.ID, "userpass"))
  30. # ):
  31. # raise TimeoutException(f"id='userpass' 元素没有找到")
  32. un_el = self.driver.find_element(By.ID, "username")
  33. un_el.send_keys(username)
  34. pass_el = self.driver.find_element(By.ID, "userpass")
  35. pass_el.send_keys(password)
  36. login_btn = self.driver.find_element(By.ID, "login-button")
  37. login_btn.click()
  38. wait.until(ec.presence_of_element_located((By.ID, "site-content")))
  39. # if not self._wait_until(ec.presence_of_element_located((By.ID, "site-content"))):
  40. # raise TimeoutException(f"id='site-content' 元素没有找到")
  41. except TimeoutException as e:
  42. raise Exception(f"登录失败 [{self._adapter_type}] [超时]: {e}")
  43. except NoSuchElementException as e:
  44. raise Exception(f"登录失败 [{self._adapter_type}] [找不到元素]: {e}")
  45. def _collect(self, keyword: str):
  46. items = self._search_by_type(keyword, 0)
  47. self._process_list(items, 0)
  48. sleep(2)
  49. items = self._search_by_type(keyword, 1)
  50. self._process_list(items, 1)
  51. if utils.get_config_bool(self.batch_save_key):
  52. self.store.save_collect_data(True)
  53. def _search_by_type(self, keyword: str, data_type):
  54. try:
  55. self.driver.get(self._url)
  56. if data_type == 0:
  57. utils.get_logger().info(f"开始采集 招标公告")
  58. el = self.driver.find_element(
  59. By.XPATH, "//div[@id='z-b-g-g']/h2/a[@class='more']"
  60. )
  61. else:
  62. utils.get_logger().info(f"开始采集 中标结果公告")
  63. el = self.driver.find_element(
  64. By.XPATH, "//div[@id='z-b-jg-gg']/h2/a[@class='more']"
  65. )
  66. el.click()
  67. if not self._wait_until(ec.number_of_windows_to_be(2)):
  68. return []
  69. self.driver.close()
  70. self.driver.switch_to.window(self.driver.window_handles[0])
  71. return self._search(keyword)
  72. except TimeoutException as e:
  73. raise Exception(f"搜索失败 [{self._adapter_type}] [超时]: {e}")
  74. except NoSuchElementException as e:
  75. raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
  76. def _search(self, keyword: str) -> list:
  77. if not self._wait_until(
  78. ec.presence_of_element_located((By.ID, "searchBidProjForm"))
  79. ):
  80. return []
  81. search_el = self.driver.find_element(
  82. By.XPATH, "//form[@id='searchBidProjForm']/ul/li/input[@id='fullText']"
  83. )
  84. search_el.clear()
  85. search_el.send_keys(keyword)
  86. search_btn = self.driver.find_element(
  87. By.XPATH, "//form[@id='searchBidProjForm']/ul/li/button"
  88. )
  89. search_btn.click()
  90. self._next_count = 0
  91. if not self._wait_until(
  92. ec.presence_of_element_located((By.ID, "site-content"))
  93. ):
  94. return []
  95. default_search_txt = "全部"
  96. search_txt = utils.get_config_value(self.search_day_key, default_search_txt)
  97. utils.get_logger().debug(f"搜索日期条件: {search_txt}")
  98. if search_txt != default_search_txt:
  99. last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
  100. sleep(1)
  101. last_el.click()
  102. if not self._wait_until(
  103. ec.presence_of_element_located((By.ID, "site-content"))
  104. ):
  105. return []
  106. else:
  107. sleep(1)
  108. try:
  109. a_links = self.driver.find_elements(
  110. By.XPATH, "//form[@id='pagerSubmitForm']/a"
  111. )
  112. count = len(a_links)
  113. if count > 1:
  114. count = count - 1
  115. utils.get_logger().debug(f"共查询到 {count} 页,每页 10 条")
  116. except Exception as e:
  117. utils.get_logger().error(f"搜索失败[尝试查询页数]: {e}")
  118. items = self.driver.find_elements(By.XPATH, "//ul[@class='as-pager-body']/li/a")
  119. return items
  120. def _process_list(self, items: list, data_type) -> list:
  121. if not items:
  122. return []
  123. for item in items:
  124. self._process_item(item, data_type)
  125. sleep(2)
  126. next_items = self._next_page()
  127. return self._process_list(next_items, data_type)
  128. def _next_page(self) -> list:
  129. try:
  130. try:
  131. btn = self.driver.find_element(
  132. By.XPATH, "//form[@id='pagerSubmitForm']/a[@class='next']"
  133. )
  134. except NoSuchElementException:
  135. utils.get_logger().debug(f"翻页结束 [{self._adapter_type}]")
  136. return []
  137. btn.click()
  138. self._next_count += 1
  139. utils.get_logger().debug(
  140. f"下一页[{self._next_count+1}]: {self.driver.current_url}"
  141. )
  142. if not self._wait_until(
  143. ec.presence_of_element_located((By.ID, "site-content"))
  144. ):
  145. return []
  146. items = self.driver.find_elements(
  147. By.XPATH, "//ul[@class='as-pager-body']/li/a"
  148. )
  149. return items
  150. except NoSuchElementException as e:
  151. raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
  152. except TimeoutException as e:
  153. raise Exception(f"翻页失败 [{self._adapter_type}] [超时]: {e}")
  154. def _process_item(self, item, data_type):
  155. main_handle = self.driver.current_window_handle
  156. close = True
  157. try:
  158. url = item.get_attribute("href")
  159. if self._check_is_collect_by_url(url):
  160. close = False
  161. return
  162. item.click()
  163. if not self._wait_until(ec.number_of_windows_to_be(2)):
  164. return
  165. handles = self.driver.window_handles
  166. for handle in handles:
  167. if handle != main_handle:
  168. self.driver.switch_to.window(handle)
  169. break
  170. url = self.driver.current_url
  171. utils.get_logger().debug(f"跳转详情")
  172. if not self._wait_until(
  173. ec.presence_of_element_located((By.CLASS_NAME, "content"))
  174. ):
  175. return
  176. content = self.driver.find_element(By.CLASS_NAME, "content").text
  177. if self._check_content(content):
  178. self._save_db(url, content, data_type)
  179. else:
  180. self._save_db(url, content, data_type, is_invalid=True)
  181. except TimeoutException as e:
  182. utils.get_logger().error(
  183. f"采集发生异常 [{self._adapter_type}] Timeout: {self.driver.current_url}。Exception: {e}"
  184. )
  185. # raise Exception(f"采集失败 [超时]: {e}")
  186. except NoSuchElementException as e:
  187. utils.get_logger().error(
  188. f"采集发生异常 [{self._adapter_type}] NoSuchElement: {self.driver.current_url}。Exception: {e}"
  189. )
  190. raise Exception(f"采集失败 [{self._adapter_type}] [找不到元素]: {e}")
  191. finally:
  192. if close:
  193. sleep(2)
  194. self.driver.close()
  195. self.driver.switch_to.window(main_handle)