chinabidding_data_collection_adapter.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. from time import sleep
  2. from selenium import webdriver
  3. from selenium.webdriver.common.by import By
  4. from selenium.webdriver.support.wait import WebDriverWait
  5. from selenium.webdriver.support import expected_conditions as ec
  6. from selenium.common.exceptions import TimeoutException, NoSuchElementException
  7. from drivers.driver_creator import DriverCreator
  8. from stores.data_store_interface import IDataStore
  9. from adapters.data_collection_adapter_interface import IDataCollectionAdapter
  10. from utils.logger_helper import LoggerHelper
  11. from utils.config_helper import ConfigHelper
  12. class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
  13. """
  14. 中国招标网数据采集适配器
  15. """
  16. logger = LoggerHelper.get_logger()
  17. def __init__(self, url: str):
  18. self._url = url
  19. self._store = None
  20. self._driver = None
  21. self._keyword = None
  22. @property
  23. def store(self) -> IDataStore:
  24. return self._store
  25. @property
  26. def url(self):
  27. return self._url
  28. @property
  29. def keyword(self):
  30. return self._keyword
  31. @property
  32. def driver(self):
  33. if not self._driver:
  34. self._driver = self.create_driver()
  35. return self._driver
  36. def create_driver(self) -> webdriver:
  37. try:
  38. return DriverCreator().gen_remote_driver(self.url)
  39. except Exception as e:
  40. raise Exception(f"创建驱动器失败: {e}")
  41. def login(self, driver, username: str, password: str) -> None:
  42. try:
  43. login_el = driver.find_element(
  44. By.XPATH, "//div[@id='loginRight']/a[@class='login']")
  45. login_el.click()
  46. wait = WebDriverWait(driver, 10, 1)
  47. wait.until(ec.presence_of_element_located((By.ID, "userpass")))
  48. un_el = driver.find_element(By.ID, "username")
  49. un_el.send_keys(username)
  50. pass_el = driver.find_element(By.ID, "userpass")
  51. pass_el.send_keys(password)
  52. login_btn = driver.find_element(By.ID, "login-button")
  53. login_btn.click()
  54. wait.until(ec.presence_of_element_located((By.ID, "site-content")))
  55. except TimeoutException as e:
  56. raise Exception(f"登录失败 [超时]: {e}")
  57. except NoSuchElementException as e:
  58. raise Exception(f"登录失败 [找不到元素]: {e}")
  59. def search(self, driver, keyword: str) -> list:
  60. try:
  61. self._keyword = keyword
  62. wait = WebDriverWait(driver, 10, 1)
  63. wait.until(
  64. ec.presence_of_element_located((By.ID, "projSearchForm")))
  65. search_el = driver.find_element(By.ID, "fullText")
  66. search_el.send_keys(keyword)
  67. search_btn = driver.find_element(
  68. By.XPATH, "//form[@id='projSearchForm']/button")
  69. search_btn.click()
  70. wait.until(ec.presence_of_element_located((By.ID, "site-content")))
  71. # 查询3天内的数据
  72. search_txt = ConfigHelper().get("adapter.chinabidding.search_day")
  73. if not search_txt:
  74. search_txt = "近三天"
  75. self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
  76. last_el = driver.find_element(By.LINK_TEXT, search_txt)
  77. last_el.click()
  78. wait.until(ec.presence_of_element_located((By.ID, "site-content")))
  79. try:
  80. a_links = driver.find_elements(
  81. By.XPATH, "//form[@id='pagerSubmitForm']/a")
  82. count = len(a_links)
  83. if count > 1:
  84. count = count - 1
  85. self.logger.info(f"共查询到 {count} 页")
  86. except Exception as e:
  87. self.logger.error(f"搜索失败[尝试查询页数]: {e}")
  88. items = driver.find_elements(By.XPATH,
  89. "//ul[@class='as-pager-body']/li/a")
  90. return items
  91. except TimeoutException as e:
  92. raise Exception(f"搜索失败 [超时]: {e}")
  93. except NoSuchElementException as e:
  94. raise Exception(f"搜索失败 [找不到元素]: {e}")
  95. def collect(self, driver, items: list, store: IDataStore) :
  96. if store:
  97. self._store = store
  98. self._process_list(driver, items)
  99. self.store.save_collect_data(True)
  100. def _next_page(self, driver) -> list:
  101. try:
  102. wait = WebDriverWait(driver, 10, 1)
  103. next_path = "//form[@id='pagerSubmitForm']/a[@class='next']"
  104. wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
  105. btn = driver.find_element(By.XPATH, next_path)
  106. btn.click()
  107. self.logger.info(f"跳转到下页: {driver.current_url}")
  108. wait.until(ec.presence_of_element_located((By.ID, "site-content")))
  109. items = driver.find_elements(By.XPATH,
  110. "//ul[@class='as-pager-body']/li/a")
  111. return items
  112. except NoSuchElementException as e:
  113. raise Exception(f"翻页失败 [找不到元素]: {e}")
  114. except TimeoutException:
  115. self.logger.info("翻页结束")
  116. return []
  117. def _process_item(self, driver, item):
  118. current_handle = driver.current_window_handle
  119. try:
  120. url = item.get_attribute('href')
  121. old = self.store.query_one_collect_by_url(url)
  122. if old:
  123. self.logger.info(f"已采集过: {url}")
  124. return
  125. item.click()
  126. wait = WebDriverWait(driver, 10, 1)
  127. wait.until(ec.number_of_windows_to_be(2))
  128. handles = driver.window_handles
  129. for handle in handles:
  130. if handle != current_handle:
  131. driver.switch_to.window(handle)
  132. break
  133. url = driver.current_url
  134. self.logger.info(f"跳转详情: {driver.current_url}")
  135. wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
  136. content = driver.find_element(By.TAG_NAME, "body").text
  137. self._save(url, content)
  138. sleep(1)
  139. driver.close()
  140. sleep(2)
  141. except TimeoutException as e:
  142. self.logger.error(
  143. f"采集发生异常 Timeout: {driver.current_url}。Exception: {e}")
  144. # raise Exception(f"采集失败 [超时]: {e}")
  145. except NoSuchElementException as e:
  146. self.logger.error(
  147. f"采集发生异常 NoSuchElement: {driver.current_url}。Exception: {e}")
  148. raise Exception(f"采集失败 [找不到元素]: {e}")
  149. finally:
  150. driver.switch_to.window(current_handle)
  151. def _save(self, url, content):
  152. # self.logger.info(f"保存数据: {url},关键字{self.keyword}")
  153. if not self.store:
  154. self.logger.info(f"DataStore 未指定: {url},关键字{self.keyword}")
  155. else:
  156. self.store.insert_collect_data(url, self.keyword, content, True)
  157. def _process_list(self, driver, items: list) -> list:
  158. if not items:
  159. return []
  160. for item in items:
  161. self._process_item(driver, item)
  162. sleep(2)
  163. next_items = self._next_page(driver)
  164. return self._process_list(driver, next_items)
  165. def teardown(self, driver) -> None:
  166. try:
  167. if driver:
  168. driver.quit()
  169. except Exception as e:
  170. raise Exception(f"关闭驱动器失败: {e}")