|
@@ -15,7 +15,6 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
|
|
|
中国招标网数据采集适配器
|
|
|
"""
|
|
|
|
|
|
-
|
|
|
def __init__(self, url: str,store:IDataStore=None):
|
|
|
self._url = url
|
|
|
self._store = store
|
|
@@ -23,25 +22,6 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
|
|
|
self._keyword = None
|
|
|
self._adapter_type = "chinabidding"
|
|
|
|
|
|
- # @property
|
|
|
- # def store(self) -> IDataStore:
|
|
|
- # return self._store
|
|
|
- #
|
|
|
- # @property
|
|
|
- # def url(self):
|
|
|
- # return self._url
|
|
|
- #
|
|
|
- # @property
|
|
|
- # def keyword(self):
|
|
|
- # return self._keyword
|
|
|
- #
|
|
|
- # @property
|
|
|
- # def driver(self)->webdriver:
|
|
|
- # if not self._driver:
|
|
|
- # self._driver = self._create_driver()
|
|
|
- # return self._driver
|
|
|
-
|
|
|
-
|
|
|
def login(self, username: str, password: str) -> None:
|
|
|
try:
|
|
|
login_el = self.driver.find_element(
|
|
@@ -57,73 +37,95 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
|
|
|
login_btn.click()
|
|
|
wait.until(ec.presence_of_element_located((By.ID, "site-content")))
|
|
|
except TimeoutException as e:
|
|
|
- raise Exception(f"登录失败 [超时]: {e}")
|
|
|
+ raise Exception(f"登录失败 [{self._adapter_type}] [超时]: {e}")
|
|
|
except NoSuchElementException as e:
|
|
|
- raise Exception(f"登录失败 [找不到元素]: {e}")
|
|
|
+ raise Exception(f"登录失败 [{self._adapter_type}] [找不到元素]: {e}")
|
|
|
|
|
|
|
|
|
def collect(self, keyword: str, store: IDataStore):
|
|
|
if store:
|
|
|
self._store = store
|
|
|
self._keyword = keyword
|
|
|
- items = self._search(keyword)
|
|
|
- self._process_list(items)
|
|
|
+ items = self._search_by_type(keyword, 0)
|
|
|
+ self._process_list(items,0)
|
|
|
+ sleep(2)
|
|
|
+ items = self._search_by_type(keyword,1)
|
|
|
+ self._process_list(items,1)
|
|
|
if self.config.get_bool(self.batch_save_key):
|
|
|
self.store.save_collect_data(True)
|
|
|
|
|
|
- def _search(self, keyword: str) -> list:
|
|
|
+ def _search_by_type(self, keyword: str,data_type):
|
|
|
try:
|
|
|
- wait = WebDriverWait(self.driver, 10, 1)
|
|
|
- wait.until(
|
|
|
- ec.presence_of_element_located((By.ID, "projSearchForm")))
|
|
|
- search_el = self.driver.find_element(By.ID, "fullText")
|
|
|
- search_el.send_keys("")
|
|
|
- search_el.send_keys(keyword)
|
|
|
- search_btn = self.driver.find_element(
|
|
|
- By.XPATH, "//form[@id='projSearchForm']/button")
|
|
|
- search_btn.click()
|
|
|
- wait.until(ec.presence_of_element_located((By.ID, "site-content")))
|
|
|
- default_search_txt = "近3日"
|
|
|
- search_txt = self.config.get(self.search_day_key, default_search_txt)
|
|
|
- self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
|
|
|
- if search_txt != default_search_txt:
|
|
|
- last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
|
|
|
- last_el.click()
|
|
|
- wait.until(ec.presence_of_element_located((By.ID, "site-content")))
|
|
|
+ self.driver.get(self._url)
|
|
|
+ if data_type == 0:
|
|
|
+ self.logger.info(f"开始采集 招标公告")
|
|
|
+ el = self.driver.find_element(By.XPATH, "//div[@id='z-b-g-g']/h2/a[@class='more']")
|
|
|
else:
|
|
|
- sleep(1)
|
|
|
- try:
|
|
|
- a_links = self.driver.find_elements(
|
|
|
- By.XPATH, "//form[@id='pagerSubmitForm']/a")
|
|
|
- count = len(a_links)
|
|
|
- if count > 1:
|
|
|
- count = count - 1
|
|
|
- self.logger.info(f"共查询到 {count} 页")
|
|
|
- except Exception as e:
|
|
|
- self.logger.error(f"搜索失败[尝试查询页数]: {e}")
|
|
|
- items = self.driver.find_elements(By.XPATH,
|
|
|
- "//ul[@class='as-pager-body']/li/a")
|
|
|
- return items
|
|
|
+ self.logger.info(f"开始采集 中标结果公告")
|
|
|
+ el = self.driver.find_element(By.XPATH, "//div[@id='z-b-jg-gg']/h2/a[@class='more']")
|
|
|
+ el.click()
|
|
|
+ wait = WebDriverWait(self.driver, 10, 1)
|
|
|
+ wait.until(ec.number_of_windows_to_be(2))
|
|
|
+ self.driver.close()
|
|
|
+ self.driver.switch_to.window(self.driver.window_handles[0])
|
|
|
+ return self._search(keyword)
|
|
|
except TimeoutException as e:
|
|
|
- raise Exception(f"搜索失败 [超时]: {e}")
|
|
|
+ raise Exception(f"搜索失败 [{self._adapter_type}] [超时]: {e}")
|
|
|
except NoSuchElementException as e:
|
|
|
- raise Exception(f"搜索失败 [找不到元素]: {e}")
|
|
|
+ raise Exception(f"搜索失败 [{self._adapter_type}] [找不到元素]: {e}")
|
|
|
|
|
|
- def _process_list(self, items: list) -> list:
|
|
|
+
|
|
|
+ def _search(self, keyword: str) -> list:
|
|
|
+ wait = WebDriverWait(self.driver, 10, 1)
|
|
|
+ wait.until(
|
|
|
+ ec.presence_of_element_located((By.ID, "searchBidProjForm")))
|
|
|
+ search_el = self.driver.find_element(By.XPATH, "//form[@id='searchBidProjForm']/ul/li/input[@id='fullText']")
|
|
|
+ search_el.clear()
|
|
|
+ search_el.send_keys(keyword)
|
|
|
+ search_btn = self.driver.find_element(
|
|
|
+ By.XPATH, "//form[@id='searchBidProjForm']/ul/li/button")
|
|
|
+ search_btn.click()
|
|
|
+ wait.until(ec.presence_of_element_located((By.ID, "site-content")))
|
|
|
+ default_search_txt = "全部"
|
|
|
+ search_txt = self.config.get(self.search_day_key, default_search_txt)
|
|
|
+ self.logger.info(f"搜索关键字: {keyword},搜索条件: {search_txt}")
|
|
|
+ if search_txt != default_search_txt:
|
|
|
+ last_el = self.driver.find_element(By.LINK_TEXT, search_txt)
|
|
|
+ sleep(1)
|
|
|
+ last_el.click()
|
|
|
+ wait.until(ec.presence_of_element_located((By.ID, "site-content")))
|
|
|
+ else:
|
|
|
+ sleep(1)
|
|
|
+ try:
|
|
|
+ a_links = self.driver.find_elements(
|
|
|
+ By.XPATH, "//form[@id='pagerSubmitForm']/a")
|
|
|
+ count = len(a_links)
|
|
|
+ if count > 1:
|
|
|
+ count = count - 1
|
|
|
+ self.logger.info(f"共查询到 {count} 页")
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error(f"搜索失败[尝试查询页数]: {e}")
|
|
|
+ items = self.driver.find_elements(By.XPATH,
|
|
|
+ "//ul[@class='as-pager-body']/li/a")
|
|
|
+ return items
|
|
|
+
|
|
|
+ def _process_list(self, items: list,data_type) -> list:
|
|
|
if not items:
|
|
|
return []
|
|
|
for item in items:
|
|
|
- self._process_item(item)
|
|
|
+ self._process_item(item,data_type)
|
|
|
sleep(2)
|
|
|
next_items = self._next_page()
|
|
|
- return self._process_list(next_items)
|
|
|
+ return self._process_list(next_items,data_type)
|
|
|
|
|
|
def _next_page(self) -> list:
|
|
|
try:
|
|
|
wait = WebDriverWait(self.driver, 10, 1)
|
|
|
- next_path = "//form[@id='pagerSubmitForm']/a[@class='next']"
|
|
|
- wait.until(ec.presence_of_element_located((By.XPATH, next_path)))
|
|
|
- btn = self.driver.find_element(By.XPATH, next_path)
|
|
|
+ try:
|
|
|
+ btn = self.driver.find_element(By.XPATH, "//form[@id='pagerSubmitForm']/a[@class='next']")
|
|
|
+ except NoSuchElementException:
|
|
|
+ self.logger.info(f"翻页结束 [{self._adapter_type}]")
|
|
|
+ return []
|
|
|
btn.click()
|
|
|
self.logger.info(f"跳转到下页: {self.driver.current_url}")
|
|
|
wait.until(ec.presence_of_element_located((By.ID, "site-content")))
|
|
@@ -131,12 +133,11 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
|
|
|
"//ul[@class='as-pager-body']/li/a")
|
|
|
return items
|
|
|
except NoSuchElementException as e:
|
|
|
- raise Exception(f"翻页失败 [找不到元素]: {e}")
|
|
|
+ raise Exception(f"翻页失败 [{self._adapter_type}] [找不到元素]: {e}")
|
|
|
except TimeoutException:
|
|
|
- self.logger.info("翻页结束")
|
|
|
- return []
|
|
|
+ raise Exception(f"翻页结束 [{self._adapter_type}] [超时]: {e}")
|
|
|
|
|
|
- def _process_item(self, item):
|
|
|
+ def _process_item(self, item,data_type):
|
|
|
main_handle = self.driver.current_window_handle
|
|
|
close = True
|
|
|
try:
|
|
@@ -153,22 +154,23 @@ class ChinabiddingDataCollectionAdapter(IDataCollectionAdapter):
|
|
|
self.driver.switch_to.window(handle)
|
|
|
break
|
|
|
url = self.driver.current_url
|
|
|
- self.logger.info(f"跳转详情")
|
|
|
- wait.until(ec.presence_of_element_located((By.TAG_NAME, "body")))
|
|
|
- content = self.driver.find_element(By.TAG_NAME, "body").text
|
|
|
+ # self.logger.info(f"跳转详情")
|
|
|
+ print(".",end="")
|
|
|
+ wait.until(ec.presence_of_element_located((By.CLASS_NAME, "content")))
|
|
|
+ content = self.driver.find_element(By.CLASS_NAME, "content").text
|
|
|
if self._check_content(content):
|
|
|
- self._save_db(url, content)
|
|
|
+ self._save_db(url, content, data_type)
|
|
|
else:
|
|
|
- self._save_db(url, content, is_invalid=True)
|
|
|
+ self._save_db(url, content, data_type, is_invalid=True)
|
|
|
|
|
|
except TimeoutException as e:
|
|
|
self.logger.error(
|
|
|
- f"采集发生异常 Timeout: {self.driver.current_url}。Exception: {e}")
|
|
|
+ f"采集发生异常 [{self._adapter_type}] Timeout: {self.driver.current_url}。Exception: {e}")
|
|
|
# raise Exception(f"采集失败 [超时]: {e}")
|
|
|
except NoSuchElementException as e:
|
|
|
self.logger.error(
|
|
|
- f"采集发生异常 NoSuchElement: {self.driver.current_url}。Exception: {e}")
|
|
|
- raise Exception(f"采集失败 [找不到元素]: {e}")
|
|
|
+ f"采集发生异常 [{self._adapter_type}] NoSuchElement: {self.driver.current_url}。Exception: {e}")
|
|
|
+ raise Exception(f"采集失败 [{self._adapter_type}] [找不到元素]: {e}")
|
|
|
finally:
|
|
|
if close:
|
|
|
sleep(2)
|