request.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. from concurrent.futures import ThreadPoolExecutor
  2. from time import time
  3. from tqdm.asyncio import tqdm_asyncio
  4. import utils.constants as constants
  5. from utils.channel import (
  6. format_channel_name,
  7. get_results_from_soup,
  8. get_results_from_soup_requests,
  9. )
  10. from utils.config import config
  11. from utils.driver.setup import setup_driver
  12. from utils.driver.tools import search_submit
  13. from utils.requests.tools import get_soup_requests, close_session
  14. from utils.retry import (
  15. retry_func,
  16. find_clickable_element_with_retry,
  17. )
  18. from utils.tools import (
  19. get_pbar_remaining,
  20. get_soup
  21. )
  22. if config.open_driver:
  23. try:
  24. from selenium.webdriver.common.by import By
  25. except:
  26. pass
  27. async def get_channels_by_online_search(names, callback=None):
  28. """
  29. Get the channels by online search
  30. """
  31. channels = {}
  32. pageUrl = constants.foodie_url
  33. if not pageUrl:
  34. return channels
  35. open_driver = config.open_driver
  36. page_num = config.online_search_page_num
  37. start_time = time()
  38. def process_channel_by_online_search(name):
  39. info_list = []
  40. driver = None
  41. try:
  42. if open_driver:
  43. driver = setup_driver()
  44. try:
  45. retry_func(
  46. lambda: driver.get(pageUrl), name=f"online search:{name}"
  47. )
  48. except Exception as e:
  49. driver.close()
  50. driver.quit()
  51. driver = setup_driver()
  52. driver.get(pageUrl)
  53. search_submit(driver, name)
  54. else:
  55. page_soup = None
  56. request_url = f"{pageUrl}?s={name}"
  57. try:
  58. page_soup = retry_func(
  59. lambda: get_soup_requests(request_url),
  60. name=f"online search:{name}",
  61. )
  62. except Exception as e:
  63. page_soup = get_soup_requests(request_url)
  64. if not page_soup:
  65. print(f"{name}:Request fail.")
  66. return
  67. retry_limit = 3
  68. for page in range(1, page_num + 1):
  69. retries = 0
  70. if not open_driver and page == 1:
  71. retries = 2
  72. while retries < retry_limit:
  73. try:
  74. if page > 1:
  75. if open_driver:
  76. page_link = find_clickable_element_with_retry(
  77. driver,
  78. (
  79. By.XPATH,
  80. f'//a[contains(@href, "={page}") and contains(@href, "{name}")]',
  81. ),
  82. )
  83. if not page_link:
  84. break
  85. driver.execute_script(
  86. "arguments[0].click();", page_link
  87. )
  88. else:
  89. request_url = f"{pageUrl}?s={name}&page={page}"
  90. page_soup = retry_func(
  91. lambda: get_soup_requests(request_url),
  92. name=f"online search:{name}, page:{page}",
  93. )
  94. soup = (
  95. get_soup(driver.page_source) if open_driver else page_soup
  96. )
  97. if soup:
  98. if "About 0 results" in soup.text:
  99. retries += 1
  100. continue
  101. results = (
  102. get_results_from_soup(soup, name)
  103. if open_driver
  104. else get_results_from_soup_requests(soup, name)
  105. )
  106. print(name, "page:", page, "results num:", len(results))
  107. if len(results) == 0:
  108. print(
  109. f"{name}:No results found, refreshing page and retrying..."
  110. )
  111. if open_driver:
  112. driver.refresh()
  113. retries += 1
  114. continue
  115. elif len(results) <= 3:
  116. if open_driver:
  117. next_page_link = find_clickable_element_with_retry(
  118. driver,
  119. (
  120. By.XPATH,
  121. f'//a[contains(@href, "={page + 1}") and contains(@href, "{name}")]',
  122. ),
  123. retries=1,
  124. )
  125. if next_page_link:
  126. driver.close()
  127. driver.quit()
  128. driver = setup_driver()
  129. search_submit(driver, name)
  130. retries += 1
  131. continue
  132. for result in results:
  133. url = result["url"]
  134. if url:
  135. info_list.append({
  136. "url": url,
  137. "date": result["date"],
  138. "resolution": result["resolution"]
  139. })
  140. break
  141. else:
  142. print(
  143. f"{name}:No page soup found, refreshing page and retrying..."
  144. )
  145. if open_driver:
  146. driver.refresh()
  147. retries += 1
  148. continue
  149. except Exception as e:
  150. print(f"{name}:Error on page {page}: {e}")
  151. break
  152. if retries == retry_limit:
  153. print(f"{name}:Reached retry limit, moving to next page")
  154. except Exception as e:
  155. print(f"{name}:Error on search: {e}")
  156. pass
  157. finally:
  158. if driver:
  159. driver.close()
  160. driver.quit()
  161. pbar.update()
  162. if callback:
  163. callback(
  164. f"正在进行线上查询, 剩余{names_len - pbar.n}个频道待查询, 预计剩余时间: {get_pbar_remaining(n=pbar.n, total=pbar.total, start_time=start_time)}",
  165. int((pbar.n / names_len) * 100),
  166. )
  167. return {"name": format_channel_name(name), "data": info_list}
  168. names_len = len(names)
  169. pbar = tqdm_asyncio(total=names_len, desc="Online search")
  170. if callback:
  171. callback(f"正在进行线上查询, 共{names_len}个频道", 0)
  172. with ThreadPoolExecutor(max_workers=3) as executor:
  173. futures = [
  174. executor.submit(process_channel_by_online_search, name) for name in names
  175. ]
  176. for future in futures:
  177. result = future.result()
  178. name = result.get("name")
  179. data = result.get("data", [])
  180. if name:
  181. channels[name] = data
  182. if not open_driver:
  183. close_session()
  184. pbar.close()
  185. return channels