request.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. from asyncio import Semaphore
  2. from concurrent.futures import ThreadPoolExecutor
  3. from tqdm import tqdm
  4. from tqdm.asyncio import tqdm_asyncio
  5. from utils.config import config
  6. from utils.driver.tools import get_soup_driver
  7. from utils.requests.tools import get_soup_requests, close_session
  8. from utils.retry import retry_func
  9. from utils.speed import get_delay_requests
  10. def get_proxy_list(page_count=1):
  11. """
  12. Get proxy list, parameter page_count is the number of pages to get
  13. """
  14. url_pattern = [
  15. "https://www.zdaye.com/free/{}/",
  16. "https://www.kuaidaili.com/free/inha/{}/",
  17. "https://www.kuaidaili.com/free/intr/{}/",
  18. ]
  19. proxy_list = []
  20. urls = []
  21. open_driver = config.open_driver
  22. for page_index in range(1, page_count + 1):
  23. for pattern in url_pattern:
  24. url = pattern.format(page_index)
  25. urls.append(url)
  26. pbar = tqdm(total=len(urls), desc="Getting proxy list")
  27. def get_proxy(url):
  28. proxys = []
  29. try:
  30. if open_driver:
  31. soup = retry_func(lambda: get_soup_driver(url), name=url)
  32. else:
  33. try:
  34. soup = retry_func(lambda: get_soup_requests(url), name=url)
  35. except Exception as e:
  36. soup = get_soup_requests(url)
  37. table = soup.find("table")
  38. trs = table.find_all("tr") if table else []
  39. for tr in trs[1:]:
  40. tds = tr.find_all("td")
  41. ip = tds[0].get_text().strip()
  42. port = tds[1].get_text().strip()
  43. proxy = f"http://{ip}:{port}"
  44. proxys.append(proxy)
  45. finally:
  46. pbar.update()
  47. return proxys
  48. max_workers = 3 if open_driver else 10
  49. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  50. futures = [executor.submit(get_proxy, url) for url in urls]
  51. for future in futures:
  52. proxy_list.extend(future.result())
  53. if not open_driver:
  54. close_session()
  55. pbar.close()
  56. return proxy_list
  57. async def get_proxy_list_with_test(base_url, proxy_list):
  58. """
  59. Get the proxy list with speed test
  60. """
  61. if not proxy_list:
  62. print("No valid proxy found")
  63. return []
  64. semaphore = Semaphore(100)
  65. async def get_speed_task(url, timeout, proxy):
  66. async with semaphore:
  67. return await get_delay_requests(url, timeout=timeout, proxy=proxy)
  68. response_times = await tqdm_asyncio.gather(
  69. *(get_speed_task(base_url, timeout=30, proxy=url) for url in proxy_list),
  70. desc="Testing proxy speed",
  71. )
  72. proxy_list_with_test = [
  73. (proxy, response_time)
  74. for proxy, response_time in zip(proxy_list, response_times)
  75. if response_time != float("inf")
  76. ]
  77. if not proxy_list_with_test:
  78. print("No valid proxy found")
  79. return []
  80. proxy_list_with_test.sort(key=lambda x: x[1])
  81. proxy_urls = [url for url, _ in proxy_list_with_test]
  82. print(f"Valid proxy found: {len(proxy_urls)}")
  83. return proxy_urls