ScrapingHttp.c 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #include "../Proxy/Proxy.h"
  2. #include "Config.h"
  3. #include "Scraping.h"
  4. #include <curl/curl.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <string.h>
  8. #include <unistd.h>
  9. #define HTTP_DELAY_MIN_US 100000
  10. #define HTTP_DELAY_RANGE_US 100000
  11. static const char *USER_AGENTS[] = {
  12. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
  13. "like Gecko) Chrome/120.0.0.0 Safari/537.36",
  14. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
  15. "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
  16. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
  17. "Gecko) Chrome/120.0.0.0 Safari/537.36",
  18. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
  19. "Firefox/121.0",
  20. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
  21. "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
  22. #define USER_AGENT_COUNT (sizeof(USER_AGENTS) / sizeof(USER_AGENTS[0]))
  23. size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
  24. void *userp) {
  25. size_t realsize = size * nmemb;
  26. MemoryBuffer *mem = (MemoryBuffer *)userp;
  27. if (mem->size + realsize + 1 > mem->capacity) {
  28. size_t new_cap =
  29. mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2;
  30. while (new_cap < mem->size + realsize + 1)
  31. new_cap *= 2;
  32. char *ptr = (char *)realloc(mem->memory, new_cap);
  33. if (!ptr)
  34. return 0;
  35. mem->memory = ptr;
  36. mem->capacity = new_cap;
  37. }
  38. memcpy(&(mem->memory[mem->size]), contents, realsize);
  39. mem->size += realsize;
  40. mem->memory[mem->size] = 0;
  41. return realsize;
  42. }
  43. const char *get_random_user_agent(void) {
  44. return USER_AGENTS[rand() % USER_AGENT_COUNT];
  45. }
  46. void configure_curl_handle(CURL *curl, const char *full_url,
  47. MemoryBuffer *chunk, struct curl_slist *headers) {
  48. curl_easy_setopt(curl, CURLOPT_URL, full_url);
  49. curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
  50. curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_memory_callback);
  51. curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
  52. curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
  53. curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
  54. curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
  55. curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT_SECS);
  56. curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
  57. curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT_SECS);
  58. curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
  59. curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
  60. apply_proxy_settings(curl);
  61. }
  62. char *build_search_url(const char *base_url, const char *page_param,
  63. int page_multiplier, int page_base,
  64. const char *encoded_query, int page) {
  65. int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base;
  66. char *url = malloc(BUFFER_SIZE_LARGE);
  67. if (!url)
  68. return NULL;
  69. snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query,
  70. page_param, page_value);
  71. return url;
  72. }
  73. struct curl_slist *build_request_headers(const char *host_header,
  74. const char *referer) {
  75. struct curl_slist *headers = NULL;
  76. char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM];
  77. snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header);
  78. snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer);
  79. headers = curl_slist_append(headers, host_buf);
  80. headers = curl_slist_append(headers, ref_buf);
  81. headers = curl_slist_append(
  82. headers,
  83. "Accept: "
  84. "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
  85. headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
  86. headers = curl_slist_append(headers, "DNT: 1");
  87. return headers;
  88. }
  89. void http_delay(void) {
  90. usleep(HTTP_DELAY_MIN_US + (rand() % HTTP_DELAY_RANGE_US));
  91. }