Scraping.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. #include "Scraping.h"
  2. #include "../Cache/Cache.h"
  3. #include "../Proxy/Proxy.h"
  4. #include "Config.h"
  5. #include <curl/curl.h>
  6. #include <libxml/HTMLparser.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <time.h>
  10. int check_cache_for_job(ScrapeJob *job) {
  11. if (get_cache_ttl_search() <= 0)
  12. return 0;
  13. char *key = cache_compute_key(job->query, job->page, job->engine->name);
  14. if (!key)
  15. return 0;
  16. char *cached_data = NULL;
  17. size_t cached_size = 0;
  18. if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data,
  19. &cached_size) == 0 &&
  20. cached_data && cached_size > 0) {
  21. xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
  22. HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
  23. HTML_PARSE_NOWARNING);
  24. if (doc) {
  25. job->results_count = job->engine->parser(
  26. job->engine->name, doc, job->out_results, job->max_results);
  27. xmlFreeDoc(doc);
  28. }
  29. free(cached_data);
  30. free(key);
  31. return 1;
  32. }
  33. free(key);
  34. return 0;
  35. }
  36. void parse_and_cache_response(ScrapeJob *job) {
  37. if (job->response.size == 0) {
  38. job->results_count = 0;
  39. return;
  40. }
  41. char *key = cache_compute_key(job->query, job->page, job->engine->name);
  42. if (key && get_cache_ttl_search() > 0)
  43. cache_set(key, job->response.memory, job->response.size);
  44. free(key);
  45. xmlDocPtr doc = htmlReadMemory(
  46. job->response.memory, job->response.size, NULL, NULL,
  47. HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
  48. if (doc) {
  49. job->results_count = job->engine->parser(
  50. job->engine->name, doc, job->out_results, job->max_results);
  51. xmlFreeDoc(doc);
  52. } else {
  53. job->results_count = 0;
  54. }
  55. }
  56. void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
  57. struct curl_slist *headers = NULL;
  58. curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
  59. if (headers)
  60. curl_slist_free_all(headers);
  61. free(job->response.memory);
  62. job->response.memory = NULL;
  63. }
  64. void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
  65. if (msg->data.result == CURLE_OK)
  66. parse_and_cache_response(job);
  67. else
  68. job->results_count = 0;
  69. cleanup_job_handle(job, handle);
  70. }
  71. int setup_job(ScrapeJob *job, CURLM *multi_handle) {
  72. if (job->handle)
  73. curl_easy_cleanup(job->handle);
  74. if (job->response.memory)
  75. free(job->response.memory);
  76. if (check_cache_for_job(job)) {
  77. job->results_count = job->results_count > 0 ? job->results_count : 0;
  78. return 0;
  79. }
  80. char *encoded_query = curl_easy_escape(NULL, job->query, 0);
  81. if (!encoded_query)
  82. return -1;
  83. char *full_url =
  84. build_search_url(job->engine->base_url, job->engine->page_param,
  85. job->engine->page_multiplier, job->engine->page_base,
  86. encoded_query, job->page);
  87. free(encoded_query);
  88. if (!full_url)
  89. return -1;
  90. job->handle = curl_easy_init();
  91. if (!job->handle) {
  92. free(full_url);
  93. return -1;
  94. }
  95. job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
  96. job->response.size = 0;
  97. job->response.capacity = INITIAL_BUFFER_SIZE;
  98. struct curl_slist *headers =
  99. build_request_headers(job->engine->host_header, job->engine->referer);
  100. configure_curl_handle(job->handle, full_url, &job->response, headers);
  101. curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
  102. free(full_url);
  103. curl_multi_add_handle(multi_handle, job->handle);
  104. return 0;
  105. }
  106. int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) {
  107. CURLMsg *msg;
  108. int msgs_left;
  109. while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
  110. if (msg->msg != CURLMSG_DONE)
  111. continue;
  112. CURL *handle = msg->easy_handle;
  113. for (int i = 0; i < num_jobs; i++) {
  114. if (jobs[i].handle && jobs[i].handle == handle) {
  115. process_response(&jobs[i], handle, msg);
  116. curl_multi_remove_handle(multi_handle, handle);
  117. curl_easy_cleanup(handle);
  118. jobs[i].handle = NULL;
  119. break;
  120. }
  121. }
  122. }
  123. return 0;
  124. }
  125. int should_retry(ScrapeJob *jobs, int num_jobs) {
  126. if (proxy_count <= 0)
  127. return 0;
  128. for (int i = 0; i < num_jobs; i++) {
  129. if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
  130. return 1;
  131. }
  132. return 0;
  133. }
  134. int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
  135. int retries = 0;
  136. retry:
  137. CURLM *multi_handle = curl_multi_init();
  138. if (!multi_handle)
  139. return -1;
  140. for (int i = 0; i < num_jobs; i++) {
  141. if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) {
  142. curl_multi_remove_handle(multi_handle, jobs[i].handle);
  143. curl_easy_cleanup(jobs[i].handle);
  144. jobs[i].handle = NULL;
  145. }
  146. }
  147. http_delay();
  148. int still_running = 0;
  149. curl_multi_perform(multi_handle, &still_running);
  150. do {
  151. int numfds = 0;
  152. CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
  153. if (mc != CURLM_OK)
  154. break;
  155. curl_multi_perform(multi_handle, &still_running);
  156. } while (still_running);
  157. handle_responses(multi_handle, jobs, num_jobs);
  158. curl_multi_cleanup(multi_handle);
  159. if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) {
  160. retries++;
  161. goto retry;
  162. }
  163. return 0;
  164. }
  165. int scrape_engine(const SearchEngine *engine, const char *query,
  166. SearchResult **out_results, int max_results) {
  167. ScrapeJob job = {.engine = engine,
  168. .query = (char *)query,
  169. .out_results = out_results,
  170. .max_results = max_results,
  171. .results_count = 0,
  172. .page = 1};
  173. scrape_engines_parallel(&job, 1);
  174. return job.results_count;
  175. }