| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- #include "Scraping.h"
- #include "../Cache/Cache.h"
- #include "../Proxy/Proxy.h"
- #include "Config.h"
- #include <curl/curl.h>
- #include <libxml/HTMLparser.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- int check_cache_for_job(ScrapeJob *job) {
- if (get_cache_ttl_search() <= 0)
- return 0;
- char *key = cache_compute_key(job->query, job->page, job->engine->name);
- if (!key)
- return 0;
- char *cached_data = NULL;
- size_t cached_size = 0;
- if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data,
- &cached_size) == 0 &&
- cached_data && cached_size > 0) {
- xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
- HTML_PARSE_NOWARNING);
- if (doc) {
- job->results_count = job->engine->parser(
- job->engine->name, doc, job->out_results, job->max_results);
- xmlFreeDoc(doc);
- }
- free(cached_data);
- free(key);
- return 1;
- }
- free(key);
- return 0;
- }
- void parse_and_cache_response(ScrapeJob *job) {
- if (job->response.size == 0) {
- job->results_count = 0;
- return;
- }
- char *key = cache_compute_key(job->query, job->page, job->engine->name);
- if (key && get_cache_ttl_search() > 0)
- cache_set(key, job->response.memory, job->response.size);
- free(key);
- xmlDocPtr doc = htmlReadMemory(
- job->response.memory, job->response.size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
- if (doc) {
- job->results_count = job->engine->parser(
- job->engine->name, doc, job->out_results, job->max_results);
- xmlFreeDoc(doc);
- } else {
- job->results_count = 0;
- }
- }
- void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
- struct curl_slist *headers = NULL;
- curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
- if (headers)
- curl_slist_free_all(headers);
- free(job->response.memory);
- job->response.memory = NULL;
- }
- void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
- if (msg->data.result == CURLE_OK)
- parse_and_cache_response(job);
- else
- job->results_count = 0;
- cleanup_job_handle(job, handle);
- }
- int setup_job(ScrapeJob *job, CURLM *multi_handle) {
- if (job->handle)
- curl_easy_cleanup(job->handle);
- if (job->response.memory)
- free(job->response.memory);
- if (check_cache_for_job(job)) {
- job->results_count = job->results_count > 0 ? job->results_count : 0;
- return 0;
- }
- char *encoded_query = curl_easy_escape(NULL, job->query, 0);
- if (!encoded_query)
- return -1;
- char *full_url =
- build_search_url(job->engine->base_url, job->engine->page_param,
- job->engine->page_multiplier, job->engine->page_base,
- encoded_query, job->page);
- free(encoded_query);
- if (!full_url)
- return -1;
- job->handle = curl_easy_init();
- if (!job->handle) {
- free(full_url);
- return -1;
- }
- job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
- job->response.size = 0;
- job->response.capacity = INITIAL_BUFFER_SIZE;
- struct curl_slist *headers =
- build_request_headers(job->engine->host_header, job->engine->referer);
- configure_curl_handle(job->handle, full_url, &job->response, headers);
- curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
- free(full_url);
- curl_multi_add_handle(multi_handle, job->handle);
- return 0;
- }
- int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) {
- CURLMsg *msg;
- int msgs_left;
- while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
- if (msg->msg != CURLMSG_DONE)
- continue;
- CURL *handle = msg->easy_handle;
- for (int i = 0; i < num_jobs; i++) {
- if (jobs[i].handle && jobs[i].handle == handle) {
- process_response(&jobs[i], handle, msg);
- curl_multi_remove_handle(multi_handle, handle);
- curl_easy_cleanup(handle);
- jobs[i].handle = NULL;
- break;
- }
- }
- }
- return 0;
- }
- int should_retry(ScrapeJob *jobs, int num_jobs) {
- if (proxy_count <= 0)
- return 0;
- for (int i = 0; i < num_jobs; i++) {
- if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
- return 1;
- }
- return 0;
- }
- int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
- int retries = 0;
- retry:
- CURLM *multi_handle = curl_multi_init();
- if (!multi_handle)
- return -1;
- for (int i = 0; i < num_jobs; i++) {
- if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) {
- curl_multi_remove_handle(multi_handle, jobs[i].handle);
- curl_easy_cleanup(jobs[i].handle);
- jobs[i].handle = NULL;
- }
- }
- http_delay();
- int still_running = 0;
- curl_multi_perform(multi_handle, &still_running);
- do {
- int numfds = 0;
- CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
- if (mc != CURLM_OK)
- break;
- curl_multi_perform(multi_handle, &still_running);
- } while (still_running);
- handle_responses(multi_handle, jobs, num_jobs);
- curl_multi_cleanup(multi_handle);
- if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) {
- retries++;
- goto retry;
- }
- return 0;
- }
- int scrape_engine(const SearchEngine *engine, const char *query,
- SearchResult **out_results, int max_results) {
- ScrapeJob job = {.engine = engine,
- .query = (char *)query,
- .out_results = out_results,
- .max_results = max_results,
- .results_count = 0,
- .page = 1};
- scrape_engines_parallel(&job, 1);
- return job.results_count;
- }
|