#include "Scraping.h" #include "../Cache/Cache.h" #include "../Proxy/Proxy.h" #include "Config.h" #include #include #include #include #include int check_cache_for_job(ScrapeJob *job) { if (get_cache_ttl_search() <= 0) return 0; char *key = cache_compute_key(job->query, job->page, job->engine->name); if (!key) return 0; char *cached_data = NULL; size_t cached_size = 0; if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data, &cached_size) == 0 && cached_data && cached_size > 0) { xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (doc) { job->results_count = job->engine->parser( job->engine->name, doc, job->out_results, job->max_results); xmlFreeDoc(doc); } free(cached_data); free(key); return 1; } free(key); return 0; } void parse_and_cache_response(ScrapeJob *job) { if (job->response.size == 0) { job->results_count = 0; return; } char *key = cache_compute_key(job->query, job->page, job->engine->name); if (key && get_cache_ttl_search() > 0) cache_set(key, job->response.memory, job->response.size); free(key); xmlDocPtr doc = htmlReadMemory( job->response.memory, job->response.size, NULL, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (doc) { job->results_count = job->engine->parser( job->engine->name, doc, job->out_results, job->max_results); xmlFreeDoc(doc); } else { job->results_count = 0; } } void cleanup_job_handle(ScrapeJob *job, CURL *handle) { struct curl_slist *headers = NULL; curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); if (headers) curl_slist_free_all(headers); free(job->response.memory); job->response.memory = NULL; } void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { if (msg->data.result == CURLE_OK) parse_and_cache_response(job); else job->results_count = 0; cleanup_job_handle(job, handle); } int setup_job(ScrapeJob *job, CURLM *multi_handle) { if (job->handle) curl_easy_cleanup(job->handle); if (job->response.memory) free(job->response.memory); if (check_cache_for_job(job)) { job->results_count = job->results_count > 0 ? job->results_count : 0; return 0; } char *encoded_query = curl_easy_escape(NULL, job->query, 0); if (!encoded_query) return -1; char *full_url = build_search_url(job->engine->base_url, job->engine->page_param, job->engine->page_multiplier, job->engine->page_base, encoded_query, job->page); free(encoded_query); if (!full_url) return -1; job->handle = curl_easy_init(); if (!job->handle) { free(full_url); return -1; } job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE); job->response.size = 0; job->response.capacity = INITIAL_BUFFER_SIZE; struct curl_slist *headers = build_request_headers(job->engine->host_header, job->engine->referer); configure_curl_handle(job->handle, full_url, &job->response, headers); curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); free(full_url); curl_multi_add_handle(multi_handle, job->handle); return 0; } int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) { CURLMsg *msg; int msgs_left; while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { if (msg->msg != CURLMSG_DONE) continue; CURL *handle = msg->easy_handle; for (int i = 0; i < num_jobs; i++) { if (jobs[i].handle && jobs[i].handle == handle) { process_response(&jobs[i], handle, msg); curl_multi_remove_handle(multi_handle, handle); curl_easy_cleanup(handle); jobs[i].handle = NULL; break; } } } return 0; } int should_retry(ScrapeJob *jobs, int num_jobs) { if (proxy_count <= 0) return 0; for (int i = 0; i < num_jobs; i++) { if (jobs[i].results_count == 0 && jobs[i].response.size == 0) return 1; } return 0; } int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { int retries = 0; retry: CURLM *multi_handle = curl_multi_init(); if (!multi_handle) return -1; for (int i = 0; i < num_jobs; i++) { if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) { curl_multi_remove_handle(multi_handle, jobs[i].handle); curl_easy_cleanup(jobs[i].handle); jobs[i].handle = NULL; } } http_delay(); int still_running = 0; curl_multi_perform(multi_handle, &still_running); do { int numfds = 0; CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); if (mc != CURLM_OK) break; curl_multi_perform(multi_handle, &still_running); } while (still_running); handle_responses(multi_handle, jobs, num_jobs); curl_multi_cleanup(multi_handle); if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) { retries++; goto retry; } return 0; } int scrape_engine(const SearchEngine *engine, const char *query, SearchResult **out_results, int max_results) { ScrapeJob job = {.engine = engine, .query = (char *)query, .out_results = out_results, .max_results = max_results, .results_count = 0, .page = 1}; scrape_engines_parallel(&job, 1); return job.results_count; }