|
@@ -1,395 +1,20 @@
|
|
|
#include "Scraping.h"
|
|
#include "Scraping.h"
|
|
|
#include "../Cache/Cache.h"
|
|
#include "../Cache/Cache.h"
|
|
|
#include "../Proxy/Proxy.h"
|
|
#include "../Proxy/Proxy.h"
|
|
|
-#include "../Utility/Unescape.h"
|
|
|
|
|
-#include "../Utility/XmlHelper.h"
|
|
|
|
|
#include "Config.h"
|
|
#include "Config.h"
|
|
|
#include <curl/curl.h>
|
|
#include <curl/curl.h>
|
|
|
#include <libxml/HTMLparser.h>
|
|
#include <libxml/HTMLparser.h>
|
|
|
-#include <libxml/xpath.h>
|
|
|
|
|
#include <stdio.h>
|
|
#include <stdio.h>
|
|
|
#include <stdlib.h>
|
|
#include <stdlib.h>
|
|
|
-#include <string.h>
|
|
|
|
|
#include <time.h>
|
|
#include <time.h>
|
|
|
-#include <unistd.h>
|
|
|
|
|
-
|
|
|
|
|
-static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
|
|
|
|
|
- void *userp) {
|
|
|
|
|
- size_t realsize = size * nmemb;
|
|
|
|
|
- MemoryBuffer *mem = (MemoryBuffer *)userp;
|
|
|
|
|
-
|
|
|
|
|
- if (mem->size + realsize + 1 > mem->capacity) {
|
|
|
|
|
- size_t new_cap =
|
|
|
|
|
- mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2;
|
|
|
|
|
- while (new_cap < mem->size + realsize + 1)
|
|
|
|
|
- new_cap *= 2;
|
|
|
|
|
-
|
|
|
|
|
- char *ptr = (char *)realloc(mem->memory, new_cap);
|
|
|
|
|
- if (!ptr) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
- mem->memory = ptr;
|
|
|
|
|
- mem->capacity = new_cap;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- memcpy(&(mem->memory[mem->size]), contents, realsize);
|
|
|
|
|
- mem->size += realsize;
|
|
|
|
|
- mem->memory[mem->size] = 0;
|
|
|
|
|
-
|
|
|
|
|
- return realsize;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static const char *get_random_user_agent(void) {
|
|
|
|
|
- static const char *agents[] = {
|
|
|
|
|
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
|
|
|
|
|
- "like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
|
|
|
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
|
|
|
|
- "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
|
|
|
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
|
|
|
|
|
- "Gecko) "
|
|
|
|
|
- "Chrome/120.0.0.0` Safari/537.36",
|
|
|
|
|
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
|
|
|
|
|
- "Firefox/121.0",
|
|
|
|
|
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
|
|
|
|
|
- "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
|
|
|
|
|
- return agents[rand() % 5];
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
|
|
|
|
|
- SearchResult **out_results, int max_results) {
|
|
|
|
|
- (void)engine_name;
|
|
|
|
|
- int found_count = 0;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
|
|
|
|
- if (!xpathCtx) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr xpathObj = xml_xpath_eval(
|
|
|
|
|
- xpathCtx, "//tr[not(contains(@class, "
|
|
|
|
|
- "'result-sponsored'))]//a[@class='result-link']");
|
|
|
|
|
-
|
|
|
|
|
- if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
|
|
|
|
- if (xpathObj)
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- int num_links = xpathObj->nodesetval->nodeNr;
|
|
|
|
|
- *out_results = xml_result_alloc(num_links, max_results);
|
|
|
|
|
- if (!*out_results) {
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- for (int i = 0; i < num_links && found_count < max_results; i++) {
|
|
|
|
|
- xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i];
|
|
|
|
|
- char *title = xml_node_content(linkNode);
|
|
|
|
|
- char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href");
|
|
|
|
|
- char *snippet_text = NULL;
|
|
|
|
|
-
|
|
|
|
|
- xmlNodePtr current = linkNode->parent;
|
|
|
|
|
- while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
|
|
|
|
|
- current = current->parent;
|
|
|
|
|
-
|
|
|
|
|
- if (current && current->next) {
|
|
|
|
|
- xmlNodePtr snippetRow = current->next;
|
|
|
|
|
- while (snippetRow &&
|
|
|
|
|
- xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
|
|
|
|
|
- snippetRow = snippetRow->next;
|
|
|
|
|
- if (snippetRow) {
|
|
|
|
|
- xpathCtx->node = snippetRow;
|
|
|
|
|
- xmlXPathObjectPtr sObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, ".//td[@class='result-snippet']");
|
|
|
|
|
- if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
|
|
|
|
|
- snippet_text = xml_node_content(sObj->nodesetval->nodeTab[0]);
|
|
|
|
|
- }
|
|
|
|
|
- if (sObj)
|
|
|
|
|
- xmlXPathFreeObject(sObj);
|
|
|
|
|
- xpathCtx->node = NULL;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- (*out_results)[found_count].url = unescape_search_url(url);
|
|
|
|
|
- (*out_results)[found_count].title = strdup(title ? title : "No Title");
|
|
|
|
|
- (*out_results)[found_count].snippet =
|
|
|
|
|
- strdup(snippet_text ? snippet_text : "");
|
|
|
|
|
- found_count++;
|
|
|
|
|
-
|
|
|
|
|
- if (title)
|
|
|
|
|
- xmlFree(title);
|
|
|
|
|
- if (url)
|
|
|
|
|
- xmlFree(url);
|
|
|
|
|
- if (snippet_text)
|
|
|
|
|
- xmlFree(snippet_text);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return found_count;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static int parse_startpage(const char *engine_name, xmlDocPtr doc,
|
|
|
|
|
- SearchResult **out_results, int max_results) {
|
|
|
|
|
- (void)engine_name;
|
|
|
|
|
- int found_count = 0;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
|
|
|
|
- if (!xpathCtx) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr xpathObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, "//div[contains(@class, 'result')]");
|
|
|
|
|
-
|
|
|
|
|
- if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
|
|
|
|
- if (xpathObj)
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- int num_results = xpathObj->nodesetval->nodeNr;
|
|
|
|
|
- *out_results = xml_result_alloc(num_results, max_results);
|
|
|
|
|
- if (!*out_results) {
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- for (int i = 0; i < num_results && found_count < max_results; i++) {
|
|
|
|
|
- xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
|
|
|
|
|
- xpathCtx->node = resultNode;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr linkObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, ".//a[contains(@class, 'result-link')]");
|
|
|
|
|
- char *url =
|
|
|
|
|
- (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
|
|
|
|
|
- ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
|
|
|
|
|
- (xmlChar *)"href")
|
|
|
|
|
- : NULL;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr titleObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, ".//h2[contains(@class, 'wgl-title')]");
|
|
|
|
|
- char *title =
|
|
|
|
|
- (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
|
|
|
|
|
- ? xml_node_content(titleObj->nodesetval->nodeTab[0])
|
|
|
|
|
- : NULL;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr snippetObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, ".//p[contains(@class, 'description')]");
|
|
|
|
|
- char *snippet_text =
|
|
|
|
|
- (snippetObj && snippetObj->nodesetval &&
|
|
|
|
|
- snippetObj->nodesetval->nodeNr > 0)
|
|
|
|
|
- ? xml_node_content(snippetObj->nodesetval->nodeTab[0])
|
|
|
|
|
- : NULL;
|
|
|
|
|
-
|
|
|
|
|
- if (url && title) {
|
|
|
|
|
- (*out_results)[found_count].url = strdup(url);
|
|
|
|
|
- (*out_results)[found_count].title = strdup(title);
|
|
|
|
|
- (*out_results)[found_count].snippet =
|
|
|
|
|
- strdup(snippet_text ? snippet_text : "");
|
|
|
|
|
- found_count++;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- if (title)
|
|
|
|
|
- xmlFree(title);
|
|
|
|
|
- if (url)
|
|
|
|
|
- xmlFree(url);
|
|
|
|
|
- if (snippet_text)
|
|
|
|
|
- xmlFree(snippet_text);
|
|
|
|
|
- if (linkObj)
|
|
|
|
|
- xmlXPathFreeObject(linkObj);
|
|
|
|
|
- if (titleObj)
|
|
|
|
|
- xmlXPathFreeObject(titleObj);
|
|
|
|
|
- if (snippetObj)
|
|
|
|
|
- xmlXPathFreeObject(snippetObj);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- xpathCtx->node = NULL;
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return found_count;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
|
|
|
|
|
- SearchResult **out_results, int max_results) {
|
|
|
|
|
- (void)engine_name;
|
|
|
|
|
- int found_count = 0;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
|
|
|
|
- if (!xpathCtx) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr xpathObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, "//div[contains(@class, 'algo-sr')]");
|
|
|
|
|
-
|
|
|
|
|
- if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
|
|
|
|
- if (xpathObj)
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
|
|
|
|
|
- int num_results = xpathObj->nodesetval->nodeNr;
|
|
|
|
|
- *out_results = xml_result_alloc(num_results, max_results);
|
|
|
|
|
- if (!*out_results) {
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
|
|
+int check_cache_for_job(ScrapeJob *job) {
|
|
|
|
|
+ if (get_cache_ttl_search() <= 0)
|
|
|
return 0;
|
|
return 0;
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- for (int i = 0; i < num_results && found_count < max_results; i++) {
|
|
|
|
|
- xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
|
|
|
|
|
- xpathCtx->node = resultNode;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr linkObj = xml_xpath_eval(
|
|
|
|
|
- xpathCtx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']");
|
|
|
|
|
- char *url =
|
|
|
|
|
- (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
|
|
|
|
|
- ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
|
|
|
|
|
- (xmlChar *)"href")
|
|
|
|
|
- : NULL;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr titleObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, ".//h3[contains(@class, 'title')]");
|
|
|
|
|
- char *title =
|
|
|
|
|
- (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
|
|
|
|
|
- ? xml_node_content(titleObj->nodesetval->nodeTab[0])
|
|
|
|
|
- : NULL;
|
|
|
|
|
-
|
|
|
|
|
- xmlXPathObjectPtr snippetObj =
|
|
|
|
|
- xml_xpath_eval(xpathCtx, ".//div[contains(@class, 'compText')]//p");
|
|
|
|
|
- char *snippet_text =
|
|
|
|
|
- (snippetObj && snippetObj->nodesetval &&
|
|
|
|
|
- snippetObj->nodesetval->nodeNr > 0)
|
|
|
|
|
- ? xml_node_content(snippetObj->nodesetval->nodeTab[0])
|
|
|
|
|
- : NULL;
|
|
|
|
|
-
|
|
|
|
|
- if (url && title) {
|
|
|
|
|
- (*out_results)[found_count].url = unescape_search_url(url);
|
|
|
|
|
- (*out_results)[found_count].title = strdup(title);
|
|
|
|
|
- (*out_results)[found_count].snippet =
|
|
|
|
|
- strdup(snippet_text ? snippet_text : "");
|
|
|
|
|
- found_count++;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- if (title)
|
|
|
|
|
- xmlFree(title);
|
|
|
|
|
- if (url)
|
|
|
|
|
- xmlFree(url);
|
|
|
|
|
- if (snippet_text)
|
|
|
|
|
- xmlFree(snippet_text);
|
|
|
|
|
- if (linkObj)
|
|
|
|
|
- xmlXPathFreeObject(linkObj);
|
|
|
|
|
- if (titleObj)
|
|
|
|
|
- xmlXPathFreeObject(titleObj);
|
|
|
|
|
- if (snippetObj)
|
|
|
|
|
- xmlXPathFreeObject(snippetObj);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- xpathCtx->node = NULL;
|
|
|
|
|
- xmlXPathFreeObject(xpathObj);
|
|
|
|
|
- xmlXPathFreeContext(xpathCtx);
|
|
|
|
|
- return found_count;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-const SearchEngine ENGINE_REGISTRY[] = {
|
|
|
|
|
- {.name = "DuckDuckGo Lite",
|
|
|
|
|
- .base_url = "https://lite.duckduckgo.com/lite/?q=",
|
|
|
|
|
- .host_header = "lite.duckduckgo.com",
|
|
|
|
|
- .referer = "https://lite.duckduckgo.com/",
|
|
|
|
|
- .page_param = "s",
|
|
|
|
|
- .page_multiplier = 30,
|
|
|
|
|
- .page_base = 0,
|
|
|
|
|
- .parser = parse_ddg_lite},
|
|
|
|
|
- {.name = "Startpage",
|
|
|
|
|
- .base_url = "https://www.startpage.com/sp/search?q=",
|
|
|
|
|
- .host_header = "www.startpage.com",
|
|
|
|
|
- .referer = "https://www.startpage.com/",
|
|
|
|
|
- .page_param = "page",
|
|
|
|
|
- .page_multiplier = 1,
|
|
|
|
|
- .page_base = 1,
|
|
|
|
|
- .parser = parse_startpage},
|
|
|
|
|
- {.name = "Yahoo",
|
|
|
|
|
- .base_url = "https://search.yahoo.com/search?p=",
|
|
|
|
|
- .host_header = "search.yahoo.com",
|
|
|
|
|
- .referer = "https://search.yahoo.com/",
|
|
|
|
|
- .page_param = "b",
|
|
|
|
|
- .page_multiplier = 10,
|
|
|
|
|
- .page_base = 1,
|
|
|
|
|
- .parser = parse_yahoo}};
|
|
|
|
|
-
|
|
|
|
|
-const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
|
|
|
|
|
-
|
|
|
|
|
-#define CURL_TIMEOUT 15L
|
|
|
|
|
-#define CURL_DNS_TIMEOUT 300L
|
|
|
|
|
-
|
|
|
|
|
-static void configure_curl_handle(CURL *curl, const char *full_url,
|
|
|
|
|
- MemoryBuffer *chunk,
|
|
|
|
|
- struct curl_slist *headers) {
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_URL, full_url);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
|
|
|
|
|
-
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
|
|
|
|
|
- curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
|
|
|
|
|
-
|
|
|
|
|
- apply_proxy_settings(curl);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static char *build_search_url(const char *base_url, const char *page_param,
|
|
|
|
|
- int page_multiplier, int page_base,
|
|
|
|
|
- const char *encoded_query, int page) {
|
|
|
|
|
- int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base;
|
|
|
|
|
- char *url = malloc(BUFFER_SIZE_LARGE);
|
|
|
|
|
- if (!url) {
|
|
|
|
|
- return NULL;
|
|
|
|
|
- }
|
|
|
|
|
- snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query,
|
|
|
|
|
- page_param, page_value);
|
|
|
|
|
- return url;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static struct curl_slist *build_request_headers(const char *host_header,
|
|
|
|
|
- const char *referer) {
|
|
|
|
|
- struct curl_slist *headers = NULL;
|
|
|
|
|
- char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM];
|
|
|
|
|
-
|
|
|
|
|
- snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header);
|
|
|
|
|
- snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer);
|
|
|
|
|
-
|
|
|
|
|
- headers = curl_slist_append(headers, host_buf);
|
|
|
|
|
- headers = curl_slist_append(headers, ref_buf);
|
|
|
|
|
- headers = curl_slist_append(
|
|
|
|
|
- headers,
|
|
|
|
|
- "Accept: "
|
|
|
|
|
- "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
|
|
|
|
- headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
|
|
|
|
|
- headers = curl_slist_append(headers, "DNT: 1");
|
|
|
|
|
-
|
|
|
|
|
- return headers;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static int check_cache_for_job(ScrapeJob *job) {
|
|
|
|
|
- if (get_cache_ttl_search() <= 0) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
|
|
|
|
|
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
|
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
|
|
- if (!key) {
|
|
|
|
|
|
|
+ if (!key)
|
|
|
return 0;
|
|
return 0;
|
|
|
- }
|
|
|
|
|
|
|
|
|
|
char *cached_data = NULL;
|
|
char *cached_data = NULL;
|
|
|
size_t cached_size = 0;
|
|
size_t cached_size = 0;
|
|
@@ -414,27 +39,31 @@ static int check_cache_for_job(ScrapeJob *job) {
|
|
|
return 0;
|
|
return 0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
|
|
|
|
|
- if (msg->data.result == CURLE_OK && job->response.size > 0) {
|
|
|
|
|
- char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
|
|
|
|
- if (key && get_cache_ttl_search() > 0) {
|
|
|
|
|
- cache_set(key, job->response.memory, job->response.size);
|
|
|
|
|
- free(key);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+void parse_and_cache_response(ScrapeJob *job) {
|
|
|
|
|
+ if (job->response.size == 0) {
|
|
|
|
|
+ job->results_count = 0;
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- xmlDocPtr doc = htmlReadMemory(
|
|
|
|
|
- job->response.memory, job->response.size, NULL, NULL,
|
|
|
|
|
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
|
|
|
|
|
|
+ char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
|
|
|
|
+ if (key && get_cache_ttl_search() > 0)
|
|
|
|
|
+ cache_set(key, job->response.memory, job->response.size);
|
|
|
|
|
+ free(key);
|
|
|
|
|
|
|
|
- if (doc) {
|
|
|
|
|
- job->results_count = job->engine->parser(
|
|
|
|
|
- job->engine->name, doc, job->out_results, job->max_results);
|
|
|
|
|
- xmlFreeDoc(doc);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ xmlDocPtr doc = htmlReadMemory(
|
|
|
|
|
+ job->response.memory, job->response.size, NULL, NULL,
|
|
|
|
|
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
|
|
|
|
+
|
|
|
|
|
+ if (doc) {
|
|
|
|
|
+ job->results_count = job->engine->parser(
|
|
|
|
|
+ job->engine->name, doc, job->out_results, job->max_results);
|
|
|
|
|
+ xmlFreeDoc(doc);
|
|
|
} else {
|
|
} else {
|
|
|
job->results_count = 0;
|
|
job->results_count = 0;
|
|
|
}
|
|
}
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
|
|
+void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
|
|
|
struct curl_slist *headers = NULL;
|
|
struct curl_slist *headers = NULL;
|
|
|
curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
|
|
curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
|
|
|
if (headers)
|
|
if (headers)
|
|
@@ -444,67 +73,112 @@ static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
|
|
|
job->response.memory = NULL;
|
|
job->response.memory = NULL;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
|
|
|
|
- int retries = 0;
|
|
|
|
|
|
|
+void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
|
|
|
|
|
+ if (msg->data.result == CURLE_OK)
|
|
|
|
|
+ parse_and_cache_response(job);
|
|
|
|
|
+ else
|
|
|
|
|
+ job->results_count = 0;
|
|
|
|
|
|
|
|
-retry:
|
|
|
|
|
- CURLM *multi_handle = curl_multi_init();
|
|
|
|
|
- if (!multi_handle) {
|
|
|
|
|
|
|
+ cleanup_job_handle(job, handle);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+int setup_job(ScrapeJob *job, CURLM *multi_handle) {
|
|
|
|
|
+ if (job->handle)
|
|
|
|
|
+ curl_easy_cleanup(job->handle);
|
|
|
|
|
+ if (job->response.memory)
|
|
|
|
|
+ free(job->response.memory);
|
|
|
|
|
+
|
|
|
|
|
+ if (check_cache_for_job(job)) {
|
|
|
|
|
+ job->results_count = job->results_count > 0 ? job->results_count : 0;
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ char *encoded_query = curl_easy_escape(NULL, job->query, 0);
|
|
|
|
|
+ if (!encoded_query)
|
|
|
|
|
+ return -1;
|
|
|
|
|
+
|
|
|
|
|
+ char *full_url =
|
|
|
|
|
+ build_search_url(job->engine->base_url, job->engine->page_param,
|
|
|
|
|
+ job->engine->page_multiplier, job->engine->page_base,
|
|
|
|
|
+ encoded_query, job->page);
|
|
|
|
|
+ free(encoded_query);
|
|
|
|
|
+
|
|
|
|
|
+ if (!full_url)
|
|
|
|
|
+ return -1;
|
|
|
|
|
+
|
|
|
|
|
+ job->handle = curl_easy_init();
|
|
|
|
|
+ if (!job->handle) {
|
|
|
|
|
+ free(full_url);
|
|
|
return -1;
|
|
return -1;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- for (int i = 0; i < num_jobs; i++) {
|
|
|
|
|
- ScrapeJob *job = &jobs[i];
|
|
|
|
|
|
|
+ job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
|
|
|
|
|
+ job->response.size = 0;
|
|
|
|
|
+ job->response.capacity = INITIAL_BUFFER_SIZE;
|
|
|
|
|
|
|
|
- if (job->handle) {
|
|
|
|
|
- curl_easy_cleanup(job->handle);
|
|
|
|
|
- job->handle = NULL;
|
|
|
|
|
- }
|
|
|
|
|
- if (job->response.memory) {
|
|
|
|
|
- free(job->response.memory);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ struct curl_slist *headers =
|
|
|
|
|
+ build_request_headers(job->engine->host_header, job->engine->referer);
|
|
|
|
|
|
|
|
- if (check_cache_for_job(job)) {
|
|
|
|
|
- job->results_count = job->results_count > 0 ? job->results_count : 0;
|
|
|
|
|
- continue;
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ configure_curl_handle(job->handle, full_url, &job->response, headers);
|
|
|
|
|
+ curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
|
|
|
|
|
|
|
|
- char *encoded_query = curl_easy_escape(NULL, job->query, 0);
|
|
|
|
|
- if (!encoded_query) {
|
|
|
|
|
- continue;
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ free(full_url);
|
|
|
|
|
+ curl_multi_add_handle(multi_handle, job->handle);
|
|
|
|
|
+ return 0;
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- char *full_url =
|
|
|
|
|
- build_search_url(job->engine->base_url, job->engine->page_param,
|
|
|
|
|
- job->engine->page_multiplier, job->engine->page_base,
|
|
|
|
|
- encoded_query, job->page);
|
|
|
|
|
- free(encoded_query);
|
|
|
|
|
|
|
+int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) {
|
|
|
|
|
+ CURLMsg *msg;
|
|
|
|
|
+ int msgs_left;
|
|
|
|
|
|
|
|
- if (!full_url) {
|
|
|
|
|
|
|
+ while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
|
|
|
|
|
+ if (msg->msg != CURLMSG_DONE)
|
|
|
continue;
|
|
continue;
|
|
|
- }
|
|
|
|
|
|
|
|
|
|
- job->handle = curl_easy_init();
|
|
|
|
|
- if (!job->handle) {
|
|
|
|
|
- free(full_url);
|
|
|
|
|
- continue;
|
|
|
|
|
|
|
+ CURL *handle = msg->easy_handle;
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < num_jobs; i++) {
|
|
|
|
|
+ if (jobs[i].handle && jobs[i].handle == handle) {
|
|
|
|
|
+ process_response(&jobs[i], handle, msg);
|
|
|
|
|
+ curl_multi_remove_handle(multi_handle, handle);
|
|
|
|
|
+ curl_easy_cleanup(handle);
|
|
|
|
|
+ jobs[i].handle = NULL;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
|
|
|
|
|
- job->response.size = 0;
|
|
|
|
|
- job->response.capacity = INITIAL_BUFFER_SIZE;
|
|
|
|
|
|
|
+ return 0;
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- struct curl_slist *headers =
|
|
|
|
|
- build_request_headers(job->engine->host_header, job->engine->referer);
|
|
|
|
|
|
|
+int should_retry(ScrapeJob *jobs, int num_jobs) {
|
|
|
|
|
+ if (proxy_count <= 0)
|
|
|
|
|
+ return 0;
|
|
|
|
|
|
|
|
- configure_curl_handle(job->handle, full_url, &job->response, headers);
|
|
|
|
|
- curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
|
|
|
|
|
|
|
+ for (int i = 0; i < num_jobs; i++) {
|
|
|
|
|
+ if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
|
|
|
|
|
+ return 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ return 0;
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- free(full_url);
|
|
|
|
|
- curl_multi_add_handle(multi_handle, job->handle);
|
|
|
|
|
|
|
+int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
|
|
|
|
+ int retries = 0;
|
|
|
|
|
+
|
|
|
|
|
+retry:
|
|
|
|
|
+ CURLM *multi_handle = curl_multi_init();
|
|
|
|
|
+ if (!multi_handle)
|
|
|
|
|
+ return -1;
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < num_jobs; i++) {
|
|
|
|
|
+ if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) {
|
|
|
|
|
+ curl_multi_remove_handle(multi_handle, jobs[i].handle);
|
|
|
|
|
+ curl_easy_cleanup(jobs[i].handle);
|
|
|
|
|
+ jobs[i].handle = NULL;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- usleep(100000 + (rand() % 100000));
|
|
|
|
|
|
|
+ http_delay();
|
|
|
|
|
|
|
|
int still_running = 0;
|
|
int still_running = 0;
|
|
|
curl_multi_perform(multi_handle, &still_running);
|
|
curl_multi_perform(multi_handle, &still_running);
|
|
@@ -512,50 +186,17 @@ retry:
|
|
|
do {
|
|
do {
|
|
|
int numfds = 0;
|
|
int numfds = 0;
|
|
|
CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
|
|
CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
|
|
|
-
|
|
|
|
|
- if (mc != CURLM_OK) {
|
|
|
|
|
|
|
+ if (mc != CURLM_OK)
|
|
|
break;
|
|
break;
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
curl_multi_perform(multi_handle, &still_running);
|
|
curl_multi_perform(multi_handle, &still_running);
|
|
|
} while (still_running);
|
|
} while (still_running);
|
|
|
|
|
|
|
|
- CURLMsg *msg;
|
|
|
|
|
- int msgs_left;
|
|
|
|
|
- while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
|
|
|
|
|
- if (msg->msg == CURLMSG_DONE) {
|
|
|
|
|
- CURL *handle = msg->easy_handle;
|
|
|
|
|
-
|
|
|
|
|
- for (int i = 0; i < num_jobs; i++) {
|
|
|
|
|
- if (jobs[i].handle && jobs[i].handle == handle) {
|
|
|
|
|
- ScrapeJob *job = &jobs[i];
|
|
|
|
|
-
|
|
|
|
|
- process_job_response(job, handle, msg);
|
|
|
|
|
-
|
|
|
|
|
- curl_multi_remove_handle(multi_handle, handle);
|
|
|
|
|
- if (handle)
|
|
|
|
|
- curl_easy_cleanup(handle);
|
|
|
|
|
- job->handle = NULL;
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
|
|
+ handle_responses(multi_handle, jobs, num_jobs);
|
|
|
curl_multi_cleanup(multi_handle);
|
|
curl_multi_cleanup(multi_handle);
|
|
|
|
|
|
|
|
- if (retries < max_proxy_retries && proxy_count > 0) {
|
|
|
|
|
- int any_failed = 0;
|
|
|
|
|
- for (int i = 0; i < num_jobs; i++) {
|
|
|
|
|
- if (jobs[i].results_count == 0 && jobs[i].response.size == 0) {
|
|
|
|
|
- any_failed = 1;
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- if (any_failed) {
|
|
|
|
|
- retries++;
|
|
|
|
|
- goto retry;
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) {
|
|
|
|
|
+ retries++;
|
|
|
|
|
+ goto retry;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
return 0;
|