Scraping.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. #include "Scraping.h"
  2. #include "../Utility/Unescape.h"
  3. #include <curl/curl.h>
  4. #include <libxml/HTMLparser.h>
  5. #include <libxml/xpath.h>
  6. #include <stdio.h>
  7. #include <stdlib.h>
  8. #include <string.h>
  9. #include <time.h>
  10. #include <unistd.h>
  11. static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
  12. void *userp) {
  13. size_t realsize = size * nmemb;
  14. MemoryBuffer *mem = (MemoryBuffer *)userp;
  15. if (mem->size + realsize + 1 > mem->capacity) {
  16. size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2;
  17. while (new_cap < mem->size + realsize + 1) new_cap *= 2;
  18. char *ptr = (char *)realloc(mem->memory, new_cap);
  19. if (!ptr) {
  20. return 0;
  21. }
  22. mem->memory = ptr;
  23. mem->capacity = new_cap;
  24. }
  25. memcpy(&(mem->memory[mem->size]), contents, realsize);
  26. mem->size += realsize;
  27. mem->memory[mem->size] = 0;
  28. return realsize;
  29. }
  30. static const char *get_random_user_agent() {
  31. static const char *agents[] = {
  32. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
  33. "like Gecko) Chrome/120.0.0.0 Safari/537.36",
  34. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
  35. "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
  36. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
  37. "Gecko) "
  38. "Chrome/120.0.0.0` Safari/537.36",
  39. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
  40. "Firefox/121.0",
  41. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
  42. "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
  43. return agents[rand() % 5];
  44. }
  45. static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
  46. SearchResult **out_results, int max_results) {
  47. (void)engine_name;
  48. int found_count = 0;
  49. xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
  50. if (!xpathCtx) {
  51. return 0;
  52. }
  53. const char *link_xpath = "//tr[not(contains(@class, 'result-sponsored'))]//a[@class='result-link']";
  54. xmlXPathObjectPtr xpathObj =
  55. xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx);
  56. if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
  57. if (xpathObj) xmlXPathFreeObject(xpathObj);
  58. xmlXPathFreeContext(xpathCtx);
  59. return 0;
  60. }
  61. int num_links = xpathObj->nodesetval->nodeNr;
  62. int actual_alloc = (num_links < max_results) ? num_links : max_results;
  63. *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
  64. if (!*out_results) {
  65. xmlXPathFreeObject(xpathObj);
  66. xmlXPathFreeContext(xpathCtx);
  67. return 0;
  68. }
  69. for (int i = 0; i < num_links && found_count < max_results; i++) {
  70. xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i];
  71. char *title = (char *)xmlNodeGetContent(linkNode);
  72. char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href");
  73. char *snippet_text = NULL;
  74. xmlNodePtr current = linkNode->parent;
  75. while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
  76. current = current->parent;
  77. if (current && current->next) {
  78. xmlNodePtr snippetRow = current->next;
  79. while (snippetRow &&
  80. xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
  81. snippetRow = snippetRow->next;
  82. if (snippetRow) {
  83. xpathCtx->node = snippetRow;
  84. xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
  85. (xmlChar *)".//td[@class='result-snippet']", xpathCtx);
  86. if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
  87. snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
  88. }
  89. if (sObj) xmlXPathFreeObject(sObj);
  90. xpathCtx->node = NULL;
  91. }
  92. }
  93. (*out_results)[found_count].url = unescape_search_url(url);
  94. (*out_results)[found_count].title = strdup(title ? title : "No Title");
  95. (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : "");
  96. found_count++;
  97. if (title) xmlFree(title);
  98. if (url) xmlFree(url);
  99. if (snippet_text) xmlFree(snippet_text);
  100. }
  101. xmlXPathFreeObject(xpathObj);
  102. xmlXPathFreeContext(xpathCtx);
  103. return found_count;
  104. }
  105. static int parse_startpage(const char *engine_name, xmlDocPtr doc,
  106. SearchResult **out_results, int max_results) {
  107. (void)engine_name;
  108. int found_count = 0;
  109. xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
  110. if (!xpathCtx) {
  111. return 0;
  112. }
  113. const char *container_xpath = "//div[contains(@class, 'result')]";
  114. xmlXPathObjectPtr xpathObj =
  115. xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
  116. if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
  117. if (xpathObj) xmlXPathFreeObject(xpathObj);
  118. xmlXPathFreeContext(xpathCtx);
  119. return 0;
  120. }
  121. int num_results = xpathObj->nodesetval->nodeNr;
  122. int actual_alloc = (num_results < max_results) ? num_results : max_results;
  123. *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
  124. if (!*out_results) {
  125. xmlXPathFreeObject(xpathObj);
  126. xmlXPathFreeContext(xpathCtx);
  127. return 0;
  128. }
  129. for (int i = 0; i < num_results && found_count < max_results; i++) {
  130. xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
  131. xpathCtx->node = resultNode;
  132. xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
  133. (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx);
  134. char *url =
  135. (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
  136. ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
  137. (xmlChar *)"href")
  138. : NULL;
  139. xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
  140. (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx);
  141. char *title =
  142. (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
  143. ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
  144. : NULL;
  145. xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
  146. (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx);
  147. char *snippet_text =
  148. (snippetObj && snippetObj->nodesetval &&
  149. snippetObj->nodesetval->nodeNr > 0)
  150. ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
  151. : NULL;
  152. if (url && title) {
  153. (*out_results)[found_count].url = strdup(url);
  154. (*out_results)[found_count].title = strdup(title);
  155. (*out_results)[found_count].snippet =
  156. strdup(snippet_text ? snippet_text : "");
  157. found_count++;
  158. }
  159. if (title) xmlFree(title);
  160. if (url) xmlFree(url);
  161. if (snippet_text) xmlFree(snippet_text);
  162. if (linkObj) xmlXPathFreeObject(linkObj);
  163. if (titleObj) xmlXPathFreeObject(titleObj);
  164. if (snippetObj) xmlXPathFreeObject(snippetObj);
  165. }
  166. xpathCtx->node = NULL;
  167. xmlXPathFreeObject(xpathObj);
  168. xmlXPathFreeContext(xpathCtx);
  169. return found_count;
  170. }
  171. static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
  172. SearchResult **out_results, int max_results) {
  173. (void)engine_name;
  174. int found_count = 0;
  175. xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
  176. if (!xpathCtx) {
  177. return 0;
  178. }
  179. const char *container_xpath = "//div[contains(@class, 'algo-sr')]";
  180. xmlXPathObjectPtr xpathObj =
  181. xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
  182. if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
  183. if (xpathObj) xmlXPathFreeObject(xpathObj);
  184. xmlXPathFreeContext(xpathCtx);
  185. return 0;
  186. }
  187. int num_results = xpathObj->nodesetval->nodeNr;
  188. int actual_alloc = (num_results < max_results) ? num_results : max_results;
  189. *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
  190. if (!*out_results) {
  191. xmlXPathFreeObject(xpathObj);
  192. xmlXPathFreeContext(xpathCtx);
  193. return 0;
  194. }
  195. for (int i = 0; i < num_results && found_count < max_results; i++) {
  196. xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
  197. xpathCtx->node = resultNode;
  198. xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
  199. (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']",
  200. xpathCtx);
  201. char *url =
  202. (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
  203. ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
  204. (xmlChar *)"href")
  205. : NULL;
  206. xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
  207. (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx);
  208. char *title =
  209. (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
  210. ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
  211. : NULL;
  212. xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
  213. (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx);
  214. char *snippet_text =
  215. (snippetObj && snippetObj->nodesetval &&
  216. snippetObj->nodesetval->nodeNr > 0)
  217. ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
  218. : NULL;
  219. if (url && title) {
  220. (*out_results)[found_count].url = unescape_search_url(url);
  221. (*out_results)[found_count].title = strdup(title);
  222. (*out_results)[found_count].snippet =
  223. strdup(snippet_text ? snippet_text : "");
  224. found_count++;
  225. }
  226. if (title) xmlFree(title);
  227. if (url) xmlFree(url);
  228. if (snippet_text) xmlFree(snippet_text);
  229. if (linkObj) xmlXPathFreeObject(linkObj);
  230. if (titleObj) xmlXPathFreeObject(titleObj);
  231. if (snippetObj) xmlXPathFreeObject(snippetObj);
  232. }
  233. xpathCtx->node = NULL;
  234. xmlXPathFreeObject(xpathObj);
  235. xmlXPathFreeContext(xpathCtx);
  236. return found_count;
  237. }
  238. const SearchEngine ENGINE_REGISTRY[] = {
  239. {.name = "DuckDuckGo Lite",
  240. .base_url = "https://lite.duckduckgo.com/lite/?q=",
  241. .host_header = "lite.duckduckgo.com",
  242. .referer = "https://lite.duckduckgo.com/",
  243. .page_param = "s",
  244. .page_multiplier = 30,
  245. .page_base = 0,
  246. .parser = parse_ddg_lite},
  247. {.name = "Startpage",
  248. .base_url = "https://www.startpage.com/sp/search?q=",
  249. .host_header = "www.startpage.com",
  250. .referer = "https://www.startpage.com/",
  251. .page_param = "page",
  252. .page_multiplier = 1,
  253. .page_base = 1,
  254. .parser = parse_startpage},
  255. {.name = "Yahoo",
  256. .base_url = "https://search.yahoo.com/search?p=",
  257. .host_header = "search.yahoo.com",
  258. .referer = "https://search.yahoo.com/",
  259. .page_param = "b",
  260. .page_multiplier = 10,
  261. .page_base = 1,
  262. .parser = parse_yahoo}};
  263. const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
  264. static void configure_curl_handle(CURL *curl, const char *full_url,
  265. MemoryBuffer *chunk,
  266. struct curl_slist *headers) {
  267. curl_easy_setopt(curl, CURLOPT_URL, full_url);
  268. curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
  269. curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
  270. curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
  271. curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
  272. curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
  273. curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
  274. curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L);
  275. curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
  276. curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
  277. curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
  278. curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
  279. }
  280. int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
  281. CURLM *multi_handle = curl_multi_init();
  282. if (!multi_handle) {
  283. return -1;
  284. }
  285. for (int i = 0; i < num_jobs; i++) {
  286. ScrapeJob *job = &jobs[i];
  287. job->handle = curl_easy_init();
  288. if (!job->handle) {
  289. continue;
  290. }
  291. job->response.memory = (char *)malloc(16384);
  292. job->response.size = 0;
  293. job->response.capacity = 16384;
  294. char full_url[1024];
  295. char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
  296. if (!encoded_query) {
  297. curl_easy_cleanup(job->handle);
  298. job->handle = NULL;
  299. continue;
  300. }
  301. int page = (job->page < 1) ? 1 : job->page;
  302. int page_value = (page - 1) * job->engine->page_multiplier + job->engine->page_base;
  303. snprintf(full_url, sizeof(full_url), "%s%s&%s=%d",
  304. job->engine->base_url,
  305. encoded_query,
  306. job->engine->page_param,
  307. page_value);
  308. curl_free(encoded_query);
  309. struct curl_slist *headers = NULL;
  310. char host_buf[256], ref_buf[256];
  311. snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header);
  312. snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer);
  313. headers = curl_slist_append(headers, host_buf);
  314. headers = curl_slist_append(headers, ref_buf);
  315. headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
  316. headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
  317. headers = curl_slist_append(headers, "DNT: 1");
  318. configure_curl_handle(job->handle, full_url, &job->response, headers);
  319. curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
  320. curl_multi_add_handle(multi_handle, job->handle);
  321. }
  322. usleep(100000 + (rand() % 100000));
  323. int still_running = 0;
  324. curl_multi_perform(multi_handle, &still_running);
  325. do {
  326. int numfds = 0;
  327. CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
  328. if (mc != CURLM_OK) {
  329. break;
  330. }
  331. curl_multi_perform(multi_handle, &still_running);
  332. } while (still_running);
  333. CURLMsg *msg;
  334. int msgs_left;
  335. while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
  336. if (msg->msg == CURLMSG_DONE) {
  337. CURL *handle = msg->easy_handle;
  338. for (int i = 0; i < num_jobs; i++) {
  339. if (jobs[i].handle == handle) {
  340. ScrapeJob *job = &jobs[i];
  341. long response_code;
  342. curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
  343. if (msg->data.result == CURLE_OK && job->response.size > 0) {
  344. xmlDocPtr doc = htmlReadMemory(
  345. job->response.memory, job->response.size, NULL, NULL,
  346. HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
  347. if (doc) {
  348. job->results_count = job->engine->parser(
  349. job->engine->name, doc, job->out_results, job->max_results);
  350. xmlFreeDoc(doc);
  351. }
  352. } else {
  353. job->results_count = 0;
  354. }
  355. struct curl_slist *headers;
  356. curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
  357. if (headers) curl_slist_free_all(headers);
  358. free(job->response.memory);
  359. curl_multi_remove_handle(multi_handle, handle);
  360. curl_easy_cleanup(handle);
  361. break;
  362. }
  363. }
  364. }
  365. }
  366. curl_multi_cleanup(multi_handle);
  367. return 0;
  368. }
  369. int scrape_engine(const SearchEngine *engine, const char *query,
  370. SearchResult **out_results, int max_results) {
  371. ScrapeJob job = {
  372. .engine = engine,
  373. .query = (char *)query,
  374. .out_results = out_results,
  375. .max_results = max_results,
  376. .results_count = 0,
  377. .page = 1
  378. };
  379. scrape_engines_parallel(&job, 1);
  380. return job.results_count;
  381. }