Scraping.h 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. #ifndef SCRAPING_H
  2. #define SCRAPING_H
  3. #include <curl/curl.h>
  4. #include <libxml/HTMLparser.h>
  5. #include <libxml/xpath.h>
  6. typedef struct {
  7. char *url;
  8. char *title;
  9. char *snippet;
  10. } SearchResult;
  11. typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc,
  12. SearchResult **out_results, int max_results);
  13. typedef struct {
  14. const char *name;
  15. const char *base_url;
  16. const char *host_header;
  17. const char *referer;
  18. const char *page_param;
  19. int page_multiplier;
  20. int page_base;
  21. ParserFunc parser;
  22. } SearchEngine;
  23. typedef struct {
  24. char *memory;
  25. size_t size;
  26. size_t capacity;
  27. } MemoryBuffer;
  28. typedef struct {
  29. const SearchEngine *engine;
  30. char *query;
  31. SearchResult **out_results;
  32. int max_results;
  33. int page;
  34. CURL *handle;
  35. MemoryBuffer response;
  36. int results_count;
  37. } ScrapeJob;
  38. extern const SearchEngine ENGINE_REGISTRY[];
  39. extern const int ENGINE_COUNT;
  40. size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
  41. void *userp);
  42. const char *get_random_user_agent(void);
  43. void configure_curl_handle(CURL *curl, const char *full_url,
  44. MemoryBuffer *chunk, struct curl_slist *headers);
  45. char *build_search_url(const char *base_url, const char *page_param,
  46. int page_multiplier, int page_base,
  47. const char *encoded_query, int page);
  48. struct curl_slist *build_request_headers(const char *host_header,
  49. const char *referer);
  50. void http_delay(void);
  51. xmlXPathContextPtr create_xpath_context(xmlDocPtr doc);
  52. void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj);
  53. SearchResult *alloc_results_array(int capacity, int max_results);
  54. void assign_result(SearchResult *result, char *url, char *title, char *snippet,
  55. int unescape);
  56. void free_xml_node_list(char *title, char *url, char *snippet);
  57. int scrape_engine(const SearchEngine *engine, const char *query,
  58. SearchResult **out_results, int max_results);
  59. int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs);
  60. #endif