소스 검색

feature: added caching

frosty 1 주 전
부모
커밋
e33310f263
8개의 변경된 파일185개의 추가작업 그리고 18개의 파일을 삭제
  1. 1 1
      Makefile
  2. 11 1
      example-config.ini
  3. 9 0
      src/Config.c
  4. 3 0
      src/Config.h
  5. 49 0
      src/Infobox/Dictionary.c
  6. 34 1
      src/Infobox/Wikipedia.c
  7. 19 1
      src/Main.c
  8. 59 14
      src/Scraping/Scraping.c

+ 1 - 1
Makefile

@@ -10,7 +10,7 @@ else
     LDFLAGS :=
 endif
 
-LIBS    := -lbeaker -lcurl -lxml2 -lpthread -lm
+LIBS    := -lbeaker -lcurl -lxml2 -lpthread -lm -lssl -lcrypto
 
 SRC_DIR := src
 BIN_DIR := bin

+ 11 - 1
example-config.ini

@@ -1,7 +1,7 @@
 [server]
 host = 0.0.0.0
 port = 8000
-     
+
 [proxy]
 # Single proxy (comment out to use list_file instead)
 #proxy = "socks5://127.0.0.1:9050"
@@ -14,3 +14,13 @@ port = 8000
 # Randomize proxy credentials for each request
 #randomize_username = true
 #randomize_password = true
+
+[cache]
+# Directory to store cached responses
+#dir = /tmp/omnisearch_cache
+
+# Cache TTL for search results in seconds (default: 3600 = 1 hour)
+#ttl_search = 3600
+
+# Cache TTL for infobox data in seconds (default: 86400 = 24 hours)
+#ttl_infobox = 86400

+ 9 - 0
src/Config.c

@@ -80,6 +80,15 @@ int load_config(const char *filename, Config *config) {
         } else if (strcmp(key, "randomize_password") == 0) {
           config->randomize_password = atoi(value);
         }
+      } else if (strcmp(section, "cache") == 0) {
+        if (strcmp(key, "dir") == 0) {
+          strncpy(config->cache_dir, value, sizeof(config->cache_dir) - 1);
+          config->cache_dir[sizeof(config->cache_dir) - 1] = '\0';
+        } else if (strcmp(key, "ttl_search") == 0) {
+          config->cache_ttl_search = atoi(value);
+        } else if (strcmp(key, "ttl_infobox") == 0) {
+          config->cache_ttl_infobox = atoi(value);
+        }
       }
     }
   }

+ 3 - 0
src/Config.h

@@ -9,6 +9,9 @@ typedef struct {
   int max_proxy_retries;
   int randomize_username;
   int randomize_password;
+  char cache_dir[512];
+  int cache_ttl_search;
+  int cache_ttl_infobox;
 } Config;
 
 int load_config(const char *filename, Config *config);

+ 49 - 0
src/Infobox/Dictionary.c

@@ -1,4 +1,5 @@
 #include "Dictionary.h"
+#include "../Cache/Cache.h"
 #include "../Proxy/Proxy.h"
 #include "../Scraping/Scraping.h"
 #include <ctype.h>
@@ -266,6 +267,48 @@ InfoBox fetch_dictionary_data(const char *query) {
   if (!url)
     return info;
 
+  char *cache_key = cache_compute_key(url, 0, "dictionary");
+  if (cache_key && get_cache_ttl_infobox() > 0) {
+    char *cached_data = NULL;
+    size_t cached_size = 0;
+    if (cache_get(cache_key, (time_t)get_cache_ttl_infobox(), &cached_data,
+                  &cached_size) == 0 &&
+        cached_data && cached_size > 0) {
+      htmlDocPtr doc = htmlReadMemory(cached_data, cached_size, url, NULL,
+                                      HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+                                          HTML_PARSE_NOWARNING);
+      if (doc) {
+        char *word = xpath_text(doc, "//span[@class='hw dhw']");
+        char *pron = xpath_text(
+            doc,
+            "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
+        char *pos = xpath_text(doc, "//span[@class='pos dpos']");
+        char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
+        char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");
+
+        if (word && def) {
+          info.title = strdup("Dictionary");
+          info.extract = build_html(word, pron, pos, def, ex);
+          info.thumbnail_url = strdup("/static/dictionary.jpg");
+          info.url = strdup(url);
+        }
+
+        free(word);
+        free(pron);
+        free(pos);
+        free(def);
+        free(ex);
+        xmlFreeDoc(doc);
+      }
+      free(cached_data);
+      free(cache_key);
+      free(url);
+      return info;
+    }
+    free(cached_data);
+  }
+  free(cache_key);
+
   CURL *curl = curl_easy_init();
   if (!curl) {
     free(url);
@@ -281,6 +324,12 @@ InfoBox fetch_dictionary_data(const char *query) {
   apply_proxy_settings(curl);
 
   if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
+    cache_key = cache_compute_key(url, 0, "dictionary");
+    if (cache_key && get_cache_ttl_infobox() > 0) {
+      cache_set(cache_key, chunk.memory, chunk.size);
+    }
+    free(cache_key);
+
     htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
                                     HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
                                         HTML_PARSE_NOWARNING);

+ 34 - 1
src/Infobox/Wikipedia.c

@@ -1,4 +1,5 @@
 #include "Wikipedia.h"
+#include "../Cache/Cache.h"
 #include "../Proxy/Proxy.h"
 #include "../Scraping/Scraping.h"
 #include <curl/curl.h>
@@ -117,6 +118,32 @@ InfoBox fetch_wiki_data(char *api_url) {
   struct WikiMemoryStruct chunk;
   InfoBox info = {NULL, NULL, NULL, NULL};
 
+  if (!api_url) {
+    return info;
+  }
+
+  char *cache_key = cache_compute_key(api_url, 0, "wikipedia");
+  if (cache_key && get_cache_ttl_infobox() > 0) {
+    char *cached_data = NULL;
+    size_t cached_size = 0;
+    if (cache_get(cache_key, get_cache_ttl_infobox(), &cached_data,
+                  &cached_size) == 0 &&
+        cached_data && cached_size > 0) {
+      xmlDocPtr doc =
+          xmlReadMemory(cached_data, cached_size, "noname.xml", NULL, 0);
+      if (doc != NULL) {
+        xmlNode *root_element = xmlDocGetRootElement(doc);
+        extract_wiki_info(root_element, &info);
+        xmlFreeDoc(doc);
+      }
+      free(cached_data);
+      free(cache_key);
+      return info;
+    }
+    free(cached_data);
+  }
+  free(cache_key);
+
   chunk.memory = malloc(1);
   chunk.size = 0;
 
@@ -132,7 +159,13 @@ InfoBox fetch_wiki_data(char *api_url) {
 
     res = curl_easy_perform(curl_handle);
 
-    if (res == CURLE_OK) {
+    if (res == CURLE_OK && chunk.size > 0) {
+      cache_key = cache_compute_key(api_url, 0, "wikipedia");
+      if (cache_key && get_cache_ttl_infobox() > 0) {
+        cache_set(cache_key, chunk.memory, chunk.size);
+      }
+      free(cache_key);
+
       xmlDocPtr doc =
           xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
       if (doc != NULL) {

+ 19 - 1
src/Main.c

@@ -5,7 +5,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "Cache/Cache.h"
 #include "Config.h"
+#include "Infobox/Wikipedia.h"
 #include "Proxy/Proxy.h"
 #include "Routes/Home.h"
 #include "Routes/ImageProxy.h"
@@ -37,12 +39,27 @@ int main() {
                    .proxy_list_file = "",
                    .max_proxy_retries = 3,
                    .randomize_username = 0,
-                   .randomize_password = 0};
+                   .randomize_password = 0,
+                   .cache_dir = "/tmp/omnisearch_cache",
+                   .cache_ttl_search = 3600,
+                   .cache_ttl_infobox = 86400};
 
   if (load_config("config.ini", &config) != 0) {
     fprintf(stderr, "Warning: Could not load config file, using defaults\n");
   }
 
+  if (cache_init(config.cache_dir) != 0) {
+    fprintf(
+        stderr,
+        "Warning: Failed to initialize cache, continuing without caching\n");
+  } else {
+    fprintf(stderr, "Cache initialized at %s\n", config.cache_dir);
+    cache_cleanup(config.cache_ttl_search);
+  }
+
+  set_cache_ttl_search(config.cache_ttl_search);
+  set_cache_ttl_infobox(config.cache_ttl_infobox);
+
   if (config.proxy_list_file[0] != '\0') {
     if (load_proxy_list(config.proxy_list_file) < 0) {
       fprintf(
@@ -82,5 +99,6 @@ int main() {
   curl_global_cleanup();
   xmlCleanupParser();
   free_proxy_list();
+  cache_shutdown();
   return EXIT_SUCCESS;
 }

+ 59 - 14
src/Scraping/Scraping.c

@@ -1,4 +1,5 @@
 #include "Scraping.h"
+#include "../Cache/Cache.h"
 #include "../Proxy/Proxy.h"
 #include "../Utility/Unescape.h"
 #include <curl/curl.h>
@@ -368,6 +369,10 @@ retry:
   for (int i = 0; i < num_jobs; i++) {
     ScrapeJob *job = &jobs[i];
 
+    char cache_key[64];
+    char full_url[1024];
+    char *encoded_query = NULL;
+
     if (job->handle) {
       curl_easy_cleanup(job->handle);
       job->handle = NULL;
@@ -376,20 +381,8 @@ retry:
       free(job->response.memory);
     }
 
-    job->handle = curl_easy_init();
-    if (!job->handle) {
-      continue;
-    }
-
-    job->response.memory = (char *)malloc(16384);
-    job->response.size = 0;
-    job->response.capacity = 16384;
-
-    char full_url[1024];
-    char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
+    encoded_query = curl_easy_escape(NULL, job->query, 0);
     if (!encoded_query) {
-      curl_easy_cleanup(job->handle);
-      job->handle = NULL;
       continue;
     }
 
@@ -399,7 +392,52 @@ retry:
 
     snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
              encoded_query, job->engine->page_param, page_value);
-    curl_free(encoded_query);
+
+    char *key = cache_compute_key(job->query, job->page, job->engine->name);
+    if (key) {
+      strncpy(cache_key, key, sizeof(cache_key) - 1);
+      cache_key[sizeof(cache_key) - 1] = '\0';
+      free(key);
+    } else {
+      snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
+               job->engine->name);
+    }
+
+    char *cached_data = NULL;
+    size_t cached_size = 0;
+    int cache_hit = 0;
+
+    if (get_cache_ttl_search() > 0 &&
+        cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
+                  &cached_size) == 0 &&
+        cached_data && cached_size > 0) {
+      xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
+                                     HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+                                         HTML_PARSE_NOWARNING);
+      if (doc) {
+        job->results_count = job->engine->parser(
+            job->engine->name, doc, job->out_results, job->max_results);
+        xmlFreeDoc(doc);
+        cache_hit = 1;
+      }
+      free(cached_data);
+    }
+
+    if (cache_hit) {
+      free(encoded_query);
+      job->results_count = job->results_count > 0 ? job->results_count : 0;
+      continue;
+    }
+
+    job->handle = curl_easy_init();
+    if (!job->handle) {
+      free(encoded_query);
+      continue;
+    }
+
+    job->response.memory = (char *)malloc(16384);
+    job->response.size = 0;
+    job->response.capacity = 16384;
 
     struct curl_slist *headers = NULL;
     char host_buf[256], ref_buf[256];
@@ -451,6 +489,13 @@ retry:
           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
 
           if (msg->data.result == CURLE_OK && job->response.size > 0) {
+            char *key =
+                cache_compute_key(job->query, job->page, job->engine->name);
+            if (key && get_cache_ttl_search() > 0) {
+              cache_set(key, job->response.memory, job->response.size);
+              free(key);
+            }
+
             xmlDocPtr doc = htmlReadMemory(
                 job->response.memory, job->response.size, NULL, NULL,
                 HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);