3 달 전 · e33310f263
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ else
 
				     LDFLAGS :=
			
 
				 endif
			
 
				 
			
 
				-LIBS    := -lbeaker -lcurl -lxml2 -lpthread -lm
			
 
				+LIBS    := -lbeaker -lcurl -lxml2 -lpthread -lm -lssl -lcrypto
			
 
				 
			
 
				 SRC_DIR := src
			
 
				 BIN_DIR := bin
			
--- a/example-config.ini
+++ b/example-config.ini
@@ -1,7 +1,7 @@
 
				 [server]
			
 
				 host = 0.0.0.0
			
 
				 port = 8000
			
 
				-     
			
 
				+
			
 
				 [proxy]
			
 
				 # Single proxy (comment out to use list_file instead)
			
 
				 #proxy = "socks5://127.0.0.1:9050"
			
@@ -14,3 +14,13 @@ port = 8000
 
				 # Randomize proxy credentials for each request
			
 
				 #randomize_username = true
			
 
				 #randomize_password = true
			
 
				+
			
 
				+[cache]
			
 
				+# Directory to store cached responses
			
 
				+#dir = /tmp/omnisearch_cache
			
 
				+
			
 
				+# Cache TTL for search results in seconds (default: 3600 = 1 hour)
			
 
				+#ttl_search = 3600
			
 
				+
			
 
				+# Cache TTL for infobox data in seconds (default: 86400 = 24 hours)
			
 
				+#ttl_infobox = 86400
			
--- a/src/Config.c
+++ b/src/Config.c
@@ -80,6 +80,15 @@ int load_config(const char *filename, Config *config) {
 
				         } else if (strcmp(key, "randomize_password") == 0) {
			
 
				           config->randomize_password = atoi(value);
			
 
				         }
			
 
				+      } else if (strcmp(section, "cache") == 0) {
			
 
				+        if (strcmp(key, "dir") == 0) {
			
 
				+          strncpy(config->cache_dir, value, sizeof(config->cache_dir) - 1);
			
 
				+          config->cache_dir[sizeof(config->cache_dir) - 1] = '\0';
			
 
				+        } else if (strcmp(key, "ttl_search") == 0) {
			
 
				+          config->cache_ttl_search = atoi(value);
			
 
				+        } else if (strcmp(key, "ttl_infobox") == 0) {
			
 
				+          config->cache_ttl_infobox = atoi(value);
			
 
				+        }
			
 
				       }
			
 
				     }
			
 
				   }
			
--- a/src/Config.h
+++ b/src/Config.h
@@ -9,6 +9,9 @@ typedef struct {
 
				   int max_proxy_retries;
			
 
				   int randomize_username;
			
 
				   int randomize_password;
			
 
				+  char cache_dir[512];
			
 
				+  int cache_ttl_search;
			
 
				+  int cache_ttl_infobox;
			
 
				 } Config;
			
 
				 
			
 
				 int load_config(const char *filename, Config *config);
			
--- a/src/Infobox/Dictionary.c
+++ b/src/Infobox/Dictionary.c
@@ -1,4 +1,5 @@
 
				 #include "Dictionary.h"
			
 
				+#include "../Cache/Cache.h"
			
 
				 #include "../Proxy/Proxy.h"
			
 
				 #include "../Scraping/Scraping.h"
			
 
				 #include <ctype.h>
			
@@ -266,6 +267,48 @@ InfoBox fetch_dictionary_data(const char *query) {
 
				   if (!url)
			
 
				     return info;
			
 
				 
			
 
				+  char *cache_key = cache_compute_key(url, 0, "dictionary");
			
 
				+  if (cache_key && get_cache_ttl_infobox() > 0) {
			
 
				+    char *cached_data = NULL;
			
 
				+    size_t cached_size = 0;
			
 
				+    if (cache_get(cache_key, (time_t)get_cache_ttl_infobox(), &cached_data,
			
 
				+                  &cached_size) == 0 &&
			
 
				+        cached_data && cached_size > 0) {
			
 
				+      htmlDocPtr doc = htmlReadMemory(cached_data, cached_size, url, NULL,
			
 
				+                                      HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
			
 
				+                                          HTML_PARSE_NOWARNING);
			
 
				+      if (doc) {
			
 
				+        char *word = xpath_text(doc, "//span[@class='hw dhw']");
			
 
				+        char *pron = xpath_text(
			
 
				+            doc,
			
 
				+            "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
			
 
				+        char *pos = xpath_text(doc, "//span[@class='pos dpos']");
			
 
				+        char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
			
 
				+        char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");
			
 
				+
			
 
				+        if (word && def) {
			
 
				+          info.title = strdup("Dictionary");
			
 
				+          info.extract = build_html(word, pron, pos, def, ex);
			
 
				+          info.thumbnail_url = strdup("/static/dictionary.jpg");
			
 
				+          info.url = strdup(url);
			
 
				+        }
			
 
				+
			
 
				+        free(word);
			
 
				+        free(pron);
			
 
				+        free(pos);
			
 
				+        free(def);
			
 
				+        free(ex);
			
 
				+        xmlFreeDoc(doc);
			
 
				+      }
			
 
				+      free(cached_data);
			
 
				+      free(cache_key);
			
 
				+      free(url);
			
 
				+      return info;
			
 
				+    }
			
 
				+    free(cached_data);
			
 
				+  }
			
 
				+  free(cache_key);
			
 
				+
			
 
				   CURL *curl = curl_easy_init();
			
 
				   if (!curl) {
			
 
				     free(url);
			
@@ -281,6 +324,12 @@ InfoBox fetch_dictionary_data(const char *query) {
 
				   apply_proxy_settings(curl);
			
 
				 
			
 
				   if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
			
 
				+    cache_key = cache_compute_key(url, 0, "dictionary");
			
 
				+    if (cache_key && get_cache_ttl_infobox() > 0) {
			
 
				+      cache_set(cache_key, chunk.memory, chunk.size);
			
 
				+    }
			
 
				+    free(cache_key);
			
 
				+
			
 
				     htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
			
 
				                                     HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
			
 
				                                         HTML_PARSE_NOWARNING);
			
--- a/src/Infobox/Wikipedia.c
+++ b/src/Infobox/Wikipedia.c
@@ -1,4 +1,5 @@
 
				 #include "Wikipedia.h"
			
 
				+#include "../Cache/Cache.h"
			
 
				 #include "../Proxy/Proxy.h"
			
 
				 #include "../Scraping/Scraping.h"
			
 
				 #include <curl/curl.h>
			
@@ -117,6 +118,32 @@ InfoBox fetch_wiki_data(char *api_url) {
 
				   struct WikiMemoryStruct chunk;
			
 
				   InfoBox info = {NULL, NULL, NULL, NULL};
			
 
				 
			
 
				+  if (!api_url) {
			
 
				+    return info;
			
 
				+  }
			
 
				+
			
 
				+  char *cache_key = cache_compute_key(api_url, 0, "wikipedia");
			
 
				+  if (cache_key && get_cache_ttl_infobox() > 0) {
			
 
				+    char *cached_data = NULL;
			
 
				+    size_t cached_size = 0;
			
 
				+    if (cache_get(cache_key, get_cache_ttl_infobox(), &cached_data,
			
 
				+                  &cached_size) == 0 &&
			
 
				+        cached_data && cached_size > 0) {
			
 
				+      xmlDocPtr doc =
			
 
				+          xmlReadMemory(cached_data, cached_size, "noname.xml", NULL, 0);
			
 
				+      if (doc != NULL) {
			
 
				+        xmlNode *root_element = xmlDocGetRootElement(doc);
			
 
				+        extract_wiki_info(root_element, &info);
			
 
				+        xmlFreeDoc(doc);
			
 
				+      }
			
 
				+      free(cached_data);
			
 
				+      free(cache_key);
			
 
				+      return info;
			
 
				+    }
			
 
				+    free(cached_data);
			
 
				+  }
			
 
				+  free(cache_key);
			
 
				+
			
 
				   chunk.memory = malloc(1);
			
 
				   chunk.size = 0;
			
 
				 
			
@@ -132,7 +159,13 @@ InfoBox fetch_wiki_data(char *api_url) {
 
				 
			
 
				     res = curl_easy_perform(curl_handle);
			
 
				 
			
 
				-    if (res == CURLE_OK) {
			
 
				+    if (res == CURLE_OK && chunk.size > 0) {
			
 
				+      cache_key = cache_compute_key(api_url, 0, "wikipedia");
			
 
				+      if (cache_key && get_cache_ttl_infobox() > 0) {
			
 
				+        cache_set(cache_key, chunk.memory, chunk.size);
			
 
				+      }
			
 
				+      free(cache_key);
			
 
				+
			
 
				       xmlDocPtr doc =
			
 
				           xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
			
 
				       if (doc != NULL) {
			
--- a/src/Main.c
+++ b/src/Main.c
@@ -5,7 +5,9 @@
 
				 #include <stdio.h>
			
 
				 #include <stdlib.h>
			
 
				 
			
 
				+#include "Cache/Cache.h"
			
 
				 #include "Config.h"
			
 
				+#include "Infobox/Wikipedia.h"
			
 
				 #include "Proxy/Proxy.h"
			
 
				 #include "Routes/Home.h"
			
 
				 #include "Routes/ImageProxy.h"
			
@@ -37,12 +39,27 @@ int main() {
 
				                    .proxy_list_file = "",
			
 
				                    .max_proxy_retries = 3,
			
 
				                    .randomize_username = 0,
			
 
				-                   .randomize_password = 0};
			
 
				+                   .randomize_password = 0,
			
 
				+                   .cache_dir = "/tmp/omnisearch_cache",
			
 
				+                   .cache_ttl_search = 3600,
			
 
				+                   .cache_ttl_infobox = 86400};
			
 
				 
			
 
				   if (load_config("config.ini", &config) != 0) {
			
 
				     fprintf(stderr, "Warning: Could not load config file, using defaults\n");
			
 
				   }
			
 
				 
			
 
				+  if (cache_init(config.cache_dir) != 0) {
			
 
				+    fprintf(
			
 
				+        stderr,
			
 
				+        "Warning: Failed to initialize cache, continuing without caching\n");
			
 
				+  } else {
			
 
				+    fprintf(stderr, "Cache initialized at %s\n", config.cache_dir);
			
 
				+    cache_cleanup(config.cache_ttl_search);
			
 
				+  }
			
 
				+
			
 
				+  set_cache_ttl_search(config.cache_ttl_search);
			
 
				+  set_cache_ttl_infobox(config.cache_ttl_infobox);
			
 
				+
			
 
				   if (config.proxy_list_file[0] != '\0') {
			
 
				     if (load_proxy_list(config.proxy_list_file) < 0) {
			
 
				       fprintf(
			
@@ -82,5 +99,6 @@ int main() {
 
				   curl_global_cleanup();
			
 
				   xmlCleanupParser();
			
 
				   free_proxy_list();
			
 
				+  cache_shutdown();
			
 
				   return EXIT_SUCCESS;
			
 
				 }
			
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -1,4 +1,5 @@
 
				 #include "Scraping.h"
			
 
				+#include "../Cache/Cache.h"
			
 
				 #include "../Proxy/Proxy.h"
			
 
				 #include "../Utility/Unescape.h"
			
 
				 #include <curl/curl.h>
			
@@ -368,6 +369,10 @@ retry:
 
				   for (int i = 0; i < num_jobs; i++) {
			
 
				     ScrapeJob *job = &jobs[i];
			
 
				 
			
 
				+    char cache_key[64];
			
 
				+    char full_url[1024];
			
 
				+    char *encoded_query = NULL;
			
 
				+
			
 
				     if (job->handle) {
			
 
				       curl_easy_cleanup(job->handle);
			
 
				       job->handle = NULL;
			
@@ -376,20 +381,8 @@ retry:
 
				       free(job->response.memory);
			
 
				     }
			
 
				 
			
 
				-    job->handle = curl_easy_init();
			
 
				-    if (!job->handle) {
			
 
				-      continue;
			
 
				-    }
			
 
				-
			
 
				-    job->response.memory = (char *)malloc(16384);
			
 
				-    job->response.size = 0;
			
 
				-    job->response.capacity = 16384;
			
 
				-
			
 
				-    char full_url[1024];
			
 
				-    char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
			
 
				+    encoded_query = curl_easy_escape(NULL, job->query, 0);
			
 
				     if (!encoded_query) {
			
 
				-      curl_easy_cleanup(job->handle);
			
 
				-      job->handle = NULL;
			
 
				       continue;
			
 
				     }
			
 
				 
			
@@ -399,7 +392,52 @@ retry:
 
				 
			
 
				     snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
			
 
				              encoded_query, job->engine->page_param, page_value);
			
 
				-    curl_free(encoded_query);
			
 
				+
			
 
				+    char *key = cache_compute_key(job->query, job->page, job->engine->name);
			
 
				+    if (key) {
			
 
				+      strncpy(cache_key, key, sizeof(cache_key) - 1);
			
 
				+      cache_key[sizeof(cache_key) - 1] = '\0';
			
 
				+      free(key);
			
 
				+    } else {
			
 
				+      snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
			
 
				+               job->engine->name);
			
 
				+    }
			
 
				+
			
 
				+    char *cached_data = NULL;
			
 
				+    size_t cached_size = 0;
			
 
				+    int cache_hit = 0;
			
 
				+
			
 
				+    if (get_cache_ttl_search() > 0 &&
			
 
				+        cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
			
 
				+                  &cached_size) == 0 &&
			
 
				+        cached_data && cached_size > 0) {
			
 
				+      xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
			
 
				+                                     HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
			
 
				+                                         HTML_PARSE_NOWARNING);
			
 
				+      if (doc) {
			
 
				+        job->results_count = job->engine->parser(
			
 
				+            job->engine->name, doc, job->out_results, job->max_results);
			
 
				+        xmlFreeDoc(doc);
			
 
				+        cache_hit = 1;
			
 
				+      }
			
 
				+      free(cached_data);
			
 
				+    }
			
 
				+
			
 
				+    if (cache_hit) {
			
 
				+      free(encoded_query);
			
 
				+      job->results_count = job->results_count > 0 ? job->results_count : 0;
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    job->handle = curl_easy_init();
			
 
				+    if (!job->handle) {
			
 
				+      free(encoded_query);
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    job->response.memory = (char *)malloc(16384);
			
 
				+    job->response.size = 0;
			
 
				+    job->response.capacity = 16384;
			
 
				 
			
 
				     struct curl_slist *headers = NULL;
			
 
				     char host_buf[256], ref_buf[256];
			
@@ -451,6 +489,13 @@ retry:
 
				           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
			
 
				 
			
 
				           if (msg->data.result == CURLE_OK && job->response.size > 0) {
			
 
				+            char *key =
			
 
				+                cache_compute_key(job->query, job->page, job->engine->name);
			
 
				+            if (key && get_cache_ttl_search() > 0) {
			
 
				+              cache_set(key, job->response.memory, job->response.size);
			
 
				+              free(key);
			
 
				+            }
			
 
				+
			
 
				             xmlDocPtr doc = htmlReadMemory(
			
 
				                 job->response.memory, job->response.size, NULL, NULL,
			
 
				                 HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);