summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBnyro <bnyro@tutanota.com>2023-09-14 13:31:54 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2023-11-27 14:16:42 +0100
commit23582aac5c599279461f2740928c33cffc944083 (patch)
tree9c2db3ef19cfba026cf1b28b7906d4c29df2cc5e
parent99fb565b391b04d10bd2f41fe6ecebb438fc4240 (diff)
[feat] implementation of presearch engine
-rw-r--r--searx/engines/presearch.py102
-rw-r--r--searx/settings.yml30
2 files changed, 132 insertions, 0 deletions
diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py
new file mode 100644
index 00000000..c41cf3b3
--- /dev/null
+++ b/searx/engines/presearch.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Presearch (general, images, videos, news)
+"""
+
+from urllib.parse import urlencode
+from searx.network import get
+from searx.utils import gen_useragent, html_to_text
+
+about = {
+ "website": "https://presearch.io",
+ "wikidiata_id": "Q7240905",
+ "official_api_documentation": "https://docs.presearch.io/nodes/api",
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": "JSON",
+}
+paging = True
+time_range_support = True
+categories = ["general", "web"] # general, images, videos, news
+
+search_type = "search" # must be any of "search", "images", "videos", "news"
+
+base_url = "https://presearch.com"
+safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
+
+
+def _get_request_id(query, page, time_range, safesearch):
+ args = {
+ "q": query,
+ "page": page,
+ }
+ if time_range:
+ args["time_range"] = time_range
+
+ url = f"{base_url}/{search_type}?{urlencode(args)}"
+ headers = {
+ 'User-Agent': gen_useragent(),
+ 'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
+ }
+ resp_text = get(url, headers=headers).text
+
+ for line in resp_text.split("\n"):
+ if "window.searchId = " in line:
+ return line.split("= ")[1][:-1].replace('"', "")
+
+ return None
+
+
+def _is_valid_img_src(url):
+ # in some cases, the image url is a base64 encoded string, which has to be skipped
+ return "https://" in url
+
+
+def request(query, params):
+ request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
+
+ params["headers"]["Accept"] = "application/json"
+ params["url"] = f"{base_url}/results?id={request_id}"
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ json = resp.json()
+
+ json_results = []
+ if search_type == "search":
+ json_results = json['results'].get('standardResults', [])
+ else:
+ json_results = json.get(search_type, [])
+
+ for json_result in json_results:
+ result = {
+ 'url': json_result['link'],
+ 'title': json_result['title'],
+ 'content': html_to_text(json_result.get('description', '')),
+ }
+ if search_type == "images":
+ result['template'] = 'images.html'
+
+ if not _is_valid_img_src(json_result['image']):
+ continue
+
+ result['img_src'] = json_result['image']
+ if _is_valid_img_src(json_result['thumbnail']):
+ result['thumbnail'] = json_result['thumbnail']
+
+ elif search_type == "videos":
+ result['template'] = 'videos.html'
+
+ if _is_valid_img_src(json_result['image']):
+ result['thumbnail'] = json_result['image']
+
+ result['duration'] = json_result['duration']
+ result['length'] = json_result['duration']
+
+ results.append(result)
+
+ return results
diff --git a/searx/settings.yml b/searx/settings.yml
index 4437fb3a..0edf0176 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1290,6 +1290,36 @@ engines:
# query_str: 'SELECT * from my_table WHERE my_column = %(query)s'
# shortcut : psql
+ - name: presearch
+ engine: presearch
+ search_type: search
+ categories: [general, web]
+ shortcut: ps
+
+ - name: presearch images
+ engine: presearch
+ search_type: images
+ categories: [images, web]
+ timeout: 4.0
+ shortcut: psimg
+ disabled: true
+
+ - name: presearch videos
+ engine: presearch
+ search_type: videos
+ categories: [videos, web]
+ timeout: 4.0
+ shortcut: psvid
+ disabled: true
+
+ - name: presearch news
+ engine: presearch
+ search_type: news
+ categories: [news, web]
+ timeout: 4.0
+ shortcut: psnews
+ disabled: true
+
- name: pub.dev
engine: xpath
shortcut: pd