summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2024-01-13 14:06:26 +0100
committerMarkus Heiser <markus.heiser@darmarIT.de>2024-01-15 19:23:26 +0100
commite560d7e373a2d083590bb75014f6b1e801775410 (patch)
treedf7c243d2649db290f7b2c6e3db5cca0965d4f2b
parenta2c269bbac95879be02423cb834833654e22521b (diff)
[mod] presearch: add language & region support
In Presearch there are languages for the UI and regions for narrowing down the search. With this change the SearXNG engine supports a search by region. The details can be found in the documentation of the source code. To test, you can search terms like:: !presearch bmw :zh-TW !presearch bmw :en-CA 1. You should get results corresponding to the region (Taiwan, Canada) 2. and in the language (Chinese, Englisch). 3. The context in info box content is in the same language. Exceptions: 1. Region or language is not supported by Presearch or 2. SearXNG user did not selected a region tag, example:: !presearch bmw :en Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r--docs/dev/engines/online/presearch.rst13
-rw-r--r--searx/engines/presearch.py117
2 files changed, 113 insertions, 17 deletions
diff --git a/docs/dev/engines/online/presearch.rst b/docs/dev/engines/online/presearch.rst
new file mode 100644
index 00000000..59332c35
--- /dev/null
+++ b/docs/dev/engines/online/presearch.rst
@@ -0,0 +1,13 @@
+.. _engine presearch:
+
+================
+Presearch Engine
+================
+
+.. contents::
+ :depth: 2
+ :local:
+ :backlinks: entry
+
+.. automodule:: searx.engines.presearch
+ :members:
diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py
index 74739382..baf692d6 100644
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@@ -1,23 +1,72 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
-"""Presearch (general, images, videos, news)
+"""Presearch supports the search types listed in :py:obj:`search_type` (general,
+images, videos, news).
-.. hint::
+Configured ``presarch`` engines:
+
+.. code:: yaml
+
+ - name: presearch
+ engine: presearch
+ search_type: search
+ categories: [general, web]
+
+ - name: presearch images
+ ...
+ search_type: images
+ categories: [images, web]
- The results in the video category are most often links to pages that contain
- a video, for instance many links from preasearch's video category link
- content from facebook (aka Meta) or Twitter (aka X). Since these are not
- real links to video streams SearXNG can't use the video template for this and
- if SearXNG can't use this template, then the user doesn't want to see these
- hits in the videos category.
+ - name: presearch videos
+ ...
+ search_type: videos
+ categories: [general, web]
+
+ - name: presearch news
+ ...
+ search_type: news
+ categories: [news, web]
+
+.. hint::
- TL;DR; by default presearch's video category is placed into categories::
+ By default Presearch's video category is intentionally placed into::
categories: [general, web]
+
+Search type ``video``
+=====================
+
+The results in the video category are most often links to pages that contain a
+video, for instance many links from Preasearch's video category link content
+from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
+video streams SearXNG can't use the video template for this and if SearXNG can't
+use this template, then the user doesn't want to see these hits in the videos
+category.
+
+
+Languages & Regions
+===================
+
+In Presearch there are languages for the UI and regions for narrowing down the
+search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
+``use_local_search_results=false``, then the defaults are set for both (the
+language and the region) from the ``Accept-Language`` header.
+
+Since the region is already "auto" by default, we only need to set the
+``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
+have to set these values in both requests we send to Presearch; in the first
+request to get the request-ID from Presearch and in the final request to get the
+result list (see ``send_accept_language_header``).
+
+
+Implementations
+===============
+
"""
from urllib.parse import urlencode
+from searx import locales
from searx.network import get
from searx.utils import gen_useragent, html_to_text
@@ -32,6 +81,7 @@ about = {
paging = True
safesearch = True
time_range_support = True
+send_accept_language_header = True
categories = ["general", "web"] # general, images, videos, news
search_type = "search"
@@ -46,19 +96,43 @@ def init(_):
raise ValueError(f'presearch search_type: {search_type}')
-def _get_request_id(query, page, time_range, safesearch_param):
+def _get_request_id(query, params):
+
args = {
"q": query,
- "page": page,
+ "page": params["pageno"],
}
- if time_range:
- args["time"] = time_range
+
+ if params["time_range"]:
+ args["time"] = params["time_range"]
url = f"{base_url}/{search_type}?{urlencode(args)}"
+
headers = {
'User-Agent': gen_useragent(),
- 'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch_param]}",
+ 'Cookie': (
+ f"b=1;"
+ f" presearch_session=;"
+ f" use_local_search_results=false;"
+ f" use_safe_search={safesearch_map[params['safesearch']]}"
+ ),
}
+ if params['searxng_locale'] != 'all':
+ l = locales.get_locale(params['searxng_locale'])
+
+ # Presearch narrows down the search by region. In SearXNG when the user
+ # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
+ # region.
+
+ # We could possibly use searx.locales.get_official_locales to determine
+ # in which regions this language is an official one, but then we still
+ # wouldn't know which region should be given more weight / Presearch
+ # performs an IP-based geolocation of the user, we don't want that in
+ # SearXNG ;-)
+
+ if l.territory:
+ headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
+
resp_text = get(url, headers=headers).text # type: ignore
for line in resp_text.split("\n"):
@@ -69,8 +143,7 @@ def _get_request_id(query, page, time_range, safesearch_param):
def request(query, params):
- request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
-
+ request_id = _get_request_id(query, params)
params["headers"]["Accept"] = "application/json"
params["url"] = f"{base_url}/results?id={request_id}"
@@ -109,7 +182,17 @@ def parse_search_query(json_results):
if info:
attributes = []
for item in info.get('about', []):
- label, value = html_to_text(item).split(':', 1)
+
+ text = html_to_text(item)
+ if ':' in text:
+ # split text into key / value
+ label, value = text.split(':', 1)
+ else:
+ # In other languages (tested with zh-TW) a colon is represented
+ # by a different symbol --> then we split at the first space.
+ label, value = text.split(' ', 1)
+ label = label[:-1]
+
value = _strip_leading_strings(value)
attributes.append({'label': label, 'value': value})
content = []