summaryrefslogtreecommitdiff
path: root/searx/search
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2021-09-22 13:36:34 +0200
committerAlexandre Flament <alex@al-f.net>2021-09-28 15:26:02 +0200
commit2eab89b4ca12a404390690210f885664fa26c173 (patch)
tree32c86842f7b0a2b4c678417a8d2e23fbbe11e8af /searx/search
parentca67f1555aa3c7dbb8dbb7ace7d4a0be9b65c717 (diff)
[fix] checker: fix memory usage
* download images using the "image_proxy" network (HTTP/1 instead of HTTP/2) * don't cache data: URL (reduce memory usage) * after each test: purge image URL cache then call garbage collector * download only the first 64kb of images
Diffstat (limited to 'searx/search')
-rw-r--r--searx/search/checker/impl.py59
1 files changed, 40 insertions, 19 deletions
diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py
index dd090c51..990fd1f6 100644
--- a/searx/search/checker/impl.py
+++ b/searx/search/checker/impl.py
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
+import gc
import typing
import types
import functools
@@ -14,6 +15,7 @@ from langdetect.lang_detect_exception import LangDetectException
import httpx
from searx import network, logger
+from searx.utils import gen_useragent
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
@@ -58,27 +60,20 @@ def _is_url(url):
@functools.lru_cache(maxsize=8192)
-def _is_url_image(image_url):
- if not isinstance(image_url, str):
- return False
-
- if image_url.startswith('//'):
- image_url = 'https:' + image_url
-
- if image_url.startswith('data:'):
- return image_url.startswith('data:image/')
-
- if not _is_url(image_url):
- return False
-
+def _download_and_check_if_image(image_url: str) -> bool:
+ """Download an URL and check if the Content-Type starts with "image/"
+ This function should not be called directly: use _is_url_image
+ otherwise the cache of functools.lru_cache contains data: URL which might be huge.
+ """
retry = 2
while retry > 0:
a = time()
try:
- network.set_timeout_for_thread(10.0, time())
- r = network.get(image_url, timeout=10.0, allow_redirects=True, headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
+ # use "image_proxy" (avoid HTTP/2)
+ network.set_context_network_name('image_proxy')
+ stream = network.stream('GET', image_url, timeout=10.0, allow_redirects=True, headers={
+ 'User-Agent': gen_useragent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
@@ -88,15 +83,37 @@ def _is_url_image(image_url):
'Sec-GPC': '1',
'Cache-Control': 'max-age=0'
})
- if r.headers["content-type"].startswith('image/'):
- return True
- return False
+ r = next(stream)
+ r.close()
+ is_image = r.headers["content-type"].startswith('image/')
+ del r
+ del stream
+ return is_image
except httpx.TimeoutException:
logger.error('Timeout for %s: %i', image_url, int(time() - a))
retry -= 1
except httpx.HTTPError:
logger.exception('Exception for %s', image_url)
return False
+ return False
+
+
+def _is_url_image(image_url) -> bool:
+ """Normalize image_url
+ """
+ if not isinstance(image_url, str):
+ return False
+
+ if image_url.startswith('//'):
+ image_url = 'https:' + image_url
+
+ if image_url.startswith('data:'):
+ return image_url.startswith('data:image/')
+
+ if not _is_url(image_url):
+ return False
+
+ return _download_and_check_if_image(image_url)
def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
@@ -414,3 +431,7 @@ class Checker:
def run(self):
for test_name in self.tests:
self.run_test(test_name)
+ # clear cache
+ _download_and_check_if_image.cache_clear()
+ # force a garbage collector
+ gc.collect()