diff options
author | Alexandre Flament <alex@al-f.net> | 2021-09-22 13:36:34 +0200 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2021-09-28 15:26:02 +0200 |
commit | 2eab89b4ca12a404390690210f885664fa26c173 (patch) | |
tree | 32c86842f7b0a2b4c678417a8d2e23fbbe11e8af /searx | |
parent | ca67f1555aa3c7dbb8dbb7ace7d4a0be9b65c717 (diff) |
[fix] checker: fix memory usage
* download images using the "image_proxy" network (HTTP/1 instead of HTTP/2)
* don't cache data: URL (reduce memory usage)
* after each test: purge image URL cache then call garbage collector
* download only the first 64kb of images
Diffstat (limited to 'searx')
-rw-r--r-- | searx/search/checker/impl.py | 59 |
1 files changed, 40 insertions, 19 deletions
diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index dd090c51..990fd1f6 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +import gc import typing import types import functools @@ -14,6 +15,7 @@ from langdetect.lang_detect_exception import LangDetectException import httpx from searx import network, logger +from searx.utils import gen_useragent from searx.results import ResultContainer from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -58,27 +60,20 @@ def _is_url(url): @functools.lru_cache(maxsize=8192) -def _is_url_image(image_url): - if not isinstance(image_url, str): - return False - - if image_url.startswith('//'): - image_url = 'https:' + image_url - - if image_url.startswith('data:'): - return image_url.startswith('data:image/') - - if not _is_url(image_url): - return False - +def _download_and_check_if_image(image_url: str) -> bool: + """Download an URL and check if the Content-Type starts with "image/" + This function should not be called directly: use _is_url_image + otherwise the cache of functools.lru_cache contains data: URL which might be huge. + """ retry = 2 while retry > 0: a = time() try: - network.set_timeout_for_thread(10.0, time()) - r = network.get(image_url, timeout=10.0, allow_redirects=True, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + # use "image_proxy" (avoid HTTP/2) + network.set_context_network_name('image_proxy') + stream = network.stream('GET', image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': gen_useragent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', @@ -88,15 +83,37 @@ def _is_url_image(image_url): 'Sec-GPC': '1', 'Cache-Control': 'max-age=0' }) - if r.headers["content-type"].startswith('image/'): - return True - return False + r = next(stream) + r.close() + is_image = r.headers["content-type"].startswith('image/') + del r + del stream + return is_image except httpx.TimeoutException: logger.error('Timeout for %s: %i', image_url, int(time() - a)) retry -= 1 except httpx.HTTPError: logger.exception('Exception for %s', image_url) return False + return False + + +def _is_url_image(image_url) -> bool: + """Normalize image_url + """ + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + return _download_and_check_if_image(image_url) def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: @@ -414,3 +431,7 @@ class Checker: def run(self): for test_name in self.tests: self.run_test(test_name) + # clear cache + _download_and_check_if_image.cache_clear() + # force a garbage collector + gc.collect() |