diff options
Diffstat (limited to 'searx/engines/bing_news.py')
-rw-r--r-- | searx/engines/bing_news.py | 114 |
1 files changed, 63 insertions, 51 deletions
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 81c8df0f..3585a1cd 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -1,6 +1,11 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint """Bing-News: description see :py:obj:`searx.engines.bing`. + +.. hint:: + + Bing News is *different* in some ways! + """ # pylint: disable=invalid-name @@ -10,11 +15,9 @@ from urllib.parse import urlencode from lxml import html +from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex from searx.enginelib.traits import EngineTraits -from searx.engines.bing import ( - set_bing_cookies, - _fetch_traits, -) +from searx.engines.bing import set_bing_cookies if TYPE_CHECKING: import logging @@ -37,57 +40,48 @@ about = { # engine dependent config categories = ['news'] paging = True +"""If go through the pages and there are actually no new results for another +page, then bing returns the results from the last page again.""" + time_range_support = True time_map = { - 'day': '4', - 'week': '8', - 'month': '9', + 'day': 'interval="4"', + 'week': 'interval="7"', + 'month': 'interval="9"', } -"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the +"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the difference of *last day* and *last week* in the result list is just marginally. -""" +Bing does not have news range ``year`` / we use ``month`` instead.""" base_url = 'https://www.bing.com/news/infinitescrollajax' """Bing (News) search URL""" -bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes' -"""Bing (News) search API description""" - -mkt_alias = { - 'zh': 'en-WW', - 'zh-CN': 'en-WW', -} -"""Bing News has an official market code 'zh-CN' but we won't get a result with -this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate* -market code (en-WW). -""" - def request(query, params): """Assemble a Bing-News request.""" - sxng_locale = params['searxng_locale'] - engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale) - engine_language = traits.get_language(sxng_locale, 'en-us') - + engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore + engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore set_bing_cookies(params, engine_language, engine_region) # build URL query # # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 + page = int(params.get('pageno', 1)) - 1 query_params = { - # fmt: off 'q': query, 'InfiniteScroll': 1, # to simplify the page count lets use the default of 10 images per page - 'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1, - # fmt: on + 'first': page * 10 + 1, + 'SFX': page, + 'form': 'PTFTNR', + 'setlang': engine_region.split('-')[0], + 'cc': engine_region.split('-')[-1], } if params['time_range']: - # qft=interval:"7" - query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9') + query_params['qft'] = time_map.get(params['time_range'], 'interval="9"') params['url'] = base_url + '?' + urlencode(query_params) @@ -103,18 +97,34 @@ def response(resp): dom = html.fromstring(resp.text) - for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'): + for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'): + + link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None) + if link is None: + continue + url = link.attrib.get('href') + title = extract_text(link) + content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]')) + + metadata = [] + source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None) + if source is not None: + for item in ( + eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None), + # eval_xpath_getindex(source, './/a', 0, None), + # eval_xpath_getindex(source, './div/span', 3, None), + link.attrib.get('data-author'), + ): + if item is not None: + t = extract_text(item) + if t and t.strip(): + metadata.append(t.strip()) + metadata = ' | '.join(metadata) - url = newsitem.xpath('./@url')[0] - title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip() - content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip() thumbnail = None - author = newsitem.xpath('./@data-author')[0] - metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip() - - img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src') - if img_src: - thumbnail = 'https://www.bing.com/' + img_src[0] + imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None) + if imagelink is not None: + thumbnail = 'https://www.bing.com/' + imagelink.attrib.get('src') results.append( { @@ -122,7 +132,6 @@ def response(resp): 'title': title, 'content': content, 'img_src': thumbnail, - 'author': author, 'metadata': metadata, } ) @@ -131,17 +140,20 @@ def response(resp): def fetch_traits(engine_traits: EngineTraits): - """Fetch languages and regions from Bing-News. + """Fetch languages and regions from Bing-News.""" + # pylint: disable=import-outside-toplevel + + from searx.engines.bing import fetch_traits as _f - The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the - first table says *"query parameter when calling the Video Search API."* - .. that's why I use the 4. table "News Category API markets" for the - ``xpath_market_codes``. + _f(engine_traits) - """ + # fix market codes not known by bing news: - xpath_market_codes = '//table[4]/tbody/tr/td[3]' - # xpath_country_codes = '//table[2]/tbody/tr/td[2]' - xpath_language_codes = '//table[3]/tbody/tr/td[2]' + # In bing the market code 'zh-cn' exists, but there is no 'news' category in + # bing for this market. Alternatively we use the the market code from Honk + # Kong. Even if this is not correct, it is better than having no hits at + # all, or sending false queries to bing that could raise the suspicion of a + # bot. - _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) + # HINT: 'en-hk' is the region code it does not indicate the language en!! + engine_traits.regions['zh-CN'] = 'en-hk' |