google_images.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Google Images engine using the internal
  3. Google API used by the Google Go Android app.
  4. This internal API offer results in
  5. - JSON (``_fmt:json``)
  6. - Protobuf_ (``_fmt:pb``)
  7. - Protobuf_ compressed? (``_fmt:pc``)
  8. - HTML (``_fmt:html``)
  9. - Protobuf_ encoded in JSON (``_fmt:jspb``).
  10. .. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
  11. """
  12. from typing import TYPE_CHECKING
  13. from urllib.parse import urlencode
  14. from json import loads
  15. from searx.engines.google import fetch_traits # pylint: disable=unused-import
  16. from searx.engines.google import (
  17. get_google_info,
  18. time_range_dict,
  19. detect_google_sorry,
  20. )
  21. if TYPE_CHECKING:
  22. import logging
  23. from searx.enginelib.traits import EngineTraits
  24. logger: logging.Logger
  25. traits: EngineTraits
  26. # about
  27. about = {
  28. "website": 'https://images.google.com',
  29. "wikidata_id": 'Q521550',
  30. "official_api_documentation": 'https://developers.google.com/custom-search',
  31. "use_official_api": False,
  32. "require_api_key": False,
  33. "results": 'JSON',
  34. }
  35. # engine dependent config
  36. categories = ['images', 'web']
  37. paging = True
  38. max_page = 50
  39. time_range_support = True
  40. safesearch = True
  41. send_accept_language_header = True
  42. filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
  43. def request(query, params):
  44. """Google-Image search request"""
  45. google_info = get_google_info(params, traits)
  46. query_url = (
  47. 'https://'
  48. + google_info['subdomain']
  49. + '/search'
  50. + '?'
  51. + urlencode({'q': query, 'tbm': "isch", **google_info['params'], 'asearch': 'isch'})
  52. # don't urlencode this because wildly different AND bad results
  53. # pagination uses Zero-based numbering
  54. + f'&async=_fmt:json,p:1,ijn:{params["pageno"] - 1}'
  55. )
  56. if params['time_range'] in time_range_dict:
  57. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  58. if params['safesearch']:
  59. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  60. params['url'] = query_url
  61. params['cookies'] = google_info['cookies']
  62. params['headers'].update(google_info['headers'])
  63. # this ua will allow getting ~50 results instead of 10. #1641
  64. params['headers']['User-Agent'] = (
  65. 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12;' f' {google_info.get("country", "US")}) gzip'
  66. )
  67. return params
  68. def response(resp):
  69. """Get response from google's search request"""
  70. results = []
  71. detect_google_sorry(resp)
  72. json_start = resp.text.find('{"ischj":')
  73. json_data = loads(resp.text[json_start:])
  74. for item in json_data["ischj"].get("metadata", []):
  75. result_item = {
  76. 'url': item["result"]["referrer_url"],
  77. 'title': item["result"]["page_title"],
  78. 'content': item["text_in_grid"]["snippet"],
  79. 'source': item["result"]["site_title"],
  80. 'resolution': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
  81. 'img_src': item["original_image"]["url"],
  82. 'thumbnail_src': item["thumbnail"]["url"],
  83. 'template': 'images.html',
  84. }
  85. author = item["result"].get('iptc', {}).get('creator')
  86. if author:
  87. result_item['author'] = ', '.join(author)
  88. copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
  89. if copyright_notice:
  90. result_item['source'] += ' | ' + copyright_notice
  91. freshness_date = item["result"].get("freshness_date")
  92. if freshness_date:
  93. result_item['source'] += ' | ' + freshness_date
  94. file_size = item.get('gsa', {}).get('file_size')
  95. if file_size:
  96. result_item['source'] += ' (%s)' % file_size
  97. results.append(result_item)
  98. return results