google_scholar.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Google Scholar engine.
  3. Compared to other Google services the Scholar engine has a simple GET REST-API
  4. and there does not exists `async` API. Even though the API slightly vintage we
  5. can make use of the :ref:`google API` to assemble the arguments of the GET
  6. request.
  7. """
  8. from typing import TYPE_CHECKING
  9. from typing import Optional
  10. from urllib.parse import urlencode
  11. from datetime import datetime
  12. from lxml import html
  13. from searx.utils import (
  14. eval_xpath,
  15. eval_xpath_getindex,
  16. eval_xpath_list,
  17. extract_text,
  18. )
  19. from searx.exceptions import SearxEngineCaptchaException
  20. from searx.engines.google import fetch_traits # pylint: disable=unused-import
  21. from searx.engines.google import (
  22. get_google_info,
  23. time_range_dict,
  24. )
  25. from searx.enginelib.traits import EngineTraits
  26. if TYPE_CHECKING:
  27. import logging
  28. logger: logging.Logger
  29. traits: EngineTraits
  30. # about
  31. about = {
  32. "website": 'https://scholar.google.com',
  33. "wikidata_id": 'Q494817',
  34. "official_api_documentation": 'https://developers.google.com/custom-search',
  35. "use_official_api": False,
  36. "require_api_key": False,
  37. "results": 'HTML',
  38. }
  39. # engine dependent config
  40. categories = ['science', 'scientific publications']
  41. paging = True
  42. max_page = 50
  43. """`Google: max 50 pages`_
  44. .. _Google: max 50 pages: https://github.com/searxng/searxng/issues/2982
  45. """
  46. language_support = True
  47. time_range_support = True
  48. safesearch = False
  49. send_accept_language_header = True
  50. def time_range_args(params):
  51. """Returns a dictionary with a time range arguments based on
  52. ``params['time_range']``.
  53. Google Scholar supports a detailed search by year. Searching by *last
  54. month* or *last week* (as offered by SearXNG) is uncommon for scientific
  55. publications and is not supported by Google Scholar.
  56. To limit the result list when the users selects a range, all the SearXNG
  57. ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
  58. is set an empty dictionary of arguments is returned. Example; when
  59. user selects a time range (current year minus one in 2022):
  60. .. code:: python
  61. { 'as_ylo' : 2021 }
  62. """
  63. ret_val = {}
  64. if params['time_range'] in time_range_dict:
  65. ret_val['as_ylo'] = datetime.now().year - 1
  66. return ret_val
  67. def detect_google_captcha(dom):
  68. """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
  69. not redirected to ``sorry.google.com``.
  70. """
  71. if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
  72. raise SearxEngineCaptchaException()
  73. def request(query, params):
  74. """Google-Scholar search request"""
  75. google_info = get_google_info(params, traits)
  76. # subdomain is: scholar.google.xy
  77. google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
  78. args = {
  79. 'q': query,
  80. **google_info['params'],
  81. 'start': (params['pageno'] - 1) * 10,
  82. 'as_sdt': '2007', # include patents / to disable set '0,5'
  83. 'as_vis': '0', # include citations / to disable set '1'
  84. }
  85. args.update(time_range_args(params))
  86. params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
  87. params['cookies'] = google_info['cookies']
  88. params['headers'].update(google_info['headers'])
  89. return params
  90. def parse_gs_a(text: Optional[str]):
  91. """Parse the text written in green.
  92. Possible formats:
  93. * "{authors} - {journal}, {year} - {publisher}"
  94. * "{authors} - {year} - {publisher}"
  95. * "{authors} - {publisher}"
  96. """
  97. if text is None or text == "":
  98. return None, None, None, None
  99. s_text = text.split(' - ')
  100. authors = s_text[0].split(', ')
  101. publisher = s_text[-1]
  102. if len(s_text) != 3:
  103. return authors, None, publisher, None
  104. # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
  105. # get journal and year
  106. journal_year = s_text[1].split(', ')
  107. # journal is optional and may contains some coma
  108. if len(journal_year) > 1:
  109. journal = ', '.join(journal_year[0:-1])
  110. if journal == '…':
  111. journal = None
  112. else:
  113. journal = None
  114. # year
  115. year = journal_year[-1]
  116. try:
  117. publishedDate = datetime.strptime(year.strip(), '%Y')
  118. except ValueError:
  119. publishedDate = None
  120. return authors, journal, publisher, publishedDate
  121. def response(resp): # pylint: disable=too-many-locals
  122. """Parse response from Google Scholar"""
  123. results = []
  124. # convert the text to dom
  125. dom = html.fromstring(resp.text)
  126. detect_google_captcha(dom)
  127. # parse results
  128. for result in eval_xpath_list(dom, '//div[@data-rp]'):
  129. title = extract_text(eval_xpath(result, './/h3[1]//a'))
  130. if not title:
  131. # this is a [ZITATION] block
  132. continue
  133. pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
  134. if pub_type:
  135. pub_type = pub_type[1:-1].lower()
  136. url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
  137. content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
  138. authors, journal, publisher, publishedDate = parse_gs_a(
  139. extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
  140. )
  141. if publisher in url:
  142. publisher = None
  143. # cited by
  144. comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
  145. # link to the html or pdf document
  146. html_url = None
  147. pdf_url = None
  148. doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
  149. doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
  150. if doc_type == "[PDF]":
  151. pdf_url = doc_url
  152. else:
  153. html_url = doc_url
  154. results.append(
  155. {
  156. 'template': 'paper.html',
  157. 'type': pub_type,
  158. 'url': url,
  159. 'title': title,
  160. 'authors': authors,
  161. 'publisher': publisher,
  162. 'journal': journal,
  163. 'publishedDate': publishedDate,
  164. 'content': content,
  165. 'comments': comments,
  166. 'html_url': html_url,
  167. 'pdf_url': pdf_url,
  168. }
  169. )
  170. # parse suggestion
  171. for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
  172. # append suggestion
  173. results.append({'suggestion': extract_text(suggestion)})
  174. for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
  175. results.append({'correction': extract_text(correction)})
  176. return results