google_scholar.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This is the implementation of the Google Scholar engine.
  3. Compared to other Google services the Scholar engine has a simple GET REST-API
  4. and there does not exists `async` API. Even though the API slightly vintage we
  5. can make use of the :ref:`google API` to assemble the arguments of the GET
  6. request.
  7. """
  8. from typing import TYPE_CHECKING
  9. from typing import Optional
  10. from urllib.parse import urlencode
  11. from datetime import datetime
  12. from lxml import html
  13. from searx.utils import (
  14. eval_xpath,
  15. eval_xpath_getindex,
  16. eval_xpath_list,
  17. extract_text,
  18. )
  19. from searx.exceptions import SearxEngineCaptchaException
  20. from searx.engines.google import fetch_traits # pylint: disable=unused-import
  21. from searx.engines.google import (
  22. get_google_info,
  23. time_range_dict,
  24. )
  25. from searx.enginelib.traits import EngineTraits
  26. if TYPE_CHECKING:
  27. import logging
  28. logger: logging.Logger
  29. traits: EngineTraits
  30. # about
  31. about = {
  32. "website": 'https://scholar.google.com',
  33. "wikidata_id": 'Q494817',
  34. "official_api_documentation": 'https://developers.google.com/custom-search',
  35. "use_official_api": False,
  36. "require_api_key": False,
  37. "results": 'HTML',
  38. }
  39. # engine dependent config
  40. categories = ['science', 'scientific publications']
  41. paging = True
  42. max_page = 50
  43. language_support = True
  44. time_range_support = True
  45. safesearch = False
  46. send_accept_language_header = True
  47. def time_range_args(params):
  48. """Returns a dictionary with a time range arguments based on
  49. ``params['time_range']``.
  50. Google Scholar supports a detailed search by year. Searching by *last
  51. month* or *last week* (as offered by SearXNG) is uncommon for scientific
  52. publications and is not supported by Google Scholar.
  53. To limit the result list when the users selects a range, all the SearXNG
  54. ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
  55. is set an empty dictionary of arguments is returned. Example; when
  56. user selects a time range (current year minus one in 2022):
  57. .. code:: python
  58. { 'as_ylo' : 2021 }
  59. """
  60. ret_val = {}
  61. if params['time_range'] in time_range_dict:
  62. ret_val['as_ylo'] = datetime.now().year - 1
  63. return ret_val
  64. def detect_google_captcha(dom):
  65. """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
  66. not redirected to ``sorry.google.com``.
  67. """
  68. if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
  69. raise SearxEngineCaptchaException()
  70. def request(query, params):
  71. """Google-Scholar search request"""
  72. google_info = get_google_info(params, traits)
  73. # subdomain is: scholar.google.xy
  74. google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
  75. args = {
  76. 'q': query,
  77. **google_info['params'],
  78. 'start': (params['pageno'] - 1) * 10,
  79. 'as_sdt': '2007', # include patents / to disable set '0,5'
  80. 'as_vis': '0', # include citations / to disable set '1'
  81. }
  82. args.update(time_range_args(params))
  83. params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
  84. params['cookies'] = google_info['cookies']
  85. params['headers'].update(google_info['headers'])
  86. return params
  87. def parse_gs_a(text: Optional[str]):
  88. """Parse the text written in green.
  89. Possible formats:
  90. * "{authors} - {journal}, {year} - {publisher}"
  91. * "{authors} - {year} - {publisher}"
  92. * "{authors} - {publisher}"
  93. """
  94. if text is None or text == "":
  95. return None, None, None, None
  96. s_text = text.split(' - ')
  97. authors = s_text[0].split(', ')
  98. publisher = s_text[-1]
  99. if len(s_text) != 3:
  100. return authors, None, publisher, None
  101. # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
  102. # get journal and year
  103. journal_year = s_text[1].split(', ')
  104. # journal is optional and may contains some coma
  105. if len(journal_year) > 1:
  106. journal = ', '.join(journal_year[0:-1])
  107. if journal == '…':
  108. journal = None
  109. else:
  110. journal = None
  111. # year
  112. year = journal_year[-1]
  113. try:
  114. publishedDate = datetime.strptime(year.strip(), '%Y')
  115. except ValueError:
  116. publishedDate = None
  117. return authors, journal, publisher, publishedDate
  118. def response(resp): # pylint: disable=too-many-locals
  119. """Parse response from Google Scholar"""
  120. results = []
  121. # convert the text to dom
  122. dom = html.fromstring(resp.text)
  123. detect_google_captcha(dom)
  124. # parse results
  125. for result in eval_xpath_list(dom, '//div[@data-rp]'):
  126. title = extract_text(eval_xpath(result, './/h3[1]//a'))
  127. if not title:
  128. # this is a [ZITATION] block
  129. continue
  130. pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
  131. if pub_type:
  132. pub_type = pub_type[1:-1].lower()
  133. url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
  134. content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
  135. authors, journal, publisher, publishedDate = parse_gs_a(
  136. extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
  137. )
  138. if publisher in url:
  139. publisher = None
  140. # cited by
  141. comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
  142. # link to the html or pdf document
  143. html_url = None
  144. pdf_url = None
  145. doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
  146. doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
  147. if doc_type == "[PDF]":
  148. pdf_url = doc_url
  149. else:
  150. html_url = doc_url
  151. results.append(
  152. {
  153. 'template': 'paper.html',
  154. 'type': pub_type,
  155. 'url': url,
  156. 'title': title,
  157. 'authors': authors,
  158. 'publisher': publisher,
  159. 'journal': journal,
  160. 'publishedDate': publishedDate,
  161. 'content': content,
  162. 'comments': comments,
  163. 'html_url': html_url,
  164. 'pdf_url': pdf_url,
  165. }
  166. )
  167. # parse suggestion
  168. for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
  169. # append suggestion
  170. results.append({'suggestion': extract_text(suggestion)})
  171. for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
  172. results.append({'correction': extract_text(correction)})
  173. return results