base.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. BASE (Scholar publications)
  4. """
  5. from urllib.parse import urlencode
  6. from lxml import etree
  7. from datetime import datetime
  8. import re
  9. from searx.utils import searx_useragent
  10. # about
  11. about = {
  12. "website": 'https://base-search.net',
  13. "wikidata_id": 'Q448335',
  14. "official_api_documentation": 'https://api.base-search.net/',
  15. "use_official_api": True,
  16. "require_api_key": False,
  17. "results": 'XML',
  18. }
  19. categories = ['science']
  20. base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\
  21. + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
  22. # engine dependent config
  23. paging = True
  24. number_of_results = 10
  25. # shortcuts for advanced search
  26. shorcut_dict = {
  27. # user-friendly keywords
  28. 'format:': 'dcformat:',
  29. 'author:': 'dccreator:',
  30. 'collection:': 'dccollection:',
  31. 'hdate:': 'dchdate:',
  32. 'contributor:': 'dccontributor:',
  33. 'coverage:': 'dccoverage:',
  34. 'date:': 'dcdate:',
  35. 'abstract:': 'dcdescription:',
  36. 'urls:': 'dcidentifier:',
  37. 'language:': 'dclanguage:',
  38. 'publisher:': 'dcpublisher:',
  39. 'relation:': 'dcrelation:',
  40. 'rights:': 'dcrights:',
  41. 'source:': 'dcsource:',
  42. 'subject:': 'dcsubject:',
  43. 'title:': 'dctitle:',
  44. 'type:': 'dcdctype:'
  45. }
  46. def request(query, params):
  47. # replace shortcuts with API advanced search keywords
  48. for key in shorcut_dict.keys():
  49. query = re.sub(key, shorcut_dict[key], query)
  50. # basic search
  51. offset = (params['pageno'] - 1) * number_of_results
  52. string_args = dict(query=urlencode({'query': query}),
  53. offset=offset,
  54. hits=number_of_results)
  55. params['url'] = base_url.format(**string_args)
  56. params['headers']['User-Agent'] = searx_useragent()
  57. return params
  58. def response(resp):
  59. results = []
  60. search_results = etree.XML(resp.content)
  61. for entry in search_results.xpath('./result/doc'):
  62. content = "No description available"
  63. date = datetime.now() # needed in case no dcdate is available for an item
  64. for item in entry:
  65. if item.attrib["name"] == "dcdate":
  66. date = item.text
  67. elif item.attrib["name"] == "dctitle":
  68. title = item.text
  69. elif item.attrib["name"] == "dclink":
  70. url = item.text
  71. elif item.attrib["name"] == "dcdescription":
  72. content = item.text[:300]
  73. if len(item.text) > 300:
  74. content += "..."
  75. # dates returned by the BASE API are not several formats
  76. publishedDate = None
  77. for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
  78. try:
  79. publishedDate = datetime.strptime(date, date_format)
  80. break
  81. except:
  82. pass
  83. if publishedDate is not None:
  84. res_dict = {'url': url,
  85. 'title': title,
  86. 'publishedDate': publishedDate,
  87. 'content': content}
  88. else:
  89. res_dict = {'url': url,
  90. 'title': title,
  91. 'content': content}
  92. results.append(res_dict)
  93. return results