pubmed.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. PubMed (Scholar publications)
  4. """
  5. from flask_babel import gettext
  6. from lxml import etree
  7. from datetime import datetime
  8. from urllib.parse import urlencode
  9. from searx.poolrequests import get
  10. # about
  11. about = {
  12. "website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
  13. "wikidata_id": 'Q1540899',
  14. "official_api_documentation": {
  15. 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
  16. 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/'
  17. },
  18. "use_official_api": True,
  19. "require_api_key": False,
  20. "results": 'XML',
  21. }
  22. categories = ['science']
  23. base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\
  24. + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
  25. # engine dependent config
  26. number_of_results = 10
  27. pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
  28. def request(query, params):
  29. # basic search
  30. offset = (params['pageno'] - 1) * number_of_results
  31. string_args = dict(query=urlencode({'term': query}),
  32. offset=offset,
  33. hits=number_of_results)
  34. params['url'] = base_url.format(**string_args)
  35. return params
  36. def response(resp):
  37. results = []
  38. # First retrieve notice of each result
  39. pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\
  40. + 'db=pubmed&retmode=xml&id={pmids_string}'
  41. pmids_results = etree.XML(resp.content)
  42. pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
  43. pmids_string = ''
  44. for item in pmids:
  45. pmids_string += item.text + ','
  46. retrieve_notice_args = dict(pmids_string=pmids_string)
  47. retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
  48. search_results_xml = get(retrieve_url_encoded).content
  49. search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
  50. for entry in search_results:
  51. title = entry.xpath('.//Article/ArticleTitle')[0].text
  52. pmid = entry.xpath('.//PMID')[0].text
  53. url = pubmed_url + pmid
  54. try:
  55. content = entry.xpath('.//Abstract/AbstractText')[0].text
  56. except:
  57. content = gettext('No abstract is available for this publication.')
  58. # If a doi is available, add it to the snipppet
  59. try:
  60. doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
  61. content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content)
  62. except:
  63. pass
  64. if len(content) > 300:
  65. content = content[0:300] + "..."
  66. # TODO: center snippet on query term
  67. res_dict = {'url': url,
  68. 'title': title,
  69. 'content': content}
  70. try:
  71. publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text
  72. + '-' + entry.xpath('.//DateCreated/Month')[0].text
  73. + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d')
  74. res_dict['publishedDate'] = publishedDate
  75. except:
  76. pass
  77. results.append(res_dict)
  78. return results