pubmed.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """PubMed (Scholar publications)
  3. """
  4. from datetime import datetime
  5. from urllib.parse import urlencode
  6. from lxml import etree
  7. from searx.network import get
  8. from searx.utils import (
  9. eval_xpath_getindex,
  10. eval_xpath_list,
  11. extract_text,
  12. )
  13. # about
  14. about = {
  15. "website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
  16. "wikidata_id": 'Q1540899',
  17. "official_api_documentation": {
  18. 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
  19. 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
  20. },
  21. "use_official_api": True,
  22. "require_api_key": False,
  23. "results": 'XML',
  24. }
  25. categories = ['science', 'scientific publications']
  26. base_url = (
  27. 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
  28. )
  29. # engine dependent config
  30. number_of_results = 10
  31. pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
  32. def request(query, params):
  33. # basic search
  34. offset = (params['pageno'] - 1) * number_of_results
  35. string_args = {
  36. 'query': urlencode({'term': query}),
  37. 'offset': offset,
  38. 'hits': number_of_results,
  39. }
  40. params['url'] = base_url.format(**string_args)
  41. return params
  42. def response(resp): # pylint: disable=too-many-locals
  43. results = []
  44. # First retrieve notice of each result
  45. pubmed_retrieve_api_url = (
  46. 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
  47. )
  48. pmids_results = etree.XML(resp.content)
  49. pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
  50. pmids_string = ''
  51. for item in pmids:
  52. pmids_string += item.text + ','
  53. retrieve_notice_args = {'pmids_string': pmids_string}
  54. retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
  55. search_results_response = get(retrieve_url_encoded).content
  56. search_results = etree.XML(search_results_response)
  57. for entry in eval_xpath_list(search_results, '//PubmedArticle'):
  58. medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
  59. title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
  60. pmid = eval_xpath_getindex(medline, './/PMID', 0).text
  61. url = pubmed_url + pmid
  62. content = extract_text(
  63. eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
  64. )
  65. doi = extract_text(
  66. eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
  67. )
  68. journal = extract_text(
  69. eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
  70. )
  71. issn = extract_text(
  72. eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
  73. )
  74. authors = []
  75. for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
  76. f = eval_xpath_getindex(author, './ForeName', 0, default=None)
  77. l = eval_xpath_getindex(author, './LastName', 0, default=None)
  78. f = '' if f is None else f.text
  79. l = '' if l is None else l.text
  80. authors.append((f + ' ' + l).strip())
  81. res_dict = {
  82. 'template': 'paper.html',
  83. 'url': url,
  84. 'title': title,
  85. 'content': content or "",
  86. 'journal': journal,
  87. 'issn': [issn],
  88. 'authors': authors,
  89. 'doi': doi,
  90. }
  91. accepted_date = eval_xpath_getindex(
  92. entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
  93. )
  94. if accepted_date is not None:
  95. year = eval_xpath_getindex(accepted_date, './Year', 0)
  96. month = eval_xpath_getindex(accepted_date, './Month', 0)
  97. day = eval_xpath_getindex(accepted_date, './Day', 0)
  98. try:
  99. publishedDate = datetime.strptime(
  100. year.text + '-' + month.text + '-' + day.text,
  101. '%Y-%m-%d',
  102. )
  103. res_dict['publishedDate'] = publishedDate
  104. except Exception as e: # pylint: disable=broad-exception-caught
  105. print(e)
  106. results.append(res_dict)
  107. return results