xpath.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. from lxml import html
  2. from urllib import urlencode, unquote
  3. from urlparse import urlparse, urljoin
  4. from lxml.etree import _ElementStringResult, _ElementUnicodeResult
  5. from searx.utils import html_to_text
  6. search_url = None
  7. url_xpath = None
  8. content_xpath = None
  9. title_xpath = None
  10. suggestion_xpath = ''
  11. results_xpath = ''
  12. '''
  13. if xpath_results is list, extract the text from each result and concat the list
  14. if xpath_results is a xml element, extract all the text node from it
  15. ( text_content() method from lxml )
  16. if xpath_results is a string element, then it's already done
  17. '''
  18. def extract_text(xpath_results):
  19. if type(xpath_results) == list:
  20. # it's list of result : concat everything using recursive call
  21. if not xpath_results:
  22. raise Exception('Empty url resultset')
  23. result = ''
  24. for e in xpath_results:
  25. result = result + extract_text(e)
  26. return result.strip()
  27. elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
  28. # it's a string
  29. return ''.join(xpath_results)
  30. else:
  31. # it's a element
  32. return html_to_text(xpath_results.text_content()).strip()
  33. def extract_url(xpath_results, search_url):
  34. url = extract_text(xpath_results)
  35. if url.startswith('//'):
  36. # add http or https to this kind of url //example.com/
  37. parsed_search_url = urlparse(search_url)
  38. url = parsed_search_url.scheme + url
  39. elif url.startswith('/'):
  40. # fix relative url to the search engine
  41. url = urljoin(search_url, url)
  42. # normalize url
  43. url = normalize_url(url)
  44. return url
  45. def normalize_url(url):
  46. parsed_url = urlparse(url)
  47. # add a / at this end of the url if there is no path
  48. if not parsed_url.netloc:
  49. raise Exception('Cannot parse url')
  50. if not parsed_url.path:
  51. url += '/'
  52. # FIXME : hack for yahoo
  53. if parsed_url.hostname == 'search.yahoo.com'\
  54. and parsed_url.path.startswith('/r'):
  55. p = parsed_url.path
  56. mark = p.find('/**')
  57. if mark != -1:
  58. return unquote(p[mark + 3:]).decode('utf-8')
  59. return url
  60. def request(query, params):
  61. query = urlencode({'q': query})[2:]
  62. params['url'] = search_url.format(query=query)
  63. params['query'] = query
  64. return params
  65. def response(resp):
  66. results = []
  67. dom = html.fromstring(resp.text)
  68. if results_xpath:
  69. for result in dom.xpath(results_xpath):
  70. url = extract_url(result.xpath(url_xpath), search_url)
  71. title = extract_text(result.xpath(title_xpath)[0])
  72. content = extract_text(result.xpath(content_xpath)[0])
  73. results.append({'url': url, 'title': title, 'content': content})
  74. else:
  75. for url, title, content in zip(
  76. (extract_url(x, search_url) for
  77. x in dom.xpath(url_xpath)),
  78. map(extract_text, dom.xpath(title_xpath)),
  79. map(extract_text, dom.xpath(content_xpath))
  80. ):
  81. results.append({'url': url, 'title': title, 'content': content})
  82. if not suggestion_xpath:
  83. return results
  84. for suggestion in dom.xpath(suggestion_xpath):
  85. results.append({'suggestion': extract_text(suggestion)})
  86. return results