soundcloud.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. """
  2. Soundcloud (Music)
  3. @website https://soundcloud.com
  4. @provide-api yes (https://developers.soundcloud.com/)
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, title, content, publishedDate, embedded
  9. """
  10. import re
  11. from StringIO import StringIO
  12. from json import loads
  13. from lxml import etree
  14. from urllib import urlencode, quote_plus
  15. from dateutil import parser
  16. from searx import logger
  17. from searx.poolrequests import get as http_get
  18. # engine dependent config
  19. categories = ['music']
  20. paging = True
  21. # search-url
  22. url = 'https://api.soundcloud.com/'
  23. search_url = url + 'search?{query}'\
  24. '&facet=model'\
  25. '&limit=20'\
  26. '&offset={offset}'\
  27. '&linked_partitioning=1'\
  28. '&client_id={client_id}' # noqa
  29. embedded_url = '<iframe width="100%" height="166" ' +\
  30. 'scrolling="no" frameborder="no" ' +\
  31. 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>'
  32. def get_client_id():
  33. response = http_get("https://soundcloud.com")
  34. rx_namespace = {"re": "http://exslt.org/regular-expressions"}
  35. if response.ok:
  36. tree = etree.parse(StringIO(response.content), etree.HTMLParser())
  37. script_tags = tree.xpath("//script[re:match(@src, '(.*app.*js)')]", namespaces=rx_namespace)
  38. app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
  39. # extracts valid app_js urls from soundcloud.com content
  40. for app_js_url in app_js_urls:
  41. # gets app_js and searches for the clientid
  42. response = http_get(app_js_url)
  43. if response.ok:
  44. cids = re.search(r'client_id:"([^"]*)"', response.content, re.M | re.I)
  45. if cids is not None and len(cids.groups()):
  46. return cids.groups()[0]
  47. logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!")
  48. return ""
  49. # api-key
  50. guest_client_id = get_client_id()
  51. # do search-request
  52. def request(query, params):
  53. offset = (params['pageno'] - 1) * 20
  54. params['url'] = search_url.format(query=urlencode({'q': query}),
  55. offset=offset,
  56. client_id=guest_client_id)
  57. return params
  58. # get response from search-request
  59. def response(resp):
  60. results = []
  61. search_res = loads(resp.text)
  62. # parse results
  63. for result in search_res.get('collection', []):
  64. if result['kind'] in ('track', 'playlist'):
  65. title = result['title']
  66. content = result['description']
  67. publishedDate = parser.parse(result['last_modified'])
  68. uri = quote_plus(result['uri'])
  69. embedded = embedded_url.format(uri=uri)
  70. # append result
  71. results.append({'url': result['permalink_url'],
  72. 'title': title,
  73. 'publishedDate': publishedDate,
  74. 'embedded': embedded,
  75. 'content': content})
  76. # return results
  77. return results