tokyotoshokan.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Tokyo Toshokan (A BitTorrent Library for Japanese Media)
  4. """
  5. import re
  6. from urllib.parse import urlencode
  7. from lxml import html
  8. from datetime import datetime
  9. from searx.utils import extract_text, get_torrent_size, int_or_zero
  10. # about
  11. about = {
  12. "website": 'https://www.tokyotosho.info/',
  13. "wikidata_id": None,
  14. "official_api_documentation": None,
  15. "use_official_api": False,
  16. "require_api_key": False,
  17. "results": 'HTML',
  18. }
  19. # engine dependent config
  20. categories = ['files', 'videos', 'music']
  21. paging = True
  22. # search-url
  23. base_url = 'https://www.tokyotosho.info/'
  24. search_url = base_url + 'search.php?{query}'
  25. # do search-request
  26. def request(query, params):
  27. query = urlencode({'page': params['pageno'], 'terms': query})
  28. params['url'] = search_url.format(query=query)
  29. return params
  30. # get response from search-request
  31. def response(resp):
  32. results = []
  33. dom = html.fromstring(resp.text)
  34. rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]')
  35. # check if there are no results or page layout was changed so we cannot parse it
  36. # currently there are two rows for each result, so total count must be even
  37. if len(rows) == 0 or len(rows) % 2 != 0:
  38. return []
  39. # regular expression for parsing torrent size strings
  40. size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)
  41. # processing the results, two rows at a time
  42. for i in range(0, len(rows), 2):
  43. # parse the first row
  44. name_row = rows[i]
  45. links = name_row.xpath('./td[@class="desc-top"]/a')
  46. params = {
  47. 'template': 'torrent.html',
  48. 'url': links[-1].attrib.get('href'),
  49. 'title': extract_text(links[-1])
  50. }
  51. # I have not yet seen any torrents without magnet links, but
  52. # it's better to be prepared to stumble upon one some day
  53. if len(links) == 2:
  54. magnet = links[0].attrib.get('href')
  55. if magnet.startswith('magnet'):
  56. # okay, we have a valid magnet link, let's add it to the result
  57. params['magnetlink'] = magnet
  58. # no more info in the first row, start parsing the second one
  59. info_row = rows[i + 1]
  60. desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
  61. for item in desc.split('|'):
  62. item = item.strip()
  63. if item.startswith('Size:'):
  64. try:
  65. # ('1.228', 'GB')
  66. groups = size_re.match(item).groups()
  67. params['filesize'] = get_torrent_size(groups[0], groups[1])
  68. except:
  69. pass
  70. elif item.startswith('Date:'):
  71. try:
  72. # Date: 2016-02-21 21:44 UTC
  73. date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
  74. params['publishedDate'] = date
  75. except:
  76. pass
  77. elif item.startswith('Comment:'):
  78. params['content'] = item
  79. stats = info_row.xpath('./td[@class="stats"]/span')
  80. # has the layout not changed yet?
  81. if len(stats) == 3:
  82. params['seed'] = int_or_zero(extract_text(stats[0]))
  83. params['leech'] = int_or_zero(extract_text(stats[1]))
  84. results.append(params)
  85. return results