tokyotoshokan.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Tokyo Toshokan (A BitTorrent Library for Japanese Media)
  3. """
  4. import re
  5. from datetime import datetime
  6. from urllib.parse import urlencode
  7. from lxml import html
  8. from searx.utils import extract_text, int_or_zero
  9. # about
  10. about = {
  11. "website": 'https://www.tokyotosho.info/',
  12. "wikidata_id": None,
  13. "official_api_documentation": None,
  14. "use_official_api": False,
  15. "require_api_key": False,
  16. "results": 'HTML',
  17. }
  18. # engine dependent config
  19. categories = ['files']
  20. paging = True
  21. # search-url
  22. base_url = 'https://www.tokyotosho.info/'
  23. search_url = base_url + 'search.php?{query}'
  24. # do search-request
  25. def request(query, params):
  26. query = urlencode({'page': params['pageno'], 'terms': query})
  27. params['url'] = search_url.format(query=query)
  28. return params
  29. # get response from search-request
  30. def response(resp):
  31. results = []
  32. dom = html.fromstring(resp.text)
  33. rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]')
  34. # check if there are no results or page layout was changed so we cannot parse it
  35. # currently there are two rows for each result, so total count must be even
  36. if len(rows) == 0 or len(rows) % 2 != 0:
  37. return []
  38. # regular expression for parsing torrent size strings
  39. size_re = re.compile(r'[\d.]+(T|G|M)?B', re.IGNORECASE)
  40. # processing the results, two rows at a time
  41. for i in range(0, len(rows), 2):
  42. # parse the first row
  43. name_row = rows[i]
  44. links = name_row.xpath('./td[@class="desc-top"]/a')
  45. params = {'template': 'torrent.html', 'url': links[-1].attrib.get('href'), 'title': extract_text(links[-1])}
  46. # I have not yet seen any torrents without magnet links, but
  47. # it's better to be prepared to stumble upon one some day
  48. if len(links) == 2:
  49. magnet = links[0].attrib.get('href')
  50. if magnet.startswith('magnet'):
  51. # okay, we have a valid magnet link, let's add it to the result
  52. params['magnetlink'] = magnet
  53. # no more info in the first row, start parsing the second one
  54. info_row = rows[i + 1]
  55. desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
  56. for item in desc.split('|'):
  57. item = item.strip()
  58. if item.startswith('Size:'):
  59. try:
  60. params['filesize'] = size_re.search(item).group()
  61. except: # pylint: disable=bare-except
  62. pass
  63. elif item.startswith('Date:'):
  64. try:
  65. # Date: 2016-02-21 21:44 UTC
  66. date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
  67. params['publishedDate'] = date
  68. except: # pylint: disable=bare-except
  69. pass
  70. elif item.startswith('Comment:'):
  71. params['content'] = item
  72. stats = info_row.xpath('./td[@class="stats"]/span')
  73. # has the layout not changed yet?
  74. if len(stats) == 3:
  75. params['seed'] = int_or_zero(extract_text(stats[0]))
  76. params['leech'] = int_or_zero(extract_text(stats[1]))
  77. results.append(params)
  78. return results