filecrop.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. from urllib import urlencode
  2. from HTMLParser import HTMLParser
  3. url = 'http://www.filecrop.com/'
  4. search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa
  5. paging = True
  6. class FilecropResultParser(HTMLParser):
  7. def __init__(self):
  8. HTMLParser.__init__(self)
  9. self.__start_processing = False
  10. self.results = []
  11. self.result = {}
  12. self.tr_counter = 0
  13. self.data_counter = 0
  14. def handle_starttag(self, tag, attrs):
  15. if tag == 'tr':
  16. if ('bgcolor', '#edeff5') in attrs or\
  17. ('bgcolor', '#ffffff') in attrs:
  18. self.__start_processing = True
  19. if not self.__start_processing:
  20. return
  21. if tag == 'label':
  22. self.result['title'] = [attr[1] for attr in attrs
  23. if attr[0] == 'title'][0]
  24. elif tag == 'a' and ('rel', 'nofollow') in attrs\
  25. and ('class', 'sourcelink') in attrs:
  26. if 'content' in self.result:
  27. self.result['content'] += [attr[1] for attr in attrs
  28. if attr[0] == 'title'][0]
  29. else:
  30. self.result['content'] = [attr[1] for attr in attrs
  31. if attr[0] == 'title'][0]
  32. self.result['content'] += ' '
  33. elif tag == 'a':
  34. self.result['url'] = url + [attr[1] for attr in attrs
  35. if attr[0] == 'href'][0]
  36. def handle_endtag(self, tag):
  37. if self.__start_processing is False:
  38. return
  39. if tag == 'tr':
  40. self.tr_counter += 1
  41. if self.tr_counter == 2:
  42. self.__start_processing = False
  43. self.tr_counter = 0
  44. self.data_counter = 0
  45. self.results.append(self.result)
  46. self.result = {}
  47. def handle_data(self, data):
  48. if not self.__start_processing:
  49. return
  50. if 'content' in self.result:
  51. self.result['content'] += data + ' '
  52. else:
  53. self.result['content'] = data + ' '
  54. self.data_counter += 1
  55. def request(query, params):
  56. index = 1 + (params['pageno'] - 1) * 30
  57. params['url'] = search_url.format(query=urlencode({'w': query}),
  58. index=index)
  59. return params
  60. def response(resp):
  61. parser = FilecropResultParser()
  62. parser.feed(resp.text)
  63. return parser.results