lastscrape.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #!/usr/bin/env python
  2. #-*- coding: utf-8 -*-
  3. #
  4. # Lastscrape -- recovers data from libre.fm
  5. # Copyright (C) 2009 Free Software Foundation, Inc
  6. #
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. #
  20. """usage: lastscrape.py USER [OUTPUT_FILE]"""
  21. import sys
  22. import time
  23. import codecs
  24. import urllib2
  25. from BeautifulSoup import BeautifulSoup
  26. sys.stdout = codecs.lookup('utf-8')[-1](sys.stdout)
  27. def parse_page(page):
  28. """Parse a page of recently listened tracks and return a list."""
  29. soup = BeautifulSoup(urllib2.urlopen(page),
  30. convertEntities=BeautifulSoup.HTML_ENTITIES)
  31. for row in soup.find('table', 'candyStriped tracklist').findAll('tr'):
  32. artist, track, timestamp = parse_track(row)
  33. # Tracks submitted before 2005 have no timestamp
  34. if artist and track:
  35. yield (artist, track, timestamp)
  36. def parse_track(row):
  37. """Return a tuple containing track data."""
  38. try:
  39. track_info = row.find('td', 'subjectCell')
  40. artist, track = track_info.findAll('a')
  41. timestamp = row.find('abbr')['title']
  42. artist = artist.contents[0].strip()
  43. track = track.contents[0].strip()
  44. return (artist, track, timestamp)
  45. except:
  46. # Parsing failed
  47. print 'parsing failed'
  48. return (None, None, None)
  49. def fetch_tracks(user, request_delay=0.5):
  50. """Fetch all tracks from a profile page and return a list."""
  51. url = 'http://last.fm/user/%s/tracks' % user
  52. try:
  53. f = urllib2.urlopen(url)
  54. except urllib2.HTTPError:
  55. raise Exception("Username probably does not exist.")
  56. soup = BeautifulSoup(urllib2.urlopen(url),
  57. convertEntities=BeautifulSoup.HTML_ENTITIES)
  58. try:
  59. num_pages = int(soup.find('a', 'lastpage').contents[0])
  60. except:
  61. num_pages = 1
  62. for cur_page in range(1, num_pages + 1):
  63. try:
  64. tracks = parse_page(url + '?page=' + str(cur_page))
  65. except:
  66. time.sleep(1)
  67. tracks = parse_page(url + '?page=' + str(cur_page))
  68. for artist, track, timestamp in tracks:
  69. yield (artist, track, timestamp)
  70. if cur_page < num_pages:
  71. time.sleep(request_delay)
  72. def main(*args):
  73. if len(args) == 2:
  74. # Print to stdout
  75. for artist, track, timestamp in fetch_tracks(args[1]):
  76. print u'%s\t%s\t%s' % (artist, track, timestamp)
  77. elif len(args) == 3:
  78. # Write to file
  79. f = codecs.open(args[2], 'w', 'utf-8')
  80. for artist, track, timestamp in fetch_tracks(args[1]):
  81. f.write(u'%s\t%s\t%s\n' % (artist, track, timestamp))
  82. print u'%s\t%s\t%s' % (artist, track, timestamp)
  83. f.close()
  84. else:
  85. print __doc__
  86. if __name__ == '__main__':
  87. sys.exit(main(*sys.argv))