123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- #!/usr/bin/env python
- #-*- coding: utf-8 -*-
- #
- # Lastscrape -- recovers data from libre.fm
- # Copyright (C) 2009 Free Software Foundation, Inc
- #
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- #
- """usage: lastscrape.py USER [OUTPUT_FILE]"""
- import sys
- import time
- import codecs
- import urllib2
- from BeautifulSoup import BeautifulSoup
- sys.stdout = codecs.lookup('utf-8')[-1](sys.stdout)
- def parse_page(page):
- """Parse a page of recently listened tracks and return a list."""
- soup = BeautifulSoup(urllib2.urlopen(page),
- convertEntities=BeautifulSoup.HTML_ENTITIES)
- for row in soup.find('table', 'candyStriped tracklist').findAll('tr'):
- artist, track, timestamp = parse_track(row)
- # Tracks submitted before 2005 have no timestamp
- if artist and track:
- yield (artist, track, timestamp)
- def parse_track(row):
- """Return a tuple containing track data."""
- try:
- track_info = row.find('td', 'subjectCell')
- artist, track = track_info.findAll('a')
- timestamp = row.find('abbr')['title']
- artist = artist.contents[0].strip()
- track = track.contents[0].strip()
- return (artist, track, timestamp)
- except:
- # Parsing failed
- print 'parsing failed'
- return (None, None, None)
- def fetch_tracks(user, request_delay=0.5):
- """Fetch all tracks from a profile page and return a list."""
- url = 'http://last.fm/user/%s/tracks' % user
- try:
- f = urllib2.urlopen(url)
- except urllib2.HTTPError:
- raise Exception("Username probably does not exist.")
- soup = BeautifulSoup(urllib2.urlopen(url),
- convertEntities=BeautifulSoup.HTML_ENTITIES)
- try:
- num_pages = int(soup.find('a', 'lastpage').contents[0])
- except:
- num_pages = 1
- for cur_page in range(1, num_pages + 1):
- try:
- tracks = parse_page(url + '?page=' + str(cur_page))
- except:
- time.sleep(1)
- tracks = parse_page(url + '?page=' + str(cur_page))
- for artist, track, timestamp in tracks:
- yield (artist, track, timestamp)
- if cur_page < num_pages:
- time.sleep(request_delay)
- def main(*args):
- if len(args) == 2:
- # Print to stdout
- for artist, track, timestamp in fetch_tracks(args[1]):
- print u'%s\t%s\t%s' % (artist, track, timestamp)
- elif len(args) == 3:
- # Write to file
- f = codecs.open(args[2], 'w', 'utf-8')
- for artist, track, timestamp in fetch_tracks(args[1]):
- f.write(u'%s\t%s\t%s\n' % (artist, track, timestamp))
- print u'%s\t%s\t%s' % (artist, track, timestamp)
- f.close()
- else:
- print __doc__
- if __name__ == '__main__':
- sys.exit(main(*sys.argv))
|