import-jamendo.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. #!/usr/bin/env python
  2. import xml.etree.cElementTree as ElementTree
  3. import sys, gzip, time, httplib
  4. import psycopg2 as ordbms
  5. JAMENDO_MIRROR = "gigue.rrbone.net"
  6. genremap = {
  7. 0 : "Blues",
  8. 1 : "Classic Rock",
  9. 2 : "Country",
  10. 3 : "Dance",
  11. 4 : "Disco",
  12. 5 : "Funk",
  13. 6 : "Grunge",
  14. 7 : "Hip-Hop",
  15. 8 : "Jazz",
  16. 9 : "Metal",
  17. 10 : "New Age",
  18. 11 : "Oldies",
  19. 12 : "Other",
  20. 13 : "Pop",
  21. 14 : "R&B",
  22. 15 : "Rap",
  23. 16 : "Reggae",
  24. 17 : "Rock",
  25. 18 : "Techno",
  26. 19 : "Industrial",
  27. 20 : "Alternative",
  28. 21 : "Ska",
  29. 22 : "Death Metal",
  30. 23 : "Pranks",
  31. 24 : "Soundtrack",
  32. 25 : "Euro-Techno",
  33. 26 : "Ambient",
  34. 27 : "Trip-Hop",
  35. 28 : "Vocal",
  36. 29 : "Jazz+Funk",
  37. 30 : "Fusion",
  38. 31 : "Trance",
  39. 32 : "Classical",
  40. 33 : "Instrumental",
  41. 34 : "Acid",
  42. 35 : "House",
  43. 36 : "Game",
  44. 37 : "Sound Clip",
  45. 38 : "Gospel",
  46. 39 : "Noise",
  47. 40 : "Alternative Rock",
  48. 41 : "Bass",
  49. 42 : "Soul",
  50. 43 : "Punk",
  51. 44 : "Space",
  52. 45 : "Meditative",
  53. 46 : "Instrumental Pop",
  54. 47 : "Instrumental Rock",
  55. 48 : "Ethnic",
  56. 49 : "Gothic",
  57. 50 : "Darkwave",
  58. 51 : "Techno-Industrial",
  59. 52 : "Electronic",
  60. 53 : "Pop-Folk",
  61. 54 : "Eurodance",
  62. 55 : "Dream",
  63. 56 : "Southern Rock",
  64. 57 : "Comedy",
  65. 58 : "Cult",
  66. 59 : "Gangsta",
  67. 60 : "Top 40",
  68. 61 : "Christian Rap",
  69. 62 : "Pop/Funk",
  70. 63 : "Jungle",
  71. 64 : "Native American",
  72. 65 : "Cabaret",
  73. 66 : "New Wave",
  74. 67 : "Psychadelic",
  75. 68 : "Rave",
  76. 69 : "Showtunes",
  77. 70 : "Trailer",
  78. 71 : "Lo-Fi",
  79. 72 : "Tribal",
  80. 73 : "Acid Punk",
  81. 74 : "Acid Jazz",
  82. 75 : "Polka",
  83. 76 : "Retro",
  84. 77 : "Musical",
  85. 78 : "Rock & Roll",
  86. 79 : "Hard Rock",
  87. 80 : "Folk",
  88. 81 : "Folk-Rock",
  89. 82 : "National Folk",
  90. 83 : "Swing",
  91. 84 : "Fast Fusion",
  92. 85 : "Bebop",
  93. 86 : "Latin",
  94. 87 : "Revival",
  95. 88 : "Celtic",
  96. 89 : "Bluegrass",
  97. 90 : "Avantgarde",
  98. 91 : "Gothic Rock",
  99. 92 : "Progressive Rock",
  100. 93 : "Psychedelic Rock",
  101. 94 : "Symphonic Rock",
  102. 95 : "Slow Rock",
  103. 96 : "Big Band",
  104. 97 : "Chorus",
  105. 98 : "Easy Listening",
  106. 99 : "Acoustic",
  107. 100 : "Humour",
  108. 101 : "Speech",
  109. 102 : "Chanson",
  110. 103 : "Opera",
  111. 104 : "Chamber Music",
  112. 105 : "Sonata",
  113. 106 : "Symphony",
  114. 107 : "Booty Bass",
  115. 108 : "Primus",
  116. 109 : "Porn Groove",
  117. 110 : "Satire",
  118. 111 : "Slow Jam",
  119. 112 : "Club",
  120. 113 : "Tango",
  121. 114 : "Samba",
  122. 115 : "Folklore",
  123. 116 : "Ballad",
  124. 117 : "Power Ballad",
  125. 118 : "Rhythmic Soul",
  126. 119 : "Freestyle",
  127. 120 : "Duet",
  128. 121 : "Punk Rock",
  129. 122 : "Drum Solo",
  130. 123 : "A capella",
  131. 124 : "Euro-House",
  132. 125 : "Dance Hall",
  133. }
  134. class JamendoImport:
  135. def __init__(self, username='librefm', database='librefm', updates=False):
  136. self.conn = ordbms.connect ("dbname='%s' user='%s'" % (database, username))
  137. self.perform_updates = updates
  138. self.cursor = self.conn.cursor ()
  139. def parse(self, dump):
  140. for event, elem in ElementTree.iterparse(dump):
  141. if elem.tag == "artist":
  142. artist = self.proc_artist(elem)
  143. if self.artist_exists(artist["name"]):
  144. if self.perform_updates:
  145. try:
  146. self.cursor.execute("UPDATE Artist SET image_small = %s, homepage = %s, mbid = %s WHERE name = %s", (artist["image"], artist["url"], artist["mbid"], artist["name"]))
  147. self.conn.commit()
  148. except Exception, e:
  149. self.conn.rollback()
  150. print 'ua', e
  151. else:
  152. try:
  153. self.cursor.execute("INSERT INTO Artist (name, image_small, mbid, homepage) VALUES (%s, %s, %s, %s)", (artist["name"], artist["image"], artist["mbid"], artist["url"]))
  154. self.conn.commit()
  155. except Exception, e:
  156. self.conn.rollback()
  157. print 'ia', e
  158. any_streamable_tracks = 0
  159. for album in artist["albums"]:
  160. if self.album_exists(artist["name"], album["name"]):
  161. if self.perform_updates:
  162. try:
  163. self.cursor.execute("UPDATE Album SET albumurl = %s, image = %s, artwork_license = %s, mbid = %s, releasedate = %s, downloadurl = %s WHERE name = %s AND artist_name = %s",
  164. (album["url"], album["image"], album["license_artwork"], album["mbid"], album["releasedate"], album["downloadurl"],
  165. album["name"], artist["name"]))
  166. self.conn.commit()
  167. except Exception, e:
  168. self.conn.rollback()
  169. print 'ub', e
  170. else:
  171. try:
  172. self.cursor.execute("INSERT INTO Album (name, artist_name, albumurl, image, artwork_license, mbid, releasedate, downloadurl) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
  173. (album["name"], artist["name"], album["url"], album["image"], album["license_artwork"], album["mbid"], album["releasedate"], album["downloadurl"]))
  174. self.conn.commit()
  175. except Exception, e:
  176. self.conn.rollback()
  177. print 'ib', e
  178. for tag in album["tags"]:
  179. if not self.tag_exists(tag, artist["name"], album["name"]):
  180. try:
  181. self.cursor.execute("INSERT INTO Tags (tag, artist, album) VALUES (%s, %s, %s)",
  182. (tag, artist["name"], album["name"]))
  183. self.conn.commit()
  184. except Exception, e:
  185. self.conn.rollback()
  186. print 'ig', e
  187. for track in album["tracks"]:
  188. if self.free_license(track["license"]) and self.is_in_mirror(track["id"]):
  189. streamable = 1
  190. any_streamable_tracks = 1
  191. else:
  192. streamable = 0
  193. try:
  194. duration = int(track["duration"])
  195. except:
  196. duration = None
  197. otherid = "jm:"
  198. try:
  199. otherid += str(int(track["id"]))
  200. except:
  201. otherid += "unknown"
  202. if self.track_exists(artist["name"], album["name"], track["name"]):
  203. try:
  204. self.cursor.execute("UPDATE Track SET downloadurl = %s, streamurl = %s, mbid = %s, license = %s, duration = %s, otherid = %s, streamable = %s WHERE name = %s AND artist_name = %s AND album_name = %s", (track["downloadurl"], track["streamurl"], track["mbid"], track["license"], duration, otherid, streamable, track["name"], artist["name"], album["name"]))
  205. self.conn.commit()
  206. except Exception, e:
  207. self.conn.rollback()
  208. print 'ut', e
  209. else:
  210. try:
  211. self.cursor.execute("INSERT INTO Track (name, artist_name, album_name, mbid, downloadurl, streamurl, license, duration, otherid, streamable) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (track["name"], artist["name"], album["name"], track["mbid"], track["downloadurl"], track["streamurl"], track["license"], duration, otherid, streamable))
  212. self.conn.commit()
  213. except Exception, e:
  214. self.conn.rollback()
  215. print 'it', e
  216. for tag in track["tags"]:
  217. if not self.tag_exists(tag, artist["name"], album["name"], track["name"]):
  218. try:
  219. self.cursor.execute("INSERT INTO Tags (tag, artist, album, track) VALUES (%s, %s, %s, %s)",
  220. (tag, artist["name"], album["name"], track["name"]))
  221. self.conn.commit()
  222. except Exception, e:
  223. self.conn.rollback()
  224. print 'ig2', e
  225. if any_streamable_tracks:
  226. try:
  227. self.cursor.execute("UPDATE Artist SET streamable = 1 WHERE name = %s", (artist["name"],))
  228. self.conn.commit()
  229. except Exception, e:
  230. self.conn.rollback()
  231. print 'ua', e
  232. def close(self):
  233. self.cursor.close()
  234. self.conn.commit()
  235. self.conn.close()
  236. def proc_artist(self, elem):
  237. artist = {}
  238. artist["albums"] = []
  239. artist["image"] = None
  240. for artist_e in elem.getchildren():
  241. if artist_e.tag == "name":
  242. artist["name"] = artist_e.text
  243. if artist_e.tag == "id":
  244. artist["id"] = int(artist_e.text)
  245. if artist_e.tag == "image":
  246. artist["image"] = artist_e.text
  247. if artist_e.tag == "mbgid":
  248. if artist_e.text is None or len(artist_e.text) == 36:
  249. artist["mbid"] = artist_e.text
  250. else:
  251. print "Artist mbgid wrong length (%d): %s" % (len(artist_e.text),artist_e.text)
  252. if artist_e.tag == "url":
  253. artist["url"] = artist_e.text
  254. if artist_e.tag == "Albums":
  255. for album_e in artist_e.getchildren():
  256. artist["albums"].append(self.proc_album(album_e))
  257. return artist
  258. def proc_album(self, elem):
  259. album = {}
  260. album["tracks"] = []
  261. album["tags"] = []
  262. album["name"] = None
  263. for album_e in elem.getchildren():
  264. if album_e.tag == "name":
  265. album["name"] = album_e.text
  266. if album_e.tag == "id":
  267. album["id"] = int(album_e.text)
  268. album["url"] = "jamendo://album/%d" % album["id"]
  269. album["downloadurl"] = "jamendo://album/download/%d" % album["id"]
  270. album["image"] = "jamendo://album/art/%d" % album["id"]
  271. if album_e.tag == "id3genre":
  272. genre = genremap[int(album_e.text)]
  273. album["tags"].append(genre)
  274. if album_e.tag == "mbgid":
  275. if album_e.text is None or len(album_e.text) == 36:
  276. album["mbid"] = album_e.text
  277. else:
  278. print "Album mbgid wrong length (%d): %s" % (len(album_e.text),album_e.text)
  279. if album_e.tag == "license_artwork":
  280. album["license_artwork"] = album_e.text
  281. if album_e.tag == "releasedate":
  282. album["releasedate"] = time.mktime(time.strptime(album_e.text, "%Y-%m-%dT%H:%M:%S+01:00"))
  283. if album_e.tag == "Tracks":
  284. for track_e in album_e.getchildren():
  285. album["tracks"].append(self.proc_track(track_e))
  286. return album
  287. def proc_track(self, elem):
  288. track = {}
  289. track["tags"] = []
  290. track["mbid"] = None
  291. track["downloadurl"] = None
  292. for track_e in elem.getchildren():
  293. if track_e.tag == "id":
  294. track["id"] = int(track_e.text)
  295. track["streamurl"] = "jamendo://track/stream/%d" % track["id"]
  296. if track_e.tag == "name":
  297. track["name"] = track_e.text
  298. if track_e.tag == "id3genre":
  299. genre = genremap[int(track_e.text)]
  300. track["tags"].append(genre)
  301. if track_e.tag == "license":
  302. track["license"] = track_e.text
  303. if track_e.tag == "duration":
  304. track["duration"] = track_e.text
  305. if track_e.tag == "mbgid":
  306. if track_e.text is None or len(track_e.text) == 36:
  307. track["mbid"] = track_e.text
  308. else:
  309. print "Track mbgid wrong length (%d): %s" % (len(track_e.text),track_e.text)
  310. if track_e.tag == "Tags":
  311. for tag_e in track_e.getchildren():
  312. track["tags"].append(self.proc_tag(tag_e))
  313. return track
  314. def proc_tag(self, elem):
  315. for track_e in elem.getchildren():
  316. if track_e.tag == "idstr":
  317. return track_e.text
  318. def artist_exists(self, artist):
  319. try:
  320. self.cursor.execute("SELECT name FROM Artist WHERE name = %s ", (artist,))
  321. return self.cursor.rowcount != 0
  322. except:
  323. return False
  324. def album_exists(self, artist, album):
  325. try:
  326. self.cursor.execute("SELECT name FROM Album WHERE artist_name = %s AND name = %s", (artist, album))
  327. return self.cursor.rowcount != 0
  328. except:
  329. return False
  330. def track_exists(self, artist, album, track):
  331. try:
  332. self.cursor.execute("SELECT name FROM Track WHERE artist_name = %s AND album_name = %s AND name = %s", (artist, album, track))
  333. return self.cursor.rowcount != 0
  334. except:
  335. return False
  336. def tag_exists(self, tag, artist, album, track=None):
  337. try:
  338. if track:
  339. self.cursor.execute("SELECT tag FROM Tags WHERE tag = %s AND artist = %s AND album = %s AND track = %s", (tag, artist, album, track))
  340. else:
  341. self.cursor.execute("SELECT tag FROM Tags WHERE tag = %s AND artist = %s AND album = %s AND track = ''", (tag, artist, album))
  342. return self.cursor.rowcount != 0
  343. except:
  344. return False
  345. def free_license(self, license):
  346. return ("http://creativecommons.org/licenses/by-sa" in license or "http://creativecommons.org/licenses/by/" in license or "http://artlibre.org/licence.php/lal.html" in license)
  347. def is_in_mirror(self, id):
  348. try:
  349. trackfile = "/" + str(id) + ".ogg2"
  350. connection = httplib.HTTPConnection(JAMENDO_MIRROR)
  351. connection.request("HEAD", trackfile)
  352. response = connection.getresponse()
  353. connection.close()
  354. except:
  355. return False
  356. else:
  357. return response.status == 200 and response.getheader('Content-Type') == 'audio/ogg'
  358. if __name__ == "__main__":
  359. if len(sys.argv) != 2 and len(sys.argv) != 4:
  360. print "Usage: import-jamendo.py <database dump>"
  361. print "or"
  362. print "import-jamendo.py <database dump> <username> <database>"
  363. sys.exit(1)
  364. if sys.argv[1][-2:] == "gz":
  365. dump = gzip.open(sys.argv[1], "r")
  366. else:
  367. dump = open(sys.argv[1], "r")
  368. if len(sys.argv) == 2:
  369. importer = JamendoImport()
  370. else:
  371. importer = JamendoImport(sys.argv[2], sys.argv[3])
  372. importer.parse(dump)
  373. importer.close()