py7zlib.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. #!/usr/bin/python -u
  2. #
  3. # Python Bindings for LZMA
  4. #
  5. # Copyright (c) 2004-2006 by Joachim Bauch, mail@joachim-bauch.de
  6. # 7-Zip Copyright (C) 1999-2005 Igor Pavlov
  7. # LZMA SDK Copyright (C) 1999-2005 Igor Pavlov
  8. #
  9. # This library is free software; you can redistribute it and/or
  10. # modify it under the terms of the GNU Lesser General Public
  11. # License as published by the Free Software Foundation; either
  12. # version 2.1 of the License, or (at your option) any later version.
  13. #
  14. # This library is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. # Lesser General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Lesser General Public
  20. # License along with this library; if not, write to the Free Software
  21. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  22. #
  23. # $Id: py7zlib.py 115 2006-06-18 10:53:56Z jojo $
  24. #
  25. """Read from and write to 7zip format archives.
  26. """
  27. import pylzma
  28. from struct import pack, unpack
  29. from zlib import crc32
  30. from cStringIO import StringIO
  31. MAGIC_7Z = '7z\xbc\xaf\x27\x1c'
  32. PROPERTY_END = '\x00'
  33. PROPERTY_HEADER = '\x01'
  34. PROPERTY_ARCHIVE_PROPERTIES = '\x02'
  35. PROPERTY_ADDITIONAL_STREAMS_INFO = '\x03'
  36. PROPERTY_MAIN_STREAMS_INFO = '\x04'
  37. PROPERTY_FILES_INFO = '\x05'
  38. PROPERTY_PACK_INFO = '\x06'
  39. PROPERTY_UNPACK_INFO = '\x07'
  40. PROPERTY_SUBSTREAMS_INFO = '\x08'
  41. PROPERTY_SIZE = '\x09'
  42. PROPERTY_CRC = '\x0a'
  43. PROPERTY_FOLDER = '\x0b'
  44. PROPERTY_CODERS_UNPACK_SIZE = '\x0c'
  45. PROPERTY_NUM_UNPACK_STREAM = '\x0d'
  46. PROPERTY_EMPTY_STREAM = '\x0e'
  47. PROPERTY_EMPTY_FILE = '\x0f'
  48. PROPERTY_ANTI = '\x10'
  49. PROPERTY_NAME = '\x11'
  50. PROPERTY_CREATION_TIME = '\x12'
  51. PROPERTY_LAST_ACCESS_TIME = '\x13'
  52. PROPERTY_LAST_WRITE_TIME = '\x14'
  53. PROPERTY_ATTRIBUTES = '\x15'
  54. PROPERTY_COMMENT = '\x16'
  55. PROPERTY_ENCODED_HEADER = '\x17'
  56. class FormatError(Exception):
  57. pass
  58. class Base:
  59. """ base class with support for various basic read/write functions """
  60. def _readReal64Bit(self, file):
  61. res = file.read(8)
  62. a, b = unpack('<LL', res)
  63. return b << 32 | a, res
  64. def _read64Bit(self, file):
  65. b = ord(file.read(1))
  66. mask = 0x80
  67. for i in xrange(8):
  68. if b & mask == 0:
  69. bytes = list(unpack('%dB' % i, file.read(i)))
  70. bytes.reverse()
  71. value = (bytes and reduce(lambda x, y: long(x) << 8 | y, bytes)) or 0L
  72. highpart = b & (mask - 1)
  73. return value + (long(highpart) << (i * 8))
  74. mask >>= 1
  75. def _readBoolean(self, file, count, checkall=0):
  76. if checkall:
  77. alldefined = file.read(1)
  78. if alldefined != '\x00':
  79. return [True] * count
  80. result = []
  81. b = 0
  82. mask = 0
  83. for i in xrange(count):
  84. if mask == 0:
  85. b = ord(file.read(1))
  86. mask = 0x80
  87. result.append(b & mask != 0)
  88. mask >>= 1
  89. return result
  90. class PackInfo(Base):
  91. """ informations about packed streams """
  92. def __init__(self, file):
  93. self.packpos = self._read64Bit(file)
  94. self.numstreams = self._read64Bit(file)
  95. id = file.read(1)
  96. if id == PROPERTY_SIZE:
  97. self.packsizes = [self._read64Bit(file) for x in xrange(self.numstreams)]
  98. id = file.read(1)
  99. if id == PROPERTY_CRC:
  100. self.crcs = [self._read64Bit(file) for x in xrange(self.numstreams)]
  101. id = file.read(1)
  102. if id != PROPERTY_END:
  103. raise FormatError, 'end id expected but %s found' % repr(id)
  104. class Folder(Base):
  105. """ a "Folder" represents a stream of compressed data """
  106. def __init__(self, file):
  107. numcoders = self._read64Bit(file)
  108. self.coders = []
  109. self.digestdefined = False
  110. totalin = 0
  111. self.totalout = 0
  112. for i in xrange(numcoders):
  113. while True:
  114. b = ord(file.read(1))
  115. methodsize = b & 0xf
  116. issimple = b & 0x10 == 0
  117. noattributes = b & 0x20 == 0
  118. last_alternative = b & 0x80 == 0
  119. c = {}
  120. c['method'] = file.read(methodsize)
  121. if not issimple:
  122. c['numinstreams'] = self._read64Bit(file)
  123. c['numoutstreams'] = self._read64Bit(file)
  124. else:
  125. c['numinstreams'] = 1
  126. c['numoutstreams'] = 1
  127. totalin += c['numinstreams']
  128. self.totalout += c['numoutstreams']
  129. if c['method'][0] != '\x00':
  130. c['properties'] = file.read(self._read64Bit(file))
  131. self.coders.append(c)
  132. if last_alternative:
  133. break
  134. numbindpairs = self.totalout - 1
  135. self.bindpairs = []
  136. for i in xrange(numbindpairs):
  137. self.bindpairs.append((self._read64Bit(file), self._read64Bit(file), ))
  138. numpackedstreams = totalin - numbindpairs
  139. self.packed_indexes = []
  140. if numpackedstreams == 1:
  141. for i in xrange(totalin):
  142. if self.findInBindPair(i) < 0:
  143. self.packed_indexes.append(i)
  144. elif numpackedstreams > 1:
  145. for i in xrange(numpackedstreams):
  146. self.packed_indexes.append(self._read64Bit(file))
  147. def getUnpackSize(self):
  148. if not self.unpacksizes:
  149. return 0
  150. r = range(len(self.unpacksizes))
  151. r.reverse()
  152. for i in r:
  153. if self.findOutBindPair(i):
  154. return self.unpacksizes[i]
  155. raise 'not found'
  156. def findInBindPair(self, index):
  157. for idx in xrange(len(self.bindpairs)):
  158. a, b = self.bindpairs[idx]
  159. if a == index:
  160. return idx
  161. return -1
  162. def findOutBindPair(self, index):
  163. for idx in xrange(len(self.bindpairs)):
  164. a, b = self.bindpairs[idx]
  165. if b == index:
  166. return idx
  167. return -1
  168. class Digests(Base):
  169. """ holds a list of checksums """
  170. def __init__(self, file, count):
  171. self.defined = self._readBoolean(file, count, checkall=1)
  172. self.crcs = [unpack('<l', file.read(4))[0] for x in xrange(count)]
  173. UnpackDigests = Digests
  174. class UnpackInfo(Base):
  175. """ combines multiple folders """
  176. def __init__(self, file):
  177. id = file.read(1)
  178. if id != PROPERTY_FOLDER:
  179. raise FormatError, 'folder id expected but %s found' % repr(id)
  180. self.numfolders = self._read64Bit(file)
  181. self.folders = []
  182. external = file.read(1)
  183. if external == '\x00':
  184. self.folders = [Folder(file) for x in xrange(self.numfolders)]
  185. elif external == '\x01':
  186. self.datastreamidx = self._read64Bit(file)
  187. else:
  188. raise FormatError, '0x00 or 0x01 expected but %s found' % repr(external)
  189. id = file.read(1)
  190. if id != PROPERTY_CODERS_UNPACK_SIZE:
  191. raise FormatError, 'coders unpack size id expected but %s found' % repr(id)
  192. for folder in self.folders:
  193. folder.unpacksizes = [self._read64Bit(file) for x in xrange(folder.totalout)]
  194. id = file.read(1)
  195. if id == PROPERTY_CRC:
  196. digests = UnpackDigests(file, self.numfolders)
  197. for idx in xrange(self.numfolders):
  198. folder = self.folders[idx]
  199. folder.digestdefined = digests.defined[idx]
  200. folder.crc = digests.crcs[idx]
  201. id = file.read(1)
  202. if id != PROPERTY_END:
  203. raise FormatError, 'end id expected but %s found' % repr(id)
  204. class SubstreamsInfo(Base):
  205. """ defines the substreams of a folder """
  206. def __init__(self, file, numfolders, folders):
  207. self.digests = []
  208. self.digestsdefined = []
  209. id = file.read(1)
  210. if id == PROPERTY_NUM_UNPACK_STREAM:
  211. self.numunpackstreams = [self._read64Bit(file) for x in xrange(numfolders)]
  212. id = file.read(1)
  213. else:
  214. self.numunpackstreams = []
  215. for idx in xrange(numfolders):
  216. self.numunpackstreams.append(1)
  217. if id == PROPERTY_SIZE:
  218. sum = 0
  219. self.unpacksizes = []
  220. for i in xrange(len(self.numunpackstreams)):
  221. for j in xrange(1, self.numunpackstreams[i]):
  222. size = self._read64Bit(file)
  223. self.unpacksizes.append(size)
  224. sum += size
  225. self.unpacksizes.append(folders[i].getUnpackSize() - sum)
  226. id = file.read(1)
  227. if id == PROPERTY_CRC:
  228. numdigests = 0
  229. numdigeststotal = 0
  230. for i in xrange(numfolders):
  231. numsubstreams = self.numunpackstreams[i]
  232. if numsubstreams != 1 or not folders[i].digestdefined:
  233. numdigests += numsubstreams
  234. numdigeststotal += numsubstreams
  235. digests = Digests(file, numdigests)
  236. didx = 0
  237. for i in xrange(numfolders):
  238. folder = folders[i]
  239. numsubstreams = self.numunpackstreams[i]
  240. if numsubstreams == 1 and folder.digestdefined:
  241. self.digestsdefined.append(True)
  242. self.digests.append(folder.crc)
  243. else:
  244. for j in xrange(numsubstreams):
  245. self.digestsdefined.append(digests.defined[didx])
  246. self.digests.append(digests.crcs[didx])
  247. didx += 1
  248. id = file.read(1)
  249. if id != PROPERTY_END:
  250. raise FormatError, 'end id expected but %s found' % repr(id)
  251. if not self.digestsdefined:
  252. self.digestsdefined = [False] * numdigeststotal
  253. self.digests = [0] * numdigeststotal
  254. class StreamsInfo(Base):
  255. """ informations about compressed streams """
  256. def __init__(self, file):
  257. id = file.read(1)
  258. if id == PROPERTY_PACK_INFO:
  259. self.packinfo = PackInfo(file)
  260. id = file.read(1)
  261. if id == PROPERTY_UNPACK_INFO:
  262. self.unpackinfo = UnpackInfo(file)
  263. id = file.read(1)
  264. if id == PROPERTY_SUBSTREAMS_INFO:
  265. self.substreamsinfo = SubstreamsInfo(file, self.unpackinfo.numfolders, self.unpackinfo.folders)
  266. id = file.read(1)
  267. if id != PROPERTY_END:
  268. raise FormatError, 'end id expected but %s found' % repr(id)
  269. class FilesInfo(Base):
  270. """ holds file properties """
  271. def _readTimes(self, file, files, name):
  272. defined = self._readBoolean(file, len(files), checkall=1)
  273. for i in xrange(len(files)):
  274. if defined[i]:
  275. files[i][name] = self._readReal64Bit(file)[0] #unpack('<L', file.read(4))[0]
  276. else:
  277. files[i][name] = None
  278. def __init__(self, file):
  279. self.numfiles = self._read64Bit(file)
  280. self.files = [{'emptystream': False} for x in xrange(self.numfiles)]
  281. numemptystreams = 0
  282. while True:
  283. typ = self._read64Bit(file)
  284. if typ > 255:
  285. raise FormatError, 'invalid type, must be below 256, is %d' % typ
  286. typ = chr(typ)
  287. if typ == PROPERTY_END:
  288. break
  289. size = self._read64Bit(file)
  290. buffer = StringIO(file.read(size))
  291. if typ == PROPERTY_EMPTY_STREAM:
  292. isempty = self._readBoolean(buffer, self.numfiles)
  293. map(lambda x, y: x.update({'emptystream': y}), self.files, isempty)
  294. for x in isempty:
  295. if x: numemptystreams += 1
  296. emptyfiles = [False] * numemptystreams
  297. antifiles = [False] * numemptystreams
  298. elif typ == PROPERTY_EMPTY_FILE:
  299. emptyfiles = self._readBoolean(buffer, numemptystreams)
  300. elif typ == PROPERTY_ANTI:
  301. antifiles = self._readBoolean(buffer, numemptystreams)
  302. elif typ == PROPERTY_NAME:
  303. external = buffer.read(1)
  304. if external != '\x00':
  305. self.dataindex = self._read64Bit(buffer)
  306. # XXX: evaluate external
  307. raise NotImplementedError
  308. for f in self.files:
  309. name = ''
  310. while True:
  311. ch = buffer.read(2)
  312. if ch == '\0\0':
  313. f['filename'] = unicode(name, 'utf-16')
  314. break
  315. name += ch
  316. elif typ == PROPERTY_CREATION_TIME:
  317. self._readTimes(buffer, self.files, 'creationtime')
  318. elif typ == PROPERTY_LAST_ACCESS_TIME:
  319. self._readTimes(buffer, self.files, 'lastaccesstime')
  320. elif typ == PROPERTY_LAST_WRITE_TIME:
  321. self._readTimes(buffer, self.files, 'lastwritetime')
  322. elif typ == PROPERTY_ATTRIBUTES:
  323. defined = self._readBoolean(buffer, self.numfiles, checkall=1)
  324. for i in xrange(self.numfiles):
  325. f = self.files[i]
  326. if defined[i]:
  327. f['attributes'] = unpack('<L', buffer.read(4))[0]
  328. else:
  329. f['attributes'] = None
  330. else:
  331. raise FormatError, 'invalid type %s' % repr(typ)
  332. class Header(Base):
  333. """ the archive header """
  334. def __init__(self, file):
  335. id = file.read(1)
  336. if id == PROPERTY_ARCHIVE_PROPERTIES:
  337. self.properties = ArchiveProperties(file)
  338. id = file.read(1)
  339. if id == PROPERTY_ADDITIONAL_STREAMS_INFO:
  340. self.additional_streams = StreamsInfo(file)
  341. id = file.read(1)
  342. if id == PROPERTY_MAIN_STREAMS_INFO:
  343. self.main_streams = StreamsInfo(file)
  344. id = file.read(1)
  345. if id == PROPERTY_FILES_INFO:
  346. self.files = FilesInfo(file)
  347. id = file.read(1)
  348. if id != PROPERTY_END:
  349. raise FormatError, 'end id expected but %s found' % (repr(id))
  350. class ArchiveFile:
  351. """ wrapper around a file in the archive """
  352. def __init__(self, info, start, src_start, size, folder, archive, maxsize=None):
  353. self.digest = None
  354. self._archive = archive
  355. self._file = archive._file
  356. self._start = start
  357. self._src_start = src_start
  358. self._folder = folder
  359. self.size = size
  360. # maxsize is only valid for solid archives
  361. self._maxsize = maxsize
  362. for k, v in info.items():
  363. setattr(self, k, v)
  364. self.reset()
  365. def reset(self):
  366. self.pos = 0
  367. def read(self):
  368. data = ''
  369. idx = 0
  370. cnt = 0
  371. dec = pylzma.decompressobj(maxlength=self._start+self.size)
  372. self._file.seek(self._src_start)
  373. dec.decompress(self._folder.coders[0]['properties'])
  374. total = self.compressed
  375. if total is None:
  376. remaining = self._start+self.size
  377. out = StringIO()
  378. while remaining > 0:
  379. data = self._file.read(1024)
  380. tmp = dec.decompress(data, remaining)
  381. out.write(tmp)
  382. remaining -= len(tmp)
  383. data = out.getvalue()
  384. else:
  385. data = dec.decompress(self._file.read(total), self._start+self.size)
  386. return data[self._start:self._start+self.size]
  387. def checkcrc(self):
  388. if self.digest is None:
  389. return True
  390. self.reset()
  391. data = self.read()
  392. crc = crc32(data)
  393. # make crc unsigned
  394. # XXX: better way to do this?
  395. crc = unpack('<l', pack('<L', crc))[0]
  396. return crc == self.digest
  397. class Archive7z(Base):
  398. """ the archive itself """
  399. def __init__(self, file):
  400. self._file = file
  401. self.header = file.read(len(MAGIC_7Z))
  402. if self.header != MAGIC_7Z:
  403. raise FormatError, 'not a 7z file'
  404. self.version = unpack('BB', file.read(2))
  405. self.startheadercrc = unpack('<l', file.read(4))[0]
  406. self.nextheaderofs, data = self._readReal64Bit(file)
  407. crc = crc32(data)
  408. self.nextheadersize, data = self._readReal64Bit(file)
  409. crc = crc32(data, crc)
  410. data = file.read(4)
  411. self.nextheadercrc = unpack('<l', data)[0]
  412. crc = crc32(data, crc)
  413. if crc != self.startheadercrc:
  414. raise FormatError, 'invalid header data'
  415. self.afterheader = file.tell()
  416. file.seek(self.nextheaderofs, 1)
  417. buffer = StringIO(file.read(self.nextheadersize))
  418. if crc32(buffer.getvalue()) != self.nextheadercrc:
  419. raise FormatError, 'invalid header data'
  420. while True:
  421. id = buffer.read(1)
  422. if id == PROPERTY_HEADER:
  423. break
  424. if id != PROPERTY_ENCODED_HEADER:
  425. raise 'Unknown field:', repr(id)
  426. streams = StreamsInfo(buffer)
  427. file.seek(self.afterheader + 0)
  428. data = ''
  429. idx = 0
  430. for folder in streams.unpackinfo.folders:
  431. file.seek(streams.packinfo.packpos, 1)
  432. props = folder.coders[0]['properties']
  433. for idx in xrange(len(streams.packinfo.packsizes)):
  434. tmp = file.read(streams.packinfo.packsizes[idx])
  435. data += pylzma.decompress(props+tmp, maxlength=folder.unpacksizes[idx])
  436. if folder.digestdefined:
  437. if folder.crc != crc32(data):
  438. raise FormatError, 'invalid block data'
  439. buffer = StringIO(data)
  440. self.header = Header(buffer)
  441. self.files = []
  442. files = self.header.files
  443. folders = self.header.main_streams.unpackinfo.folders
  444. packinfo = self.header.main_streams.packinfo
  445. subinfo = self.header.main_streams.substreamsinfo
  446. packsizes = packinfo.packsizes
  447. self.solid = packinfo.numstreams == 1
  448. if self.solid:
  449. # the files are stored in substreams
  450. if hasattr(subinfo, 'unpacksizes'):
  451. unpacksizes = subinfo.unpacksizes
  452. else:
  453. unpacksizes = [x.unpacksizes[0] for x in folders]
  454. else:
  455. # every file has it's own folder with compressed data
  456. unpacksizes = [x.unpacksizes[0] for x in folders]
  457. fidx = 0
  458. obidx = 0
  459. src_pos = self.afterheader
  460. pos = 0
  461. maxsize = (self.solid and packinfo.packsizes[0]) or None
  462. for idx in xrange(files.numfiles):
  463. info = files.files[idx]
  464. folder = folders[fidx]
  465. if not info['emptystream']:
  466. info['compressed'] = (not self.solid and packsizes[obidx]) or None
  467. info['uncompressed'] = unpacksizes[obidx]
  468. file = ArchiveFile(info, pos, src_pos, unpacksizes[obidx], folder, self, maxsize=maxsize)
  469. if subinfo.digestsdefined[obidx]:
  470. file.digest = subinfo.digests[obidx]
  471. self.files.append(file)
  472. if self.solid:
  473. pos += unpacksizes[obidx]
  474. else:
  475. src_pos += packsizes[obidx]
  476. obidx += 1
  477. if not self.solid:
  478. fidx += 1
  479. self.numfiles = len(self.files)
  480. self.filenames = map(lambda x: x.filename, self.files)
  481. # interface like TarFile
  482. def getmember(self, name):
  483. # XXX: store files in dictionary
  484. for f in self.files:
  485. if f.filename == name:
  486. return f
  487. return None
  488. def getmembers(self):
  489. return self.files
  490. def getnames(self):
  491. return self.filenames
  492. def list(self, verbose=True):
  493. print 'total %d files in %sarchive' % (self.numfiles, (self.solid and 'solid ') or '')
  494. if not verbose:
  495. print '\n'.join(self.filenames)
  496. return
  497. for f in self.files:
  498. extra = (f.compressed and '%10d ' % (f.compressed)) or ' '
  499. print '%10d%s%s %s' % (f.size, extra, hex(f.digest)[2:-1], f.filename)
  500. if __name__ == '__main__':
  501. f = Archive7z(open('test.7z', 'rb'))
  502. #f = Archive7z(open('pylzma.7z', 'rb'))
  503. f.list()