DumpFiles.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Dump various files
  6. # AUTHORS: Christopher Hall <hsw@openmoko.com>
  7. import sys, os
  8. import struct
  9. import os.path
  10. import pylzma
  11. import getopt
  12. import PrintLog
  13. import pylzma
  14. import locale
  15. locale.setlocale(locale.LC_ALL, '')
  16. verbose = False
  17. def usage(message):
  18. if None != message:
  19. print('error: {0:s}'.format(message))
  20. print('usage: {0:s} <options> <indices...>'.format(os.path.basename(__file__)))
  21. print(' --help This message')
  22. print(' --verbose Enable verbose output')
  23. print(' --dir=<dir> Directory containing files [image/enpedia]')
  24. print(' --extract=<prefix> Prefix for extracted data (no extraction)')
  25. exit(1)
  26. def main():
  27. global verbose
  28. try:
  29. opts, args = getopt.getopt(sys.argv[1:],
  30. 'hvd:e:',
  31. ['help',
  32. 'verbose',
  33. 'dir=',
  34. 'extract=',
  35. ])
  36. except getopt.GetoptError, err:
  37. usage(err)
  38. verbose = False
  39. dir = 'image/enpedia'
  40. extract = None
  41. for opt, arg in opts:
  42. if opt in ('-v', '--verbose'):
  43. verbose = True
  44. elif opt in ('-h', '--help'):
  45. usage(None)
  46. elif opt in ('-d', '--dir'):
  47. dir = arg
  48. elif opt in ('-e', '--extract'):
  49. extract = arg
  50. else:
  51. usage('unhandled option: ' + opt)
  52. if not os.path.isdir(dir):
  53. usage('{0:s} is not a directory'.format(dir))
  54. idx_file = open(os.path.join(dir, "wiki.idx"), "rb")
  55. fnd_file = open(os.path.join(dir, "wiki.fnd"), "rb")
  56. dat_format = os.path.join(dir, "wiki{0:d}.dat")
  57. index_min = 1
  58. index_max = struct.unpack('<I', idx_file.read(4))[0]
  59. PrintLog.message('Total index entries = {0:d}'.format(index_max))
  60. PrintLog.message('')
  61. for item in args:
  62. try:
  63. index_number = int(item, 0)
  64. except ValueError:
  65. usage('"{0:s}" is not numeric'.format(item))
  66. if index_number < index_min or index_number > index_max:
  67. usage('index: {0:d} is outdide [{1:d} .. {2:d}]'.format(index_number, index_min, index_max))
  68. process(index_number, idx_file, fnd_file, dat_format, extract)
  69. idx_file.close()
  70. fnd_file.close()
  71. def process(index_number, idx_file, fnd_file, dat_format, extract):
  72. """dump the index and fnd file entries"""
  73. PrintLog.message('Index number = {0:13n} [0x{0:08x}]'.format(index_number))
  74. PrintLog.message('')
  75. uint32_size = 4
  76. index_entry_size = 2 * uint32_size + 1
  77. index_offset = uint32_size + index_entry_size * (index_number - 1)
  78. idx_file.seek(index_offset)
  79. offset_dat, offset_fnd, file_id = struct.unpack('<2IB', idx_file.read(index_entry_size))
  80. data_file_name = dat_format.format(file_id)
  81. PrintLog.message('Index offset = {0:13n} [0x{0:08x}]'.format(index_offset))
  82. PrintLog.message('Data offset = {0:13n} [0x{0:08x}]'.format(offset_dat))
  83. PrintLog.message('FND offset = {0:13n} [0x{0:08x}]'.format(offset_fnd))
  84. PrintLog.message('File ID = {0:13n} [0x{0:08x}] => "{1:s}"'.format(file_id, data_file_name))
  85. fnd_file.seek(offset_fnd)
  86. article_index_check = struct.unpack('<I', fnd_file.read(uint32_size))[0]
  87. index_match = '(Matches)' if article_index_check == index_number else '(**MISMATCHED INDEX**)'
  88. PrintLog.message('FND index = {0:13n} [0x{0:08x}] {1:s}'.format(article_index_check, index_match))
  89. ignored = fnd_file.read(1) # skip nul byte
  90. titles = fnd_file.read(1024).split('\0') # >= 2 * MAX_TITLE_SEARCH
  91. PrintLog.message('FND title = "{0!r:s}"'.format(titles[1])) # actual title
  92. dat_file = open(data_file_name, 'rb')
  93. dat_file.seek(offset_dat)
  94. number_of_pages = struct.unpack('<B', dat_file.read(1))[0]
  95. PrintLog.message('Data Pages = {0:13n} [0x{0:08x}]'.format(number_of_pages))
  96. PrintLog.message('')
  97. total_article_bytes = 0
  98. PrintLog.message('{0:>29s}{1:>25s}{2:>25s}'.format('Article Number', 'Article Offset', 'Uncompressed Length'))
  99. for i in range(0, number_of_pages):
  100. page_id, page_offset, page_length = struct.unpack('<3I', dat_file.read(3 * uint32_size))
  101. restricted = 'Restricted' if (0 != page_offset & 0x80000000) else ''
  102. page_offset = page_offset & 0x7fffffff
  103. total_article_bytes += page_length
  104. PrintLog.message('{0:3d}: {1:10n} [0x{1:08x}] {2:10n} [0x{2:08x}] {3:10n} [0x{3:08x}] {4:s}'
  105. .format(i, page_id, page_offset, page_length, restricted))
  106. PrintLog.message('{0:<{1}s}{2:10n} [0x{2:08x}]'.format('Total bytes: ', 3+3+10+4+8+3+10+4+8+3, total_article_bytes))
  107. PrintLog.message('')
  108. data_length = struct.unpack('<I', dat_file.read(4))[0]
  109. PrintLog.message('DataLength = {0:13n} [0x{0:08x}]'.format(data_length))
  110. article_data = dat_file.read(data_length)
  111. dat_file.close()
  112. if extract is not None:
  113. output_file_name = extract + '-I' + str(index_number) + '-b' + str(data_length) + '.articles'
  114. PrintLog.message('Extracting uncompressed articles to: {0:s}'.format(output_file_name))
  115. out = open(output_file_name, 'wb')
  116. out.write(pylzma.decompress(article_data))
  117. out.close()
  118. PrintLog.message('')
  119. # run the program
  120. if __name__ == "__main__":
  121. main()