DumpFnd.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: compress the fnd file - just a test
  6. # AUTHORS: Christopher Hall <hsw@openmoko.com>
  7. import sys, os
  8. import struct
  9. import os.path
  10. import pylzma
  11. import getopt
  12. import PrintLog
  13. import pylzma
  14. import locale
  15. locale.setlocale(locale.LC_ALL, '')
  16. verbose = False
  17. def usage(message):
  18. if None != message:
  19. print('error: {0:s}'.format(message))
  20. print('usage: {0:s} <options> <fnd-file>'.format(os.path.basename(__file__)))
  21. print(' --help This message')
  22. print(' --verbose Enable verbose output')
  23. exit(1)
  24. def main():
  25. global verbose
  26. try:
  27. opts, args = getopt.getopt(sys.argv[1:],
  28. 'hv',
  29. ['help',
  30. 'verbose',
  31. ])
  32. except getopt.GetoptError, err:
  33. usage(err)
  34. verbose = False
  35. uint32_size = 4
  36. for opt, arg in opts:
  37. if opt in ('-v', '--verbose'):
  38. verbose = True
  39. elif opt in ('-h', '--help'):
  40. usage(None)
  41. else:
  42. usage('unhandled option: ' + opt)
  43. if len(args) != 1:
  44. usage('missing argument')
  45. fnd_file = open(args[0], "rb")
  46. total_entries = 0
  47. bigram_table = {}
  48. for i in range(128,256):
  49. bigram_table[i] = fnd_file.read(2)
  50. previous_title1 = ''
  51. previous_title2 = ''
  52. while True:
  53. fnd_offset = fnd_file.tell()
  54. header = fnd_file.read(uint32_size + 1)
  55. if 0 == len(header):
  56. break
  57. article_number, nul_byte = struct.unpack('<IB', header)
  58. title1 = get_title(fnd_file)
  59. title2 = get_title(fnd_file)
  60. total_entries += 1
  61. length1 = len(title1)
  62. length2 = len(title2)
  63. if 0 != length1 and title1[0] < ' ':
  64. prefix_length = ord(title1[0]) + 1
  65. title1 = previous_title1[:prefix_length] + title1[1:]
  66. if 0 != length2 and title2[0] < ' ':
  67. prefix_length = ord(title2[0]) + 1
  68. title2 = previous_title2[:prefix_length] + title2[1:]
  69. full_length1 = len(title1)
  70. full_length2 = len(title2)
  71. decoded_title1 = ''
  72. for c in title1:
  73. i = ord(c)
  74. if i in bigram_table:
  75. decoded_title1 += bigram_table[i]
  76. else:
  77. decoded_title1 += c
  78. PrintLog.message(u'Index: {an:13n} @ Offset: {of:13n} [0x{of:08x}]\n'
  79. u'{pad1:s}[{l1:3d}/{fl1:3d}]:{t1!r:s}\n'
  80. u'{pad1:s}{pad2}{dt1!r:s}\n'
  81. u'{pad1:s}[{fl1:3d}/{fl2:3d}]:"{t2:s}"\n'
  82. .format(of = fnd_offset, an = article_number,
  83. l1 = length1, fl1 = full_length1, t1 = title1, dt1 = decoded_title1,
  84. pad1 = ' ' * 2, pad2 = ' ' * (2 * 3 + 4),
  85. l2 = length2, fl2 = full_length2, t2 = truncated_utf8(title2)))
  86. previous_title1 = title1
  87. previous_title2 = title2
  88. fnd_file.close()
  89. PrintLog.message(u'Total entries = {0:13n}'.format(total_entries))
  90. def truncated_utf8(text):
  91. """converted text to unicode even if the string is truncated"""
  92. while len(text) > 0:
  93. try:
  94. return unicode(text, 'utf-8')
  95. except UnicodeDecodeError:
  96. pass
  97. text = text[:-1]
  98. return u''
  99. def get_title(f):
  100. c = 'X'
  101. title = ''
  102. while '\0' != c:
  103. c = f.read(1)
  104. title += c
  105. return title[:-1]
  106. # run the program
  107. if __name__ == "__main__":
  108. main()