CountSizes.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Count the articles in different size ranges
  6. # AUTHORS: Christopher Hall <hsw@openmoko.com>
  7. import sys, os
  8. import struct
  9. import os.path
  10. import pylzma
  11. import getopt
  12. import PrintLog
  13. import locale
  14. locale.setlocale(locale.LC_ALL, '')
  15. verbose = False
  16. def usage(message):
  17. if None != message:
  18. print('error: {0:s}'.format(message))
  19. print('usage: {0:s} <options>'.format(os.path.basename(__file__)))
  20. print(' --help This message')
  21. print(' --verbose Enable verbose output')
  22. print(' --dir=<dir> Directory containing files [image/enpedia]')
  23. exit(1)
  24. def main():
  25. global verbose
  26. global sizes
  27. global distribution
  28. global dist_list
  29. global total
  30. global byte_count
  31. try:
  32. opts, args = getopt.getopt(sys.argv[1:],
  33. 'hvd:',
  34. ['help',
  35. 'verbose',
  36. 'dir='])
  37. except getopt.GetoptError, err:
  38. usage(err)
  39. verbose = False
  40. dir = 'image/enpedia'
  41. for opt, arg in opts:
  42. if opt in ('-v', '--verbose'):
  43. verbose = True
  44. elif opt in ('-h', '--help'):
  45. usage(None)
  46. elif opt in ('-d', '--dir'):
  47. dir = arg
  48. else:
  49. usage('unhandled option: ' + opt)
  50. if not os.path.isdir(dir):
  51. usage('{0:s} is not a directory'.format(dir))
  52. idx_file = open(os.path.join(dir, "wiki.idx"), "rb")
  53. fnd_file = open(os.path.join(dir, "wiki.fnd"), "rb")
  54. dat_format = os.path.join(dir, "wiki{0:d}.dat")
  55. index_min = 1
  56. index_max = struct.unpack('<I', idx_file.read(4))[0]
  57. PrintLog.message('Total index entries = {0:d}'.format(index_max))
  58. total = 0
  59. sizes = {}
  60. distribution = {}
  61. byte_count = {}
  62. dist_list = [100, 200, 300, 400, 500, 600, 700, 800, 900,
  63. 1000, 2000, 3000, 5000, 7500,
  64. 10000, 20000, 50000,
  65. 100000, 200000, 500000,
  66. 99999999]
  67. for d in dist_list:
  68. distribution[d] = 0
  69. byte_count[d] = 0
  70. for item in range(index_max):
  71. index_number = 1 + item
  72. if index_number not in sizes:
  73. process(index_number, idx_file, fnd_file, dat_format)
  74. PrintLog.message('{0:>10s} {1:>20s} {2:>20s} {3:>14s}'.format('Size(<=)', 'Articles', 'Accumulated', 'Bytes'))
  75. sum = 0
  76. for i in dist_list:
  77. sum += distribution[i]
  78. PrintLog.message('{0:10n} = {1:10n} {2:7.1f} % {3:10n} {4:7.1f} % {5:14n}'
  79. .format(i,
  80. distribution[i],
  81. 100.0 * distribution[i] / index_max,
  82. sum,
  83. 100.0 * sum / index_max,
  84. byte_count[i]))
  85. PrintLog.message('summed = {0:10n}'.format(sum))
  86. PrintLog.message('sizes = {0:10n}'.format(len(sizes)))
  87. PrintLog.message('total = {0:10n}'.format(total))
  88. idx_file.close()
  89. fnd_file.close()
  90. def process(index_number, idx_file, fnd_file, dat_format):
  91. """dump the index and fnd file entries"""
  92. global verbose
  93. global sizes
  94. global distribution
  95. global dist_list
  96. global total
  97. global byte_count
  98. if verbose:
  99. PrintLog.message('Index number = {0:10n} 0x{0:08x}'.format(index_number))
  100. uint32_size = 4
  101. index_entry_size = 2 * uint32_size + 1
  102. index_offset = uint32_size + index_entry_size * (index_number - 1)
  103. idx_file.seek(index_offset)
  104. offset_dat, offset_fnd, file_id = struct.unpack('<2IB', idx_file.read(index_entry_size))
  105. data_file_name = dat_format.format(file_id)
  106. dat_file = open(data_file_name, 'rb')
  107. dat_file.seek(offset_dat)
  108. number_of_pages = struct.unpack('B', dat_file.read(1))[0]
  109. for i in range(0, number_of_pages):
  110. page_id, page_offset, page_length = struct.unpack('<3I', dat_file.read(12))
  111. restricted = 'Restricted' if (0 != page_offset & 0x80000000) else ''
  112. page_offset = page_offset & 0x7fffffff
  113. if page_id in sizes:
  114. PrintLog.message('DUP: {0:10n}'.format(page_id))
  115. sizes[page_id] = page_length
  116. for d in dist_list:
  117. if page_length <= d:
  118. distribution[d] += 1
  119. byte_count[d] += page_length
  120. total += 1
  121. break
  122. dat_file.close()
  123. # run the program
  124. if __name__ == "__main__":
  125. main()