mdict-parser.js 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939
  1. //define(['pako', 'lzo', 'ripemd128', 'bluebird', 'mdict-parseXml', 'mdict-MCommon']
  2. //pako, lzo, ripemd128, Promise, parseXml, MCommon
  3. var parseXml = function (str) {
  4. return (new DOMParser()).parseFromString(str, 'text/xml');
  5. };
  6. var MParser = (function (){
  7. // Value of undefined.
  8. var UNDEFINED = void 0;
  9. // A shared UTF-16LE text decorder used to read dictionary header string.
  10. var UTF_16LE = new TextDecoder('utf-16le');
  11. /**
  12. * Return the first argument as result.
  13. * This function is used to simulate consequence, i.e. read data and return it, then forward to a new position.
  14. * @param any data or function call
  15. * @return the first arugment
  16. */
  17. function conseq(/* args... */) { return arguments[0]; }
  18. /*
  19. * Decrypt encrypted data block of keyword index (attrs.Encrypted = "2").
  20. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-index-encryption
  21. * @param buf an ArrayBuffer containing source data
  22. * @param key an ArrayBuffer holding decryption key, which will be supplied to ripemd128() before decryption
  23. * @return an ArrayBuffer carrying decrypted data, occupying the same memory space of source buffer
  24. */
  25. function decrypt(buf, key) {
  26. key = ripemd128(key);
  27. var byte, keylen = key.length, prev = 0x36, i = 0, len = buf.length;
  28. for (; i < len; i++) {
  29. byte = buf[i];
  30. byte = ((byte >> 4) | (byte << 4) ); // & 0xFF; <-- it's already a byte
  31. byte = byte ^ prev ^ (i & 0xFF) ^ key[i % keylen];
  32. prev = buf[i];
  33. buf[i] = byte;
  34. }
  35. return buf;
  36. }
  37. /**
  38. * For sliceThen(..).exec(proc, ..), mark what proc function returns is multiple values
  39. * to be passed to further Promise#spread(..) call.
  40. */
  41. function spreadus() {
  42. var args = Array.prototype.slice.apply(arguments);
  43. args._spreadus_ = true;
  44. return args;
  45. }
  46. /**
  47. * Slice part of a file/blob object, return a promise object which will resolve to an ArrayBuffer to feed subsequent process.
  48. * The returned promise object is extened with an exec(proc, args...) method which can be chained with further process.
  49. * @param file file or blob object
  50. * @param offset start position to slice
  51. * @param len length to slice
  52. * @return a promise object which will resolve to an ArrayBuffer containing data been read
  53. */
  54. function sliceThen(file, offset, len) {
  55. var p = new Promise(function(_resolve) {
  56. var reader = new FileReader();
  57. reader.onload = function() { _resolve(reader.result); }
  58. reader.readAsArrayBuffer(file.slice(offset, offset + len));
  59. });
  60. /**
  61. * Call proc with specified arguments prepending with sliced file/blob data (ArrayBuffer) been read.
  62. * @param the first argument is a function to be executed
  63. * @param other optional arguments are passed to the function following auto supplied input ArrayBuffer
  64. * @return a promise object which can be chained with further process through spread() method
  65. */
  66. p.exec = function(proc /*, args... */) {
  67. var args = Array.prototype.slice.call(arguments, 1);
  68. return p.then(function(data) {
  69. args.unshift(data);
  70. var ret = proc.apply(null, args);
  71. return resolve(ret !== UNDEFINED && ret._spreadus_ ? ret : [ret]);
  72. });
  73. };
  74. return p;
  75. }
  76. /**
  77. * Wrap value as a resolved promise.
  78. */
  79. function resolve(value) { return Promise.resolve(value); }
  80. /**
  81. * Wrap value as a rejected promise.
  82. */
  83. function reject(reason) { return Promise.reject(reason); }
  84. /**
  85. * Harvest any resolved promises, if all failed then return reasons.
  86. */
  87. function harvest(outcomes) {
  88. return Promise.settle(outcomes).then(function(results) {
  89. if (results.length === 0) {
  90. return reject("** NOT FOUND **");
  91. }
  92. var solved = [], failed = [];
  93. for (var i = 0; i < results.length; i++) {
  94. if (results[i].isResolved()) {
  95. solved.push(results[i].value());
  96. } else {
  97. failed.push(results[i].reason());
  98. }
  99. }
  100. return solved.length ? solved : failed;
  101. });
  102. }
  103. /*
  104. * Create a Record Block Table object to load record block info from record section in mdx/mdd file.
  105. * Retrived data is stored in an Uint32Array which contains N pairs of (offset_comp, offset_decomp) value,
  106. * where N is number of record blocks.
  107. *
  108. * When looking up a given key for its definition:
  109. * 1. Search KEY_INDEX to locate keyword block containing the given key.
  110. * 2. Scanning the found keyword block to get its record offset and size.
  111. * 3. Search RECORD_BLOCK_TABLE to get record block containing the record.
  112. * 4. Load the found record block, using its offset and size to retrieve record content.
  113. *
  114. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section
  115. */
  116. function createRecordBlockTable() {
  117. var pos = 0, // current position
  118. arr; // backed Uint32Array
  119. return {
  120. // Allocate required ArrayBuffer for storing record block table, where len is number of record blocks.
  121. alloc: function(len) {
  122. arr = new Uint32Array(len * 2);
  123. },
  124. // Store offset pair value (compressed & decompressed) for a record block
  125. // NOTE: offset_comp is absolute offset counted from start of mdx/mdd file.
  126. put: function(offset_comp, offset_decomp) {
  127. arr[pos++] = offset_comp; arr[pos++] = offset_decomp;
  128. },
  129. // Given offset of a keyword after decompression, return a record block info containing it, else undefined if not found.
  130. find: function(keyAt) {
  131. var hi = (arr.length >> 1) - 1, lo = 0, i = (lo + hi) >> 1, val = arr[(i << 1) + 1];
  132. if (keyAt > arr[(hi << 1) + 1] || keyAt < 0) {
  133. return;
  134. }
  135. while (true) {
  136. if (hi - lo <= 1) {
  137. if (i < hi) {
  138. return {
  139. block_no: i,
  140. comp_offset: arr[i <<= 1],
  141. comp_size: arr[i + 2] - arr[i],
  142. decomp_offset:arr[i + 1],
  143. decomp_size: arr[i + 3] - arr[i + 1]
  144. };
  145. } else {
  146. return;
  147. }
  148. }
  149. (keyAt < val) ? hi = i : lo = i;
  150. i = (lo + hi) >> 1;
  151. val = arr[(i << 1) + 1];
  152. }
  153. },
  154. };
  155. }
  156. /**
  157. * Test if a value of dictionary attribute is true or not.
  158. */
  159. function isTrue(v) {
  160. v = ((v || false) + '').toLowerCase();
  161. return v === 'yes' || v === 'true';
  162. }
  163. /**
  164. * Parse a MDict dictionary/resource file (mdx/mdd).
  165. * @param file a File/Blob object
  166. * @param ext file extension, mdx/mdd
  167. * @return a Promise object which will resolve to a lookup function.
  168. */
  169. function parse_mdict(file, ext) {
  170. var KEY_INDEX, // keyword index array
  171. RECORD_BLOCK_TABLE = createRecordBlockTable(); // record block table
  172. var attrs = {}, // storing dictionary attributes
  173. _v2, // true if enginge version > 2
  174. _bpu, // bytes per unit when converting text size to byte length for text data
  175. _tail, // need to skip extra tail bytes after decoding text
  176. _decoder, // text decorder
  177. _decryptors = [false, false],
  178. // [keyword_header_decryptor, keyword_index_decryptor], only keyword_index_decryptor is supported
  179. _searchTextLen, // search NUL to get text length
  180. _readShort = function(scanner) { return scanner.readUint8(); },
  181. // read a "short" number representing kewword text size, 8-bit for version < 2, 16-bit for version >= 2
  182. _readNum = function(scanner) { return scanner.readInt(); },
  183. // Read a number representing offset or data block size, 16-bit for version < 2, 32-bit for version >= 2
  184. _checksum_v2 = function() {},
  185. // Version >= 2.0 only checksum
  186. _adaptKey = function(key) { return key; },
  187. // adapt key by converting to lower case or stripping punctuations according to dictionary attributes (KeyCaseSensitive, StripKey)
  188. _slice = sliceThen.bind(null, file);
  189. // bind sliceThen() with file argument
  190. /**
  191. * Config scanner according to dictionary attributes.
  192. */
  193. function config() {
  194. attrs.Encoding = attrs.Encoding || 'UTF-16';
  195. _searchTextLen = (attrs.Encoding === 'UTF-16')
  196. ? function(dv, offset) {
  197. offset = offset;
  198. var mark = offset;
  199. while (dv.getUint16(offset)) { offset+= _bpu /* scan for \u0000 */ };
  200. return offset - mark;
  201. } : function(dv, offset) {
  202. offset = offset;
  203. var mark = offset;
  204. while (dv.getUint8(offset++)) { /* scan for NUL */ }
  205. return offset - mark - 1;
  206. };
  207. _decoder = new TextDecoder(attrs.Encoding || 'UTF-16LE');
  208. _bpu = (attrs.Encoding === 'UTF-16') ? 2 : 1;
  209. if (parseInt(attrs.GeneratedByEngineVersion, 10) >= 2.0) {
  210. _v2 = true;
  211. _tail = _bpu;
  212. // HUGE dictionary file (>4G) is not supported, take only lower 32-bit
  213. _readNum = function(scanner) { return scanner.forward(4), scanner.readInt(); };
  214. _readShort = function(scanner) { return scanner.readUint16(); };
  215. _checksum_v2 = function(scanner) { return scanner.checksum(); };
  216. } else {
  217. _tail = 0;
  218. }
  219. // keyword index decrypted?
  220. if (attrs.Encrypted & 0x02) {
  221. _decryptors[1] = decrypt;
  222. }
  223. var regexp = MCommon.REGEXP_STRIPKEY[ext];
  224. if (isTrue(attrs.KeyCaseSensitive)) {
  225. _adaptKey = isTrue(attrs.StripKey)
  226. ? function(key) { return key.replace(regexp, '$1'); }
  227. : function(key) { return key; };
  228. } else {
  229. _adaptKey = isTrue(attrs.StripKey || (_v2 ? '' : 'yes'))
  230. ? function(key) { return key.toLowerCase().replace(regexp, '$1'); }
  231. : function(key) { return key.toLowerCase(); };
  232. }
  233. }
  234. // Read data in current offset from target data ArrayBuffer
  235. function Scanner(buf, len) {
  236. var offset = 0, dv = new DataView(buf);
  237. var methods = {
  238. // target data size in bytes
  239. size: function() { return len || buf.byteLength; },
  240. // update offset to new position
  241. forward: function(len) { return offset += len; },
  242. // return current offset
  243. offset: function() { return offset; },
  244. // MDict file format uses big endian to store number
  245. // 32-bit unsigned int
  246. readInt: function() { return conseq(dv.getUint32(offset, false), this.forward(4)); },
  247. readUint16: function() { return conseq(dv.getUint16(offset, false), this.forward(2)); },
  248. readUint8: function() { return conseq(dv.getUint8(offset, false), this.forward(1)); },
  249. // Read a "short" number representing keyword text size, 8-bit for version < 2, 16-bit for version >= 2
  250. readShort: function() { return _readShort(this); },
  251. // Read a number representing offset or data block size, 16-bit for version < 2, 32-bit for version >= 2
  252. readNum: function() { return _readNum(this); },
  253. readUTF16: function(len) { return conseq(UTF_16LE.decode(new Uint8Array(buf, offset, len)), this.forward(len)); },
  254. // Read data to an Uint8Array and decode it to text with specified encoding.
  255. // Text length in bytes is determined by searching terminated NUL.
  256. // NOTE: After decoding the text, it is need to forward extra "tail" bytes according to specified encoding.
  257. readText: function() {
  258. var len = _searchTextLen(dv, offset);
  259. return conseq(_decoder.decode(new Uint8Array(buf, offset, len)), this.forward(len + _bpu));
  260. },
  261. // Read data to an Uint8Array and decode it to text with specified encoding.
  262. // @param len length in basic unit, need to multiply byte per unit to get length in bytes
  263. // NOTE: After decoding the text, it is need to forward extra "tail" bytes according to specified encoding.
  264. readTextSized: function(len) {
  265. len *= _bpu;
  266. return conseq(_decoder.decode(new Uint8Array(buf, offset, len)), this.forward(len + _tail));
  267. },
  268. // Skip checksum, just ignore it anyway.
  269. checksum: function() { this.forward(4); },
  270. // Version >= 2.0 only
  271. checksum_v2: function() { return _checksum_v2(this); },
  272. // Read data block of keyword index, key block or record content.
  273. // These data block are maybe in compressed (gzip or lzo) format, while keyword index maybe be encrypted.
  274. // @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#compression (with typo mistake)
  275. readBlock: function(len, expectedBufSize, decryptor) {
  276. var comp_type = dv.getUint8(offset, false); // compression type, 0 = non, 1 = lzo, 2 = gzip
  277. if (comp_type === 0) {
  278. if (_v2) {
  279. this.forward(8); // for version >= 2, skip comp_type (4 bytes with tailing \x00) and checksum (4 bytes)
  280. }
  281. return this;
  282. } else {
  283. // skip comp_type (4 bytes with tailing \x00) and checksum (4 bytes)
  284. offset += 8; len -= 8;
  285. var tmp = new Uint8Array(buf, offset, len);
  286. if (decryptor) {
  287. var passkey = new Uint8Array(8);
  288. passkey.set(new Uint8Array(buf, offset - 4, 4)); // key part 1: checksum
  289. passkey.set([0x95, 0x36, 0x00, 0x00], 4); // key part 2: fixed data
  290. tmp = decryptor(tmp, passkey);
  291. }
  292. tmp = comp_type === 2 ? pako.inflate(tmp) : lzo.decompress(tmp, expectedBufSize, 1308672);
  293. this.forward(len);
  294. return Scanner(tmp.buffer, tmp.length);
  295. }
  296. },
  297. // Read raw data as Uint8Array from current offset with specified length in bytes
  298. readRaw: function(len) {
  299. return conseq(new Uint8Array(buf, offset, len), this.forward(len === UNDEFINED ? buf.length - offset : len));
  300. },
  301. };
  302. return Object.create(methods);
  303. }
  304. /**
  305. * Read the first 4 bytes of mdx/mdd file to get length of header_str.
  306. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#file-structure
  307. * @param input sliced file (start = 0, length = 4)
  308. * @return length of header_str
  309. */
  310. function read_file_head(input) {
  311. return Scanner(input).readInt();
  312. }
  313. /**
  314. * Read header section, parse dictionary attributes and config scanner according to engine version attribute.
  315. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#header-section
  316. * @param input sliced file (start = 4, length = len + 48), header string + header section (max length 48)
  317. * @param len lenghth of header_str
  318. * @return [remained length of header section (header_str and checksum, = len + 4), original input]
  319. */
  320. function read_header_sect(input, len) {
  321. var scanner = Scanner(input),
  322. header_str = scanner.readUTF16(len).replace(/\0$/, ''); // need to remove tailing NUL
  323. // parse dictionary attributes
  324. var xml = parseXml(header_str).querySelector('Dictionary, Library_Data').attributes;
  325. for (var i = 0, item; i < xml.length; i++) {
  326. item = xml.item(i);
  327. attrs[item.nodeName] = item.nodeValue;
  328. }
  329. attrs.Encrypted = parseInt(attrs.Encrypted, 10) || 0;
  330. MCommon.log('dictionary attributes: ', attrs);
  331. config();
  332. return spreadus(len + 4, input);
  333. }
  334. /**
  335. * Read keyword summary at the begining of keyword section.
  336. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-section
  337. * @param input sliced file, same as input passed to read_header_sect()
  338. * @param offset start position of keyword section in sliced file, equals to length of header string plus checksum.\
  339. * @return keyword_sect object
  340. */
  341. function read_keyword_summary(input, offset) {
  342. var scanner = Scanner(input);
  343. scanner.forward(offset);
  344. return {
  345. num_blocks: scanner.readNum(),
  346. num_entries: scanner.readNum(),
  347. key_index_decomp_len: _v2 && scanner.readNum(), // Ver >= 2.0 only
  348. key_index_comp_len: scanner.readNum(),
  349. key_blocks_len: scanner.readNum(),
  350. chksum: scanner.checksum_v2(),
  351. // extra field
  352. len: scanner.offset() - offset, // actual length of keyword section, varying with engine version attribute
  353. };
  354. }
  355. /**
  356. * Read keyword index part of keyword section.
  357. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-header-encryption
  358. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-index
  359. * @param input sliced file, remained part of keyword section after keyword summary which can also be used to read following key blocks.
  360. * @param keyword_summary
  361. * @return [keyword_summary, array of keyword index]
  362. */
  363. function read_keyword_index(input, keyword_summary) {
  364. var scanner = Scanner(input).readBlock(keyword_summary.key_index_comp_len, keyword_summary.key_index_decomp_len, _decryptors[1]),
  365. keyword_index = Array(keyword_summary.num_blocks),
  366. offset = 0;
  367. for (var i = 0, size; i < keyword_summary.num_blocks; i++) {
  368. keyword_index[i] = {
  369. num_entries: conseq(scanner.readNum(), size = scanner.readShort()),
  370. // UNUSED, can be ignored
  371. // first_size: size = scanner.readShort(),
  372. first_word: conseq(scanner.readTextSized(size), size = scanner.readShort()),
  373. // UNUSED, can be ignored
  374. // last_size: size = scanner.readShort(),
  375. last_word: scanner.readTextSized(size),
  376. comp_size: size = scanner.readNum(),
  377. decomp_size: scanner.readNum(),
  378. // extra fields
  379. offset: offset, // offset of the first byte for the target key block in mdx/mdd file
  380. index: i // index of this key index, used to search previous/next block
  381. };
  382. offset += size;
  383. }
  384. return spreadus(keyword_summary, keyword_index);
  385. }
  386. /**
  387. * Read keyword entries inside a keyword block and fill KEY_TABLE.
  388. * @param scanner scanner object to read key entries, which starts at begining of target key block
  389. * @param kdx corresponding keyword index object
  390. * NOTE: no need to read keyword block anymore, for debug only.
  391. */
  392. function read_key_block(scanner, kdx) {
  393. var scanner = scanner.readBlock(kdx.comp_size, kdx.decomp_size);
  394. for (var i = 0; i < kdx.num_entries; i++) {
  395. // scanner.readNum(); scanner.readText();
  396. var kk = [scanner.readNum(), scanner.readText()];
  397. // console.log(scanner.readNum(), scanner.readText());
  398. }
  399. }
  400. /**
  401. * Delay to scan key table, for debug onyl.
  402. * @param slicedKeyBlock a promise object which will resolve to an ArrayBuffer containing keyword blocks
  403. * sliced from mdx/mdd file.
  404. * @param num_entries number of keyword entries
  405. * @param keyword_index array of keyword index
  406. * @param delay time to delay for scanning key table
  407. */
  408. function willScanKeyTable(slicedKeyBlock, num_entries, keyword_index, delay) {
  409. slicedKeyBlock.delay(delay).then(function (input) {
  410. MCommon.log('scan key table...');
  411. var scanner = Scanner(input);
  412. for (var i = 0, size = keyword_index.length; i < size; i++) {
  413. read_key_block(scanner, keyword_index[i]);
  414. }
  415. MCommon.log('KEY_TABLE loaded.');
  416. });
  417. }
  418. /**
  419. * Read record summary at the begining of record section.
  420. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section
  421. * @param input sliced file, start = begining of record section, length = 32 (max length of record summary)
  422. * @param pos begining of record section
  423. * @returj record summary object
  424. */
  425. function read_record_summary(input, pos) {
  426. var scanner = Scanner(input),
  427. record_summary = {
  428. num_blocks: scanner.readNum(),
  429. num_entries: scanner.readNum(),
  430. index_len: scanner.readNum(),
  431. blocks_len: scanner.readNum(),
  432. // extra field
  433. len: scanner.offset(), // actual length of record section (excluding record block index), varying with engine version attribute
  434. };
  435. // start position of record block from head of mdx/mdd file
  436. record_summary.block_pos = pos + record_summary.index_len + record_summary.len;
  437. return record_summary;
  438. }
  439. /**
  440. * Read record block index part in record section, and fill RECORD_BLOCK_TABLE
  441. * @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section
  442. * @param input sliced file, start = begining of record block index, length = record_summary.index_len
  443. * @param record_summary record summary object
  444. */
  445. function read_record_block(input, record_summary) {
  446. var scanner = Scanner(input),
  447. size = record_summary.num_blocks,
  448. record_index = Array(size),
  449. p0 = record_summary.block_pos,
  450. p1 = 0;
  451. RECORD_BLOCK_TABLE.alloc(size + 1);
  452. for (var i = 0, rdx; i < size; i++) {
  453. record_index[i] = rdx = {
  454. comp_size: scanner.readNum(),
  455. decomp_size: scanner.readNum()
  456. };
  457. RECORD_BLOCK_TABLE.put(p0, p1);
  458. p0 += rdx.comp_size;
  459. p1 += rdx.decomp_size;
  460. }
  461. RECORD_BLOCK_TABLE.put(p0, p1);
  462. }
  463. /**
  464. * Read definition in text for given keyinfo object.
  465. * @param input record block sliced from the file
  466. * @param block record block index
  467. * @param keyinfo a object with property of record's offset and optional size for the given keyword
  468. * @return definition in text
  469. */
  470. function read_definition(input, block, keyinfo) {
  471. var scanner = Scanner(input).readBlock(block.comp_size, block.decomp_size);
  472. scanner.forward(keyinfo.offset - block.decomp_offset);
  473. return scanner.readText();
  474. }
  475. /**
  476. * Following link to find actual definition of keyword.
  477. * @param definition maybe starts with "@@@LINK=" which links to another keyword
  478. * @param lookup search function
  479. * @return resolved actual definition
  480. */
  481. function followLink(definition, lookup) {
  482. return (definition.substring(0, 8) !== '@@@LINK=')
  483. ? definition
  484. : lookup(definition.substring(8));
  485. }
  486. /**
  487. * Read content in ArrayBuffer for give keyinfo object
  488. * @param input record block sliced from the file
  489. * @param block record block index
  490. * @param keyinfo a object with property of record's offset and optional size for the given keyword
  491. * @return an ArrayBuffer containing resource of image/audio/css/font etc.
  492. */
  493. function read_object(input, block, keyinfo) {
  494. if (input.byteLength > 0) {
  495. var scanner = Scanner(input).readBlock(block.comp_size, block.decomp_size);
  496. scanner.forward(keyinfo.offset - block.decomp_offset);
  497. return scanner.readRaw(keyinfo.size);
  498. } else {
  499. throw '* OUT OF FILE RANGE * ' + keyinfo + ' @offset=' + block.comp_offset;
  500. }
  501. }
  502. /**
  503. * Find word definition for given keyinfo object.
  504. * @param keyinfo a object with property of record's offset and optional size for the given keyword
  505. * @return a promise object which will resolve to definition in text. Link to other keyword is followed to get actual definition.
  506. */
  507. function findWord(keyinfo) {
  508. var block = RECORD_BLOCK_TABLE.find(keyinfo.offset);
  509. return _slice(block.comp_offset, block.comp_size)
  510. .exec(read_definition, block, keyinfo)
  511. .spread(function (definition) { return resolve(followLink(definition, LOOKUP.mdx)); });
  512. }
  513. /**
  514. * Find resource (image, sound etc.) for given keyinfo object.
  515. * @param keyinfo a object with property of record's offset and optional size for the given keyword
  516. * @return a promise object which will resolve to an ArrayBuffer containing resource of image/audio/css/font etc.
  517. * TODO: Follow link, maybe it's too expensive and a rarely used feature?
  518. */
  519. function findResource(keyinfo) {
  520. var block = RECORD_BLOCK_TABLE.find(keyinfo.offset);
  521. return _slice(block.comp_offset, block.comp_size)
  522. .exec(read_object, block, keyinfo)
  523. .spread(function (blob) { return resolve(blob); });
  524. }
  525. //------------------------------------------------------------------------------------------------
  526. // Implementation for look-up
  527. //------------------------------------------------------------------------------------------------
  528. var slicedKeyBlock,
  529. _cached_keys, // cache latest keys
  530. _trail, // store latest visited record block & position when search for candidate keys
  531. mutual_ticket = 0; // a oneway increased ticket used to cancel unfinished pattern match
  532. /**
  533. * Reduce the key index array to an element which contains or is the nearest one matching a given phrase.
  534. */
  535. function reduce(arr, phrase) {
  536. var len = arr.length;
  537. if (len > 1) {
  538. len = len >> 1;
  539. return phrase > _adaptKey(arr[len - 1].last_word)
  540. ? reduce(arr.slice(len), phrase)
  541. : reduce(arr.slice(0, len), phrase);
  542. } else {
  543. return arr[0];
  544. }
  545. }
  546. /**
  547. * Reduce the array to index of an element which contains or is the nearest one matching a given phrase.
  548. */
  549. function shrink(arr, phrase) {
  550. var len = arr.length, sub;
  551. if (len > 1) {
  552. len = len >> 1;
  553. var key = _adaptKey(arr[len]);
  554. if (phrase < key) {
  555. sub = arr.slice(0, len);
  556. sub.pos = arr.pos;
  557. } else {
  558. sub = arr.slice(len);
  559. sub.pos = (arr.pos || 0) + len;
  560. }
  561. return shrink(sub, phrase);
  562. } else {
  563. return (arr.pos || 0) + (phrase <= _adaptKey(arr[0]) ? 0 : 1);
  564. }
  565. }
  566. /**
  567. * Load keys for a keyword index object from mdx/mdd file.
  568. * @param kdx keyword index object
  569. */
  570. function loadKeys(kdx) {
  571. if (_cached_keys && _cached_keys.pilot === kdx.first_word) {
  572. return resolve(_cached_keys.list);
  573. } else {
  574. return slicedKeyBlock.then(function(input) {
  575. var scanner = Scanner(input), list = Array(kdx.num_entries);
  576. scanner.forward(kdx.offset);
  577. scanner = scanner.readBlock(kdx.comp_size, kdx.decomp_size);
  578. for (var i = 0; i < kdx.num_entries; i++) {
  579. var offset = scanner.readNum();
  580. list[i] = new Object(scanner.readText());
  581. list[i].offset = offset;
  582. if (i > 0) {
  583. list[i - 1].size = offset - list[i - 1].offset;
  584. }
  585. }
  586. _cached_keys = {list: list, pilot: kdx.first_word};
  587. return list;
  588. });
  589. }
  590. }
  591. /**
  592. * Search for the first keyword match given phrase.
  593. */
  594. function seekVanguard(phrase) {
  595. phrase = _adaptKey(phrase);
  596. var kdx = reduce(KEY_INDEX, phrase);
  597. // look back for the first record block containing keyword for the specified phrase
  598. if (phrase <= _adaptKey(kdx.last_word)) {
  599. var index = kdx.index - 1, prev;
  600. while (prev = KEY_INDEX[index]) {
  601. if (_adaptKey(prev.last_word) !== _adaptKey(kdx.last_word)) {
  602. break;
  603. }
  604. kdx = prev;
  605. index--;
  606. }
  607. }
  608. return loadKeys(kdx).then(function (list) {
  609. var idx = shrink(list, phrase);
  610. // look back for the first matched keyword position
  611. while (idx > 0) {
  612. if (_adaptKey(list[--idx]) !== _adaptKey(phrase)) {
  613. idx++;
  614. break;
  615. }
  616. }
  617. return [kdx, Math.min(idx, list.length - 1), list];
  618. });
  619. }
  620. // TODO: have to restrict max count to improve response
  621. /**
  622. * Append more to word list according to a filter or expected size.
  623. */
  624. function appendMore(word, list, nextKdx, expectedSize, filter, ticket) {
  625. if (ticket !== mutual_ticket) {
  626. throw 'force terminated';
  627. }
  628. if (filter) {
  629. if (_trail.count < expectedSize && nextKdx && nextKdx.first_word.substr(0, word.length) === word) {
  630. return loadKeys(nextKdx).delay(30).then(function(more) {
  631. MCommon.log(nextKdx);
  632. _trail.offset = 0;
  633. _trail.block = nextKdx.index;
  634. Array.prototype.push.apply(list, more.filter(filter, _trail));
  635. return appendMore(word, list, KEY_INDEX[nextKdx.index + 1], expectedSize, filter, ticket);
  636. });
  637. } else {
  638. if (list.length === 0) {
  639. _trail.exhausted = true;
  640. }
  641. return resolve(list);
  642. }
  643. } else {
  644. var shortage = expectedSize - list.length;
  645. if (shortage > 0 && nextKdx) {
  646. console.log('go next', nextKdx);
  647. _trail.block = nextKdx.index;
  648. return loadKeys(nextKdx).then(function(more) {
  649. _trail.offset = 0;
  650. _trail.pos = Math.min(shortage, more.length);
  651. Array.prototype.push.apply(list, more.slice(0, shortage));
  652. console.log('$$ ' + more[shortage - 1], shortage);
  653. return appendMore(word, list, KEY_INDEX[nextKdx.index + 1], expectedSize, filter, ticket);
  654. });
  655. } else {
  656. if (_trail.pos > expectedSize) {
  657. _trail.pos = expectedSize;
  658. }
  659. list = list.slice(0, expectedSize);
  660. _trail.count = list.length;
  661. _trail.total += _trail.count;
  662. return resolve(list);
  663. }
  664. }
  665. }
  666. function followUp() {
  667. var kdx = KEY_INDEX[_trail.block];
  668. return loadKeys(kdx).then(function (list) {
  669. return [kdx, Math.min(_trail.offset + _trail.pos, list.length - 1), list];
  670. });
  671. }
  672. function matchKeys(phrase, expectedSize, follow) {
  673. expectedSize = Math.max(expectedSize || 0, 10);
  674. var str = phrase.trim().toLowerCase(),
  675. m = /([^?*]+)[?*]+/.exec(str),
  676. word;
  677. if (m) {
  678. word = m[1];
  679. var wildcard = new RegExp('^' + str.replace(/([\.\\\+\[\^\]\$\(\)])/g, '\\$1').replace(/\*+/g, '.*').replace(/\?/g, '.') + '$'),
  680. tester = phrase[phrase.length - 1] === ' '
  681. ? function(s) { return wildcard.test(s); }
  682. : function(s) { return wildcard.test(s) && !/ /.test(s); },
  683. filter = function (s, i) {
  684. if (_trail.count < expectedSize && tester(s)) {
  685. _trail.count++;
  686. _trail.total++;
  687. _trail.pos = i + 1;
  688. return true;
  689. }
  690. return false;
  691. };
  692. } else {
  693. word = phrase.trim();
  694. }
  695. if (_trail && _trail.phrase !== phrase) {
  696. follow = false;
  697. }
  698. if (follow && _trail && _trail.exhausted) {
  699. return resolve([]);
  700. }
  701. var startFrom = follow && _trail ? followUp() : seekVanguard(word);
  702. return startFrom.spread(function(kdx, idx, list) {
  703. console.log('start ', kdx);
  704. list = list.slice(idx);
  705. _trail = {phrase: phrase,
  706. block: kdx.index,
  707. offset: idx,
  708. pos: list.length,
  709. count: 0,
  710. total: follow ? _trail && _trail.total || 0 : 0
  711. };
  712. if (filter) {
  713. list = list.filter(filter, _trail);
  714. }
  715. return appendMore(word, list, KEY_INDEX[kdx.index + 1], expectedSize, filter, ++mutual_ticket)
  716. .then(function(result) {
  717. if (_trail.block === KEY_INDEX.length - 1) {
  718. if (_trail.offset + _trail.pos >= KEY_INDEX[_trail.block].num_entries) {
  719. _trail.exhausted = true;
  720. console.log('EXHAUSTED!!!!');
  721. }
  722. }
  723. console.log('trail: ', _trail);
  724. return result;
  725. });
  726. });
  727. };
  728. /**
  729. * Match the first element in list with given offset.
  730. */
  731. function matchOffset(list, offset) {
  732. return list.some(function(el) { return el.offset === offset ? list = [el] : false; }) ? list : [];
  733. }
  734. // Lookup functions
  735. var LOOKUP = {
  736. /**
  737. * @param query
  738. * String
  739. * {phrase: .., max: .., follow: true} object
  740. */
  741. mdx: function(query) {
  742. if (typeof query === 'string' || query instanceof String) {
  743. _trail = null;
  744. var word = query.trim().toLowerCase(), offset = query.offset;
  745. return seekVanguard(word).spread(function(kdx, idx, list) {
  746. list = list.slice(idx);
  747. if (offset !== UNDEFINED) {
  748. list = matchOffset(list, offset);
  749. } else {
  750. list = list.filter(function(el) { return el.toLowerCase() === word; });
  751. }
  752. return harvest(list.map(findWord));
  753. });
  754. } else {
  755. return matchKeys(query.phrase, query.max, query.follow);
  756. }
  757. },
  758. // TODO: chain multiple mdd file
  759. mdd: function(phrase) {
  760. var word = phrase.trim().toLowerCase();
  761. word = '\\' + word.replace(/(^[/\\])|([/]$)/, '');
  762. word = word.replace(/\//g, '\\');
  763. return seekVanguard(word).spread(function(kdx, idx, list) {
  764. return list.slice(idx).filter(function(one) {
  765. return one.toLowerCase() === word;
  766. });
  767. }).then(function(candidates) {
  768. if (candidates.length === 0) {
  769. throw '*RESOURCE NOT FOUND* ' + phrase;
  770. } else {
  771. return findResource(candidates[0]);
  772. }
  773. });
  774. }
  775. };
  776. // ------------------------------------------
  777. // start to load mdx/mdd file
  778. // ------------------------------------------
  779. MCommon.log('start to load ' + file.name);
  780. var pos = 0;
  781. // read first 4 bytes to get header length
  782. return _slice(pos, 4).exec(read_file_head).spread(function(len) {
  783. pos += 4; // start of header string in header section
  784. return _slice(pos, len + 48)
  785. .exec(read_header_sect, len);
  786. }).spread(function(header_remain_len, input) {
  787. pos += header_remain_len; // start of keyword section
  788. return read_keyword_summary(input, header_remain_len);
  789. }).then(function(keyword_summary) { MCommon.log(keyword_summary);
  790. pos += keyword_summary.len; // start of key index in keyword section
  791. return _slice(pos, keyword_summary.key_index_comp_len)
  792. .exec(read_keyword_index, keyword_summary);
  793. }).spread(function (keyword_summary, keyword_index) {
  794. pos += keyword_summary.key_index_comp_len; // start of keyword block in keyword section
  795. slicedKeyBlock = _slice(pos, keyword_summary.key_blocks_len);
  796. /*
  797. // Now it's fast enough to look up word without key table, which scans keyword from the specified key blocks in an effcient way.
  798. // No need to scan the whole key table in ahead.
  799. willScanKeyTable(slicedKeyBlock, keyword_summary.num_entries, keyword_index, 00);
  800. // */
  801. pos += keyword_summary.key_blocks_len; // start of record section
  802. KEY_INDEX = keyword_index;
  803. }).then(function () {
  804. return _slice(pos, 32)
  805. .exec(read_record_summary, pos);
  806. }).spread(function (record_summary) { MCommon.log(record_summary);
  807. pos += record_summary.len; // start of record blocks in record section
  808. return _slice(pos, record_summary.index_len)
  809. .exec(read_record_block, record_summary);
  810. }).spread(function() { MCommon.log('-- parse done --', file.name);
  811. // resolve and return lookup() function according to file extension (mdx/mdd)
  812. LOOKUP[ext].description = attrs.Description;
  813. return resolve(LOOKUP[ext]);
  814. });
  815. };
  816. // -------------------------
  817. // END OF parse_mdict()
  818. // -------------------------
  819. /**
  820. * Load a set of files which will be parsed as MDict dictionary & resource (mdx/mdd).
  821. */
  822. return function load(files) {
  823. var resources = [];
  824. Array.prototype.forEach.call(files, function(f) {
  825. var ext = MCommon.getExtension(f.name, 'mdx');
  826. resources.push(resources[ext] = parse_mdict(f, ext));
  827. });
  828. return Promise.all(resources)
  829. .then(function() { return resolve(resources); });
  830. };
  831. }());