lib.cpp 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926
  1. #ifdef HAVE_CONFIG_H
  2. # include "config.h"
  3. #endif
  4. #include <algorithm>
  5. #include <cstring>
  6. #include <cctype>
  7. #include <sys/stat.h>
  8. #include <zlib.h>
  9. #include <glib/gstdio.h>
  10. #include "distance.h"
  11. #include "file.hpp"
  12. #include "mapfile.hpp"
  13. #include "lib.h"
  14. // Notice: read src/tools/DICTFILE_FORMAT for the dictionary
  15. // file's format information!
  16. static inline bool bIsVowel(gchar inputchar)
  17. {
  18. gchar ch = g_ascii_toupper(inputchar);
  19. return ( ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U' );
  20. }
  21. static bool bIsPureEnglish(const gchar *str)
  22. {
  23. // i think this should work even when it is UTF8 string :).
  24. for (int i = 0; str[i] != 0; i++)
  25. //if(str[i]<0)
  26. //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK.
  27. // Better use isascii() but not str[i]<0 while char is default unsigned in arm
  28. if (!isascii(str[i]))
  29. return false;
  30. return true;
  31. }
  32. static inline gint stardict_strcmp(const gchar *s1, const gchar *s2)
  33. {
  34. gint a = g_ascii_strcasecmp(s1, s2);
  35. if (a == 0)
  36. return strcmp(s1, s2);
  37. else
  38. return a;
  39. }
  40. bool DictInfo::load_from_ifo_file(const std::string& ifofilename,
  41. bool istreedict)
  42. {
  43. ifo_file_name = ifofilename;
  44. gchar *buffer;
  45. if (!g_file_get_contents(ifofilename.c_str(), &buffer, NULL, NULL))
  46. return false;
  47. #define TREEDICT_MAGIC_DATA "StarDict's treedict ifo file\nversion=2.4.2\n"
  48. #define DICT_MAGIC_DATA "StarDict's dict ifo file\nversion=2.4.2\n"
  49. const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA;
  50. if (!g_str_has_prefix(buffer, magic_data))
  51. {
  52. g_free(buffer);
  53. return false;
  54. }
  55. gchar *p1, *p2, *p3;
  56. p1 = buffer + strlen(magic_data) - 1;
  57. p2 = strstr(p1, "\nwordcount=");
  58. if (!p2)
  59. {
  60. g_free(buffer);
  61. return false;
  62. }
  63. p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n');
  64. gchar *tmpstr = (gchar *)g_memdup(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1) + 1);
  65. tmpstr[p3 - (p2 + sizeof("\nwordcount=") - 1)] = '\0';
  66. wordcount = atol(tmpstr);
  67. g_free(tmpstr);
  68. if (istreedict)
  69. {
  70. p2 = strstr(p1, "\ntdxfilesize=");
  71. if (!p2)
  72. {
  73. g_free(buffer);
  74. return false;
  75. }
  76. p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n');
  77. tmpstr = (gchar *)g_memdup(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1) + 1);
  78. tmpstr[p3 - (p2 + sizeof("\ntdxfilesize=") - 1)] = '\0';
  79. index_file_size = atol(tmpstr);
  80. g_free(tmpstr);
  81. }
  82. else
  83. {
  84. p2 = strstr(p1, "\nidxfilesize=");
  85. if (!p2)
  86. {
  87. g_free(buffer);
  88. return false;
  89. }
  90. p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n');
  91. tmpstr = (gchar *)g_memdup(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1) + 1);
  92. tmpstr[p3 - (p2 + sizeof("\nidxfilesize=") - 1)] = '\0';
  93. index_file_size = atol(tmpstr);
  94. g_free(tmpstr);
  95. }
  96. p2 = strstr(p1, "\nbookname=");
  97. if (!p2)
  98. {
  99. g_free(buffer);
  100. return false;
  101. }
  102. p2 = p2 + sizeof("\nbookname=") - 1;
  103. p3 = strchr(p2, '\n');
  104. bookname.assign(p2, p3 - p2);
  105. p2 = strstr(p1, "\nauthor=");
  106. if (p2)
  107. {
  108. p2 = p2 + sizeof("\nauthor=") - 1;
  109. p3 = strchr(p2, '\n');
  110. author.assign(p2, p3 - p2);
  111. }
  112. p2 = strstr(p1, "\nemail=");
  113. if (p2)
  114. {
  115. p2 = p2 + sizeof("\nemail=") - 1;
  116. p3 = strchr(p2, '\n');
  117. email.assign(p2, p3 - p2);
  118. }
  119. p2 = strstr(p1, "\nwebsite=");
  120. if (p2)
  121. {
  122. p2 = p2 + sizeof("\nwebsite=") - 1;
  123. p3 = strchr(p2, '\n');
  124. website.assign(p2, p3 - p2);
  125. }
  126. p2 = strstr(p1, "\ndate=");
  127. if (p2)
  128. {
  129. p2 = p2 + sizeof("\ndate=") - 1;
  130. p3 = strchr(p2, '\n');
  131. date.assign(p2, p3 - p2);
  132. }
  133. p2 = strstr(p1, "\ndescription=");
  134. if (p2)
  135. {
  136. p2 = p2 + sizeof("\ndescription=") - 1;
  137. p3 = strchr(p2, '\n');
  138. description.assign(p2, p3 - p2);
  139. }
  140. p2 = strstr(p1, "\nsametypesequence=");
  141. if (p2)
  142. {
  143. p2 += sizeof("\nsametypesequence=") - 1;
  144. p3 = strchr(p2, '\n');
  145. sametypesequence.assign(p2, p3 - p2);
  146. }
  147. g_free(buffer);
  148. return true;
  149. }
  150. //===================================================================
  151. DictBase::DictBase()
  152. {
  153. dictfile = NULL;
  154. cache_cur = 0;
  155. }
  156. DictBase::~DictBase()
  157. {
  158. if (dictfile)
  159. fclose(dictfile);
  160. }
  161. gchar* DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size)
  162. {
  163. for (int i = 0; i < WORDDATA_CACHE_NUM; i++)
  164. if (cache[i].data && cache[i].offset == idxitem_offset)
  165. return cache[i].data;
  166. if (dictfile)
  167. fseek(dictfile, idxitem_offset, SEEK_SET);
  168. gchar *data;
  169. if (!sametypesequence.empty())
  170. {
  171. gchar *origin_data = (gchar *)g_malloc(idxitem_size);
  172. if (dictfile)
  173. fread(origin_data, idxitem_size, 1, dictfile);
  174. else
  175. dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
  176. guint32 data_size;
  177. gint sametypesequence_len = sametypesequence.length();
  178. //there have sametypesequence_len char being omitted.
  179. data_size = idxitem_size + sizeof(guint32) + sametypesequence_len;
  180. //if the last item's size is determined by the end up '\0',then +=sizeof(gchar);
  181. //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32);
  182. switch (sametypesequence[sametypesequence_len - 1])
  183. {
  184. case 'm':
  185. case 't':
  186. case 'y':
  187. case 'l':
  188. case 'g':
  189. case 'x':
  190. data_size += sizeof(gchar);
  191. break;
  192. case 'W':
  193. case 'P':
  194. data_size += sizeof(guint32);
  195. break;
  196. default:
  197. if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
  198. data_size += sizeof(guint32);
  199. else
  200. data_size += sizeof(gchar);
  201. break;
  202. }
  203. data = (gchar *)g_malloc(data_size);
  204. gchar *p1, *p2;
  205. p1 = data + sizeof(guint32);
  206. p2 = origin_data;
  207. guint32 sec_size;
  208. //copy the head items.
  209. for (int i = 0; i < sametypesequence_len - 1; i++)
  210. {
  211. *p1 = sametypesequence[i];
  212. p1 += sizeof(gchar);
  213. switch (sametypesequence[i])
  214. {
  215. case 'm':
  216. case 't':
  217. case 'y':
  218. case 'l':
  219. case 'g':
  220. case 'x':
  221. sec_size = strlen(p2) + 1;
  222. memcpy(p1, p2, sec_size);
  223. p1 += sec_size;
  224. p2 += sec_size;
  225. break;
  226. case 'W':
  227. case 'P':
  228. sec_size = *reinterpret_cast<guint32 *>(p2);
  229. sec_size += sizeof(guint32);
  230. memcpy(p1, p2, sec_size);
  231. p1 += sec_size;
  232. p2 += sec_size;
  233. break;
  234. default:
  235. if (g_ascii_isupper(sametypesequence[i]))
  236. {
  237. sec_size = *reinterpret_cast<guint32 *>(p2);
  238. sec_size += sizeof(guint32);
  239. }
  240. else
  241. {
  242. sec_size = strlen(p2) + 1;
  243. }
  244. memcpy(p1, p2, sec_size);
  245. p1 += sec_size;
  246. p2 += sec_size;
  247. break;
  248. }
  249. }
  250. //calculate the last item 's size.
  251. sec_size = idxitem_size - (p2 - origin_data);
  252. *p1 = sametypesequence[sametypesequence_len - 1];
  253. p1 += sizeof(gchar);
  254. switch (sametypesequence[sametypesequence_len - 1])
  255. {
  256. case 'm':
  257. case 't':
  258. case 'y':
  259. case 'l':
  260. case 'g':
  261. case 'x':
  262. memcpy(p1, p2, sec_size);
  263. p1 += sec_size;
  264. *p1 = '\0'; //add the end up '\0';
  265. break;
  266. case 'W':
  267. case 'P':
  268. *reinterpret_cast<guint32 *>(p1) = sec_size;
  269. p1 += sizeof(guint32);
  270. memcpy(p1, p2, sec_size);
  271. break;
  272. default:
  273. if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
  274. {
  275. *reinterpret_cast<guint32 *>(p1) = sec_size;
  276. p1 += sizeof(guint32);
  277. memcpy(p1, p2, sec_size);
  278. }
  279. else
  280. {
  281. memcpy(p1, p2, sec_size);
  282. p1 += sec_size;
  283. *p1 = '\0';
  284. }
  285. break;
  286. }
  287. g_free(origin_data);
  288. *reinterpret_cast<guint32 *>(data) = data_size;
  289. }
  290. else
  291. {
  292. data = (gchar *)g_malloc(idxitem_size + sizeof(guint32));
  293. if (dictfile)
  294. fread(data + sizeof(guint32), idxitem_size, 1, dictfile);
  295. else
  296. dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size);
  297. *reinterpret_cast<guint32 *>(data) = idxitem_size + sizeof(guint32);
  298. }
  299. g_free(cache[cache_cur].data);
  300. cache[cache_cur].data = data;
  301. cache[cache_cur].offset = idxitem_offset;
  302. cache_cur++;
  303. if (cache_cur == WORDDATA_CACHE_NUM)
  304. cache_cur = 0;
  305. return data;
  306. }
  307. inline bool DictBase::containSearchData()
  308. {
  309. if (sametypesequence.empty())
  310. return true;
  311. return sametypesequence.find_first_of("mlgxty") != std::string::npos;
  312. }
  313. bool DictBase::SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data)
  314. {
  315. int nWord = SearchWords.size();
  316. std::vector<bool> WordFind(nWord, false);
  317. int nfound = 0;
  318. if (dictfile)
  319. fseek(dictfile, idxitem_offset, SEEK_SET);
  320. if (dictfile)
  321. fread(origin_data, idxitem_size, 1, dictfile);
  322. else
  323. dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
  324. gchar *p = origin_data;
  325. guint32 sec_size;
  326. int j;
  327. if (!sametypesequence.empty())
  328. {
  329. gint sametypesequence_len = sametypesequence.length();
  330. for (int i = 0; i < sametypesequence_len - 1; i++)
  331. {
  332. switch (sametypesequence[i])
  333. {
  334. case 'm':
  335. case 't':
  336. case 'y':
  337. case 'l':
  338. case 'g':
  339. case 'x':
  340. for (j = 0; j < nWord; j++)
  341. if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
  342. {
  343. WordFind[j] = true;
  344. ++nfound;
  345. }
  346. if (nfound == nWord)
  347. return true;
  348. sec_size = strlen(p) + 1;
  349. p += sec_size;
  350. break;
  351. default:
  352. if (g_ascii_isupper(sametypesequence[i]))
  353. {
  354. sec_size = *reinterpret_cast<guint32 *>(p);
  355. sec_size += sizeof(guint32);
  356. }
  357. else
  358. {
  359. sec_size = strlen(p) + 1;
  360. }
  361. p += sec_size;
  362. }
  363. }
  364. switch (sametypesequence[sametypesequence_len - 1])
  365. {
  366. case 'm':
  367. case 't':
  368. case 'y':
  369. case 'l':
  370. case 'g':
  371. case 'x':
  372. sec_size = idxitem_size - (p - origin_data);
  373. for (j = 0; j < nWord; j++)
  374. if (!WordFind[j] &&
  375. g_strstr_len(p, sec_size, SearchWords[j].c_str()))
  376. {
  377. WordFind[j] = true;
  378. ++nfound;
  379. }
  380. if (nfound == nWord)
  381. return true;
  382. break;
  383. }
  384. }
  385. else
  386. {
  387. while (guint32(p - origin_data) < idxitem_size)
  388. {
  389. switch (*p)
  390. {
  391. case 'm':
  392. case 't':
  393. case 'y':
  394. case 'l':
  395. case 'g':
  396. case 'x':
  397. for (j = 0; j < nWord; j++)
  398. if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
  399. {
  400. WordFind[j] = true;
  401. ++nfound;
  402. }
  403. if (nfound == nWord)
  404. return true;
  405. sec_size = strlen(p) + 1;
  406. p += sec_size;
  407. break;
  408. default:
  409. if (g_ascii_isupper(*p))
  410. {
  411. sec_size = *reinterpret_cast<guint32 *>(p);
  412. sec_size += sizeof(guint32);
  413. }
  414. else
  415. {
  416. sec_size = strlen(p) + 1;
  417. }
  418. p += sec_size;
  419. }
  420. }
  421. }
  422. return false;
  423. }
  424. class offset_index : public index_file
  425. {
  426. public:
  427. offset_index() : idxfile(NULL)
  428. {}
  429. ~offset_index();
  430. bool load(const std::string& url, gulong wc, gulong fsize);
  431. const gchar *get_key(glong idx);
  432. void get_data(glong idx);
  433. const gchar *get_key_and_data(glong idx);
  434. bool lookup(const char *str, glong &idx);
  435. private:
  436. static const gint ENTR_PER_PAGE = 32;
  437. static const char *CACHE_MAGIC;
  438. std::vector<guint32> wordoffset;
  439. FILE *idxfile;
  440. gulong wordcount;
  441. gchar wordentry_buf[256 + sizeof(guint32)*2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT.
  442. struct index_entry
  443. {
  444. glong idx;
  445. std::string keystr;
  446. void assign(glong i, const std::string& str)
  447. {
  448. idx = i;
  449. keystr.assign(str);
  450. }
  451. };
  452. index_entry first, last, middle, real_last;
  453. struct page_entry
  454. {
  455. gchar *keystr;
  456. guint32 off, size;
  457. };
  458. std::vector<gchar> page_data;
  459. struct page_t
  460. {
  461. glong idx;
  462. page_entry entries[ENTR_PER_PAGE];
  463. page_t(): idx( -1)
  464. {}
  465. void fill(gchar *data, gint nent, glong idx_);
  466. }
  467. page;
  468. gulong load_page(glong page_idx);
  469. const gchar *read_first_on_page_key(glong page_idx);
  470. const gchar *get_first_on_page_key(glong page_idx);
  471. bool load_cache(const std::string& url);
  472. bool save_cache(const std::string& url);
  473. static strlist_t get_cache_variant(const std::string& url);
  474. };
  475. const char *offset_index::CACHE_MAGIC = "StarDict's Cache, Version: 0.1";
  476. class wordlist_index : public index_file
  477. {
  478. public:
  479. wordlist_index() : idxdatabuf(NULL)
  480. {}
  481. ~wordlist_index();
  482. bool load(const std::string& url, gulong wc, gulong fsize);
  483. const gchar *get_key(glong idx);
  484. void get_data(glong idx);
  485. const gchar *get_key_and_data(glong idx);
  486. bool lookup(const char *str, glong &idx);
  487. private:
  488. gchar *idxdatabuf;
  489. std::vector<gchar *> wordlist;
  490. };
  491. void offset_index::page_t::fill(gchar *data, gint nent, glong idx_)
  492. {
  493. idx = idx_;
  494. gchar *p = data;
  495. glong len;
  496. for (gint i = 0; i < nent; ++i)
  497. {
  498. entries[i].keystr = p;
  499. len = strlen(p);
  500. p += len + 1;
  501. entries[i].off = g_ntohl(*reinterpret_cast<guint32 *>(p));
  502. p += sizeof(guint32);
  503. entries[i].size = g_ntohl(*reinterpret_cast<guint32 *>(p));
  504. p += sizeof(guint32);
  505. }
  506. }
  507. offset_index::~offset_index()
  508. {
  509. if (idxfile)
  510. fclose(idxfile);
  511. }
  512. inline const gchar *offset_index::read_first_on_page_key(glong page_idx)
  513. {
  514. fseek(idxfile, wordoffset[page_idx], SEEK_SET);
  515. guint page_size = wordoffset[page_idx + 1] - wordoffset[page_idx];
  516. fread(wordentry_buf, std::min<guint>(sizeof(wordentry_buf), page_size), 1, idxfile); //TODO: check returned values, deal with word entry that strlen>255.
  517. return wordentry_buf;
  518. }
  519. inline const gchar *offset_index::get_first_on_page_key(glong page_idx)
  520. {
  521. if (page_idx < middle.idx)
  522. {
  523. if (page_idx == first.idx)
  524. return first.keystr.c_str();
  525. return read_first_on_page_key(page_idx);
  526. }
  527. else if (page_idx > middle.idx)
  528. {
  529. if (page_idx == last.idx)
  530. return last.keystr.c_str();
  531. return read_first_on_page_key(page_idx);
  532. }
  533. else
  534. return middle.keystr.c_str();
  535. }
  536. bool offset_index::load_cache(const std::string& url)
  537. {
  538. strlist_t vars = get_cache_variant(url);
  539. for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
  540. {
  541. struct stat idxstat, cachestat;
  542. if (g_stat(url.c_str(), &idxstat) != 0 ||
  543. g_stat(it->c_str(), &cachestat) != 0)
  544. continue;
  545. if (cachestat.st_mtime < idxstat.st_mtime)
  546. continue;
  547. MapFile mf;
  548. if (!mf.open(it->c_str(), cachestat.st_size))
  549. continue;
  550. if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0)
  551. continue;
  552. memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC), wordoffset.size()*sizeof(wordoffset[0]));
  553. return true;
  554. }
  555. return false;
  556. }
  557. strlist_t offset_index::get_cache_variant(const std::string& url)
  558. {
  559. strlist_t res;
  560. res.push_back(url + ".oft");
  561. if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) &&
  562. g_mkdir(g_get_user_cache_dir(), 0700) == -1)
  563. return res;
  564. std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv";
  565. if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS))
  566. {
  567. if (g_mkdir(cache_dir.c_str(), 0700) == -1)
  568. return res;
  569. }
  570. else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR))
  571. return res;
  572. gchar *base = g_path_get_basename(url.c_str());
  573. res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft");
  574. g_free(base);
  575. return res;
  576. }
  577. bool offset_index::save_cache(const std::string& url)
  578. {
  579. strlist_t vars = get_cache_variant(url);
  580. for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
  581. {
  582. FILE *out = fopen(it->c_str(), "wb");
  583. if (!out)
  584. continue;
  585. if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC))
  586. continue;
  587. if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size())
  588. continue;
  589. fclose(out);
  590. printf("save to cache %s\n", url.c_str());
  591. return true;
  592. }
  593. return false;
  594. }
  595. bool offset_index::load(const std::string& url, gulong wc, gulong fsize)
  596. {
  597. wordcount = wc;
  598. gulong npages = (wc - 1) / ENTR_PER_PAGE + 2;
  599. wordoffset.resize(npages);
  600. if (!load_cache(url))
  601. { //map file will close after finish of block
  602. MapFile map_file;
  603. if (!map_file.open(url.c_str(), fsize))
  604. return false;
  605. const gchar *idxdatabuffer = map_file.begin();
  606. const gchar *p1 = idxdatabuffer;
  607. gulong index_size;
  608. guint32 j = 0;
  609. for (guint32 i = 0; i < wc; i++)
  610. {
  611. index_size = strlen(p1) + 1 + 2 * sizeof(guint32);
  612. if (i % ENTR_PER_PAGE == 0)
  613. {
  614. wordoffset[j] = p1 - idxdatabuffer;
  615. ++j;
  616. }
  617. p1 += index_size;
  618. }
  619. wordoffset[j] = p1 - idxdatabuffer;
  620. if (!save_cache(url))
  621. fprintf(stderr, "cache update failed\n");
  622. }
  623. if (!(idxfile = fopen(url.c_str(), "rb")))
  624. {
  625. wordoffset.resize(0);
  626. return false;
  627. }
  628. first.assign(0, read_first_on_page_key(0));
  629. last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2));
  630. middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2));
  631. real_last.assign(wc - 1, get_key(wc - 1));
  632. return true;
  633. }
  634. inline gulong offset_index::load_page(glong page_idx)
  635. {
  636. gulong nentr = ENTR_PER_PAGE;
  637. if (page_idx == glong(wordoffset.size() - 2))
  638. if ((nentr = wordcount % ENTR_PER_PAGE) == 0)
  639. nentr = ENTR_PER_PAGE;
  640. if (page_idx != page.idx)
  641. {
  642. page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]);
  643. fseek(idxfile, wordoffset[page_idx], SEEK_SET);
  644. fread(&page_data[0], 1, page_data.size(), idxfile);
  645. page.fill(&page_data[0], nentr, page_idx);
  646. }
  647. return nentr;
  648. }
  649. const gchar *offset_index::get_key(glong idx)
  650. {
  651. load_page(idx / ENTR_PER_PAGE);
  652. glong idx_in_page = idx % ENTR_PER_PAGE;
  653. wordentry_offset = page.entries[idx_in_page].off;
  654. wordentry_size = page.entries[idx_in_page].size;
  655. return page.entries[idx_in_page].keystr;
  656. }
  657. void offset_index::get_data(glong idx)
  658. {
  659. get_key(idx);
  660. }
  661. const gchar *offset_index::get_key_and_data(glong idx)
  662. {
  663. return get_key(idx);
  664. }
  665. bool offset_index::lookup(const char *str, glong &idx)
  666. {
  667. bool bFound = false;
  668. glong iFrom;
  669. glong iTo = wordoffset.size() - 2;
  670. gint cmpint;
  671. glong iThisIndex;
  672. if (stardict_strcmp(str, first.keystr.c_str()) < 0)
  673. {
  674. idx = 0;
  675. return false;
  676. }
  677. else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0)
  678. {
  679. idx = INVALID_INDEX;
  680. return false;
  681. }
  682. else
  683. {
  684. iFrom = 0;
  685. iThisIndex = 0;
  686. while (iFrom <= iTo)
  687. {
  688. iThisIndex = (iFrom + iTo) / 2;
  689. cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex));
  690. if (cmpint > 0)
  691. iFrom = iThisIndex + 1;
  692. else if (cmpint < 0)
  693. iTo = iThisIndex - 1;
  694. else
  695. {
  696. bFound = true;
  697. break;
  698. }
  699. }
  700. if (!bFound)
  701. idx = iTo; //prev
  702. else
  703. idx = iThisIndex;
  704. }
  705. if (!bFound)
  706. {
  707. gulong netr = load_page(idx);
  708. iFrom = 1; // Needn't search the first word anymore.
  709. iTo = netr - 1;
  710. iThisIndex = 0;
  711. while (iFrom <= iTo)
  712. {
  713. iThisIndex = (iFrom + iTo) / 2;
  714. cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr);
  715. if (cmpint > 0)
  716. iFrom = iThisIndex + 1;
  717. else if (cmpint < 0)
  718. iTo = iThisIndex - 1;
  719. else
  720. {
  721. bFound = true;
  722. break;
  723. }
  724. }
  725. idx *= ENTR_PER_PAGE;
  726. if (!bFound)
  727. idx += iFrom; //next
  728. else
  729. idx += iThisIndex;
  730. }
  731. else
  732. {
  733. idx *= ENTR_PER_PAGE;
  734. }
  735. return bFound;
  736. }
  737. wordlist_index::~wordlist_index()
  738. {
  739. g_free(idxdatabuf);
  740. }
  741. bool wordlist_index::load(const std::string& url, gulong wc, gulong fsize)
  742. {
  743. gzFile in = gzopen(url.c_str(), "rb");
  744. if (in == NULL)
  745. return false;
  746. idxdatabuf = (gchar *)g_malloc(fsize);
  747. gulong len = gzread(in, idxdatabuf, fsize);
  748. gzclose(in);
  749. if (len != fsize)
  750. return false;
  751. wordlist.resize(wc + 1);
  752. gchar *p1 = idxdatabuf;
  753. guint32 i;
  754. for (i = 0; i < wc; i++)
  755. {
  756. wordlist[i] = p1;
  757. p1 += strlen(p1) + 1 + 2 * sizeof(guint32);
  758. }
  759. wordlist[wc] = p1;
  760. return true;
  761. }
  762. const gchar *wordlist_index::get_key(glong idx)
  763. {
  764. return wordlist[idx];
  765. }
  766. void wordlist_index::get_data(glong idx)
  767. {
  768. gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar);
  769. wordentry_offset = g_ntohl(*reinterpret_cast<guint32 *>(p1));
  770. p1 += sizeof(guint32);
  771. wordentry_size = g_ntohl(*reinterpret_cast<guint32 *>(p1));
  772. }
  773. const gchar *wordlist_index::get_key_and_data(glong idx)
  774. {
  775. get_data(idx);
  776. return get_key(idx);
  777. }
  778. bool wordlist_index::lookup(const char *str, glong &idx)
  779. {
  780. bool bFound = false;
  781. glong iTo = wordlist.size() - 2;
  782. if (stardict_strcmp(str, get_key(0)) < 0)
  783. {
  784. idx = 0;
  785. }
  786. else if (stardict_strcmp(str, get_key(iTo)) > 0)
  787. {
  788. idx = INVALID_INDEX;
  789. }
  790. else
  791. {
  792. glong iThisIndex = 0;
  793. glong iFrom = 0;
  794. gint cmpint;
  795. while (iFrom <= iTo)
  796. {
  797. iThisIndex = (iFrom + iTo) / 2;
  798. cmpint = stardict_strcmp(str, get_key(iThisIndex));
  799. if (cmpint > 0)
  800. iFrom = iThisIndex + 1;
  801. else if (cmpint < 0)
  802. iTo = iThisIndex - 1;
  803. else
  804. {
  805. bFound = true;
  806. break;
  807. }
  808. }
  809. if (!bFound)
  810. idx = iFrom; //next
  811. else
  812. idx = iThisIndex;
  813. }
  814. return bFound;
  815. }
  816. //===================================================================
  817. bool Dict::load(const std::string& ifofilename)
  818. {
  819. gulong idxfilesize;
  820. if (!load_ifofile(ifofilename, idxfilesize))
  821. return false;
  822. std::string fullfilename(ifofilename);
  823. fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz");
  824. if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
  825. {
  826. dictdzfile.reset(new dictData);
  827. if (!dictdzfile->open(fullfilename, 0))
  828. {
  829. //g_print("open file %s failed!\n",fullfilename);
  830. return false;
  831. }
  832. }
  833. else
  834. {
  835. fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1);
  836. dictfile = fopen(fullfilename.c_str(), "rb");
  837. if (!dictfile)
  838. {
  839. //g_print("open file %s failed!\n",fullfilename);
  840. return false;
  841. }
  842. }
  843. fullfilename = ifofilename;
  844. fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz");
  845. if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
  846. {
  847. idx_file.reset(new wordlist_index);
  848. }
  849. else
  850. {
  851. fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1);
  852. idx_file.reset(new offset_index);
  853. }
  854. if (!idx_file->load(fullfilename, wordcount, idxfilesize))
  855. return false;
  856. //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles());
  857. return true;
  858. }
  859. bool Dict::load_ifofile(const std::string& ifofilename, gulong &idxfilesize)
  860. {
  861. DictInfo dict_info;
  862. if (!dict_info.load_from_ifo_file(ifofilename, false))
  863. return false;
  864. if (dict_info.wordcount == 0)
  865. return false;
  866. ifo_file_name = dict_info.ifo_file_name;
  867. wordcount = dict_info.wordcount;
  868. bookname = dict_info.bookname;
  869. idxfilesize = dict_info.index_file_size;
  870. sametypesequence = dict_info.sametypesequence;
  871. return true;
  872. }
  873. bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen)
  874. {
  875. int iIndexCount = 0;
  876. for (guint32 i = 0; i < narticles() && iIndexCount < iBuffLen - 1; i++)
  877. if (g_pattern_match_string(pspec, get_key(i)))
  878. aIndex[iIndexCount++] = i;
  879. aIndex[iIndexCount] = -1; // -1 is the end.
  880. return (iIndexCount > 0);
  881. }
  882. //===================================================================
  883. Libs::Libs(progress_func_t f)
  884. {
  885. progress_func = f;
  886. iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg.
  887. }
  888. Libs::~Libs()
  889. {
  890. for (std::vector<Dict *>::iterator p = oLib.begin(); p != oLib.end(); ++p)
  891. delete *p;
  892. }
  893. void Libs::load_dict(const std::string& url)
  894. {
  895. Dict *lib = new Dict;
  896. if (lib->load(url))
  897. oLib.push_back(lib);
  898. else
  899. delete lib;
  900. }
  901. class DictLoader
  902. {
  903. public:
  904. DictLoader(Libs& lib_): lib(lib_)
  905. {}
  906. void operator()(const std::string& url, bool disable)
  907. {
  908. if (!disable)
  909. lib.load_dict(url);
  910. }
  911. private:
  912. Libs& lib;
  913. };
  914. void Libs::load(const strlist_t& dicts_dirs,
  915. const strlist_t& order_list,
  916. const strlist_t& disable_list)
  917. {
  918. for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
  919. DictLoader(*this));
  920. }
  921. class DictReLoader
  922. {
  923. public:
  924. DictReLoader(std::vector<Dict *> &p, std::vector<Dict *> &f,
  925. Libs& lib_) : prev(p), future(f), lib(lib_)
  926. {}
  927. void operator()(const std::string& url, bool disable)
  928. {
  929. if (!disable)
  930. {
  931. Dict *dict = find(url);
  932. if (dict)
  933. future.push_back(dict);
  934. else
  935. lib.load_dict(url);
  936. }
  937. }
  938. private:
  939. std::vector<Dict *> &prev;
  940. std::vector<Dict *> &future;
  941. Libs& lib;
  942. Dict *find(const std::string& url)
  943. {
  944. std::vector<Dict *>::iterator it;
  945. for (it = prev.begin(); it != prev.end(); ++it)
  946. if ((*it)->ifofilename() == url)
  947. break;
  948. if (it != prev.end())
  949. {
  950. Dict *res = *it;
  951. prev.erase(it);
  952. return res;
  953. }
  954. return NULL;
  955. }
  956. };
  957. void Libs::reload(const strlist_t& dicts_dirs,
  958. const strlist_t& order_list,
  959. const strlist_t& disable_list)
  960. {
  961. std::vector<Dict *> prev(oLib);
  962. oLib.clear();
  963. for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
  964. DictReLoader(prev, oLib, *this));
  965. for (std::vector<Dict *>::iterator it = prev.begin(); it != prev.end(); ++it)
  966. delete *it;
  967. }
  968. const gchar *Libs::poGetCurrentWord(glong * iCurrent)
  969. {
  970. const gchar *poCurrentWord = NULL;
  971. const gchar *word;
  972. for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
  973. {
  974. if (iCurrent[iLib] == INVALID_INDEX)
  975. continue;
  976. if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
  977. continue;
  978. if ( poCurrentWord == NULL )
  979. {
  980. poCurrentWord = poGetWord(iCurrent[iLib], iLib);
  981. }
  982. else
  983. {
  984. word = poGetWord(iCurrent[iLib], iLib);
  985. if (stardict_strcmp(poCurrentWord, word) > 0 )
  986. poCurrentWord = word;
  987. }
  988. }
  989. return poCurrentWord;
  990. }
  991. const gchar *
  992. Libs::poGetNextWord(const gchar *sWord, glong *iCurrent)
  993. {
  994. // the input can be:
  995. // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback();
  996. // (NULL,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords();
  997. const gchar *poCurrentWord = NULL;
  998. std::vector<Dict *>::size_type iCurrentLib = 0;
  999. const gchar *word;
  1000. for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
  1001. {
  1002. if (sWord)
  1003. oLib[iLib]->Lookup(sWord, iCurrent[iLib]);
  1004. if (iCurrent[iLib] == INVALID_INDEX)
  1005. continue;
  1006. if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
  1007. continue;
  1008. if (poCurrentWord == NULL )
  1009. {
  1010. poCurrentWord = poGetWord(iCurrent[iLib], iLib);
  1011. iCurrentLib = iLib;
  1012. }
  1013. else
  1014. {
  1015. word = poGetWord(iCurrent[iLib], iLib);
  1016. if (stardict_strcmp(poCurrentWord, word) > 0 )
  1017. {
  1018. poCurrentWord = word;
  1019. iCurrentLib = iLib;
  1020. }
  1021. }
  1022. }
  1023. if (poCurrentWord)
  1024. {
  1025. iCurrent[iCurrentLib]
  1026. ++;
  1027. for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
  1028. {
  1029. if (iLib == iCurrentLib)
  1030. continue;
  1031. if (iCurrent[iLib] == INVALID_INDEX)
  1032. continue;
  1033. if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
  1034. continue;
  1035. if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0 )
  1036. iCurrent[iLib]++;
  1037. }
  1038. poCurrentWord = poGetCurrentWord(iCurrent);
  1039. }
  1040. return poCurrentWord;
  1041. }
  1042. const gchar *
  1043. Libs::poGetPreWord(glong * iCurrent)
  1044. {
  1045. // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange();
  1046. const gchar *poCurrentWord = NULL;
  1047. std::vector<Dict *>::size_type iCurrentLib = 0;
  1048. const gchar *word;
  1049. for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
  1050. {
  1051. if (iCurrent[iLib] == INVALID_INDEX)
  1052. iCurrent[iLib] = narticles(iLib);
  1053. else
  1054. {
  1055. if ( iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
  1056. continue;
  1057. }
  1058. if ( poCurrentWord == NULL )
  1059. {
  1060. poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib);
  1061. iCurrentLib = iLib;
  1062. }
  1063. else
  1064. {
  1065. word = poGetWord(iCurrent[iLib] - 1, iLib);
  1066. if (stardict_strcmp(poCurrentWord, word) < 0 )
  1067. {
  1068. poCurrentWord = word;
  1069. iCurrentLib = iLib;
  1070. }
  1071. }
  1072. }
  1073. if (poCurrentWord)
  1074. {
  1075. iCurrent[iCurrentLib]
  1076. --;
  1077. for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
  1078. {
  1079. if (iLib == iCurrentLib)
  1080. continue;
  1081. if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
  1082. continue;
  1083. if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0)
  1084. {
  1085. iCurrent[iLib]--;
  1086. }
  1087. else
  1088. {
  1089. if (iCurrent[iLib] == narticles(iLib))
  1090. iCurrent[iLib] = INVALID_INDEX;
  1091. }
  1092. }
  1093. }
  1094. return poCurrentWord;
  1095. }
  1096. bool Libs::LookupSimilarWord(const gchar* sWord, glong & iWordIndex, int iLib)
  1097. {
  1098. glong iIndex;
  1099. bool bFound = false;
  1100. gchar *casestr;
  1101. if (!bFound)
  1102. {
  1103. // to lower case.
  1104. casestr = g_utf8_strdown(sWord, -1);
  1105. if (strcmp(casestr, sWord))
  1106. {
  1107. if (oLib[iLib]->Lookup(casestr, iIndex))
  1108. bFound = true;
  1109. }
  1110. g_free(casestr);
  1111. // to upper case.
  1112. if (!bFound)
  1113. {
  1114. casestr = g_utf8_strup(sWord, -1);
  1115. if (strcmp(casestr, sWord))
  1116. {
  1117. if (oLib[iLib]->Lookup(casestr, iIndex))
  1118. bFound = true;
  1119. }
  1120. g_free(casestr);
  1121. }
  1122. // Upper the first character and lower others.
  1123. if (!bFound)
  1124. {
  1125. gchar *nextchar = g_utf8_next_char(sWord);
  1126. gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord);
  1127. nextchar = g_utf8_strdown(nextchar, -1);
  1128. casestr = g_strdup_printf("%s%s", firstchar, nextchar);
  1129. g_free(firstchar);
  1130. g_free(nextchar);
  1131. if (strcmp(casestr, sWord))
  1132. {
  1133. if (oLib[iLib]->Lookup(casestr, iIndex))
  1134. bFound = true;
  1135. }
  1136. g_free(casestr);
  1137. }
  1138. }
  1139. if (bIsPureEnglish(sWord))
  1140. {
  1141. // If not Found , try other status of sWord.
  1142. int iWordLen = strlen(sWord);
  1143. bool isupcase;
  1144. gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1);
  1145. //cut one char "s" or "d"
  1146. if (!bFound && iWordLen > 1)
  1147. {
  1148. isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2);
  1149. if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2))
  1150. {
  1151. strcpy(sNewWord, sWord);
  1152. sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d"
  1153. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1154. bFound = true;
  1155. else if (isupcase || g_ascii_isupper(sWord[0]))
  1156. {
  1157. casestr = g_ascii_strdown(sNewWord, -1);
  1158. if (strcmp(casestr, sNewWord))
  1159. {
  1160. if (oLib[iLib]->Lookup(casestr, iIndex))
  1161. bFound = true;
  1162. }
  1163. g_free(casestr);
  1164. }
  1165. }
  1166. }
  1167. //cut "ly"
  1168. if (!bFound && iWordLen > 2)
  1169. {
  1170. isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2);
  1171. if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2)))
  1172. {
  1173. strcpy(sNewWord, sWord);
  1174. sNewWord[iWordLen - 2] = '\0'; // cut "ly"
  1175. if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]
  1176. && !bIsVowel(sNewWord[iWordLen - 4]) &&
  1177. bIsVowel(sNewWord[iWordLen - 5]))
  1178. { //doubled
  1179. sNewWord[iWordLen - 3] = '\0';
  1180. if ( oLib[iLib]->Lookup(sNewWord, iIndex) )
  1181. bFound = true;
  1182. else
  1183. {
  1184. if (isupcase || g_ascii_isupper(sWord[0]))
  1185. {
  1186. casestr = g_ascii_strdown(sNewWord, -1);
  1187. if (strcmp(casestr, sNewWord))
  1188. {
  1189. if (oLib[iLib]->Lookup(casestr, iIndex))
  1190. bFound = true;
  1191. }
  1192. g_free(casestr);
  1193. }
  1194. if (!bFound)
  1195. sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
  1196. }
  1197. }
  1198. if (!bFound)
  1199. {
  1200. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1201. bFound = true;
  1202. else if (isupcase || g_ascii_isupper(sWord[0]))
  1203. {
  1204. casestr = g_ascii_strdown(sNewWord, -1);
  1205. if (strcmp(casestr, sNewWord))
  1206. {
  1207. if (oLib[iLib]->Lookup(casestr, iIndex))
  1208. bFound = true;
  1209. }
  1210. g_free(casestr);
  1211. }
  1212. }
  1213. }
  1214. }
  1215. //cut "ing"
  1216. if (!bFound && iWordLen > 3)
  1217. {
  1218. isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3);
  1219. if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3) )
  1220. {
  1221. strcpy(sNewWord, sWord);
  1222. sNewWord[iWordLen - 3] = '\0';
  1223. if ( iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5])
  1224. && !bIsVowel(sNewWord[iWordLen - 5]) &&
  1225. bIsVowel(sNewWord[iWordLen - 6]))
  1226. { //doubled
  1227. sNewWord[iWordLen - 4] = '\0';
  1228. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1229. bFound = true;
  1230. else
  1231. {
  1232. if (isupcase || g_ascii_isupper(sWord[0]))
  1233. {
  1234. casestr = g_ascii_strdown(sNewWord, -1);
  1235. if (strcmp(casestr, sNewWord))
  1236. {
  1237. if (oLib[iLib]->Lookup(casestr, iIndex))
  1238. bFound = true;
  1239. }
  1240. g_free(casestr);
  1241. }
  1242. if (!bFound)
  1243. sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; //restore
  1244. }
  1245. }
  1246. if ( !bFound )
  1247. {
  1248. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1249. bFound = true;
  1250. else if (isupcase || g_ascii_isupper(sWord[0]))
  1251. {
  1252. casestr = g_ascii_strdown(sNewWord, -1);
  1253. if (strcmp(casestr, sNewWord))
  1254. {
  1255. if (oLib[iLib]->Lookup(casestr, iIndex))
  1256. bFound = true;
  1257. }
  1258. g_free(casestr);
  1259. }
  1260. }
  1261. if (!bFound)
  1262. {
  1263. if (isupcase)
  1264. strcat(sNewWord, "E"); // add a char "E"
  1265. else
  1266. strcat(sNewWord, "e"); // add a char "e"
  1267. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1268. bFound = true;
  1269. else if (isupcase || g_ascii_isupper(sWord[0]))
  1270. {
  1271. casestr = g_ascii_strdown(sNewWord, -1);
  1272. if (strcmp(casestr, sNewWord))
  1273. {
  1274. if (oLib[iLib]->Lookup(casestr, iIndex))
  1275. bFound = true;
  1276. }
  1277. g_free(casestr);
  1278. }
  1279. }
  1280. }
  1281. }
  1282. //cut two char "es"
  1283. if (!bFound && iWordLen > 3)
  1284. {
  1285. isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) &&
  1286. (sWord[iWordLen - 3] == 'S' ||
  1287. sWord[iWordLen - 3] == 'X' ||
  1288. sWord[iWordLen - 3] == 'O' ||
  1289. (iWordLen > 4 && sWord[iWordLen - 3] == 'H' &&
  1290. (sWord[iWordLen - 4] == 'C' ||
  1291. sWord[iWordLen - 4] == 'S'))));
  1292. if (isupcase ||
  1293. (!strncmp(&sWord[iWordLen - 2], "es", 2) &&
  1294. (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' ||
  1295. sWord[iWordLen - 3] == 'o' ||
  1296. (iWordLen > 4 && sWord[iWordLen - 3] == 'h' &&
  1297. (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's')))))
  1298. {
  1299. strcpy(sNewWord, sWord);
  1300. sNewWord[iWordLen - 2] = '\0';
  1301. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1302. bFound = true;
  1303. else if (isupcase || g_ascii_isupper(sWord[0]))
  1304. {
  1305. casestr = g_ascii_strdown(sNewWord, -1);
  1306. if (strcmp(casestr, sNewWord))
  1307. {
  1308. if (oLib[iLib]->Lookup(casestr, iIndex))
  1309. bFound = true;
  1310. }
  1311. g_free(casestr);
  1312. }
  1313. }
  1314. }
  1315. //cut "ed"
  1316. if (!bFound && iWordLen > 3)
  1317. {
  1318. isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2);
  1319. if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2))
  1320. {
  1321. strcpy(sNewWord, sWord);
  1322. sNewWord[iWordLen - 2] = '\0';
  1323. if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4])
  1324. && !bIsVowel(sNewWord[iWordLen - 4]) &&
  1325. bIsVowel(sNewWord[iWordLen - 5]))
  1326. { //doubled
  1327. sNewWord[iWordLen - 3] = '\0';
  1328. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1329. bFound = true;
  1330. else
  1331. {
  1332. if (isupcase || g_ascii_isupper(sWord[0]))
  1333. {
  1334. casestr = g_ascii_strdown(sNewWord, -1);
  1335. if (strcmp(casestr, sNewWord))
  1336. {
  1337. if (oLib[iLib]->Lookup(casestr, iIndex))
  1338. bFound = true;
  1339. }
  1340. g_free(casestr);
  1341. }
  1342. if (!bFound)
  1343. sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
  1344. }
  1345. }
  1346. if (!bFound)
  1347. {
  1348. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1349. bFound = true;
  1350. else if (isupcase || g_ascii_isupper(sWord[0]))
  1351. {
  1352. casestr = g_ascii_strdown(sNewWord, -1);
  1353. if (strcmp(casestr, sNewWord))
  1354. {
  1355. if (oLib[iLib]->Lookup(casestr, iIndex))
  1356. bFound = true;
  1357. }
  1358. g_free(casestr);
  1359. }
  1360. }
  1361. }
  1362. }
  1363. // cut "ied" , add "y".
  1364. if (!bFound && iWordLen > 3)
  1365. {
  1366. isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3);
  1367. if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3)))
  1368. {
  1369. strcpy(sNewWord, sWord);
  1370. sNewWord[iWordLen - 3] = '\0';
  1371. if (isupcase)
  1372. strcat(sNewWord, "Y"); // add a char "Y"
  1373. else
  1374. strcat(sNewWord, "y"); // add a char "y"
  1375. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1376. bFound = true;
  1377. else if (isupcase || g_ascii_isupper(sWord[0]))
  1378. {
  1379. casestr = g_ascii_strdown(sNewWord, -1);
  1380. if (strcmp(casestr, sNewWord))
  1381. {
  1382. if (oLib[iLib]->Lookup(casestr, iIndex))
  1383. bFound = true;
  1384. }
  1385. g_free(casestr);
  1386. }
  1387. }
  1388. }
  1389. // cut "ies" , add "y".
  1390. if (!bFound && iWordLen > 3)
  1391. {
  1392. isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3);
  1393. if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3)))
  1394. {
  1395. strcpy(sNewWord, sWord);
  1396. sNewWord[iWordLen - 3] = '\0';
  1397. if (isupcase)
  1398. strcat(sNewWord, "Y"); // add a char "Y"
  1399. else
  1400. strcat(sNewWord, "y"); // add a char "y"
  1401. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1402. bFound = true;
  1403. else if (isupcase || g_ascii_isupper(sWord[0]))
  1404. {
  1405. casestr = g_ascii_strdown(sNewWord, -1);
  1406. if (strcmp(casestr, sNewWord))
  1407. {
  1408. if (oLib[iLib]->Lookup(casestr, iIndex))
  1409. bFound = true;
  1410. }
  1411. g_free(casestr);
  1412. }
  1413. }
  1414. }
  1415. // cut "er".
  1416. if (!bFound && iWordLen > 2)
  1417. {
  1418. isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2);
  1419. if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2)))
  1420. {
  1421. strcpy(sNewWord, sWord);
  1422. sNewWord[iWordLen - 2] = '\0';
  1423. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1424. bFound = true;
  1425. else if (isupcase || g_ascii_isupper(sWord[0]))
  1426. {
  1427. casestr = g_ascii_strdown(sNewWord, -1);
  1428. if (strcmp(casestr, sNewWord))
  1429. {
  1430. if (oLib[iLib]->Lookup(casestr, iIndex))
  1431. bFound = true;
  1432. }
  1433. g_free(casestr);
  1434. }
  1435. }
  1436. }
  1437. // cut "est".
  1438. if (!bFound && iWordLen > 3)
  1439. {
  1440. isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3);
  1441. if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3)))
  1442. {
  1443. strcpy(sNewWord, sWord);
  1444. sNewWord[iWordLen - 3] = '\0';
  1445. if (oLib[iLib]->Lookup(sNewWord, iIndex))
  1446. bFound = true;
  1447. else if (isupcase || g_ascii_isupper(sWord[0]))
  1448. {
  1449. casestr = g_ascii_strdown(sNewWord, -1);
  1450. if (strcmp(casestr, sNewWord))
  1451. {
  1452. if (oLib[iLib]->Lookup(casestr, iIndex))
  1453. bFound = true;
  1454. }
  1455. g_free(casestr);
  1456. }
  1457. }
  1458. }
  1459. g_free(sNewWord);
  1460. }
  1461. if (bFound)
  1462. iWordIndex = iIndex;
  1463. #if 0
  1464. else
  1465. {
  1466. //don't change iWordIndex here.
  1467. //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words.
  1468. //iWordIndex = INVALID_INDEX;
  1469. }
  1470. #endif
  1471. return bFound;
  1472. }
  1473. bool Libs::SimpleLookupWord(const gchar* sWord, glong & iWordIndex, int iLib)
  1474. {
  1475. bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex);
  1476. if (!bFound)
  1477. bFound = LookupSimilarWord(sWord, iWordIndex, iLib);
  1478. return bFound;
  1479. }
  1480. struct Fuzzystruct
  1481. {
  1482. char * pMatchWord;
  1483. int iMatchWordDistance;
  1484. };
  1485. inline bool operator<(const Fuzzystruct & lh, const Fuzzystruct & rh)
  1486. {
  1487. if (lh.iMatchWordDistance != rh.iMatchWordDistance)
  1488. return lh.iMatchWordDistance < rh.iMatchWordDistance;
  1489. if (lh.pMatchWord && rh.pMatchWord)
  1490. return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0;
  1491. return false;
  1492. }
  1493. static inline void unicode_strdown(gunichar *str)
  1494. {
  1495. while (*str)
  1496. {
  1497. *str = g_unichar_tolower(*str);
  1498. ++str;
  1499. }
  1500. }
  1501. bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size, gint iLib)
  1502. {
  1503. if (sWord[0] == '\0')
  1504. return false;
  1505. Fuzzystruct *oFuzzystruct = new Fuzzystruct[reslist_size];
  1506. for (int i = 0; i < reslist_size; i++)
  1507. {
  1508. oFuzzystruct[i].pMatchWord = NULL;
  1509. oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance;
  1510. }
  1511. int iMaxDistance = iMaxFuzzyDistance;
  1512. int iDistance;
  1513. bool Found = false;
  1514. EditDistance oEditDistance;
  1515. glong iCheckWordLen;
  1516. const char *sCheck;
  1517. gunichar *ucs4_str1, *ucs4_str2;
  1518. glong ucs4_str2_len;
  1519. ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len);
  1520. unicode_strdown(ucs4_str2);
  1521. // for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
  1522. // {
  1523. if (progress_func)
  1524. progress_func();
  1525. //if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) {
  1526. //there are Chinese dicts and English dicts...
  1527. if (TRUE)
  1528. {
  1529. const int iwords = narticles(iLib);
  1530. for (int index = 0; index < iwords; index++)
  1531. {
  1532. sCheck = poGetWord(index, iLib);
  1533. // tolower and skip too long or too short words
  1534. iCheckWordLen = g_utf8_strlen(sCheck, -1);
  1535. if (iCheckWordLen - ucs4_str2_len >= iMaxDistance ||
  1536. ucs4_str2_len - iCheckWordLen >= iMaxDistance)
  1537. continue;
  1538. ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, NULL);
  1539. if (iCheckWordLen > ucs4_str2_len)
  1540. ucs4_str1[ucs4_str2_len] = 0;
  1541. unicode_strdown(ucs4_str1);
  1542. iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance);
  1543. g_free(ucs4_str1);
  1544. if (iDistance < iMaxDistance && iDistance < ucs4_str2_len)
  1545. {
  1546. // when ucs4_str2_len=1,2 we need less fuzzy.
  1547. Found = true;
  1548. bool bAlreadyInList = false;
  1549. int iMaxDistanceAt = 0;
  1550. for (int j = 0; j < reslist_size; j++)
  1551. {
  1552. if (oFuzzystruct[j].pMatchWord &&
  1553. strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0 )
  1554. { //already in list
  1555. bAlreadyInList = true;
  1556. break;
  1557. }
  1558. //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time.
  1559. if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance )
  1560. {
  1561. iMaxDistanceAt = j;
  1562. }
  1563. }
  1564. if (!bAlreadyInList)
  1565. {
  1566. if (oFuzzystruct[iMaxDistanceAt].pMatchWord)
  1567. g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord);
  1568. oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck);
  1569. oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance;
  1570. // calc new iMaxDistance
  1571. iMaxDistance = iDistance;
  1572. for (int j = 0; j < reslist_size; j++)
  1573. {
  1574. if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance)
  1575. iMaxDistance = oFuzzystruct[j].iMatchWordDistance;
  1576. } // calc new iMaxDistance
  1577. } // add to list
  1578. } // find one
  1579. } // each word
  1580. } // ok for search
  1581. // } // each lib
  1582. g_free(ucs4_str2);
  1583. if (Found) // sort with distance
  1584. std::sort(oFuzzystruct, oFuzzystruct + reslist_size);
  1585. for (gint i = 0; i < reslist_size; ++i)
  1586. reslist[i] = oFuzzystruct[i].pMatchWord;
  1587. delete[] oFuzzystruct;
  1588. return Found;
  1589. }
  1590. inline bool less_for_compare(const char *lh, const char *rh)
  1591. {
  1592. return stardict_strcmp(lh, rh) < 0;
  1593. }
  1594. gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord)
  1595. {
  1596. glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1];
  1597. gint iMatchCount = 0;
  1598. GPatternSpec *pspec = g_pattern_spec_new(word);
  1599. for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
  1600. {
  1601. //if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib))
  1602. // -iMatchCount,so save time,but may got less result and the word may repeat.
  1603. if (oLib[iLib]->
  1604. LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1))
  1605. {
  1606. if (progress_func)
  1607. progress_func();
  1608. for (int i = 0; aiIndex[i] != -1; i++)
  1609. {
  1610. const gchar * sMatchWord = poGetWord(aiIndex[i], iLib);
  1611. bool bAlreadyInList = false;
  1612. for (int j = 0; j < iMatchCount; j++)
  1613. {
  1614. if (strcmp(ppMatchWord[j], sMatchWord) == 0)
  1615. { //already in list
  1616. bAlreadyInList = true;
  1617. break;
  1618. }
  1619. }
  1620. if (!bAlreadyInList)
  1621. ppMatchWord[iMatchCount++] = g_strdup(sMatchWord);
  1622. }
  1623. }
  1624. }
  1625. g_pattern_spec_free(pspec);
  1626. if (iMatchCount) // sort it.
  1627. std::sort(ppMatchWord, ppMatchWord + iMatchCount, less_for_compare);
  1628. return iMatchCount;
  1629. }
  1630. bool Libs::LookupData(const gchar *sWord, std::vector<gchar *> *reslist)
  1631. {
  1632. std::vector<std::string> SearchWords;
  1633. std::string SearchWord;
  1634. const char *p = sWord;
  1635. while (*p)
  1636. {
  1637. if (*p == '\\')
  1638. {
  1639. p++;
  1640. switch (*p)
  1641. {
  1642. case ' ':
  1643. SearchWord += ' ';
  1644. break;
  1645. case '\\':
  1646. SearchWord += '\\';
  1647. break;
  1648. case 't':
  1649. SearchWord += '\t';
  1650. break;
  1651. case 'n':
  1652. SearchWord += '\n';
  1653. break;
  1654. default:
  1655. SearchWord += *p;
  1656. }
  1657. }
  1658. else if (*p == ' ')
  1659. {
  1660. if (!SearchWord.empty())
  1661. {
  1662. SearchWords.push_back(SearchWord);
  1663. SearchWord.clear();
  1664. }
  1665. }
  1666. else
  1667. {
  1668. SearchWord += *p;
  1669. }
  1670. p++;
  1671. }
  1672. if (!SearchWord.empty())
  1673. {
  1674. SearchWords.push_back(SearchWord);
  1675. SearchWord.clear();
  1676. }
  1677. if (SearchWords.empty())
  1678. return false;
  1679. guint32 max_size = 0;
  1680. gchar *origin_data = NULL;
  1681. for (std::vector<Dict *>::size_type i = 0; i<oLib.size(); ++i)
  1682. {
  1683. if (!oLib[i]->
  1684. containSearchData())
  1685. continue;
  1686. if (progress_func)
  1687. progress_func();
  1688. const gulong iwords = narticles(i);
  1689. const gchar *key;
  1690. guint32 offset, size;
  1691. for (gulong j = 0;
  1692. j < iwords;
  1693. ++j)
  1694. {
  1695. oLib[i]
  1696. ->get_key_and_data(j, &key, &offset, &size);
  1697. if (size > max_size)
  1698. {
  1699. origin_data = (gchar *)g_realloc(origin_data, size);
  1700. max_size = size;
  1701. }
  1702. if (oLib[i]->SearchData(SearchWords, offset, size, origin_data))
  1703. reslist[i].push_back(g_strdup(key));
  1704. }
  1705. }
  1706. g_free(origin_data);
  1707. std::vector<Dict *>::size_type i;
  1708. for (i = 0; i<oLib.size(); ++i)
  1709. if (!reslist[i].empty())
  1710. break;
  1711. return i != oLib.size();
  1712. }
  1713. /**************************************************/
  1714. query_t analyze_query(const char *s, std::string& res)
  1715. {
  1716. if (!s || !*s)
  1717. {
  1718. res = "";
  1719. return qtSIMPLE;
  1720. }
  1721. if (*s == '/')
  1722. {
  1723. res = s + 1;
  1724. return qtFUZZY;
  1725. }
  1726. if (*s == '|')
  1727. {
  1728. res = s + 1;
  1729. return qtDATA;
  1730. }
  1731. bool regexp = false;
  1732. const char *p = s;
  1733. res = "";
  1734. for (; *p; res += *p, ++p)
  1735. {
  1736. if (*p == '\\')
  1737. {
  1738. ++p;
  1739. if (!*p)
  1740. break;
  1741. continue;
  1742. }
  1743. if (*p == '*' || *p == '?')
  1744. regexp = true;
  1745. }
  1746. if (regexp)
  1747. return qtREGEXP;
  1748. return qtSIMPLE;
  1749. }