123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926 |
- #ifdef HAVE_CONFIG_H
- # include "config.h"
- #endif
- #include <algorithm>
- #include <cstring>
- #include <cctype>
- #include <sys/stat.h>
- #include <zlib.h>
- #include <glib/gstdio.h>
- #include "distance.h"
- #include "file.hpp"
- #include "mapfile.hpp"
- #include "lib.h"
- // Notice: read src/tools/DICTFILE_FORMAT for the dictionary
- // file's format information!
- static inline bool bIsVowel(gchar inputchar)
- {
- gchar ch = g_ascii_toupper(inputchar);
- return ( ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U' );
- }
- static bool bIsPureEnglish(const gchar *str)
- {
- // i think this should work even when it is UTF8 string :).
- for (int i = 0; str[i] != 0; i++)
- //if(str[i]<0)
- //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK.
- // Better use isascii() but not str[i]<0 while char is default unsigned in arm
- if (!isascii(str[i]))
- return false;
- return true;
- }
- static inline gint stardict_strcmp(const gchar *s1, const gchar *s2)
- {
- gint a = g_ascii_strcasecmp(s1, s2);
- if (a == 0)
- return strcmp(s1, s2);
- else
- return a;
- }
- bool DictInfo::load_from_ifo_file(const std::string& ifofilename,
- bool istreedict)
- {
- ifo_file_name = ifofilename;
- gchar *buffer;
- if (!g_file_get_contents(ifofilename.c_str(), &buffer, NULL, NULL))
- return false;
- #define TREEDICT_MAGIC_DATA "StarDict's treedict ifo file\nversion=2.4.2\n"
- #define DICT_MAGIC_DATA "StarDict's dict ifo file\nversion=2.4.2\n"
- const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA;
- if (!g_str_has_prefix(buffer, magic_data))
- {
- g_free(buffer);
- return false;
- }
- gchar *p1, *p2, *p3;
- p1 = buffer + strlen(magic_data) - 1;
- p2 = strstr(p1, "\nwordcount=");
- if (!p2)
- {
- g_free(buffer);
- return false;
- }
- p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n');
- gchar *tmpstr = (gchar *)g_memdup(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1) + 1);
- tmpstr[p3 - (p2 + sizeof("\nwordcount=") - 1)] = '\0';
- wordcount = atol(tmpstr);
- g_free(tmpstr);
- if (istreedict)
- {
- p2 = strstr(p1, "\ntdxfilesize=");
- if (!p2)
- {
- g_free(buffer);
- return false;
- }
- p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n');
- tmpstr = (gchar *)g_memdup(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1) + 1);
- tmpstr[p3 - (p2 + sizeof("\ntdxfilesize=") - 1)] = '\0';
- index_file_size = atol(tmpstr);
- g_free(tmpstr);
- }
- else
- {
- p2 = strstr(p1, "\nidxfilesize=");
- if (!p2)
- {
- g_free(buffer);
- return false;
- }
- p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n');
- tmpstr = (gchar *)g_memdup(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1) + 1);
- tmpstr[p3 - (p2 + sizeof("\nidxfilesize=") - 1)] = '\0';
- index_file_size = atol(tmpstr);
- g_free(tmpstr);
- }
- p2 = strstr(p1, "\nbookname=");
- if (!p2)
- {
- g_free(buffer);
- return false;
- }
- p2 = p2 + sizeof("\nbookname=") - 1;
- p3 = strchr(p2, '\n');
- bookname.assign(p2, p3 - p2);
- p2 = strstr(p1, "\nauthor=");
- if (p2)
- {
- p2 = p2 + sizeof("\nauthor=") - 1;
- p3 = strchr(p2, '\n');
- author.assign(p2, p3 - p2);
- }
- p2 = strstr(p1, "\nemail=");
- if (p2)
- {
- p2 = p2 + sizeof("\nemail=") - 1;
- p3 = strchr(p2, '\n');
- email.assign(p2, p3 - p2);
- }
- p2 = strstr(p1, "\nwebsite=");
- if (p2)
- {
- p2 = p2 + sizeof("\nwebsite=") - 1;
- p3 = strchr(p2, '\n');
- website.assign(p2, p3 - p2);
- }
- p2 = strstr(p1, "\ndate=");
- if (p2)
- {
- p2 = p2 + sizeof("\ndate=") - 1;
- p3 = strchr(p2, '\n');
- date.assign(p2, p3 - p2);
- }
- p2 = strstr(p1, "\ndescription=");
- if (p2)
- {
- p2 = p2 + sizeof("\ndescription=") - 1;
- p3 = strchr(p2, '\n');
- description.assign(p2, p3 - p2);
- }
- p2 = strstr(p1, "\nsametypesequence=");
- if (p2)
- {
- p2 += sizeof("\nsametypesequence=") - 1;
- p3 = strchr(p2, '\n');
- sametypesequence.assign(p2, p3 - p2);
- }
- g_free(buffer);
- return true;
- }
- //===================================================================
- DictBase::DictBase()
- {
- dictfile = NULL;
- cache_cur = 0;
- }
- DictBase::~DictBase()
- {
- if (dictfile)
- fclose(dictfile);
- }
- gchar* DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size)
- {
- for (int i = 0; i < WORDDATA_CACHE_NUM; i++)
- if (cache[i].data && cache[i].offset == idxitem_offset)
- return cache[i].data;
- if (dictfile)
- fseek(dictfile, idxitem_offset, SEEK_SET);
- gchar *data;
- if (!sametypesequence.empty())
- {
- gchar *origin_data = (gchar *)g_malloc(idxitem_size);
- if (dictfile)
- fread(origin_data, idxitem_size, 1, dictfile);
- else
- dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
- guint32 data_size;
- gint sametypesequence_len = sametypesequence.length();
- //there have sametypesequence_len char being omitted.
- data_size = idxitem_size + sizeof(guint32) + sametypesequence_len;
- //if the last item's size is determined by the end up '\0',then +=sizeof(gchar);
- //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32);
- switch (sametypesequence[sametypesequence_len - 1])
- {
- case 'm':
- case 't':
- case 'y':
- case 'l':
- case 'g':
- case 'x':
- data_size += sizeof(gchar);
- break;
- case 'W':
- case 'P':
- data_size += sizeof(guint32);
- break;
- default:
- if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
- data_size += sizeof(guint32);
- else
- data_size += sizeof(gchar);
- break;
- }
- data = (gchar *)g_malloc(data_size);
- gchar *p1, *p2;
- p1 = data + sizeof(guint32);
- p2 = origin_data;
- guint32 sec_size;
- //copy the head items.
- for (int i = 0; i < sametypesequence_len - 1; i++)
- {
- *p1 = sametypesequence[i];
- p1 += sizeof(gchar);
- switch (sametypesequence[i])
- {
- case 'm':
- case 't':
- case 'y':
- case 'l':
- case 'g':
- case 'x':
- sec_size = strlen(p2) + 1;
- memcpy(p1, p2, sec_size);
- p1 += sec_size;
- p2 += sec_size;
- break;
- case 'W':
- case 'P':
- sec_size = *reinterpret_cast<guint32 *>(p2);
- sec_size += sizeof(guint32);
- memcpy(p1, p2, sec_size);
- p1 += sec_size;
- p2 += sec_size;
- break;
- default:
- if (g_ascii_isupper(sametypesequence[i]))
- {
- sec_size = *reinterpret_cast<guint32 *>(p2);
- sec_size += sizeof(guint32);
- }
- else
- {
- sec_size = strlen(p2) + 1;
- }
- memcpy(p1, p2, sec_size);
- p1 += sec_size;
- p2 += sec_size;
- break;
- }
- }
- //calculate the last item 's size.
- sec_size = idxitem_size - (p2 - origin_data);
- *p1 = sametypesequence[sametypesequence_len - 1];
- p1 += sizeof(gchar);
- switch (sametypesequence[sametypesequence_len - 1])
- {
- case 'm':
- case 't':
- case 'y':
- case 'l':
- case 'g':
- case 'x':
- memcpy(p1, p2, sec_size);
- p1 += sec_size;
- *p1 = '\0'; //add the end up '\0';
- break;
- case 'W':
- case 'P':
- *reinterpret_cast<guint32 *>(p1) = sec_size;
- p1 += sizeof(guint32);
- memcpy(p1, p2, sec_size);
- break;
- default:
- if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
- {
- *reinterpret_cast<guint32 *>(p1) = sec_size;
- p1 += sizeof(guint32);
- memcpy(p1, p2, sec_size);
- }
- else
- {
- memcpy(p1, p2, sec_size);
- p1 += sec_size;
- *p1 = '\0';
- }
- break;
- }
- g_free(origin_data);
- *reinterpret_cast<guint32 *>(data) = data_size;
- }
- else
- {
- data = (gchar *)g_malloc(idxitem_size + sizeof(guint32));
- if (dictfile)
- fread(data + sizeof(guint32), idxitem_size, 1, dictfile);
- else
- dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size);
- *reinterpret_cast<guint32 *>(data) = idxitem_size + sizeof(guint32);
- }
- g_free(cache[cache_cur].data);
- cache[cache_cur].data = data;
- cache[cache_cur].offset = idxitem_offset;
- cache_cur++;
- if (cache_cur == WORDDATA_CACHE_NUM)
- cache_cur = 0;
- return data;
- }
- inline bool DictBase::containSearchData()
- {
- if (sametypesequence.empty())
- return true;
- return sametypesequence.find_first_of("mlgxty") != std::string::npos;
- }
- bool DictBase::SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data)
- {
- int nWord = SearchWords.size();
- std::vector<bool> WordFind(nWord, false);
- int nfound = 0;
- if (dictfile)
- fseek(dictfile, idxitem_offset, SEEK_SET);
- if (dictfile)
- fread(origin_data, idxitem_size, 1, dictfile);
- else
- dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
- gchar *p = origin_data;
- guint32 sec_size;
- int j;
- if (!sametypesequence.empty())
- {
- gint sametypesequence_len = sametypesequence.length();
- for (int i = 0; i < sametypesequence_len - 1; i++)
- {
- switch (sametypesequence[i])
- {
- case 'm':
- case 't':
- case 'y':
- case 'l':
- case 'g':
- case 'x':
- for (j = 0; j < nWord; j++)
- if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
- {
- WordFind[j] = true;
- ++nfound;
- }
- if (nfound == nWord)
- return true;
- sec_size = strlen(p) + 1;
- p += sec_size;
- break;
- default:
- if (g_ascii_isupper(sametypesequence[i]))
- {
- sec_size = *reinterpret_cast<guint32 *>(p);
- sec_size += sizeof(guint32);
- }
- else
- {
- sec_size = strlen(p) + 1;
- }
- p += sec_size;
- }
- }
- switch (sametypesequence[sametypesequence_len - 1])
- {
- case 'm':
- case 't':
- case 'y':
- case 'l':
- case 'g':
- case 'x':
- sec_size = idxitem_size - (p - origin_data);
- for (j = 0; j < nWord; j++)
- if (!WordFind[j] &&
- g_strstr_len(p, sec_size, SearchWords[j].c_str()))
- {
- WordFind[j] = true;
- ++nfound;
- }
- if (nfound == nWord)
- return true;
- break;
- }
- }
- else
- {
- while (guint32(p - origin_data) < idxitem_size)
- {
- switch (*p)
- {
- case 'm':
- case 't':
- case 'y':
- case 'l':
- case 'g':
- case 'x':
- for (j = 0; j < nWord; j++)
- if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
- {
- WordFind[j] = true;
- ++nfound;
- }
- if (nfound == nWord)
- return true;
- sec_size = strlen(p) + 1;
- p += sec_size;
- break;
- default:
- if (g_ascii_isupper(*p))
- {
- sec_size = *reinterpret_cast<guint32 *>(p);
- sec_size += sizeof(guint32);
- }
- else
- {
- sec_size = strlen(p) + 1;
- }
- p += sec_size;
- }
- }
- }
- return false;
- }
- class offset_index : public index_file
- {
- public:
- offset_index() : idxfile(NULL)
- {}
- ~offset_index();
- bool load(const std::string& url, gulong wc, gulong fsize);
- const gchar *get_key(glong idx);
- void get_data(glong idx);
- const gchar *get_key_and_data(glong idx);
- bool lookup(const char *str, glong &idx);
- private:
- static const gint ENTR_PER_PAGE = 32;
- static const char *CACHE_MAGIC;
- std::vector<guint32> wordoffset;
- FILE *idxfile;
- gulong wordcount;
- gchar wordentry_buf[256 + sizeof(guint32)*2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT.
- struct index_entry
- {
- glong idx;
- std::string keystr;
- void assign(glong i, const std::string& str)
- {
- idx = i;
- keystr.assign(str);
- }
- };
- index_entry first, last, middle, real_last;
- struct page_entry
- {
- gchar *keystr;
- guint32 off, size;
- };
- std::vector<gchar> page_data;
- struct page_t
- {
- glong idx;
- page_entry entries[ENTR_PER_PAGE];
- page_t(): idx( -1)
- {}
- void fill(gchar *data, gint nent, glong idx_);
- }
- page;
- gulong load_page(glong page_idx);
- const gchar *read_first_on_page_key(glong page_idx);
- const gchar *get_first_on_page_key(glong page_idx);
- bool load_cache(const std::string& url);
- bool save_cache(const std::string& url);
- static strlist_t get_cache_variant(const std::string& url);
- };
- const char *offset_index::CACHE_MAGIC = "StarDict's Cache, Version: 0.1";
- class wordlist_index : public index_file
- {
- public:
- wordlist_index() : idxdatabuf(NULL)
- {}
- ~wordlist_index();
- bool load(const std::string& url, gulong wc, gulong fsize);
- const gchar *get_key(glong idx);
- void get_data(glong idx);
- const gchar *get_key_and_data(glong idx);
- bool lookup(const char *str, glong &idx);
- private:
- gchar *idxdatabuf;
- std::vector<gchar *> wordlist;
- };
- void offset_index::page_t::fill(gchar *data, gint nent, glong idx_)
- {
- idx = idx_;
- gchar *p = data;
- glong len;
- for (gint i = 0; i < nent; ++i)
- {
- entries[i].keystr = p;
- len = strlen(p);
- p += len + 1;
- entries[i].off = g_ntohl(*reinterpret_cast<guint32 *>(p));
- p += sizeof(guint32);
- entries[i].size = g_ntohl(*reinterpret_cast<guint32 *>(p));
- p += sizeof(guint32);
- }
- }
- offset_index::~offset_index()
- {
- if (idxfile)
- fclose(idxfile);
- }
- inline const gchar *offset_index::read_first_on_page_key(glong page_idx)
- {
- fseek(idxfile, wordoffset[page_idx], SEEK_SET);
- guint page_size = wordoffset[page_idx + 1] - wordoffset[page_idx];
- fread(wordentry_buf, std::min<guint>(sizeof(wordentry_buf), page_size), 1, idxfile); //TODO: check returned values, deal with word entry that strlen>255.
- return wordentry_buf;
- }
- inline const gchar *offset_index::get_first_on_page_key(glong page_idx)
- {
- if (page_idx < middle.idx)
- {
- if (page_idx == first.idx)
- return first.keystr.c_str();
- return read_first_on_page_key(page_idx);
- }
- else if (page_idx > middle.idx)
- {
- if (page_idx == last.idx)
- return last.keystr.c_str();
- return read_first_on_page_key(page_idx);
- }
- else
- return middle.keystr.c_str();
- }
- bool offset_index::load_cache(const std::string& url)
- {
- strlist_t vars = get_cache_variant(url);
- for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
- {
- struct stat idxstat, cachestat;
- if (g_stat(url.c_str(), &idxstat) != 0 ||
- g_stat(it->c_str(), &cachestat) != 0)
- continue;
- if (cachestat.st_mtime < idxstat.st_mtime)
- continue;
- MapFile mf;
- if (!mf.open(it->c_str(), cachestat.st_size))
- continue;
- if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0)
- continue;
- memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC), wordoffset.size()*sizeof(wordoffset[0]));
- return true;
- }
- return false;
- }
- strlist_t offset_index::get_cache_variant(const std::string& url)
- {
- strlist_t res;
- res.push_back(url + ".oft");
- if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) &&
- g_mkdir(g_get_user_cache_dir(), 0700) == -1)
- return res;
- std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv";
- if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS))
- {
- if (g_mkdir(cache_dir.c_str(), 0700) == -1)
- return res;
- }
- else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR))
- return res;
- gchar *base = g_path_get_basename(url.c_str());
- res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft");
- g_free(base);
- return res;
- }
- bool offset_index::save_cache(const std::string& url)
- {
- strlist_t vars = get_cache_variant(url);
- for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
- {
- FILE *out = fopen(it->c_str(), "wb");
- if (!out)
- continue;
- if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC))
- continue;
- if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size())
- continue;
- fclose(out);
- printf("save to cache %s\n", url.c_str());
- return true;
- }
- return false;
- }
- bool offset_index::load(const std::string& url, gulong wc, gulong fsize)
- {
- wordcount = wc;
- gulong npages = (wc - 1) / ENTR_PER_PAGE + 2;
- wordoffset.resize(npages);
- if (!load_cache(url))
- { //map file will close after finish of block
- MapFile map_file;
- if (!map_file.open(url.c_str(), fsize))
- return false;
- const gchar *idxdatabuffer = map_file.begin();
- const gchar *p1 = idxdatabuffer;
- gulong index_size;
- guint32 j = 0;
- for (guint32 i = 0; i < wc; i++)
- {
- index_size = strlen(p1) + 1 + 2 * sizeof(guint32);
- if (i % ENTR_PER_PAGE == 0)
- {
- wordoffset[j] = p1 - idxdatabuffer;
- ++j;
- }
- p1 += index_size;
- }
- wordoffset[j] = p1 - idxdatabuffer;
- if (!save_cache(url))
- fprintf(stderr, "cache update failed\n");
- }
- if (!(idxfile = fopen(url.c_str(), "rb")))
- {
- wordoffset.resize(0);
- return false;
- }
- first.assign(0, read_first_on_page_key(0));
- last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2));
- middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2));
- real_last.assign(wc - 1, get_key(wc - 1));
- return true;
- }
- inline gulong offset_index::load_page(glong page_idx)
- {
- gulong nentr = ENTR_PER_PAGE;
- if (page_idx == glong(wordoffset.size() - 2))
- if ((nentr = wordcount % ENTR_PER_PAGE) == 0)
- nentr = ENTR_PER_PAGE;
- if (page_idx != page.idx)
- {
- page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]);
- fseek(idxfile, wordoffset[page_idx], SEEK_SET);
- fread(&page_data[0], 1, page_data.size(), idxfile);
- page.fill(&page_data[0], nentr, page_idx);
- }
- return nentr;
- }
- const gchar *offset_index::get_key(glong idx)
- {
- load_page(idx / ENTR_PER_PAGE);
- glong idx_in_page = idx % ENTR_PER_PAGE;
- wordentry_offset = page.entries[idx_in_page].off;
- wordentry_size = page.entries[idx_in_page].size;
- return page.entries[idx_in_page].keystr;
- }
- void offset_index::get_data(glong idx)
- {
- get_key(idx);
- }
- const gchar *offset_index::get_key_and_data(glong idx)
- {
- return get_key(idx);
- }
- bool offset_index::lookup(const char *str, glong &idx)
- {
- bool bFound = false;
- glong iFrom;
- glong iTo = wordoffset.size() - 2;
- gint cmpint;
- glong iThisIndex;
- if (stardict_strcmp(str, first.keystr.c_str()) < 0)
- {
- idx = 0;
- return false;
- }
- else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0)
- {
- idx = INVALID_INDEX;
- return false;
- }
- else
- {
- iFrom = 0;
- iThisIndex = 0;
- while (iFrom <= iTo)
- {
- iThisIndex = (iFrom + iTo) / 2;
- cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex));
- if (cmpint > 0)
- iFrom = iThisIndex + 1;
- else if (cmpint < 0)
- iTo = iThisIndex - 1;
- else
- {
- bFound = true;
- break;
- }
- }
- if (!bFound)
- idx = iTo; //prev
- else
- idx = iThisIndex;
- }
- if (!bFound)
- {
- gulong netr = load_page(idx);
- iFrom = 1; // Needn't search the first word anymore.
- iTo = netr - 1;
- iThisIndex = 0;
- while (iFrom <= iTo)
- {
- iThisIndex = (iFrom + iTo) / 2;
- cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr);
- if (cmpint > 0)
- iFrom = iThisIndex + 1;
- else if (cmpint < 0)
- iTo = iThisIndex - 1;
- else
- {
- bFound = true;
- break;
- }
- }
- idx *= ENTR_PER_PAGE;
- if (!bFound)
- idx += iFrom; //next
- else
- idx += iThisIndex;
- }
- else
- {
- idx *= ENTR_PER_PAGE;
- }
- return bFound;
- }
- wordlist_index::~wordlist_index()
- {
- g_free(idxdatabuf);
- }
- bool wordlist_index::load(const std::string& url, gulong wc, gulong fsize)
- {
- gzFile in = gzopen(url.c_str(), "rb");
- if (in == NULL)
- return false;
- idxdatabuf = (gchar *)g_malloc(fsize);
- gulong len = gzread(in, idxdatabuf, fsize);
- gzclose(in);
- if (len != fsize)
- return false;
- wordlist.resize(wc + 1);
- gchar *p1 = idxdatabuf;
- guint32 i;
- for (i = 0; i < wc; i++)
- {
- wordlist[i] = p1;
- p1 += strlen(p1) + 1 + 2 * sizeof(guint32);
- }
- wordlist[wc] = p1;
- return true;
- }
- const gchar *wordlist_index::get_key(glong idx)
- {
- return wordlist[idx];
- }
- void wordlist_index::get_data(glong idx)
- {
- gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar);
- wordentry_offset = g_ntohl(*reinterpret_cast<guint32 *>(p1));
- p1 += sizeof(guint32);
- wordentry_size = g_ntohl(*reinterpret_cast<guint32 *>(p1));
- }
- const gchar *wordlist_index::get_key_and_data(glong idx)
- {
- get_data(idx);
- return get_key(idx);
- }
- bool wordlist_index::lookup(const char *str, glong &idx)
- {
- bool bFound = false;
- glong iTo = wordlist.size() - 2;
- if (stardict_strcmp(str, get_key(0)) < 0)
- {
- idx = 0;
- }
- else if (stardict_strcmp(str, get_key(iTo)) > 0)
- {
- idx = INVALID_INDEX;
- }
- else
- {
- glong iThisIndex = 0;
- glong iFrom = 0;
- gint cmpint;
- while (iFrom <= iTo)
- {
- iThisIndex = (iFrom + iTo) / 2;
- cmpint = stardict_strcmp(str, get_key(iThisIndex));
- if (cmpint > 0)
- iFrom = iThisIndex + 1;
- else if (cmpint < 0)
- iTo = iThisIndex - 1;
- else
- {
- bFound = true;
- break;
- }
- }
- if (!bFound)
- idx = iFrom; //next
- else
- idx = iThisIndex;
- }
- return bFound;
- }
- //===================================================================
- bool Dict::load(const std::string& ifofilename)
- {
- gulong idxfilesize;
- if (!load_ifofile(ifofilename, idxfilesize))
- return false;
- std::string fullfilename(ifofilename);
- fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz");
- if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
- {
- dictdzfile.reset(new dictData);
- if (!dictdzfile->open(fullfilename, 0))
- {
- //g_print("open file %s failed!\n",fullfilename);
- return false;
- }
- }
- else
- {
- fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1);
- dictfile = fopen(fullfilename.c_str(), "rb");
- if (!dictfile)
- {
- //g_print("open file %s failed!\n",fullfilename);
- return false;
- }
- }
- fullfilename = ifofilename;
- fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz");
- if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
- {
- idx_file.reset(new wordlist_index);
- }
- else
- {
- fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1);
- idx_file.reset(new offset_index);
- }
- if (!idx_file->load(fullfilename, wordcount, idxfilesize))
- return false;
- //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles());
- return true;
- }
- bool Dict::load_ifofile(const std::string& ifofilename, gulong &idxfilesize)
- {
- DictInfo dict_info;
- if (!dict_info.load_from_ifo_file(ifofilename, false))
- return false;
- if (dict_info.wordcount == 0)
- return false;
- ifo_file_name = dict_info.ifo_file_name;
- wordcount = dict_info.wordcount;
- bookname = dict_info.bookname;
- idxfilesize = dict_info.index_file_size;
- sametypesequence = dict_info.sametypesequence;
- return true;
- }
- bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen)
- {
- int iIndexCount = 0;
- for (guint32 i = 0; i < narticles() && iIndexCount < iBuffLen - 1; i++)
- if (g_pattern_match_string(pspec, get_key(i)))
- aIndex[iIndexCount++] = i;
- aIndex[iIndexCount] = -1; // -1 is the end.
- return (iIndexCount > 0);
- }
- //===================================================================
- Libs::Libs(progress_func_t f)
- {
- progress_func = f;
- iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg.
- }
- Libs::~Libs()
- {
- for (std::vector<Dict *>::iterator p = oLib.begin(); p != oLib.end(); ++p)
- delete *p;
- }
- void Libs::load_dict(const std::string& url)
- {
- Dict *lib = new Dict;
- if (lib->load(url))
- oLib.push_back(lib);
- else
- delete lib;
- }
- class DictLoader
- {
- public:
- DictLoader(Libs& lib_): lib(lib_)
- {}
- void operator()(const std::string& url, bool disable)
- {
- if (!disable)
- lib.load_dict(url);
- }
- private:
- Libs& lib;
- };
- void Libs::load(const strlist_t& dicts_dirs,
- const strlist_t& order_list,
- const strlist_t& disable_list)
- {
- for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
- DictLoader(*this));
- }
- class DictReLoader
- {
- public:
- DictReLoader(std::vector<Dict *> &p, std::vector<Dict *> &f,
- Libs& lib_) : prev(p), future(f), lib(lib_)
- {}
- void operator()(const std::string& url, bool disable)
- {
- if (!disable)
- {
- Dict *dict = find(url);
- if (dict)
- future.push_back(dict);
- else
- lib.load_dict(url);
- }
- }
- private:
- std::vector<Dict *> &prev;
- std::vector<Dict *> &future;
- Libs& lib;
- Dict *find(const std::string& url)
- {
- std::vector<Dict *>::iterator it;
- for (it = prev.begin(); it != prev.end(); ++it)
- if ((*it)->ifofilename() == url)
- break;
- if (it != prev.end())
- {
- Dict *res = *it;
- prev.erase(it);
- return res;
- }
- return NULL;
- }
- };
- void Libs::reload(const strlist_t& dicts_dirs,
- const strlist_t& order_list,
- const strlist_t& disable_list)
- {
- std::vector<Dict *> prev(oLib);
- oLib.clear();
- for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
- DictReLoader(prev, oLib, *this));
- for (std::vector<Dict *>::iterator it = prev.begin(); it != prev.end(); ++it)
- delete *it;
- }
- const gchar *Libs::poGetCurrentWord(glong * iCurrent)
- {
- const gchar *poCurrentWord = NULL;
- const gchar *word;
- for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
- {
- if (iCurrent[iLib] == INVALID_INDEX)
- continue;
- if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
- continue;
- if ( poCurrentWord == NULL )
- {
- poCurrentWord = poGetWord(iCurrent[iLib], iLib);
- }
- else
- {
- word = poGetWord(iCurrent[iLib], iLib);
- if (stardict_strcmp(poCurrentWord, word) > 0 )
- poCurrentWord = word;
- }
- }
- return poCurrentWord;
- }
- const gchar *
- Libs::poGetNextWord(const gchar *sWord, glong *iCurrent)
- {
- // the input can be:
- // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback();
- // (NULL,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords();
- const gchar *poCurrentWord = NULL;
- std::vector<Dict *>::size_type iCurrentLib = 0;
- const gchar *word;
- for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
- {
- if (sWord)
- oLib[iLib]->Lookup(sWord, iCurrent[iLib]);
- if (iCurrent[iLib] == INVALID_INDEX)
- continue;
- if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
- continue;
- if (poCurrentWord == NULL )
- {
- poCurrentWord = poGetWord(iCurrent[iLib], iLib);
- iCurrentLib = iLib;
- }
- else
- {
- word = poGetWord(iCurrent[iLib], iLib);
- if (stardict_strcmp(poCurrentWord, word) > 0 )
- {
- poCurrentWord = word;
- iCurrentLib = iLib;
- }
- }
- }
- if (poCurrentWord)
- {
- iCurrent[iCurrentLib]
- ++;
- for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
- {
- if (iLib == iCurrentLib)
- continue;
- if (iCurrent[iLib] == INVALID_INDEX)
- continue;
- if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
- continue;
- if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0 )
- iCurrent[iLib]++;
- }
- poCurrentWord = poGetCurrentWord(iCurrent);
- }
- return poCurrentWord;
- }
- const gchar *
- Libs::poGetPreWord(glong * iCurrent)
- {
- // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange();
- const gchar *poCurrentWord = NULL;
- std::vector<Dict *>::size_type iCurrentLib = 0;
- const gchar *word;
- for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
- {
- if (iCurrent[iLib] == INVALID_INDEX)
- iCurrent[iLib] = narticles(iLib);
- else
- {
- if ( iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
- continue;
- }
- if ( poCurrentWord == NULL )
- {
- poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib);
- iCurrentLib = iLib;
- }
- else
- {
- word = poGetWord(iCurrent[iLib] - 1, iLib);
- if (stardict_strcmp(poCurrentWord, word) < 0 )
- {
- poCurrentWord = word;
- iCurrentLib = iLib;
- }
- }
- }
- if (poCurrentWord)
- {
- iCurrent[iCurrentLib]
- --;
- for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
- {
- if (iLib == iCurrentLib)
- continue;
- if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
- continue;
- if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0)
- {
- iCurrent[iLib]--;
- }
- else
- {
- if (iCurrent[iLib] == narticles(iLib))
- iCurrent[iLib] = INVALID_INDEX;
- }
- }
- }
- return poCurrentWord;
- }
- bool Libs::LookupSimilarWord(const gchar* sWord, glong & iWordIndex, int iLib)
- {
- glong iIndex;
- bool bFound = false;
- gchar *casestr;
- if (!bFound)
- {
- // to lower case.
- casestr = g_utf8_strdown(sWord, -1);
- if (strcmp(casestr, sWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- // to upper case.
- if (!bFound)
- {
- casestr = g_utf8_strup(sWord, -1);
- if (strcmp(casestr, sWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- // Upper the first character and lower others.
- if (!bFound)
- {
- gchar *nextchar = g_utf8_next_char(sWord);
- gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord);
- nextchar = g_utf8_strdown(nextchar, -1);
- casestr = g_strdup_printf("%s%s", firstchar, nextchar);
- g_free(firstchar);
- g_free(nextchar);
- if (strcmp(casestr, sWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- if (bIsPureEnglish(sWord))
- {
- // If not Found , try other status of sWord.
- int iWordLen = strlen(sWord);
- bool isupcase;
- gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1);
- //cut one char "s" or "d"
- if (!bFound && iWordLen > 1)
- {
- isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2);
- if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d"
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- //cut "ly"
- if (!bFound && iWordLen > 2)
- {
- isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2);
- if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2)))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 2] = '\0'; // cut "ly"
- if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]
- && !bIsVowel(sNewWord[iWordLen - 4]) &&
- bIsVowel(sNewWord[iWordLen - 5]))
- { //doubled
- sNewWord[iWordLen - 3] = '\0';
- if ( oLib[iLib]->Lookup(sNewWord, iIndex) )
- bFound = true;
- else
- {
- if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- if (!bFound)
- sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
- }
- }
- if (!bFound)
- {
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- }
- //cut "ing"
- if (!bFound && iWordLen > 3)
- {
- isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3);
- if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3) )
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 3] = '\0';
- if ( iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5])
- && !bIsVowel(sNewWord[iWordLen - 5]) &&
- bIsVowel(sNewWord[iWordLen - 6]))
- { //doubled
- sNewWord[iWordLen - 4] = '\0';
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else
- {
- if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- if (!bFound)
- sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; //restore
- }
- }
- if ( !bFound )
- {
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- if (!bFound)
- {
- if (isupcase)
- strcat(sNewWord, "E"); // add a char "E"
- else
- strcat(sNewWord, "e"); // add a char "e"
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- }
- //cut two char "es"
- if (!bFound && iWordLen > 3)
- {
- isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) &&
- (sWord[iWordLen - 3] == 'S' ||
- sWord[iWordLen - 3] == 'X' ||
- sWord[iWordLen - 3] == 'O' ||
- (iWordLen > 4 && sWord[iWordLen - 3] == 'H' &&
- (sWord[iWordLen - 4] == 'C' ||
- sWord[iWordLen - 4] == 'S'))));
- if (isupcase ||
- (!strncmp(&sWord[iWordLen - 2], "es", 2) &&
- (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' ||
- sWord[iWordLen - 3] == 'o' ||
- (iWordLen > 4 && sWord[iWordLen - 3] == 'h' &&
- (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's')))))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 2] = '\0';
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- //cut "ed"
- if (!bFound && iWordLen > 3)
- {
- isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2);
- if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 2] = '\0';
- if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4])
- && !bIsVowel(sNewWord[iWordLen - 4]) &&
- bIsVowel(sNewWord[iWordLen - 5]))
- { //doubled
- sNewWord[iWordLen - 3] = '\0';
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else
- {
- if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- if (!bFound)
- sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
- }
- }
- if (!bFound)
- {
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- }
- // cut "ied" , add "y".
- if (!bFound && iWordLen > 3)
- {
- isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3);
- if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3)))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 3] = '\0';
- if (isupcase)
- strcat(sNewWord, "Y"); // add a char "Y"
- else
- strcat(sNewWord, "y"); // add a char "y"
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- // cut "ies" , add "y".
- if (!bFound && iWordLen > 3)
- {
- isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3);
- if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3)))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 3] = '\0';
- if (isupcase)
- strcat(sNewWord, "Y"); // add a char "Y"
- else
- strcat(sNewWord, "y"); // add a char "y"
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- // cut "er".
- if (!bFound && iWordLen > 2)
- {
- isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2);
- if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2)))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 2] = '\0';
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- // cut "est".
- if (!bFound && iWordLen > 3)
- {
- isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3);
- if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3)))
- {
- strcpy(sNewWord, sWord);
- sNewWord[iWordLen - 3] = '\0';
- if (oLib[iLib]->Lookup(sNewWord, iIndex))
- bFound = true;
- else if (isupcase || g_ascii_isupper(sWord[0]))
- {
- casestr = g_ascii_strdown(sNewWord, -1);
- if (strcmp(casestr, sNewWord))
- {
- if (oLib[iLib]->Lookup(casestr, iIndex))
- bFound = true;
- }
- g_free(casestr);
- }
- }
- }
- g_free(sNewWord);
- }
- if (bFound)
- iWordIndex = iIndex;
- #if 0
- else
- {
- //don't change iWordIndex here.
- //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words.
- //iWordIndex = INVALID_INDEX;
- }
- #endif
- return bFound;
- }
- bool Libs::SimpleLookupWord(const gchar* sWord, glong & iWordIndex, int iLib)
- {
- bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex);
- if (!bFound)
- bFound = LookupSimilarWord(sWord, iWordIndex, iLib);
- return bFound;
- }
- struct Fuzzystruct
- {
- char * pMatchWord;
- int iMatchWordDistance;
- };
- inline bool operator<(const Fuzzystruct & lh, const Fuzzystruct & rh)
- {
- if (lh.iMatchWordDistance != rh.iMatchWordDistance)
- return lh.iMatchWordDistance < rh.iMatchWordDistance;
- if (lh.pMatchWord && rh.pMatchWord)
- return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0;
- return false;
- }
- static inline void unicode_strdown(gunichar *str)
- {
- while (*str)
- {
- *str = g_unichar_tolower(*str);
- ++str;
- }
- }
- bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size, gint iLib)
- {
- if (sWord[0] == '\0')
- return false;
- Fuzzystruct *oFuzzystruct = new Fuzzystruct[reslist_size];
- for (int i = 0; i < reslist_size; i++)
- {
- oFuzzystruct[i].pMatchWord = NULL;
- oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance;
- }
- int iMaxDistance = iMaxFuzzyDistance;
- int iDistance;
- bool Found = false;
- EditDistance oEditDistance;
- glong iCheckWordLen;
- const char *sCheck;
- gunichar *ucs4_str1, *ucs4_str2;
- glong ucs4_str2_len;
- ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len);
- unicode_strdown(ucs4_str2);
- // for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
- // {
- if (progress_func)
- progress_func();
- //if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) {
- //there are Chinese dicts and English dicts...
- if (TRUE)
- {
- const int iwords = narticles(iLib);
- for (int index = 0; index < iwords; index++)
- {
- sCheck = poGetWord(index, iLib);
- // tolower and skip too long or too short words
- iCheckWordLen = g_utf8_strlen(sCheck, -1);
- if (iCheckWordLen - ucs4_str2_len >= iMaxDistance ||
- ucs4_str2_len - iCheckWordLen >= iMaxDistance)
- continue;
- ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, NULL);
- if (iCheckWordLen > ucs4_str2_len)
- ucs4_str1[ucs4_str2_len] = 0;
- unicode_strdown(ucs4_str1);
- iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance);
- g_free(ucs4_str1);
- if (iDistance < iMaxDistance && iDistance < ucs4_str2_len)
- {
- // when ucs4_str2_len=1,2 we need less fuzzy.
- Found = true;
- bool bAlreadyInList = false;
- int iMaxDistanceAt = 0;
- for (int j = 0; j < reslist_size; j++)
- {
- if (oFuzzystruct[j].pMatchWord &&
- strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0 )
- { //already in list
- bAlreadyInList = true;
- break;
- }
- //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time.
- if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance )
- {
- iMaxDistanceAt = j;
- }
- }
- if (!bAlreadyInList)
- {
- if (oFuzzystruct[iMaxDistanceAt].pMatchWord)
- g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord);
- oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck);
- oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance;
- // calc new iMaxDistance
- iMaxDistance = iDistance;
- for (int j = 0; j < reslist_size; j++)
- {
- if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance)
- iMaxDistance = oFuzzystruct[j].iMatchWordDistance;
- } // calc new iMaxDistance
- } // add to list
- } // find one
- } // each word
- } // ok for search
- // } // each lib
- g_free(ucs4_str2);
- if (Found) // sort with distance
- std::sort(oFuzzystruct, oFuzzystruct + reslist_size);
- for (gint i = 0; i < reslist_size; ++i)
- reslist[i] = oFuzzystruct[i].pMatchWord;
- delete[] oFuzzystruct;
- return Found;
- }
- inline bool less_for_compare(const char *lh, const char *rh)
- {
- return stardict_strcmp(lh, rh) < 0;
- }
- gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord)
- {
- glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1];
- gint iMatchCount = 0;
- GPatternSpec *pspec = g_pattern_spec_new(word);
- for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
- {
- //if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib))
- // -iMatchCount,so save time,but may got less result and the word may repeat.
- if (oLib[iLib]->
- LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1))
- {
- if (progress_func)
- progress_func();
- for (int i = 0; aiIndex[i] != -1; i++)
- {
- const gchar * sMatchWord = poGetWord(aiIndex[i], iLib);
- bool bAlreadyInList = false;
- for (int j = 0; j < iMatchCount; j++)
- {
- if (strcmp(ppMatchWord[j], sMatchWord) == 0)
- { //already in list
- bAlreadyInList = true;
- break;
- }
- }
- if (!bAlreadyInList)
- ppMatchWord[iMatchCount++] = g_strdup(sMatchWord);
- }
- }
- }
- g_pattern_spec_free(pspec);
- if (iMatchCount) // sort it.
- std::sort(ppMatchWord, ppMatchWord + iMatchCount, less_for_compare);
- return iMatchCount;
- }
- bool Libs::LookupData(const gchar *sWord, std::vector<gchar *> *reslist)
- {
- std::vector<std::string> SearchWords;
- std::string SearchWord;
- const char *p = sWord;
- while (*p)
- {
- if (*p == '\\')
- {
- p++;
- switch (*p)
- {
- case ' ':
- SearchWord += ' ';
- break;
- case '\\':
- SearchWord += '\\';
- break;
- case 't':
- SearchWord += '\t';
- break;
- case 'n':
- SearchWord += '\n';
- break;
- default:
- SearchWord += *p;
- }
- }
- else if (*p == ' ')
- {
- if (!SearchWord.empty())
- {
- SearchWords.push_back(SearchWord);
- SearchWord.clear();
- }
- }
- else
- {
- SearchWord += *p;
- }
- p++;
- }
- if (!SearchWord.empty())
- {
- SearchWords.push_back(SearchWord);
- SearchWord.clear();
- }
- if (SearchWords.empty())
- return false;
- guint32 max_size = 0;
- gchar *origin_data = NULL;
- for (std::vector<Dict *>::size_type i = 0; i<oLib.size(); ++i)
- {
- if (!oLib[i]->
- containSearchData())
- continue;
- if (progress_func)
- progress_func();
- const gulong iwords = narticles(i);
- const gchar *key;
- guint32 offset, size;
- for (gulong j = 0;
- j < iwords;
- ++j)
- {
- oLib[i]
- ->get_key_and_data(j, &key, &offset, &size);
- if (size > max_size)
- {
- origin_data = (gchar *)g_realloc(origin_data, size);
- max_size = size;
- }
- if (oLib[i]->SearchData(SearchWords, offset, size, origin_data))
- reslist[i].push_back(g_strdup(key));
- }
- }
- g_free(origin_data);
- std::vector<Dict *>::size_type i;
- for (i = 0; i<oLib.size(); ++i)
- if (!reslist[i].empty())
- break;
- return i != oLib.size();
- }
- /**************************************************/
- query_t analyze_query(const char *s, std::string& res)
- {
- if (!s || !*s)
- {
- res = "";
- return qtSIMPLE;
- }
- if (*s == '/')
- {
- res = s + 1;
- return qtFUZZY;
- }
- if (*s == '|')
- {
- res = s + 1;
- return qtDATA;
- }
- bool regexp = false;
- const char *p = s;
- res = "";
- for (; *p; res += *p, ++p)
- {
- if (*p == '\\')
- {
- ++p;
- if (!*p)
- break;
- continue;
- }
- if (*p == '*' || *p == '?')
- regexp = true;
- }
- if (regexp)
- return qtREGEXP;
- return qtSIMPLE;
- }
|