hash.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /*
  2. * Copyright (c) 2009 Openmoko Inc.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation, either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. #include <stdlib.h>
  18. #include <stdio.h>
  19. #include <stdbool.h>
  20. #include <string.h>
  21. #include <ctype.h>
  22. #include <errno.h>
  23. //#include "bmf.h"
  24. //#include "wiki_render.h"
  25. //#include "lcd_buf_draw.h"
  26. //#include "Alloc.h"
  27. //#include "Bra.h"
  28. //#include "LzmaEnc.h"
  29. #include "bigram.h"
  30. #include "search_hash.h"
  31. // pedia.idx format:
  32. // The first 4 bytes contain the article count.
  33. // Each article got a ARTICLE_PTR structure entry in pedia.idx.
  34. // The first ARTICLE_PTR entry is for article idx 1.
  35. //
  36. // pedia.pfx format:
  37. // first three character indexing table - 54 * 54 * 54 entries * 4 bytes (int32_t int - file offset of pedia.fnd)
  38. // 54 characters - null + 0~9 + a~z + ...
  39. //
  40. // pedia.fnd format:
  41. // bigram table - 128 entries * 2 bytes
  42. // All titles for search are sequentially concatnated into pedia.fnd in search order.
  43. // Each entry consists of (see TITLE_SEARCH_REMAINDER):
  44. // idx of article (pointing to pedia.idx)
  45. // variable length and null terminated remainder (starting from the 3rd character)
  46. #define SIZE_FIRST_THREE_CHAR_INDEXING (SEARCH_CHR_COUNT * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT * sizeof(uint32_t))
  47. #define MAX_TITLE_SEARCH 64
  48. #define MAX_TITLE_LEN 512
  49. typedef struct _TITLE_SEARCH { // used to mask the porinter to the remainder of the title for search
  50. int32_t idxArticle;
  51. char cZero; // null character for backward search
  52. char sTitleSearch[MAX_TITLE_SEARCH]; // null terminated title for search (with bigram encoded)
  53. } TITLE_SEARCH;
  54. static int process_hash_sequential_search(char *sLocalTitleSearch, int32_t offsetBufFnd,
  55. int lenHashSequentialSearch, char *sHashSequentialSearch, int32_t *countHashSequentialSearchForNextChar)
  56. {
  57. int lenTitleSearch = strlen(sLocalTitleSearch);
  58. int lenSame;
  59. int i;
  60. int bHashAdded = 0;
  61. for (lenSame = 0; lenSame < lenTitleSearch && lenSame < lenHashSequentialSearch; lenSame++)
  62. {
  63. if (tolower(sLocalTitleSearch[lenSame]) != sHashSequentialSearch[lenSame])
  64. break;
  65. countHashSequentialSearchForNextChar[lenSame]++;
  66. }
  67. for (i = lenSame; i < lenHashSequentialSearch; i++)
  68. {
  69. if (!bHashAdded && i >= MAX_SEARCH_STRING_ALL_HASHED_LEN && i < lenTitleSearch &&
  70. countHashSequentialSearchForNextChar[i] >= SEARCH_HASH_SEQUENTIAL_SEARCH_THRESHOLD)
  71. {
  72. add_search_hash(sLocalTitleSearch, i + 1, offsetBufFnd);
  73. bHashAdded = 1;
  74. }
  75. if (lenSame == MAX_SEARCH_STRING_ALL_HASHED_LEN)
  76. countHashSequentialSearchForNextChar[i]++;
  77. else
  78. countHashSequentialSearchForNextChar[i] = 0;
  79. }
  80. if (lenTitleSearch > MAX_SEARCH_STRING_HASHED_LEN)
  81. lenHashSequentialSearch = MAX_SEARCH_STRING_HASHED_LEN;
  82. else
  83. lenHashSequentialSearch = lenTitleSearch;
  84. for (i = lenSame; i < lenHashSequentialSearch; i++)
  85. {
  86. sHashSequentialSearch[i] = tolower(sLocalTitleSearch[i]);
  87. }
  88. return lenHashSequentialSearch;
  89. }
  90. static int32_t build_hash_tree(char *sTitleSearch, int32_t offsetBufFnd, char *bufFnd, int32_t lenBufFnd)
  91. {
  92. int i;
  93. int lenTitleSearch;
  94. TITLE_SEARCH *pTitleSearch = (TITLE_SEARCH *)&bufFnd[offsetBufFnd];
  95. int rc = 0;
  96. char *pSupportedChars = SUPPORTED_SEARCH_CHARS;
  97. char c;
  98. char sLocalTitleSearch[MAX_TITLE_LEN];
  99. int lenHashSequentialSearch = 0;
  100. char sHashSequentialSearch[MAX_SEARCH_STRING_HASHED_LEN];
  101. int32_t countHashSequentialSearchForNextChar[MAX_SEARCH_STRING_HASHED_LEN];
  102. //showMsg(3, "build_hash_tree [%s] %x\n", sTitleSearch, offsetBufFnd);
  103. memset(countHashSequentialSearchForNextChar, 0, sizeof(countHashSequentialSearchForNextChar));
  104. lenTitleSearch = strlen(sTitleSearch);
  105. if (lenTitleSearch < MAX_SEARCH_STRING_ALL_HASHED_LEN)
  106. {
  107. for (i = 0; i < strlen(pSupportedChars); i++)
  108. {
  109. c = pSupportedChars[i];
  110. if (c != ' ' || sTitleSearch[lenTitleSearch -1] != ' ') // no two continuous blanks
  111. {
  112. sTitleSearch[lenTitleSearch] = c;
  113. sTitleSearch[lenTitleSearch + 1] = '\0';
  114. bigram_decode(sLocalTitleSearch, pTitleSearch->sTitleSearch, MAX_TITLE_LEN);
  115. while (offsetBufFnd < lenBufFnd &&
  116. (rc = search_string_cmp(sLocalTitleSearch, sTitleSearch, strlen(sTitleSearch))) < 0)
  117. {
  118. lenHashSequentialSearch = process_hash_sequential_search(sLocalTitleSearch, offsetBufFnd,
  119. lenHashSequentialSearch, sHashSequentialSearch, countHashSequentialSearchForNextChar);
  120. offsetBufFnd += sizeof(pTitleSearch->idxArticle) + strlen(pTitleSearch->sTitleSearch) + 2;
  121. pTitleSearch = (TITLE_SEARCH *)&bufFnd[offsetBufFnd];
  122. bigram_decode(sLocalTitleSearch, pTitleSearch->sTitleSearch, MAX_TITLE_LEN);
  123. }
  124. if (offsetBufFnd < lenBufFnd && !rc)
  125. {
  126. add_search_hash(sTitleSearch, strlen(sTitleSearch), offsetBufFnd);
  127. lenHashSequentialSearch = 0;
  128. memset(countHashSequentialSearchForNextChar, 0, sizeof(countHashSequentialSearchForNextChar));
  129. //offsetBufFnd += sizeof(pTitleSearch->idxArticle) + strlen(pTitleSearch->sTitleSearch) + 2;
  130. if (offsetBufFnd < lenBufFnd)
  131. {
  132. offsetBufFnd = build_hash_tree(sTitleSearch, offsetBufFnd, bufFnd, lenBufFnd);
  133. pTitleSearch = (TITLE_SEARCH *)&bufFnd[offsetBufFnd];
  134. }
  135. }
  136. }
  137. }
  138. }
  139. return offsetBufFnd;
  140. }
  141. void generate_pedia_hsh(const char *fnd_name, const char *pfx_name, const char *hsh_name)
  142. {
  143. FILE *fdPfx, *fdFnd;
  144. char sTitleSearch[MAX_TITLE_LEN];
  145. int32_t *firstThreeCharIndexing;
  146. int idxFirstThreeCharIndexing;
  147. char *bufFnd;
  148. int32_t lenBufFnd;
  149. char *pSupportedChars = SUPPORTED_SEARCH_CHARS;
  150. char c1, c2, c3;
  151. int i, j, k;
  152. int32_t offsetBufFnd = 0;
  153. fdPfx = fopen(pfx_name, "rb");
  154. if (!fdPfx)
  155. {
  156. fprintf(stderr, "cannot open file '%s', error: %s\n", pfx_name, strerror(errno));
  157. exit(1);
  158. }
  159. fdFnd = fopen(fnd_name, "rb");
  160. if (!fdFnd)
  161. {
  162. fprintf(stderr, "cannot open file '%s', error: %s\n", fnd_name, strerror(errno));
  163. exit(1);
  164. }
  165. init_bigram(fdFnd);
  166. create_search_hash(hsh_name);
  167. firstThreeCharIndexing = (int32_t *)malloc(SIZE_FIRST_THREE_CHAR_INDEXING);
  168. if (!firstThreeCharIndexing)
  169. {
  170. fprintf(stderr, "malloc firstThreeCharIndexing error\n");
  171. exit(1);
  172. }
  173. if (SIZE_FIRST_THREE_CHAR_INDEXING != fread((void*)firstThreeCharIndexing, 1, SIZE_FIRST_THREE_CHAR_INDEXING, fdPfx))
  174. {
  175. fprintf(stderr, "fread failed\n");
  176. exit(1);
  177. }
  178. fseek(fdFnd, 0, SEEK_END);
  179. lenBufFnd = ftell(fdFnd);
  180. fseek(fdFnd, 0, SEEK_SET);
  181. bufFnd = malloc(lenBufFnd);
  182. if (!bufFnd)
  183. {
  184. fprintf(stderr, "malloc bufFnd error\n");
  185. exit(1);
  186. }
  187. lenBufFnd = fread(bufFnd, 1, lenBufFnd, fdFnd);
  188. for (i = 0; i < strlen(pSupportedChars); i++)
  189. {
  190. c1 = pSupportedChars[i];
  191. if (c1 != ' ') // no initial blank
  192. {
  193. for (j = 0; j < strlen(pSupportedChars); j++)
  194. {
  195. c2 = pSupportedChars[j];
  196. for (k = 0; k < strlen(pSupportedChars); k++)
  197. {
  198. c3 = pSupportedChars[k];
  199. idxFirstThreeCharIndexing =
  200. bigram_char_idx(c1) * SEARCH_CHR_COUNT * SEARCH_CHR_COUNT +
  201. bigram_char_idx(c2) * SEARCH_CHR_COUNT + bigram_char_idx(c3);
  202. if (firstThreeCharIndexing[idxFirstThreeCharIndexing])
  203. {
  204. sTitleSearch[0] = c1;
  205. sTitleSearch[1] = c2;
  206. sTitleSearch[2] = c3;
  207. sTitleSearch[3] = '\0';
  208. if (!offsetBufFnd)
  209. offsetBufFnd = firstThreeCharIndexing[idxFirstThreeCharIndexing];
  210. offsetBufFnd = build_hash_tree(sTitleSearch, offsetBufFnd, bufFnd, lenBufFnd);
  211. }
  212. }
  213. }
  214. }
  215. }
  216. save_search_hash();
  217. free(firstThreeCharIndexing);
  218. free(bufFnd);
  219. fclose(fdPfx);
  220. fclose(fdFnd);
  221. }