fts3_icu.c 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. /*
  2. ** 2007 June 22
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. *************************************************************************
  12. ** This file implements a tokenizer for fts3 based on the ICU library.
  13. */
  14. #include "fts3Int.h"
  15. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
  16. #ifdef SQLITE_ENABLE_ICU
  17. #include <assert.h>
  18. #include <string.h>
  19. #include "fts3_tokenizer.h"
  20. #include <unicode/ubrk.h>
  21. #include <unicode/ucol.h>
  22. #include <unicode/ustring.h>
  23. #include <unicode/utf16.h>
  24. typedef struct IcuTokenizer IcuTokenizer;
  25. typedef struct IcuCursor IcuCursor;
  26. struct IcuTokenizer {
  27. sqlite3_tokenizer base;
  28. char *zLocale;
  29. };
  30. struct IcuCursor {
  31. sqlite3_tokenizer_cursor base;
  32. UBreakIterator *pIter; /* ICU break-iterator object */
  33. int nChar; /* Number of UChar elements in pInput */
  34. UChar *aChar; /* Copy of input using utf-16 encoding */
  35. int *aOffset; /* Offsets of each character in utf-8 input */
  36. int nBuffer;
  37. char *zBuffer;
  38. int iToken;
  39. };
  40. /*
  41. ** Create a new tokenizer instance.
  42. */
  43. static int icuCreate(
  44. int argc, /* Number of entries in argv[] */
  45. const char * const *argv, /* Tokenizer creation arguments */
  46. sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
  47. ){
  48. IcuTokenizer *p;
  49. int n = 0;
  50. if( argc>0 ){
  51. n = strlen(argv[0])+1;
  52. }
  53. p = (IcuTokenizer *)sqlite3_malloc64(sizeof(IcuTokenizer)+n);
  54. if( !p ){
  55. return SQLITE_NOMEM;
  56. }
  57. memset(p, 0, sizeof(IcuTokenizer));
  58. if( n ){
  59. p->zLocale = (char *)&p[1];
  60. memcpy(p->zLocale, argv[0], n);
  61. }
  62. *ppTokenizer = (sqlite3_tokenizer *)p;
  63. return SQLITE_OK;
  64. }
  65. /*
  66. ** Destroy a tokenizer
  67. */
  68. static int icuDestroy(sqlite3_tokenizer *pTokenizer){
  69. IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  70. sqlite3_free(p);
  71. return SQLITE_OK;
  72. }
  73. /*
  74. ** Prepare to begin tokenizing a particular string. The input
  75. ** string to be tokenized is pInput[0..nBytes-1]. A cursor
  76. ** used to incrementally tokenize this string is returned in
  77. ** *ppCursor.
  78. */
  79. static int icuOpen(
  80. sqlite3_tokenizer *pTokenizer, /* The tokenizer */
  81. const char *zInput, /* Input string */
  82. int nInput, /* Length of zInput in bytes */
  83. sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
  84. ){
  85. IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  86. IcuCursor *pCsr;
  87. const int32_t opt = U_FOLD_CASE_DEFAULT;
  88. UErrorCode status = U_ZERO_ERROR;
  89. int nChar;
  90. UChar32 c;
  91. int iInput = 0;
  92. int iOut = 0;
  93. *ppCursor = 0;
  94. if( zInput==0 ){
  95. nInput = 0;
  96. zInput = "";
  97. }else if( nInput<0 ){
  98. nInput = strlen(zInput);
  99. }
  100. nChar = nInput+1;
  101. pCsr = (IcuCursor *)sqlite3_malloc64(
  102. sizeof(IcuCursor) + /* IcuCursor */
  103. ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */
  104. (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
  105. );
  106. if( !pCsr ){
  107. return SQLITE_NOMEM;
  108. }
  109. memset(pCsr, 0, sizeof(IcuCursor));
  110. pCsr->aChar = (UChar *)&pCsr[1];
  111. pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
  112. pCsr->aOffset[iOut] = iInput;
  113. U8_NEXT(zInput, iInput, nInput, c);
  114. while( c>0 ){
  115. int isError = 0;
  116. c = u_foldCase(c, opt);
  117. U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
  118. if( isError ){
  119. sqlite3_free(pCsr);
  120. return SQLITE_ERROR;
  121. }
  122. pCsr->aOffset[iOut] = iInput;
  123. if( iInput<nInput ){
  124. U8_NEXT(zInput, iInput, nInput, c);
  125. }else{
  126. c = 0;
  127. }
  128. }
  129. pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
  130. if( !U_SUCCESS(status) ){
  131. sqlite3_free(pCsr);
  132. return SQLITE_ERROR;
  133. }
  134. pCsr->nChar = iOut;
  135. ubrk_first(pCsr->pIter);
  136. *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
  137. return SQLITE_OK;
  138. }
  139. /*
  140. ** Close a tokenization cursor previously opened by a call to icuOpen().
  141. */
  142. static int icuClose(sqlite3_tokenizer_cursor *pCursor){
  143. IcuCursor *pCsr = (IcuCursor *)pCursor;
  144. ubrk_close(pCsr->pIter);
  145. sqlite3_free(pCsr->zBuffer);
  146. sqlite3_free(pCsr);
  147. return SQLITE_OK;
  148. }
  149. /*
  150. ** Extract the next token from a tokenization cursor.
  151. */
  152. static int icuNext(
  153. sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
  154. const char **ppToken, /* OUT: *ppToken is the token text */
  155. int *pnBytes, /* OUT: Number of bytes in token */
  156. int *piStartOffset, /* OUT: Starting offset of token */
  157. int *piEndOffset, /* OUT: Ending offset of token */
  158. int *piPosition /* OUT: Position integer of token */
  159. ){
  160. IcuCursor *pCsr = (IcuCursor *)pCursor;
  161. int iStart = 0;
  162. int iEnd = 0;
  163. int nByte = 0;
  164. while( iStart==iEnd ){
  165. UChar32 c;
  166. iStart = ubrk_current(pCsr->pIter);
  167. iEnd = ubrk_next(pCsr->pIter);
  168. if( iEnd==UBRK_DONE ){
  169. return SQLITE_DONE;
  170. }
  171. while( iStart<iEnd ){
  172. int iWhite = iStart;
  173. U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
  174. if( u_isspace(c) ){
  175. iStart = iWhite;
  176. }else{
  177. break;
  178. }
  179. }
  180. assert(iStart<=iEnd);
  181. }
  182. do {
  183. UErrorCode status = U_ZERO_ERROR;
  184. if( nByte ){
  185. char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
  186. if( !zNew ){
  187. return SQLITE_NOMEM;
  188. }
  189. pCsr->zBuffer = zNew;
  190. pCsr->nBuffer = nByte;
  191. }
  192. u_strToUTF8(
  193. pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
  194. &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
  195. &status /* Output success/failure */
  196. );
  197. } while( nByte>pCsr->nBuffer );
  198. *ppToken = pCsr->zBuffer;
  199. *pnBytes = nByte;
  200. *piStartOffset = pCsr->aOffset[iStart];
  201. *piEndOffset = pCsr->aOffset[iEnd];
  202. *piPosition = pCsr->iToken++;
  203. return SQLITE_OK;
  204. }
  205. /*
  206. ** The set of routines that implement the simple tokenizer
  207. */
  208. static const sqlite3_tokenizer_module icuTokenizerModule = {
  209. 0, /* iVersion */
  210. icuCreate, /* xCreate */
  211. icuDestroy, /* xCreate */
  212. icuOpen, /* xOpen */
  213. icuClose, /* xClose */
  214. icuNext, /* xNext */
  215. 0, /* xLanguageid */
  216. };
  217. /*
  218. ** Set *ppModule to point at the implementation of the ICU tokenizer.
  219. */
  220. void sqlite3Fts3IcuTokenizerModule(
  221. sqlite3_tokenizer_module const**ppModule
  222. ){
  223. *ppModule = &icuTokenizerModule;
  224. }
  225. #endif /* defined(SQLITE_ENABLE_ICU) */
  226. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */