fts3_unicode.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. /*
  2. ** 2012 May 24
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** Implementation of the "unicode" full-text-search tokenizer.
  14. */
  15. #ifndef SQLITE_DISABLE_FTS3_UNICODE
  16. #include "fts3Int.h"
  17. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
  18. #include <assert.h>
  19. #include <stdlib.h>
  20. #include <stdio.h>
  21. #include <string.h>
  22. #include "fts3_tokenizer.h"
  23. /*
  24. ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
  25. ** from the sqlite3 source file utf.c. If this file is compiled as part
  26. ** of the amalgamation, they are not required.
  27. */
  28. #ifndef SQLITE_AMALGAMATION
  29. static const unsigned char sqlite3Utf8Trans1[] = {
  30. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  31. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  32. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  33. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  34. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  35. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  36. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  37. 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
  38. };
  39. #define READ_UTF8(zIn, zTerm, c) \
  40. c = *(zIn++); \
  41. if( c>=0xc0 ){ \
  42. c = sqlite3Utf8Trans1[c-0xc0]; \
  43. while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
  44. c = (c<<6) + (0x3f & *(zIn++)); \
  45. } \
  46. if( c<0x80 \
  47. || (c&0xFFFFF800)==0xD800 \
  48. || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
  49. }
  50. #define WRITE_UTF8(zOut, c) { \
  51. if( c<0x00080 ){ \
  52. *zOut++ = (u8)(c&0xFF); \
  53. } \
  54. else if( c<0x00800 ){ \
  55. *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
  56. *zOut++ = 0x80 + (u8)(c & 0x3F); \
  57. } \
  58. else if( c<0x10000 ){ \
  59. *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
  60. *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
  61. *zOut++ = 0x80 + (u8)(c & 0x3F); \
  62. }else{ \
  63. *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
  64. *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
  65. *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
  66. *zOut++ = 0x80 + (u8)(c & 0x3F); \
  67. } \
  68. }
  69. #endif /* ifndef SQLITE_AMALGAMATION */
  70. typedef struct unicode_tokenizer unicode_tokenizer;
  71. typedef struct unicode_cursor unicode_cursor;
  72. struct unicode_tokenizer {
  73. sqlite3_tokenizer base;
  74. int eRemoveDiacritic;
  75. int nException;
  76. int *aiException;
  77. };
  78. struct unicode_cursor {
  79. sqlite3_tokenizer_cursor base;
  80. const unsigned char *aInput; /* Input text being tokenized */
  81. int nInput; /* Size of aInput[] in bytes */
  82. int iOff; /* Current offset within aInput[] */
  83. int iToken; /* Index of next token to be returned */
  84. char *zToken; /* storage for current token */
  85. int nAlloc; /* space allocated at zToken */
  86. };
  87. /*
  88. ** Destroy a tokenizer allocated by unicodeCreate().
  89. */
  90. static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
  91. if( pTokenizer ){
  92. unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer;
  93. sqlite3_free(p->aiException);
  94. sqlite3_free(p);
  95. }
  96. return SQLITE_OK;
  97. }
  98. /*
  99. ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
  100. ** statement has specified that the tokenizer for this table shall consider
  101. ** all characters in string zIn/nIn to be separators (if bAlnum==0) or
  102. ** token characters (if bAlnum==1).
  103. **
  104. ** For each codepoint in the zIn/nIn string, this function checks if the
  105. ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
  106. ** If so, no action is taken. Otherwise, the codepoint is added to the
  107. ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
  108. ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
  109. ** codepoints in the aiException[] array.
  110. **
  111. ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
  112. ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
  113. ** It is not possible to change the behavior of the tokenizer with respect
  114. ** to these codepoints.
  115. */
  116. static int unicodeAddExceptions(
  117. unicode_tokenizer *p, /* Tokenizer to add exceptions to */
  118. int bAlnum, /* Replace Isalnum() return value with this */
  119. const char *zIn, /* Array of characters to make exceptions */
  120. int nIn /* Length of z in bytes */
  121. ){
  122. const unsigned char *z = (const unsigned char *)zIn;
  123. const unsigned char *zTerm = &z[nIn];
  124. unsigned int iCode;
  125. int nEntry = 0;
  126. assert( bAlnum==0 || bAlnum==1 );
  127. while( z<zTerm ){
  128. READ_UTF8(z, zTerm, iCode);
  129. assert( (sqlite3FtsUnicodeIsalnum((int)iCode) & 0xFFFFFFFE)==0 );
  130. if( sqlite3FtsUnicodeIsalnum((int)iCode)!=bAlnum
  131. && sqlite3FtsUnicodeIsdiacritic((int)iCode)==0
  132. ){
  133. nEntry++;
  134. }
  135. }
  136. if( nEntry ){
  137. int *aNew; /* New aiException[] array */
  138. int nNew; /* Number of valid entries in array aNew[] */
  139. aNew = sqlite3_realloc64(p->aiException,(p->nException+nEntry)*sizeof(int));
  140. if( aNew==0 ) return SQLITE_NOMEM;
  141. nNew = p->nException;
  142. z = (const unsigned char *)zIn;
  143. while( z<zTerm ){
  144. READ_UTF8(z, zTerm, iCode);
  145. if( sqlite3FtsUnicodeIsalnum((int)iCode)!=bAlnum
  146. && sqlite3FtsUnicodeIsdiacritic((int)iCode)==0
  147. ){
  148. int i, j;
  149. for(i=0; i<nNew && aNew[i]<(int)iCode; i++);
  150. for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];
  151. aNew[i] = (int)iCode;
  152. nNew++;
  153. }
  154. }
  155. p->aiException = aNew;
  156. p->nException = nNew;
  157. }
  158. return SQLITE_OK;
  159. }
  160. /*
  161. ** Return true if the p->aiException[] array contains the value iCode.
  162. */
  163. static int unicodeIsException(unicode_tokenizer *p, int iCode){
  164. if( p->nException>0 ){
  165. int *a = p->aiException;
  166. int iLo = 0;
  167. int iHi = p->nException-1;
  168. while( iHi>=iLo ){
  169. int iTest = (iHi + iLo) / 2;
  170. if( iCode==a[iTest] ){
  171. return 1;
  172. }else if( iCode>a[iTest] ){
  173. iLo = iTest+1;
  174. }else{
  175. iHi = iTest-1;
  176. }
  177. }
  178. }
  179. return 0;
  180. }
  181. /*
  182. ** Return true if, for the purposes of tokenization, codepoint iCode is
  183. ** considered a token character (not a separator).
  184. */
  185. static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){
  186. assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
  187. return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);
  188. }
  189. /*
  190. ** Create a new tokenizer instance.
  191. */
  192. static int unicodeCreate(
  193. int nArg, /* Size of array argv[] */
  194. const char * const *azArg, /* Tokenizer creation arguments */
  195. sqlite3_tokenizer **pp /* OUT: New tokenizer handle */
  196. ){
  197. unicode_tokenizer *pNew; /* New tokenizer object */
  198. int i;
  199. int rc = SQLITE_OK;
  200. pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
  201. if( pNew==NULL ) return SQLITE_NOMEM;
  202. memset(pNew, 0, sizeof(unicode_tokenizer));
  203. pNew->eRemoveDiacritic = 1;
  204. for(i=0; rc==SQLITE_OK && i<nArg; i++){
  205. const char *z = azArg[i];
  206. int n = (int)strlen(z);
  207. if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
  208. pNew->eRemoveDiacritic = 1;
  209. }
  210. else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
  211. pNew->eRemoveDiacritic = 0;
  212. }
  213. else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){
  214. pNew->eRemoveDiacritic = 2;
  215. }
  216. else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
  217. rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
  218. }
  219. else if( n>=11 && memcmp("separators=", z, 11)==0 ){
  220. rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
  221. }
  222. else{
  223. /* Unrecognized argument */
  224. rc = SQLITE_ERROR;
  225. }
  226. }
  227. if( rc!=SQLITE_OK ){
  228. unicodeDestroy((sqlite3_tokenizer *)pNew);
  229. pNew = 0;
  230. }
  231. *pp = (sqlite3_tokenizer *)pNew;
  232. return rc;
  233. }
  234. /*
  235. ** Prepare to begin tokenizing a particular string. The input
  236. ** string to be tokenized is pInput[0..nBytes-1]. A cursor
  237. ** used to incrementally tokenize this string is returned in
  238. ** *ppCursor.
  239. */
  240. static int unicodeOpen(
  241. sqlite3_tokenizer *p, /* The tokenizer */
  242. const char *aInput, /* Input string */
  243. int nInput, /* Size of string aInput in bytes */
  244. sqlite3_tokenizer_cursor **pp /* OUT: New cursor object */
  245. ){
  246. unicode_cursor *pCsr;
  247. pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
  248. if( pCsr==0 ){
  249. return SQLITE_NOMEM;
  250. }
  251. memset(pCsr, 0, sizeof(unicode_cursor));
  252. pCsr->aInput = (const unsigned char *)aInput;
  253. if( aInput==0 ){
  254. pCsr->nInput = 0;
  255. pCsr->aInput = (const unsigned char*)"";
  256. }else if( nInput<0 ){
  257. pCsr->nInput = (int)strlen(aInput);
  258. }else{
  259. pCsr->nInput = nInput;
  260. }
  261. *pp = &pCsr->base;
  262. UNUSED_PARAMETER(p);
  263. return SQLITE_OK;
  264. }
  265. /*
  266. ** Close a tokenization cursor previously opened by a call to
  267. ** simpleOpen() above.
  268. */
  269. static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
  270. unicode_cursor *pCsr = (unicode_cursor *) pCursor;
  271. sqlite3_free(pCsr->zToken);
  272. sqlite3_free(pCsr);
  273. return SQLITE_OK;
  274. }
  275. /*
  276. ** Extract the next token from a tokenization cursor. The cursor must
  277. ** have been opened by a prior call to simpleOpen().
  278. */
  279. static int unicodeNext(
  280. sqlite3_tokenizer_cursor *pC, /* Cursor returned by simpleOpen */
  281. const char **paToken, /* OUT: Token text */
  282. int *pnToken, /* OUT: Number of bytes at *paToken */
  283. int *piStart, /* OUT: Starting offset of token */
  284. int *piEnd, /* OUT: Ending offset of token */
  285. int *piPos /* OUT: Position integer of token */
  286. ){
  287. unicode_cursor *pCsr = (unicode_cursor *)pC;
  288. unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
  289. unsigned int iCode = 0;
  290. char *zOut;
  291. const unsigned char *z = &pCsr->aInput[pCsr->iOff];
  292. const unsigned char *zStart = z;
  293. const unsigned char *zEnd;
  294. const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
  295. /* Scan past any delimiter characters before the start of the next token.
  296. ** Return SQLITE_DONE early if this takes us all the way to the end of
  297. ** the input. */
  298. while( z<zTerm ){
  299. READ_UTF8(z, zTerm, iCode);
  300. if( unicodeIsAlnum(p, (int)iCode) ) break;
  301. zStart = z;
  302. }
  303. if( zStart>=zTerm ) return SQLITE_DONE;
  304. zOut = pCsr->zToken;
  305. do {
  306. int iOut;
  307. /* Grow the output buffer if required. */
  308. if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
  309. char *zNew = sqlite3_realloc64(pCsr->zToken, pCsr->nAlloc+64);
  310. if( !zNew ) return SQLITE_NOMEM;
  311. zOut = &zNew[zOut - pCsr->zToken];
  312. pCsr->zToken = zNew;
  313. pCsr->nAlloc += 64;
  314. }
  315. /* Write the folded case of the last character read to the output */
  316. zEnd = z;
  317. iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic);
  318. if( iOut ){
  319. WRITE_UTF8(zOut, iOut);
  320. }
  321. /* If the cursor is not at EOF, read the next character */
  322. if( z>=zTerm ) break;
  323. READ_UTF8(z, zTerm, iCode);
  324. }while( unicodeIsAlnum(p, (int)iCode)
  325. || sqlite3FtsUnicodeIsdiacritic((int)iCode)
  326. );
  327. /* Set the output variables and return. */
  328. pCsr->iOff = (int)(z - pCsr->aInput);
  329. *paToken = pCsr->zToken;
  330. *pnToken = (int)(zOut - pCsr->zToken);
  331. *piStart = (int)(zStart - pCsr->aInput);
  332. *piEnd = (int)(zEnd - pCsr->aInput);
  333. *piPos = pCsr->iToken++;
  334. return SQLITE_OK;
  335. }
  336. /*
  337. ** Set *ppModule to a pointer to the sqlite3_tokenizer_module
  338. ** structure for the unicode tokenizer.
  339. */
  340. void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){
  341. static const sqlite3_tokenizer_module module = {
  342. 0,
  343. unicodeCreate,
  344. unicodeDestroy,
  345. unicodeOpen,
  346. unicodeClose,
  347. unicodeNext,
  348. 0,
  349. };
  350. *ppModule = &module;
  351. }
  352. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
  353. #endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */