fts5_aux.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822
  1. /*
  2. ** 2014 May 31
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. */
  13. #include "fts5Int.h"
  14. #include <math.h> /* amalgamator: keep */
  15. /*
  16. ** Object used to iterate through all "coalesced phrase instances" in
  17. ** a single column of the current row. If the phrase instances in the
  18. ** column being considered do not overlap, this object simply iterates
  19. ** through them. Or, if they do overlap (share one or more tokens in
  20. ** common), each set of overlapping instances is treated as a single
  21. ** match. See documentation for the highlight() auxiliary function for
  22. ** details.
  23. **
  24. ** Usage is:
  25. **
  26. ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
  27. ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
  28. ** rc = fts5CInstIterNext(&iter)
  29. ** ){
  30. ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
  31. ** }
  32. **
  33. */
  34. typedef struct CInstIter CInstIter;
  35. struct CInstIter {
  36. const Fts5ExtensionApi *pApi; /* API offered by current FTS version */
  37. Fts5Context *pFts; /* First arg to pass to pApi functions */
  38. int iCol; /* Column to search */
  39. int iInst; /* Next phrase instance index */
  40. int nInst; /* Total number of phrase instances */
  41. /* Output variables */
  42. int iStart; /* First token in coalesced phrase instance */
  43. int iEnd; /* Last token in coalesced phrase instance */
  44. };
  45. /*
  46. ** Advance the iterator to the next coalesced phrase instance. Return
  47. ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
  48. */
  49. static int fts5CInstIterNext(CInstIter *pIter){
  50. int rc = SQLITE_OK;
  51. pIter->iStart = -1;
  52. pIter->iEnd = -1;
  53. while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){
  54. int ip; int ic; int io;
  55. rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io);
  56. if( rc==SQLITE_OK ){
  57. if( ic==pIter->iCol ){
  58. int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip);
  59. if( pIter->iStart<0 ){
  60. pIter->iStart = io;
  61. pIter->iEnd = iEnd;
  62. }else if( io<=pIter->iEnd ){
  63. if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd;
  64. }else{
  65. break;
  66. }
  67. }
  68. pIter->iInst++;
  69. }
  70. }
  71. return rc;
  72. }
  73. /*
  74. ** Initialize the iterator object indicated by the final parameter to
  75. ** iterate through coalesced phrase instances in column iCol.
  76. */
  77. static int fts5CInstIterInit(
  78. const Fts5ExtensionApi *pApi,
  79. Fts5Context *pFts,
  80. int iCol,
  81. CInstIter *pIter
  82. ){
  83. int rc;
  84. memset(pIter, 0, sizeof(CInstIter));
  85. pIter->pApi = pApi;
  86. pIter->pFts = pFts;
  87. pIter->iCol = iCol;
  88. rc = pApi->xInstCount(pFts, &pIter->nInst);
  89. if( rc==SQLITE_OK ){
  90. rc = fts5CInstIterNext(pIter);
  91. }
  92. return rc;
  93. }
  94. /*************************************************************************
  95. ** Start of highlight() implementation.
  96. */
  97. typedef struct HighlightContext HighlightContext;
  98. struct HighlightContext {
  99. /* Constant parameters to fts5HighlightCb() */
  100. int iRangeStart; /* First token to include */
  101. int iRangeEnd; /* If non-zero, last token to include */
  102. const char *zOpen; /* Opening highlight */
  103. const char *zClose; /* Closing highlight */
  104. const char *zIn; /* Input text */
  105. int nIn; /* Size of input text in bytes */
  106. /* Variables modified by fts5HighlightCb() */
  107. CInstIter iter; /* Coalesced Instance Iterator */
  108. int iPos; /* Current token offset in zIn[] */
  109. int iOff; /* Have copied up to this offset in zIn[] */
  110. int bOpen; /* True if highlight is open */
  111. char *zOut; /* Output value */
  112. };
  113. /*
  114. ** Append text to the HighlightContext output string - p->zOut. Argument
  115. ** z points to a buffer containing n bytes of text to append. If n is
  116. ** negative, everything up until the first '\0' is appended to the output.
  117. **
  118. ** If *pRc is set to any value other than SQLITE_OK when this function is
  119. ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered,
  120. ** *pRc is set to an error code before returning.
  121. */
  122. static void fts5HighlightAppend(
  123. int *pRc,
  124. HighlightContext *p,
  125. const char *z, int n
  126. ){
  127. if( *pRc==SQLITE_OK && z ){
  128. if( n<0 ) n = (int)strlen(z);
  129. p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z);
  130. if( p->zOut==0 ) *pRc = SQLITE_NOMEM;
  131. }
  132. }
  133. /*
  134. ** Tokenizer callback used by implementation of highlight() function.
  135. */
  136. static int fts5HighlightCb(
  137. void *pContext, /* Pointer to HighlightContext object */
  138. int tflags, /* Mask of FTS5_TOKEN_* flags */
  139. const char *pToken, /* Buffer containing token */
  140. int nToken, /* Size of token in bytes */
  141. int iStartOff, /* Start byte offset of token */
  142. int iEndOff /* End byte offset of token */
  143. ){
  144. HighlightContext *p = (HighlightContext*)pContext;
  145. int rc = SQLITE_OK;
  146. int iPos;
  147. UNUSED_PARAM2(pToken, nToken);
  148. if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
  149. iPos = p->iPos++;
  150. if( p->iRangeEnd>=0 ){
  151. if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
  152. if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
  153. }
  154. /* If the parenthesis is open, and this token is not part of the current
  155. ** phrase, and the starting byte offset of this token is past the point
  156. ** that has currently been copied into the output buffer, close the
  157. ** parenthesis. */
  158. if( p->bOpen
  159. && (iPos<=p->iter.iStart || p->iter.iStart<0)
  160. && iStartOff>p->iOff
  161. ){
  162. fts5HighlightAppend(&rc, p, p->zClose, -1);
  163. p->bOpen = 0;
  164. }
  165. /* If this is the start of a new phrase, and the highlight is not open:
  166. **
  167. ** * copy text from the input up to the start of the phrase, and
  168. ** * open the highlight.
  169. */
  170. if( iPos==p->iter.iStart && p->bOpen==0 ){
  171. fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
  172. fts5HighlightAppend(&rc, p, p->zOpen, -1);
  173. p->iOff = iStartOff;
  174. p->bOpen = 1;
  175. }
  176. if( iPos==p->iter.iEnd ){
  177. if( p->bOpen==0 ){
  178. assert( p->iRangeEnd>=0 );
  179. fts5HighlightAppend(&rc, p, p->zOpen, -1);
  180. p->bOpen = 1;
  181. }
  182. fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
  183. p->iOff = iEndOff;
  184. if( rc==SQLITE_OK ){
  185. rc = fts5CInstIterNext(&p->iter);
  186. }
  187. }
  188. if( iPos==p->iRangeEnd ){
  189. if( p->bOpen ){
  190. if( p->iter.iStart>=0 && iPos>=p->iter.iStart ){
  191. fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
  192. p->iOff = iEndOff;
  193. }
  194. fts5HighlightAppend(&rc, p, p->zClose, -1);
  195. p->bOpen = 0;
  196. }
  197. fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
  198. p->iOff = iEndOff;
  199. }
  200. return rc;
  201. }
  202. /*
  203. ** Implementation of highlight() function.
  204. */
  205. static void fts5HighlightFunction(
  206. const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
  207. Fts5Context *pFts, /* First arg to pass to pApi functions */
  208. sqlite3_context *pCtx, /* Context for returning result/error */
  209. int nVal, /* Number of values in apVal[] array */
  210. sqlite3_value **apVal /* Array of trailing arguments */
  211. ){
  212. HighlightContext ctx;
  213. int rc;
  214. int iCol;
  215. if( nVal!=3 ){
  216. const char *zErr = "wrong number of arguments to function highlight()";
  217. sqlite3_result_error(pCtx, zErr, -1);
  218. return;
  219. }
  220. iCol = sqlite3_value_int(apVal[0]);
  221. memset(&ctx, 0, sizeof(HighlightContext));
  222. ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
  223. ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
  224. ctx.iRangeEnd = -1;
  225. rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
  226. if( rc==SQLITE_RANGE ){
  227. sqlite3_result_text(pCtx, "", -1, SQLITE_STATIC);
  228. rc = SQLITE_OK;
  229. }else if( ctx.zIn ){
  230. const char *pLoc = 0; /* Locale of column iCol */
  231. int nLoc = 0; /* Size of pLoc in bytes */
  232. if( rc==SQLITE_OK ){
  233. rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
  234. }
  235. if( rc==SQLITE_OK ){
  236. rc = pApi->xColumnLocale(pFts, iCol, &pLoc, &nLoc);
  237. }
  238. if( rc==SQLITE_OK ){
  239. rc = pApi->xTokenize_v2(
  240. pFts, ctx.zIn, ctx.nIn, pLoc, nLoc, (void*)&ctx, fts5HighlightCb
  241. );
  242. }
  243. if( ctx.bOpen ){
  244. fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
  245. }
  246. fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
  247. if( rc==SQLITE_OK ){
  248. sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
  249. }
  250. sqlite3_free(ctx.zOut);
  251. }
  252. if( rc!=SQLITE_OK ){
  253. sqlite3_result_error_code(pCtx, rc);
  254. }
  255. }
  256. /*
  257. ** End of highlight() implementation.
  258. **************************************************************************/
  259. /*
  260. ** Context object passed to the fts5SentenceFinderCb() function.
  261. */
  262. typedef struct Fts5SFinder Fts5SFinder;
  263. struct Fts5SFinder {
  264. int iPos; /* Current token position */
  265. int nFirstAlloc; /* Allocated size of aFirst[] */
  266. int nFirst; /* Number of entries in aFirst[] */
  267. int *aFirst; /* Array of first token in each sentence */
  268. const char *zDoc; /* Document being tokenized */
  269. };
  270. /*
  271. ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if
  272. ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an
  273. ** error occurs.
  274. */
  275. static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){
  276. if( p->nFirstAlloc==p->nFirst ){
  277. int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64;
  278. int *aNew;
  279. aNew = (int*)sqlite3_realloc64(p->aFirst, nNew*sizeof(int));
  280. if( aNew==0 ) return SQLITE_NOMEM;
  281. p->aFirst = aNew;
  282. p->nFirstAlloc = nNew;
  283. }
  284. p->aFirst[p->nFirst++] = iAdd;
  285. return SQLITE_OK;
  286. }
  287. /*
  288. ** This function is an xTokenize() callback used by the auxiliary snippet()
  289. ** function. Its job is to identify tokens that are the first in a sentence.
  290. ** For each such token, an entry is added to the SFinder.aFirst[] array.
  291. */
  292. static int fts5SentenceFinderCb(
  293. void *pContext, /* Pointer to HighlightContext object */
  294. int tflags, /* Mask of FTS5_TOKEN_* flags */
  295. const char *pToken, /* Buffer containing token */
  296. int nToken, /* Size of token in bytes */
  297. int iStartOff, /* Start offset of token */
  298. int iEndOff /* End offset of token */
  299. ){
  300. int rc = SQLITE_OK;
  301. UNUSED_PARAM2(pToken, nToken);
  302. UNUSED_PARAM(iEndOff);
  303. if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
  304. Fts5SFinder *p = (Fts5SFinder*)pContext;
  305. if( p->iPos>0 ){
  306. int i;
  307. char c = 0;
  308. for(i=iStartOff-1; i>=0; i--){
  309. c = p->zDoc[i];
  310. if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break;
  311. }
  312. if( i!=iStartOff-1 && (c=='.' || c==':') ){
  313. rc = fts5SentenceFinderAdd(p, p->iPos);
  314. }
  315. }else{
  316. rc = fts5SentenceFinderAdd(p, 0);
  317. }
  318. p->iPos++;
  319. }
  320. return rc;
  321. }
  322. static int fts5SnippetScore(
  323. const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
  324. Fts5Context *pFts, /* First arg to pass to pApi functions */
  325. int nDocsize, /* Size of column in tokens */
  326. unsigned char *aSeen, /* Array with one element per query phrase */
  327. int iCol, /* Column to score */
  328. int iPos, /* Starting offset to score */
  329. int nToken, /* Max tokens per snippet */
  330. int *pnScore, /* OUT: Score */
  331. int *piPos /* OUT: Adjusted offset */
  332. ){
  333. int rc;
  334. int i;
  335. int ip = 0;
  336. int ic = 0;
  337. int iOff = 0;
  338. int iFirst = -1;
  339. int nInst;
  340. int nScore = 0;
  341. int iLast = 0;
  342. sqlite3_int64 iEnd = (sqlite3_int64)iPos + nToken;
  343. rc = pApi->xInstCount(pFts, &nInst);
  344. for(i=0; i<nInst && rc==SQLITE_OK; i++){
  345. rc = pApi->xInst(pFts, i, &ip, &ic, &iOff);
  346. if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<iEnd ){
  347. nScore += (aSeen[ip] ? 1 : 1000);
  348. aSeen[ip] = 1;
  349. if( iFirst<0 ) iFirst = iOff;
  350. iLast = iOff + pApi->xPhraseSize(pFts, ip);
  351. }
  352. }
  353. *pnScore = nScore;
  354. if( piPos ){
  355. sqlite3_int64 iAdj = iFirst - (nToken - (iLast-iFirst)) / 2;
  356. if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken;
  357. if( iAdj<0 ) iAdj = 0;
  358. *piPos = (int)iAdj;
  359. }
  360. return rc;
  361. }
  362. /*
  363. ** Return the value in pVal interpreted as utf-8 text. Except, if pVal
  364. ** contains a NULL value, return a pointer to a static string zero
  365. ** bytes in length instead of a NULL pointer.
  366. */
  367. static const char *fts5ValueToText(sqlite3_value *pVal){
  368. const char *zRet = (const char*)sqlite3_value_text(pVal);
  369. return zRet ? zRet : "";
  370. }
  371. /*
  372. ** Implementation of snippet() function.
  373. */
  374. static void fts5SnippetFunction(
  375. const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
  376. Fts5Context *pFts, /* First arg to pass to pApi functions */
  377. sqlite3_context *pCtx, /* Context for returning result/error */
  378. int nVal, /* Number of values in apVal[] array */
  379. sqlite3_value **apVal /* Array of trailing arguments */
  380. ){
  381. HighlightContext ctx;
  382. int rc = SQLITE_OK; /* Return code */
  383. int iCol; /* 1st argument to snippet() */
  384. const char *zEllips; /* 4th argument to snippet() */
  385. int nToken; /* 5th argument to snippet() */
  386. int nInst = 0; /* Number of instance matches this row */
  387. int i; /* Used to iterate through instances */
  388. int nPhrase; /* Number of phrases in query */
  389. unsigned char *aSeen; /* Array of "seen instance" flags */
  390. int iBestCol; /* Column containing best snippet */
  391. int iBestStart = 0; /* First token of best snippet */
  392. int nBestScore = 0; /* Score of best snippet */
  393. int nColSize = 0; /* Total size of iBestCol in tokens */
  394. Fts5SFinder sFinder; /* Used to find the beginnings of sentences */
  395. int nCol;
  396. if( nVal!=5 ){
  397. const char *zErr = "wrong number of arguments to function snippet()";
  398. sqlite3_result_error(pCtx, zErr, -1);
  399. return;
  400. }
  401. nCol = pApi->xColumnCount(pFts);
  402. memset(&ctx, 0, sizeof(HighlightContext));
  403. iCol = sqlite3_value_int(apVal[0]);
  404. ctx.zOpen = fts5ValueToText(apVal[1]);
  405. ctx.zClose = fts5ValueToText(apVal[2]);
  406. ctx.iRangeEnd = -1;
  407. zEllips = fts5ValueToText(apVal[3]);
  408. nToken = sqlite3_value_int(apVal[4]);
  409. iBestCol = (iCol>=0 ? iCol : 0);
  410. nPhrase = pApi->xPhraseCount(pFts);
  411. aSeen = sqlite3_malloc(nPhrase);
  412. if( aSeen==0 ){
  413. rc = SQLITE_NOMEM;
  414. }
  415. if( rc==SQLITE_OK ){
  416. rc = pApi->xInstCount(pFts, &nInst);
  417. }
  418. memset(&sFinder, 0, sizeof(Fts5SFinder));
  419. for(i=0; i<nCol; i++){
  420. if( iCol<0 || iCol==i ){
  421. const char *pLoc = 0; /* Locale of column iCol */
  422. int nLoc = 0; /* Size of pLoc in bytes */
  423. int nDoc;
  424. int nDocsize;
  425. int ii;
  426. sFinder.iPos = 0;
  427. sFinder.nFirst = 0;
  428. rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc);
  429. if( rc!=SQLITE_OK ) break;
  430. rc = pApi->xColumnLocale(pFts, i, &pLoc, &nLoc);
  431. if( rc!=SQLITE_OK ) break;
  432. rc = pApi->xTokenize_v2(pFts,
  433. sFinder.zDoc, nDoc, pLoc, nLoc, (void*)&sFinder, fts5SentenceFinderCb
  434. );
  435. if( rc!=SQLITE_OK ) break;
  436. rc = pApi->xColumnSize(pFts, i, &nDocsize);
  437. if( rc!=SQLITE_OK ) break;
  438. for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){
  439. int ip, ic, io;
  440. int iAdj;
  441. int nScore;
  442. int jj;
  443. rc = pApi->xInst(pFts, ii, &ip, &ic, &io);
  444. if( ic!=i ) continue;
  445. if( io>nDocsize ) rc = FTS5_CORRUPT;
  446. if( rc!=SQLITE_OK ) continue;
  447. memset(aSeen, 0, nPhrase);
  448. rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
  449. io, nToken, &nScore, &iAdj
  450. );
  451. if( rc==SQLITE_OK && nScore>nBestScore ){
  452. nBestScore = nScore;
  453. iBestCol = i;
  454. iBestStart = iAdj;
  455. nColSize = nDocsize;
  456. }
  457. if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){
  458. for(jj=0; jj<(sFinder.nFirst-1); jj++){
  459. if( sFinder.aFirst[jj+1]>io ) break;
  460. }
  461. if( sFinder.aFirst[jj]<io ){
  462. memset(aSeen, 0, nPhrase);
  463. rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
  464. sFinder.aFirst[jj], nToken, &nScore, 0
  465. );
  466. nScore += (sFinder.aFirst[jj]==0 ? 120 : 100);
  467. if( rc==SQLITE_OK && nScore>nBestScore ){
  468. nBestScore = nScore;
  469. iBestCol = i;
  470. iBestStart = sFinder.aFirst[jj];
  471. nColSize = nDocsize;
  472. }
  473. }
  474. }
  475. }
  476. }
  477. }
  478. if( rc==SQLITE_OK ){
  479. rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn);
  480. }
  481. if( rc==SQLITE_OK && nColSize==0 ){
  482. rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
  483. }
  484. if( ctx.zIn ){
  485. const char *pLoc = 0; /* Locale of column iBestCol */
  486. int nLoc = 0; /* Bytes in pLoc */
  487. if( rc==SQLITE_OK ){
  488. rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter);
  489. }
  490. ctx.iRangeStart = iBestStart;
  491. ctx.iRangeEnd = iBestStart + nToken - 1;
  492. if( iBestStart>0 ){
  493. fts5HighlightAppend(&rc, &ctx, zEllips, -1);
  494. }
  495. /* Advance iterator ctx.iter so that it points to the first coalesced
  496. ** phrase instance at or following position iBestStart. */
  497. while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){
  498. rc = fts5CInstIterNext(&ctx.iter);
  499. }
  500. if( rc==SQLITE_OK ){
  501. rc = pApi->xColumnLocale(pFts, iBestCol, &pLoc, &nLoc);
  502. }
  503. if( rc==SQLITE_OK ){
  504. rc = pApi->xTokenize_v2(
  505. pFts, ctx.zIn, ctx.nIn, pLoc, nLoc, (void*)&ctx,fts5HighlightCb
  506. );
  507. }
  508. if( ctx.bOpen ){
  509. fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
  510. }
  511. if( ctx.iRangeEnd>=(nColSize-1) ){
  512. fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
  513. }else{
  514. fts5HighlightAppend(&rc, &ctx, zEllips, -1);
  515. }
  516. }
  517. if( rc==SQLITE_OK ){
  518. sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
  519. }else{
  520. sqlite3_result_error_code(pCtx, rc);
  521. }
  522. sqlite3_free(ctx.zOut);
  523. sqlite3_free(aSeen);
  524. sqlite3_free(sFinder.aFirst);
  525. }
  526. /************************************************************************/
  527. /*
  528. ** The first time the bm25() function is called for a query, an instance
  529. ** of the following structure is allocated and populated.
  530. */
  531. typedef struct Fts5Bm25Data Fts5Bm25Data;
  532. struct Fts5Bm25Data {
  533. int nPhrase; /* Number of phrases in query */
  534. double avgdl; /* Average number of tokens in each row */
  535. double *aIDF; /* IDF for each phrase */
  536. double *aFreq; /* Array used to calculate phrase freq. */
  537. };
  538. /*
  539. ** Callback used by fts5Bm25GetData() to count the number of rows in the
  540. ** table matched by each individual phrase within the query.
  541. */
  542. static int fts5CountCb(
  543. const Fts5ExtensionApi *pApi,
  544. Fts5Context *pFts,
  545. void *pUserData /* Pointer to sqlite3_int64 variable */
  546. ){
  547. sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
  548. UNUSED_PARAM2(pApi, pFts);
  549. (*pn)++;
  550. return SQLITE_OK;
  551. }
  552. /*
  553. ** Set *ppData to point to the Fts5Bm25Data object for the current query.
  554. ** If the object has not already been allocated, allocate and populate it
  555. ** now.
  556. */
  557. static int fts5Bm25GetData(
  558. const Fts5ExtensionApi *pApi,
  559. Fts5Context *pFts,
  560. Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */
  561. ){
  562. int rc = SQLITE_OK; /* Return code */
  563. Fts5Bm25Data *p; /* Object to return */
  564. p = (Fts5Bm25Data*)pApi->xGetAuxdata(pFts, 0);
  565. if( p==0 ){
  566. int nPhrase; /* Number of phrases in query */
  567. sqlite3_int64 nRow = 0; /* Number of rows in table */
  568. sqlite3_int64 nToken = 0; /* Number of tokens in table */
  569. sqlite3_int64 nByte; /* Bytes of space to allocate */
  570. int i;
  571. /* Allocate the Fts5Bm25Data object */
  572. nPhrase = pApi->xPhraseCount(pFts);
  573. nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
  574. p = (Fts5Bm25Data*)sqlite3_malloc64(nByte);
  575. if( p==0 ){
  576. rc = SQLITE_NOMEM;
  577. }else{
  578. memset(p, 0, (size_t)nByte);
  579. p->nPhrase = nPhrase;
  580. p->aIDF = (double*)&p[1];
  581. p->aFreq = &p->aIDF[nPhrase];
  582. }
  583. /* Calculate the average document length for this FTS5 table */
  584. if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
  585. assert( rc!=SQLITE_OK || nRow>0 );
  586. if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
  587. if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow;
  588. /* Calculate an IDF for each phrase in the query */
  589. for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
  590. sqlite3_int64 nHit = 0;
  591. rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
  592. if( rc==SQLITE_OK ){
  593. /* Calculate the IDF (Inverse Document Frequency) for phrase i.
  594. ** This is done using the standard BM25 formula as found on wikipedia:
  595. **
  596. ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
  597. **
  598. ** where "N" is the total number of documents in the set and nHit
  599. ** is the number that contain at least one instance of the phrase
  600. ** under consideration.
  601. **
  602. ** The problem with this is that if (N < 2*nHit), the IDF is
  603. ** negative. Which is undesirable. So the mimimum allowable IDF is
  604. ** (1e-6) - roughly the same as a term that appears in just over
  605. ** half of set of 5,000,000 documents. */
  606. double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
  607. if( idf<=0.0 ) idf = 1e-6;
  608. p->aIDF[i] = idf;
  609. }
  610. }
  611. if( rc!=SQLITE_OK ){
  612. sqlite3_free(p);
  613. }else{
  614. rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
  615. }
  616. if( rc!=SQLITE_OK ) p = 0;
  617. }
  618. *ppData = p;
  619. return rc;
  620. }
  621. /*
  622. ** Implementation of bm25() function.
  623. */
  624. static void fts5Bm25Function(
  625. const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
  626. Fts5Context *pFts, /* First arg to pass to pApi functions */
  627. sqlite3_context *pCtx, /* Context for returning result/error */
  628. int nVal, /* Number of values in apVal[] array */
  629. sqlite3_value **apVal /* Array of trailing arguments */
  630. ){
  631. const double k1 = 1.2; /* Constant "k1" from BM25 formula */
  632. const double b = 0.75; /* Constant "b" from BM25 formula */
  633. int rc; /* Error code */
  634. double score = 0.0; /* SQL function return value */
  635. Fts5Bm25Data *pData; /* Values allocated/calculated once only */
  636. int i; /* Iterator variable */
  637. int nInst = 0; /* Value returned by xInstCount() */
  638. double D = 0.0; /* Total number of tokens in row */
  639. double *aFreq = 0; /* Array of phrase freq. for current row */
  640. /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
  641. ** for each phrase in the query for the current row. */
  642. rc = fts5Bm25GetData(pApi, pFts, &pData);
  643. if( rc==SQLITE_OK ){
  644. aFreq = pData->aFreq;
  645. memset(aFreq, 0, sizeof(double) * pData->nPhrase);
  646. rc = pApi->xInstCount(pFts, &nInst);
  647. }
  648. for(i=0; rc==SQLITE_OK && i<nInst; i++){
  649. int ip; int ic; int io;
  650. rc = pApi->xInst(pFts, i, &ip, &ic, &io);
  651. if( rc==SQLITE_OK ){
  652. double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
  653. aFreq[ip] += w;
  654. }
  655. }
  656. /* Figure out the total size of the current row in tokens. */
  657. if( rc==SQLITE_OK ){
  658. int nTok;
  659. rc = pApi->xColumnSize(pFts, -1, &nTok);
  660. D = (double)nTok;
  661. }
  662. /* Determine and return the BM25 score for the current row. Or, if an
  663. ** error has occurred, throw an exception. */
  664. if( rc==SQLITE_OK ){
  665. for(i=0; i<pData->nPhrase; i++){
  666. score += pData->aIDF[i] * (
  667. ( aFreq[i] * (k1 + 1.0) ) /
  668. ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
  669. );
  670. }
  671. sqlite3_result_double(pCtx, -1.0 * score);
  672. }else{
  673. sqlite3_result_error_code(pCtx, rc);
  674. }
  675. }
  676. /*
  677. ** Implementation of fts5_get_locale() function.
  678. */
  679. static void fts5GetLocaleFunction(
  680. const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
  681. Fts5Context *pFts, /* First arg to pass to pApi functions */
  682. sqlite3_context *pCtx, /* Context for returning result/error */
  683. int nVal, /* Number of values in apVal[] array */
  684. sqlite3_value **apVal /* Array of trailing arguments */
  685. ){
  686. int iCol = 0;
  687. int eType = 0;
  688. int rc = SQLITE_OK;
  689. const char *zLocale = 0;
  690. int nLocale = 0;
  691. /* xColumnLocale() must be available */
  692. assert( pApi->iVersion>=4 );
  693. if( nVal!=1 ){
  694. const char *z = "wrong number of arguments to function fts5_get_locale()";
  695. sqlite3_result_error(pCtx, z, -1);
  696. return;
  697. }
  698. eType = sqlite3_value_numeric_type(apVal[0]);
  699. if( eType!=SQLITE_INTEGER ){
  700. const char *z = "non-integer argument passed to function fts5_get_locale()";
  701. sqlite3_result_error(pCtx, z, -1);
  702. return;
  703. }
  704. iCol = sqlite3_value_int(apVal[0]);
  705. if( iCol<0 || iCol>=pApi->xColumnCount(pFts) ){
  706. sqlite3_result_error_code(pCtx, SQLITE_RANGE);
  707. return;
  708. }
  709. rc = pApi->xColumnLocale(pFts, iCol, &zLocale, &nLocale);
  710. if( rc!=SQLITE_OK ){
  711. sqlite3_result_error_code(pCtx, rc);
  712. return;
  713. }
  714. sqlite3_result_text(pCtx, zLocale, nLocale, SQLITE_TRANSIENT);
  715. }
  716. int sqlite3Fts5AuxInit(fts5_api *pApi){
  717. struct Builtin {
  718. const char *zFunc; /* Function name (nul-terminated) */
  719. void *pUserData; /* User-data pointer */
  720. fts5_extension_function xFunc;/* Callback function */
  721. void (*xDestroy)(void*); /* Destructor function */
  722. } aBuiltin [] = {
  723. { "snippet", 0, fts5SnippetFunction, 0 },
  724. { "highlight", 0, fts5HighlightFunction, 0 },
  725. { "bm25", 0, fts5Bm25Function, 0 },
  726. { "fts5_get_locale", 0, fts5GetLocaleFunction, 0 },
  727. };
  728. int rc = SQLITE_OK; /* Return code */
  729. int i; /* To iterate through builtin functions */
  730. for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
  731. rc = pApi->xCreateFunction(pApi,
  732. aBuiltin[i].zFunc,
  733. aBuiltin[i].pUserData,
  734. aBuiltin[i].xFunc,
  735. aBuiltin[i].xDestroy
  736. );
  737. }
  738. return rc;
  739. }