123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822 |
- /*
- ** 2014 May 31
- **
- ** The author disclaims copyright to this source code. In place of
- ** a legal notice, here is a blessing:
- **
- ** May you do good and not evil.
- ** May you find forgiveness for yourself and forgive others.
- ** May you share freely, never taking more than you give.
- **
- ******************************************************************************
- */
- #include "fts5Int.h"
- #include <math.h> /* amalgamator: keep */
- /*
- ** Object used to iterate through all "coalesced phrase instances" in
- ** a single column of the current row. If the phrase instances in the
- ** column being considered do not overlap, this object simply iterates
- ** through them. Or, if they do overlap (share one or more tokens in
- ** common), each set of overlapping instances is treated as a single
- ** match. See documentation for the highlight() auxiliary function for
- ** details.
- **
- ** Usage is:
- **
- ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
- ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
- ** rc = fts5CInstIterNext(&iter)
- ** ){
- ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
- ** }
- **
- */
- typedef struct CInstIter CInstIter;
- struct CInstIter {
- const Fts5ExtensionApi *pApi; /* API offered by current FTS version */
- Fts5Context *pFts; /* First arg to pass to pApi functions */
- int iCol; /* Column to search */
- int iInst; /* Next phrase instance index */
- int nInst; /* Total number of phrase instances */
- /* Output variables */
- int iStart; /* First token in coalesced phrase instance */
- int iEnd; /* Last token in coalesced phrase instance */
- };
- /*
- ** Advance the iterator to the next coalesced phrase instance. Return
- ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
- */
- static int fts5CInstIterNext(CInstIter *pIter){
- int rc = SQLITE_OK;
- pIter->iStart = -1;
- pIter->iEnd = -1;
- while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){
- int ip; int ic; int io;
- rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io);
- if( rc==SQLITE_OK ){
- if( ic==pIter->iCol ){
- int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip);
- if( pIter->iStart<0 ){
- pIter->iStart = io;
- pIter->iEnd = iEnd;
- }else if( io<=pIter->iEnd ){
- if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd;
- }else{
- break;
- }
- }
- pIter->iInst++;
- }
- }
- return rc;
- }
- /*
- ** Initialize the iterator object indicated by the final parameter to
- ** iterate through coalesced phrase instances in column iCol.
- */
- static int fts5CInstIterInit(
- const Fts5ExtensionApi *pApi,
- Fts5Context *pFts,
- int iCol,
- CInstIter *pIter
- ){
- int rc;
- memset(pIter, 0, sizeof(CInstIter));
- pIter->pApi = pApi;
- pIter->pFts = pFts;
- pIter->iCol = iCol;
- rc = pApi->xInstCount(pFts, &pIter->nInst);
- if( rc==SQLITE_OK ){
- rc = fts5CInstIterNext(pIter);
- }
- return rc;
- }
- /*************************************************************************
- ** Start of highlight() implementation.
- */
- typedef struct HighlightContext HighlightContext;
- struct HighlightContext {
- /* Constant parameters to fts5HighlightCb() */
- int iRangeStart; /* First token to include */
- int iRangeEnd; /* If non-zero, last token to include */
- const char *zOpen; /* Opening highlight */
- const char *zClose; /* Closing highlight */
- const char *zIn; /* Input text */
- int nIn; /* Size of input text in bytes */
- /* Variables modified by fts5HighlightCb() */
- CInstIter iter; /* Coalesced Instance Iterator */
- int iPos; /* Current token offset in zIn[] */
- int iOff; /* Have copied up to this offset in zIn[] */
- int bOpen; /* True if highlight is open */
- char *zOut; /* Output value */
- };
- /*
- ** Append text to the HighlightContext output string - p->zOut. Argument
- ** z points to a buffer containing n bytes of text to append. If n is
- ** negative, everything up until the first '\0' is appended to the output.
- **
- ** If *pRc is set to any value other than SQLITE_OK when this function is
- ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered,
- ** *pRc is set to an error code before returning.
- */
- static void fts5HighlightAppend(
- int *pRc,
- HighlightContext *p,
- const char *z, int n
- ){
- if( *pRc==SQLITE_OK && z ){
- if( n<0 ) n = (int)strlen(z);
- p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z);
- if( p->zOut==0 ) *pRc = SQLITE_NOMEM;
- }
- }
- /*
- ** Tokenizer callback used by implementation of highlight() function.
- */
- static int fts5HighlightCb(
- void *pContext, /* Pointer to HighlightContext object */
- int tflags, /* Mask of FTS5_TOKEN_* flags */
- const char *pToken, /* Buffer containing token */
- int nToken, /* Size of token in bytes */
- int iStartOff, /* Start byte offset of token */
- int iEndOff /* End byte offset of token */
- ){
- HighlightContext *p = (HighlightContext*)pContext;
- int rc = SQLITE_OK;
- int iPos;
- UNUSED_PARAM2(pToken, nToken);
- if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
- iPos = p->iPos++;
- if( p->iRangeEnd>=0 ){
- if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
- if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
- }
- /* If the parenthesis is open, and this token is not part of the current
- ** phrase, and the starting byte offset of this token is past the point
- ** that has currently been copied into the output buffer, close the
- ** parenthesis. */
- if( p->bOpen
- && (iPos<=p->iter.iStart || p->iter.iStart<0)
- && iStartOff>p->iOff
- ){
- fts5HighlightAppend(&rc, p, p->zClose, -1);
- p->bOpen = 0;
- }
- /* If this is the start of a new phrase, and the highlight is not open:
- **
- ** * copy text from the input up to the start of the phrase, and
- ** * open the highlight.
- */
- if( iPos==p->iter.iStart && p->bOpen==0 ){
- fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
- fts5HighlightAppend(&rc, p, p->zOpen, -1);
- p->iOff = iStartOff;
- p->bOpen = 1;
- }
- if( iPos==p->iter.iEnd ){
- if( p->bOpen==0 ){
- assert( p->iRangeEnd>=0 );
- fts5HighlightAppend(&rc, p, p->zOpen, -1);
- p->bOpen = 1;
- }
- fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
- p->iOff = iEndOff;
- if( rc==SQLITE_OK ){
- rc = fts5CInstIterNext(&p->iter);
- }
- }
- if( iPos==p->iRangeEnd ){
- if( p->bOpen ){
- if( p->iter.iStart>=0 && iPos>=p->iter.iStart ){
- fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
- p->iOff = iEndOff;
- }
- fts5HighlightAppend(&rc, p, p->zClose, -1);
- p->bOpen = 0;
- }
- fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
- p->iOff = iEndOff;
- }
- return rc;
- }
- /*
- ** Implementation of highlight() function.
- */
- static void fts5HighlightFunction(
- const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
- Fts5Context *pFts, /* First arg to pass to pApi functions */
- sqlite3_context *pCtx, /* Context for returning result/error */
- int nVal, /* Number of values in apVal[] array */
- sqlite3_value **apVal /* Array of trailing arguments */
- ){
- HighlightContext ctx;
- int rc;
- int iCol;
- if( nVal!=3 ){
- const char *zErr = "wrong number of arguments to function highlight()";
- sqlite3_result_error(pCtx, zErr, -1);
- return;
- }
- iCol = sqlite3_value_int(apVal[0]);
- memset(&ctx, 0, sizeof(HighlightContext));
- ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
- ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
- ctx.iRangeEnd = -1;
- rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
- if( rc==SQLITE_RANGE ){
- sqlite3_result_text(pCtx, "", -1, SQLITE_STATIC);
- rc = SQLITE_OK;
- }else if( ctx.zIn ){
- const char *pLoc = 0; /* Locale of column iCol */
- int nLoc = 0; /* Size of pLoc in bytes */
- if( rc==SQLITE_OK ){
- rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
- }
- if( rc==SQLITE_OK ){
- rc = pApi->xColumnLocale(pFts, iCol, &pLoc, &nLoc);
- }
- if( rc==SQLITE_OK ){
- rc = pApi->xTokenize_v2(
- pFts, ctx.zIn, ctx.nIn, pLoc, nLoc, (void*)&ctx, fts5HighlightCb
- );
- }
- if( ctx.bOpen ){
- fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
- }
- fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
- if( rc==SQLITE_OK ){
- sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
- }
- sqlite3_free(ctx.zOut);
- }
- if( rc!=SQLITE_OK ){
- sqlite3_result_error_code(pCtx, rc);
- }
- }
- /*
- ** End of highlight() implementation.
- **************************************************************************/
- /*
- ** Context object passed to the fts5SentenceFinderCb() function.
- */
- typedef struct Fts5SFinder Fts5SFinder;
- struct Fts5SFinder {
- int iPos; /* Current token position */
- int nFirstAlloc; /* Allocated size of aFirst[] */
- int nFirst; /* Number of entries in aFirst[] */
- int *aFirst; /* Array of first token in each sentence */
- const char *zDoc; /* Document being tokenized */
- };
- /*
- ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if
- ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an
- ** error occurs.
- */
- static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){
- if( p->nFirstAlloc==p->nFirst ){
- int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64;
- int *aNew;
- aNew = (int*)sqlite3_realloc64(p->aFirst, nNew*sizeof(int));
- if( aNew==0 ) return SQLITE_NOMEM;
- p->aFirst = aNew;
- p->nFirstAlloc = nNew;
- }
- p->aFirst[p->nFirst++] = iAdd;
- return SQLITE_OK;
- }
- /*
- ** This function is an xTokenize() callback used by the auxiliary snippet()
- ** function. Its job is to identify tokens that are the first in a sentence.
- ** For each such token, an entry is added to the SFinder.aFirst[] array.
- */
- static int fts5SentenceFinderCb(
- void *pContext, /* Pointer to HighlightContext object */
- int tflags, /* Mask of FTS5_TOKEN_* flags */
- const char *pToken, /* Buffer containing token */
- int nToken, /* Size of token in bytes */
- int iStartOff, /* Start offset of token */
- int iEndOff /* End offset of token */
- ){
- int rc = SQLITE_OK;
- UNUSED_PARAM2(pToken, nToken);
- UNUSED_PARAM(iEndOff);
- if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
- Fts5SFinder *p = (Fts5SFinder*)pContext;
- if( p->iPos>0 ){
- int i;
- char c = 0;
- for(i=iStartOff-1; i>=0; i--){
- c = p->zDoc[i];
- if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break;
- }
- if( i!=iStartOff-1 && (c=='.' || c==':') ){
- rc = fts5SentenceFinderAdd(p, p->iPos);
- }
- }else{
- rc = fts5SentenceFinderAdd(p, 0);
- }
- p->iPos++;
- }
- return rc;
- }
- static int fts5SnippetScore(
- const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
- Fts5Context *pFts, /* First arg to pass to pApi functions */
- int nDocsize, /* Size of column in tokens */
- unsigned char *aSeen, /* Array with one element per query phrase */
- int iCol, /* Column to score */
- int iPos, /* Starting offset to score */
- int nToken, /* Max tokens per snippet */
- int *pnScore, /* OUT: Score */
- int *piPos /* OUT: Adjusted offset */
- ){
- int rc;
- int i;
- int ip = 0;
- int ic = 0;
- int iOff = 0;
- int iFirst = -1;
- int nInst;
- int nScore = 0;
- int iLast = 0;
- sqlite3_int64 iEnd = (sqlite3_int64)iPos + nToken;
- rc = pApi->xInstCount(pFts, &nInst);
- for(i=0; i<nInst && rc==SQLITE_OK; i++){
- rc = pApi->xInst(pFts, i, &ip, &ic, &iOff);
- if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<iEnd ){
- nScore += (aSeen[ip] ? 1 : 1000);
- aSeen[ip] = 1;
- if( iFirst<0 ) iFirst = iOff;
- iLast = iOff + pApi->xPhraseSize(pFts, ip);
- }
- }
- *pnScore = nScore;
- if( piPos ){
- sqlite3_int64 iAdj = iFirst - (nToken - (iLast-iFirst)) / 2;
- if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken;
- if( iAdj<0 ) iAdj = 0;
- *piPos = (int)iAdj;
- }
- return rc;
- }
- /*
- ** Return the value in pVal interpreted as utf-8 text. Except, if pVal
- ** contains a NULL value, return a pointer to a static string zero
- ** bytes in length instead of a NULL pointer.
- */
- static const char *fts5ValueToText(sqlite3_value *pVal){
- const char *zRet = (const char*)sqlite3_value_text(pVal);
- return zRet ? zRet : "";
- }
- /*
- ** Implementation of snippet() function.
- */
- static void fts5SnippetFunction(
- const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
- Fts5Context *pFts, /* First arg to pass to pApi functions */
- sqlite3_context *pCtx, /* Context for returning result/error */
- int nVal, /* Number of values in apVal[] array */
- sqlite3_value **apVal /* Array of trailing arguments */
- ){
- HighlightContext ctx;
- int rc = SQLITE_OK; /* Return code */
- int iCol; /* 1st argument to snippet() */
- const char *zEllips; /* 4th argument to snippet() */
- int nToken; /* 5th argument to snippet() */
- int nInst = 0; /* Number of instance matches this row */
- int i; /* Used to iterate through instances */
- int nPhrase; /* Number of phrases in query */
- unsigned char *aSeen; /* Array of "seen instance" flags */
- int iBestCol; /* Column containing best snippet */
- int iBestStart = 0; /* First token of best snippet */
- int nBestScore = 0; /* Score of best snippet */
- int nColSize = 0; /* Total size of iBestCol in tokens */
- Fts5SFinder sFinder; /* Used to find the beginnings of sentences */
- int nCol;
- if( nVal!=5 ){
- const char *zErr = "wrong number of arguments to function snippet()";
- sqlite3_result_error(pCtx, zErr, -1);
- return;
- }
- nCol = pApi->xColumnCount(pFts);
- memset(&ctx, 0, sizeof(HighlightContext));
- iCol = sqlite3_value_int(apVal[0]);
- ctx.zOpen = fts5ValueToText(apVal[1]);
- ctx.zClose = fts5ValueToText(apVal[2]);
- ctx.iRangeEnd = -1;
- zEllips = fts5ValueToText(apVal[3]);
- nToken = sqlite3_value_int(apVal[4]);
- iBestCol = (iCol>=0 ? iCol : 0);
- nPhrase = pApi->xPhraseCount(pFts);
- aSeen = sqlite3_malloc(nPhrase);
- if( aSeen==0 ){
- rc = SQLITE_NOMEM;
- }
- if( rc==SQLITE_OK ){
- rc = pApi->xInstCount(pFts, &nInst);
- }
- memset(&sFinder, 0, sizeof(Fts5SFinder));
- for(i=0; i<nCol; i++){
- if( iCol<0 || iCol==i ){
- const char *pLoc = 0; /* Locale of column iCol */
- int nLoc = 0; /* Size of pLoc in bytes */
- int nDoc;
- int nDocsize;
- int ii;
- sFinder.iPos = 0;
- sFinder.nFirst = 0;
- rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc);
- if( rc!=SQLITE_OK ) break;
- rc = pApi->xColumnLocale(pFts, i, &pLoc, &nLoc);
- if( rc!=SQLITE_OK ) break;
- rc = pApi->xTokenize_v2(pFts,
- sFinder.zDoc, nDoc, pLoc, nLoc, (void*)&sFinder, fts5SentenceFinderCb
- );
- if( rc!=SQLITE_OK ) break;
- rc = pApi->xColumnSize(pFts, i, &nDocsize);
- if( rc!=SQLITE_OK ) break;
- for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){
- int ip, ic, io;
- int iAdj;
- int nScore;
- int jj;
- rc = pApi->xInst(pFts, ii, &ip, &ic, &io);
- if( ic!=i ) continue;
- if( io>nDocsize ) rc = FTS5_CORRUPT;
- if( rc!=SQLITE_OK ) continue;
- memset(aSeen, 0, nPhrase);
- rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
- io, nToken, &nScore, &iAdj
- );
- if( rc==SQLITE_OK && nScore>nBestScore ){
- nBestScore = nScore;
- iBestCol = i;
- iBestStart = iAdj;
- nColSize = nDocsize;
- }
- if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){
- for(jj=0; jj<(sFinder.nFirst-1); jj++){
- if( sFinder.aFirst[jj+1]>io ) break;
- }
- if( sFinder.aFirst[jj]<io ){
- memset(aSeen, 0, nPhrase);
- rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
- sFinder.aFirst[jj], nToken, &nScore, 0
- );
- nScore += (sFinder.aFirst[jj]==0 ? 120 : 100);
- if( rc==SQLITE_OK && nScore>nBestScore ){
- nBestScore = nScore;
- iBestCol = i;
- iBestStart = sFinder.aFirst[jj];
- nColSize = nDocsize;
- }
- }
- }
- }
- }
- }
- if( rc==SQLITE_OK ){
- rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn);
- }
- if( rc==SQLITE_OK && nColSize==0 ){
- rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
- }
- if( ctx.zIn ){
- const char *pLoc = 0; /* Locale of column iBestCol */
- int nLoc = 0; /* Bytes in pLoc */
- if( rc==SQLITE_OK ){
- rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter);
- }
- ctx.iRangeStart = iBestStart;
- ctx.iRangeEnd = iBestStart + nToken - 1;
- if( iBestStart>0 ){
- fts5HighlightAppend(&rc, &ctx, zEllips, -1);
- }
- /* Advance iterator ctx.iter so that it points to the first coalesced
- ** phrase instance at or following position iBestStart. */
- while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){
- rc = fts5CInstIterNext(&ctx.iter);
- }
- if( rc==SQLITE_OK ){
- rc = pApi->xColumnLocale(pFts, iBestCol, &pLoc, &nLoc);
- }
- if( rc==SQLITE_OK ){
- rc = pApi->xTokenize_v2(
- pFts, ctx.zIn, ctx.nIn, pLoc, nLoc, (void*)&ctx,fts5HighlightCb
- );
- }
- if( ctx.bOpen ){
- fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
- }
- if( ctx.iRangeEnd>=(nColSize-1) ){
- fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
- }else{
- fts5HighlightAppend(&rc, &ctx, zEllips, -1);
- }
- }
- if( rc==SQLITE_OK ){
- sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
- }else{
- sqlite3_result_error_code(pCtx, rc);
- }
- sqlite3_free(ctx.zOut);
- sqlite3_free(aSeen);
- sqlite3_free(sFinder.aFirst);
- }
- /************************************************************************/
- /*
- ** The first time the bm25() function is called for a query, an instance
- ** of the following structure is allocated and populated.
- */
- typedef struct Fts5Bm25Data Fts5Bm25Data;
- struct Fts5Bm25Data {
- int nPhrase; /* Number of phrases in query */
- double avgdl; /* Average number of tokens in each row */
- double *aIDF; /* IDF for each phrase */
- double *aFreq; /* Array used to calculate phrase freq. */
- };
- /*
- ** Callback used by fts5Bm25GetData() to count the number of rows in the
- ** table matched by each individual phrase within the query.
- */
- static int fts5CountCb(
- const Fts5ExtensionApi *pApi,
- Fts5Context *pFts,
- void *pUserData /* Pointer to sqlite3_int64 variable */
- ){
- sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
- UNUSED_PARAM2(pApi, pFts);
- (*pn)++;
- return SQLITE_OK;
- }
- /*
- ** Set *ppData to point to the Fts5Bm25Data object for the current query.
- ** If the object has not already been allocated, allocate and populate it
- ** now.
- */
- static int fts5Bm25GetData(
- const Fts5ExtensionApi *pApi,
- Fts5Context *pFts,
- Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */
- ){
- int rc = SQLITE_OK; /* Return code */
- Fts5Bm25Data *p; /* Object to return */
- p = (Fts5Bm25Data*)pApi->xGetAuxdata(pFts, 0);
- if( p==0 ){
- int nPhrase; /* Number of phrases in query */
- sqlite3_int64 nRow = 0; /* Number of rows in table */
- sqlite3_int64 nToken = 0; /* Number of tokens in table */
- sqlite3_int64 nByte; /* Bytes of space to allocate */
- int i;
- /* Allocate the Fts5Bm25Data object */
- nPhrase = pApi->xPhraseCount(pFts);
- nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
- p = (Fts5Bm25Data*)sqlite3_malloc64(nByte);
- if( p==0 ){
- rc = SQLITE_NOMEM;
- }else{
- memset(p, 0, (size_t)nByte);
- p->nPhrase = nPhrase;
- p->aIDF = (double*)&p[1];
- p->aFreq = &p->aIDF[nPhrase];
- }
- /* Calculate the average document length for this FTS5 table */
- if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
- assert( rc!=SQLITE_OK || nRow>0 );
- if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
- if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow;
- /* Calculate an IDF for each phrase in the query */
- for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
- sqlite3_int64 nHit = 0;
- rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
- if( rc==SQLITE_OK ){
- /* Calculate the IDF (Inverse Document Frequency) for phrase i.
- ** This is done using the standard BM25 formula as found on wikipedia:
- **
- ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
- **
- ** where "N" is the total number of documents in the set and nHit
- ** is the number that contain at least one instance of the phrase
- ** under consideration.
- **
- ** The problem with this is that if (N < 2*nHit), the IDF is
- ** negative. Which is undesirable. So the mimimum allowable IDF is
- ** (1e-6) - roughly the same as a term that appears in just over
- ** half of set of 5,000,000 documents. */
- double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
- if( idf<=0.0 ) idf = 1e-6;
- p->aIDF[i] = idf;
- }
- }
- if( rc!=SQLITE_OK ){
- sqlite3_free(p);
- }else{
- rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
- }
- if( rc!=SQLITE_OK ) p = 0;
- }
- *ppData = p;
- return rc;
- }
- /*
- ** Implementation of bm25() function.
- */
- static void fts5Bm25Function(
- const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
- Fts5Context *pFts, /* First arg to pass to pApi functions */
- sqlite3_context *pCtx, /* Context for returning result/error */
- int nVal, /* Number of values in apVal[] array */
- sqlite3_value **apVal /* Array of trailing arguments */
- ){
- const double k1 = 1.2; /* Constant "k1" from BM25 formula */
- const double b = 0.75; /* Constant "b" from BM25 formula */
- int rc; /* Error code */
- double score = 0.0; /* SQL function return value */
- Fts5Bm25Data *pData; /* Values allocated/calculated once only */
- int i; /* Iterator variable */
- int nInst = 0; /* Value returned by xInstCount() */
- double D = 0.0; /* Total number of tokens in row */
- double *aFreq = 0; /* Array of phrase freq. for current row */
- /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
- ** for each phrase in the query for the current row. */
- rc = fts5Bm25GetData(pApi, pFts, &pData);
- if( rc==SQLITE_OK ){
- aFreq = pData->aFreq;
- memset(aFreq, 0, sizeof(double) * pData->nPhrase);
- rc = pApi->xInstCount(pFts, &nInst);
- }
- for(i=0; rc==SQLITE_OK && i<nInst; i++){
- int ip; int ic; int io;
- rc = pApi->xInst(pFts, i, &ip, &ic, &io);
- if( rc==SQLITE_OK ){
- double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
- aFreq[ip] += w;
- }
- }
- /* Figure out the total size of the current row in tokens. */
- if( rc==SQLITE_OK ){
- int nTok;
- rc = pApi->xColumnSize(pFts, -1, &nTok);
- D = (double)nTok;
- }
- /* Determine and return the BM25 score for the current row. Or, if an
- ** error has occurred, throw an exception. */
- if( rc==SQLITE_OK ){
- for(i=0; i<pData->nPhrase; i++){
- score += pData->aIDF[i] * (
- ( aFreq[i] * (k1 + 1.0) ) /
- ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
- );
- }
- sqlite3_result_double(pCtx, -1.0 * score);
- }else{
- sqlite3_result_error_code(pCtx, rc);
- }
- }
- /*
- ** Implementation of fts5_get_locale() function.
- */
- static void fts5GetLocaleFunction(
- const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
- Fts5Context *pFts, /* First arg to pass to pApi functions */
- sqlite3_context *pCtx, /* Context for returning result/error */
- int nVal, /* Number of values in apVal[] array */
- sqlite3_value **apVal /* Array of trailing arguments */
- ){
- int iCol = 0;
- int eType = 0;
- int rc = SQLITE_OK;
- const char *zLocale = 0;
- int nLocale = 0;
- /* xColumnLocale() must be available */
- assert( pApi->iVersion>=4 );
- if( nVal!=1 ){
- const char *z = "wrong number of arguments to function fts5_get_locale()";
- sqlite3_result_error(pCtx, z, -1);
- return;
- }
- eType = sqlite3_value_numeric_type(apVal[0]);
- if( eType!=SQLITE_INTEGER ){
- const char *z = "non-integer argument passed to function fts5_get_locale()";
- sqlite3_result_error(pCtx, z, -1);
- return;
- }
- iCol = sqlite3_value_int(apVal[0]);
- if( iCol<0 || iCol>=pApi->xColumnCount(pFts) ){
- sqlite3_result_error_code(pCtx, SQLITE_RANGE);
- return;
- }
- rc = pApi->xColumnLocale(pFts, iCol, &zLocale, &nLocale);
- if( rc!=SQLITE_OK ){
- sqlite3_result_error_code(pCtx, rc);
- return;
- }
- sqlite3_result_text(pCtx, zLocale, nLocale, SQLITE_TRANSIENT);
- }
- int sqlite3Fts5AuxInit(fts5_api *pApi){
- struct Builtin {
- const char *zFunc; /* Function name (nul-terminated) */
- void *pUserData; /* User-data pointer */
- fts5_extension_function xFunc;/* Callback function */
- void (*xDestroy)(void*); /* Destructor function */
- } aBuiltin [] = {
- { "snippet", 0, fts5SnippetFunction, 0 },
- { "highlight", 0, fts5HighlightFunction, 0 },
- { "bm25", 0, fts5Bm25Function, 0 },
- { "fts5_get_locale", 0, fts5GetLocaleFunction, 0 },
- };
- int rc = SQLITE_OK; /* Return code */
- int i; /* To iterate through builtin functions */
- for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
- rc = pApi->xCreateFunction(pApi,
- aBuiltin[i].zFunc,
- aBuiltin[i].pUserData,
- aBuiltin[i].xFunc,
- aBuiltin[i].xDestroy
- );
- }
- return rc;
- }
|