123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621 |
- /*
- ** 2011 Jun 13
- **
- ** The author disclaims copyright to this source code. In place of
- ** a legal notice, here is a blessing:
- **
- ** May you do good and not evil.
- ** May you find forgiveness for yourself and forgive others.
- ** May you share freely, never taking more than you give.
- **
- ******************************************************************************
- **
- ** This file is not part of the production FTS code. It is only used for
- ** testing. It contains a Tcl command that can be used to test if a document
- ** matches an FTS NEAR expression.
- **
- ** As of March 2012, it also contains a version 1 tokenizer used for testing
- ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
- */
- #include "tclsqlite.h"
- #include <string.h>
- #include <assert.h>
- #if defined(SQLITE_TEST)
- #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
- /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
- #include "fts3Int.h"
- #define NM_MAX_TOKEN 12
- typedef struct NearPhrase NearPhrase;
- typedef struct NearDocument NearDocument;
- typedef struct NearToken NearToken;
- struct NearDocument {
- int nToken; /* Length of token in bytes */
- NearToken *aToken; /* Token array */
- };
- struct NearToken {
- int n; /* Length of token in bytes */
- const char *z; /* Pointer to token string */
- };
- struct NearPhrase {
- int nNear; /* Preceding NEAR value */
- int nToken; /* Number of tokens in this phrase */
- NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */
- };
- static int nm_phrase_match(
- NearPhrase *p,
- NearToken *aToken
- ){
- int ii;
- for(ii=0; ii<p->nToken; ii++){
- NearToken *pToken = &p->aToken[ii];
- if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){
- if( aToken[ii].n<(pToken->n-1) ) return 0;
- if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0;
- }else{
- if( aToken[ii].n!=pToken->n ) return 0;
- if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0;
- }
- }
- return 1;
- }
- static int nm_near_chain(
- int iDir, /* Direction to iterate through aPhrase[] */
- NearDocument *pDoc, /* Document to match against */
- int iPos, /* Position at which iPhrase was found */
- int nPhrase, /* Size of phrase array */
- NearPhrase *aPhrase, /* Phrase array */
- int iPhrase /* Index of phrase found */
- ){
- int iStart;
- int iStop;
- int ii;
- int nNear;
- int iPhrase2;
- NearPhrase *p;
- NearPhrase *pPrev;
- assert( iDir==1 || iDir==-1 );
- if( iDir==1 ){
- if( (iPhrase+1)==nPhrase ) return 1;
- nNear = aPhrase[iPhrase+1].nNear;
- }else{
- if( iPhrase==0 ) return 1;
- nNear = aPhrase[iPhrase].nNear;
- }
- pPrev = &aPhrase[iPhrase];
- iPhrase2 = iPhrase+iDir;
- p = &aPhrase[iPhrase2];
- iStart = iPos - nNear - p->nToken;
- iStop = iPos + nNear + pPrev->nToken;
- if( iStart<0 ) iStart = 0;
- if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken;
- for(ii=iStart; ii<=iStop; ii++){
- if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
- if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1;
- }
- }
- return 0;
- }
- static int nm_match_count(
- NearDocument *pDoc, /* Document to match against */
- int nPhrase, /* Size of phrase array */
- NearPhrase *aPhrase, /* Phrase array */
- int iPhrase /* Index of phrase to count matches for */
- ){
- int nOcc = 0;
- int ii;
- NearPhrase *p = &aPhrase[iPhrase];
- for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){
- if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
- /* Test forward NEAR chain (i>iPhrase) */
- if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
- /* Test reverse NEAR chain (i<iPhrase) */
- if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
- /* This is a real match. Increment the counter. */
- nOcc++;
- }
- }
- return nOcc;
- }
- /*
- ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
- */
- static int SQLITE_TCLAPI fts3_near_match_cmd(
- ClientData clientData,
- Tcl_Interp *interp,
- int objc,
- Tcl_Obj *CONST objv[]
- ){
- int nTotal = 0;
- int rc;
- int ii;
- int nPhrase;
- NearPhrase *aPhrase = 0;
- NearDocument doc = {0, 0};
- Tcl_Obj **apDocToken;
- Tcl_Obj *pRet;
- Tcl_Obj *pPhrasecount = 0;
-
- Tcl_Obj **apExprToken;
- Tcl_Size nExprToken;
- Tcl_Size nn;
- UNUSED_PARAMETER(clientData);
- /* Must have 3 or more arguments. */
- if( objc<3 || (objc%2)==0 ){
- Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?...");
- rc = TCL_ERROR;
- goto near_match_out;
- }
- for(ii=3; ii<objc; ii+=2){
- enum NM_enum { NM_PHRASECOUNTS };
- struct TestnmSubcmd {
- char *zName;
- enum NM_enum eOpt;
- } aOpt[] = {
- { "-phrasecountvar", NM_PHRASECOUNTS },
- { 0, 0 }
- };
- int iOpt;
- if( Tcl_GetIndexFromObjStruct(
- interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt)
- ){
- return TCL_ERROR;
- }
- switch( aOpt[iOpt].eOpt ){
- case NM_PHRASECOUNTS:
- pPhrasecount = objv[ii+1];
- break;
- }
- }
- rc = Tcl_ListObjGetElements(interp, objv[1], &nn, &apDocToken);
- doc.nToken = (int)nn;
- if( rc!=TCL_OK ) goto near_match_out;
- doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken));
- for(ii=0; ii<doc.nToken; ii++){
- doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &nn);
- doc.aToken[ii].n = (int)nn;
- }
- rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken);
- if( rc!=TCL_OK ) goto near_match_out;
- nPhrase = (int)(nExprToken + 1) / 2;
- aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase));
- memset(aPhrase, 0, nPhrase * sizeof(NearPhrase));
- for(ii=0; ii<nPhrase; ii++){
- Tcl_Obj *pPhrase = apExprToken[ii*2];
- Tcl_Obj **apToken;
- Tcl_Size nToken;
- int jj;
- rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken);
- if( rc!=TCL_OK ) goto near_match_out;
- if( nToken>NM_MAX_TOKEN ){
- Tcl_AppendResult(interp, "Too many tokens in phrase", 0);
- rc = TCL_ERROR;
- goto near_match_out;
- }
- for(jj=0; jj<(int)nToken; jj++){
- NearToken *pT = &aPhrase[ii].aToken[jj];
- pT->z = Tcl_GetStringFromObj(apToken[jj], &nn);
- pT->n = (int)nn;
- }
- aPhrase[ii].nToken = (int)nToken;
- }
- for(ii=1; ii<nPhrase; ii++){
- Tcl_Obj *pNear = apExprToken[2*ii-1];
- int nNear;
- rc = Tcl_GetIntFromObj(interp, pNear, &nNear);
- if( rc!=TCL_OK ) goto near_match_out;
- aPhrase[ii].nNear = nNear;
- }
- pRet = Tcl_NewObj();
- Tcl_IncrRefCount(pRet);
- for(ii=0; ii<nPhrase; ii++){
- int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii);
- Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc));
- nTotal += nOcc;
- }
- if( pPhrasecount ){
- Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0);
- }
- Tcl_DecrRefCount(pRet);
- Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0));
- near_match_out:
- ckfree((char *)aPhrase);
- ckfree((char *)doc.aToken);
- return rc;
- }
- /*
- ** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
- **
- ** Normally, FTS uses hard-coded values to determine the minimum doclist
- ** size eligible for incremental loading, and the size of the chunks loaded
- ** when a doclist is incrementally loaded. This command allows the built-in
- ** values to be overridden for testing purposes.
- **
- ** If present, the first argument is the chunksize in bytes to load doclists
- ** in. The second argument is the minimum doclist size in bytes to use
- ** incremental loading with.
- **
- ** Whether or not the arguments are present, this command returns a list of
- ** two integers - the initial chunksize and threshold when the command is
- ** invoked. This can be used to restore the default behavior after running
- ** tests. For example:
- **
- ** # Override incr-load settings for testing:
- ** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
- **
- ** .... run tests ....
- **
- ** # Restore initial incr-load settings:
- ** eval fts3_configure_incr_load $cfg
- */
- static int SQLITE_TCLAPI fts3_configure_incr_load_cmd(
- ClientData clientData,
- Tcl_Interp *interp,
- int objc,
- Tcl_Obj *CONST objv[]
- ){
- #ifdef SQLITE_ENABLE_FTS3
- extern int test_fts3_node_chunksize;
- extern int test_fts3_node_chunk_threshold;
- Tcl_Obj *pRet;
- if( objc!=1 && objc!=3 ){
- Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?");
- return TCL_ERROR;
- }
- pRet = Tcl_NewObj();
- Tcl_IncrRefCount(pRet);
- Tcl_ListObjAppendElement(
- interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize));
- Tcl_ListObjAppendElement(
- interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold));
- if( objc==3 ){
- int iArg1;
- int iArg2;
- if( Tcl_GetIntFromObj(interp, objv[1], &iArg1)
- || Tcl_GetIntFromObj(interp, objv[2], &iArg2)
- ){
- Tcl_DecrRefCount(pRet);
- return TCL_ERROR;
- }
- test_fts3_node_chunksize = iArg1;
- test_fts3_node_chunk_threshold = iArg2;
- }
- Tcl_SetObjResult(interp, pRet);
- Tcl_DecrRefCount(pRet);
- #endif
- UNUSED_PARAMETER(clientData);
- return TCL_OK;
- }
- #ifdef SQLITE_ENABLE_FTS3
- /**************************************************************************
- ** Beginning of test tokenizer code.
- **
- ** For language 0, this tokenizer is similar to the default 'simple'
- ** tokenizer. For other languages L, the following:
- **
- ** * Odd numbered languages are case-sensitive. Even numbered
- ** languages are not.
- **
- ** * Language ids 100 or greater are considered an error.
- **
- ** The implementation assumes that the input contains only ASCII characters
- ** (i.e. those that may be encoded in UTF-8 using a single byte).
- */
- typedef struct test_tokenizer {
- sqlite3_tokenizer base;
- } test_tokenizer;
- typedef struct test_tokenizer_cursor {
- sqlite3_tokenizer_cursor base;
- const char *aInput; /* Input being tokenized */
- int nInput; /* Size of the input in bytes */
- int iInput; /* Current offset in aInput */
- int iToken; /* Index of next token to be returned */
- char *aBuffer; /* Buffer containing current token */
- int nBuffer; /* Number of bytes allocated at pToken */
- int iLangid; /* Configured language id */
- } test_tokenizer_cursor;
- static int testTokenizerCreate(
- int argc, const char * const *argv,
- sqlite3_tokenizer **ppTokenizer
- ){
- test_tokenizer *pNew;
- UNUSED_PARAMETER(argc);
- UNUSED_PARAMETER(argv);
- pNew = sqlite3_malloc(sizeof(test_tokenizer));
- if( !pNew ) return SQLITE_NOMEM;
- memset(pNew, 0, sizeof(test_tokenizer));
- *ppTokenizer = (sqlite3_tokenizer *)pNew;
- return SQLITE_OK;
- }
- static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
- test_tokenizer *p = (test_tokenizer *)pTokenizer;
- sqlite3_free(p);
- return SQLITE_OK;
- }
- static int testTokenizerOpen(
- sqlite3_tokenizer *pTokenizer, /* The tokenizer */
- const char *pInput, int nBytes, /* String to be tokenized */
- sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
- ){
- int rc = SQLITE_OK; /* Return code */
- test_tokenizer_cursor *pCsr; /* New cursor object */
- UNUSED_PARAMETER(pTokenizer);
- pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
- if( pCsr==0 ){
- rc = SQLITE_NOMEM;
- }else{
- memset(pCsr, 0, sizeof(test_tokenizer_cursor));
- pCsr->aInput = pInput;
- if( nBytes<0 ){
- pCsr->nInput = (int)strlen(pInput);
- }else{
- pCsr->nInput = nBytes;
- }
- }
- *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
- return rc;
- }
- static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
- test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
- sqlite3_free(pCsr->aBuffer);
- sqlite3_free(pCsr);
- return SQLITE_OK;
- }
- static int testIsTokenChar(char c){
- return (c>='a' && c<='z') || (c>='A' && c<='Z');
- }
- static int testTolower(char c){
- char ret = c;
- if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
- return ret;
- }
- static int testTokenizerNext(
- sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */
- const char **ppToken, /* OUT: *ppToken is the token text */
- int *pnBytes, /* OUT: Number of bytes in token */
- int *piStartOffset, /* OUT: Starting offset of token */
- int *piEndOffset, /* OUT: Ending offset of token */
- int *piPosition /* OUT: Position integer of token */
- ){
- test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
- int rc = SQLITE_OK;
- const char *p;
- const char *pEnd;
- p = &pCsr->aInput[pCsr->iInput];
- pEnd = &pCsr->aInput[pCsr->nInput];
- /* Skip past any white-space */
- assert( p<=pEnd );
- while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
- if( p==pEnd ){
- rc = SQLITE_DONE;
- }else{
- /* Advance to the end of the token */
- const char *pToken = p;
- sqlite3_int64 nToken;
- while( p<pEnd && testIsTokenChar(*p) ) p++;
- nToken = (sqlite3_int64)(p-pToken);
- /* Copy the token into the buffer */
- if( nToken>pCsr->nBuffer ){
- sqlite3_free(pCsr->aBuffer);
- pCsr->aBuffer = sqlite3_malloc64(nToken);
- }
- if( pCsr->aBuffer==0 ){
- rc = SQLITE_NOMEM;
- }else{
- int i;
- if( pCsr->iLangid & 0x00000001 ){
- for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
- }else{
- for(i=0; i<nToken; i++) pCsr->aBuffer[i] = (char)testTolower(pToken[i]);
- }
- pCsr->iToken++;
- pCsr->iInput = (int)(p - pCsr->aInput);
- *ppToken = pCsr->aBuffer;
- *pnBytes = (int)nToken;
- *piStartOffset = (int)(pToken - pCsr->aInput);
- *piEndOffset = (int)(p - pCsr->aInput);
- *piPosition = pCsr->iToken;
- }
- }
- return rc;
- }
- static int testTokenizerLanguage(
- sqlite3_tokenizer_cursor *pCursor,
- int iLangid
- ){
- int rc = SQLITE_OK;
- test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
- pCsr->iLangid = iLangid;
- if( pCsr->iLangid>=100 ){
- rc = SQLITE_ERROR;
- }
- return rc;
- }
- #endif
- static int SQLITE_TCLAPI fts3_test_tokenizer_cmd(
- ClientData clientData,
- Tcl_Interp *interp,
- int objc,
- Tcl_Obj *CONST objv[]
- ){
- #ifdef SQLITE_ENABLE_FTS3
- static const sqlite3_tokenizer_module testTokenizerModule = {
- 1,
- testTokenizerCreate,
- testTokenizerDestroy,
- testTokenizerOpen,
- testTokenizerClose,
- testTokenizerNext,
- testTokenizerLanguage
- };
- const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
- if( objc!=1 ){
- Tcl_WrongNumArgs(interp, 1, objv, "");
- return TCL_ERROR;
- }
- Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
- (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
- ));
- #endif
- UNUSED_PARAMETER(clientData);
- return TCL_OK;
- }
- static int SQLITE_TCLAPI fts3_test_varint_cmd(
- ClientData clientData,
- Tcl_Interp *interp,
- int objc,
- Tcl_Obj *CONST objv[]
- ){
- #ifdef SQLITE_ENABLE_FTS3
- char aBuf[24];
- int rc;
- Tcl_WideInt w;
- sqlite3_int64 w2;
- int nByte, nByte2;
- if( objc!=2 ){
- Tcl_WrongNumArgs(interp, 1, objv, "INTEGER");
- return TCL_ERROR;
- }
- rc = Tcl_GetWideIntFromObj(interp, objv[1], &w);
- if( rc!=TCL_OK ) return rc;
- nByte = sqlite3Fts3PutVarint(aBuf, w);
- nByte2 = sqlite3Fts3GetVarint(aBuf, &w2);
- if( w!=w2 || nByte!=nByte2 ){
- char *zErr = sqlite3_mprintf("error testing %lld", w);
- Tcl_ResetResult(interp);
- Tcl_AppendResult(interp, zErr, 0);
- return TCL_ERROR;
- }
- if( w<=2147483647 && w>=0 ){
- int i;
- nByte2 = fts3GetVarint32(aBuf, &i);
- if( (int)w!=i || nByte!=nByte2 ){
- char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w);
- Tcl_ResetResult(interp);
- Tcl_AppendResult(interp, zErr, 0);
- return TCL_ERROR;
- }
- }
- #endif
- UNUSED_PARAMETER(clientData);
- return TCL_OK;
- }
- /*
- ** End of tokenizer code.
- **************************************************************************/
- /*
- ** sqlite3_fts3_may_be_corrupt BOOLEAN
- **
- ** Set or clear the global "may-be-corrupt" flag. Return the old value.
- */
- static int SQLITE_TCLAPI fts3_may_be_corrupt(
- void * clientData,
- Tcl_Interp *interp,
- int objc,
- Tcl_Obj *CONST objv[]
- ){
- #ifdef SQLITE_DEBUG
- int bOld = sqlite3_fts3_may_be_corrupt;
- if( objc!=2 && objc!=1 ){
- Tcl_WrongNumArgs(interp, 1, objv, "?BOOLEAN?");
- return TCL_ERROR;
- }
- if( objc==2 ){
- int bNew;
- if( Tcl_GetBooleanFromObj(interp, objv[1], &bNew) ) return TCL_ERROR;
- sqlite3_fts3_may_be_corrupt = bNew;
- }
- Tcl_SetObjResult(interp, Tcl_NewIntObj(bOld));
- #endif
- return TCL_OK;
- }
- int Sqlitetestfts3_Init(Tcl_Interp *interp){
- Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
- Tcl_CreateObjCommand(interp,
- "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
- );
- Tcl_CreateObjCommand(
- interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
- );
- Tcl_CreateObjCommand(
- interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0
- );
- Tcl_CreateObjCommand(
- interp, "sqlite3_fts3_may_be_corrupt", fts3_may_be_corrupt, 0, 0
- );
- return TCL_OK;
- }
- #endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
- #endif /* ifdef SQLITE_TEST */
|