123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- # 2014 Dec 20
- #
- # The author disclaims copyright to this source code. In place of
- # a legal notice, here is a blessing:
- #
- # May you do good and not evil.
- # May you find forgiveness for yourself and forgive others.
- # May you share freely, never taking more than you give.
- #
- #***********************************************************************
- #
- # Tests focusing on the built-in fts5 tokenizers.
- #
- source [file join [file dirname [info script]] fts5_common.tcl]
- set testprefix fts5tokenizer
- # If SQLITE_ENABLE_FTS5 is not defined, omit this file.
- ifcapable !fts5 {
- finish_test
- return
- }
- do_execsql_test 1.0 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
- DROP TABLE ft1;
- }
- do_execsql_test 1.1 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter');
- DROP TABLE ft1;
- }
- do_execsql_test 1.2 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter);
- DROP TABLE ft1;
- }
- do_execsql_test 1.3 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
- DROP TABLE ft1;
- }
- do_execsql_test 1.4 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
- DROP TABLE ft1;
- }
- do_catchsql_test 1.5 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch');
- } {1 {no such tokenizer: nosuch}}
- do_catchsql_test 1.6 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch');
- } {1 {error in tokenizer constructor}}
- do_execsql_test 2.0 {
- CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
- INSERT INTO ft1 VALUES('embedded databases');
- }
- do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
- do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
- do_execsql_test 2.3 {
- SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding'
- } 1
- proc tcl_create {args} {
- set ::targs $args
- error "failed"
- }
- sqlite3_fts5_create_tokenizer db tcl tcl_create
- foreach {tn directive expected} {
- 1 {tokenize='tcl a b c'} {a b c}
- 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
- 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i}
- 4 {tokenize = tcl} {}
- } {
- do_catchsql_test 3.$tn.1 "
- CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
- " {1 {error in tokenizer constructor}}
- do_test 3.$tn.2 { set ::targs } $expected
- }
- do_catchsql_test 4.1 {
- CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
- } {1 {parse error in "tokenize = tcl abc"}}
- do_catchsql_test 4.2 {
- CREATE VIRTUAL TABLE ft2 USING fts5(x y)
- } {1 {unrecognized column option: y}}
- #-------------------------------------------------------------------------
- # Test the "separators" and "tokenchars" options a bit.
- #
- foreach {tn tokenizer} {1 ascii 2 unicode61} {
- reset_db
- set T "$tokenizer tokenchars ',.:' separators 'xyz'"
- execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
- do_execsql_test 5.$tn.1 {
- INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
- }
- foreach {tn2 token res} {
- 1 abc 1 2 def 1 3 ghi 1 4 jkl {}
- 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1
- 9 vw 1
- } {
- do_execsql_test 5.$tn.2.$tn2 "
- SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
- " $res
- }
- }
- #-------------------------------------------------------------------------
- # Miscellaneous tests for the ascii tokenizer.
- #
- # 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the
- # 'separators' option. But unicode61 does not.
- #
- # 5.2.*: An option without an argument is an error.
- #
- do_test 5.1.1 {
- execsql "
- CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`);
- INSERT INTO a1 VALUES('abc\u1234def');
- "
- execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' }
- } {}
- do_test 5.1.2 {
- execsql "
- CREATE VIRTUAL TABLE a2 USING fts5(
- x, tokenize=`unicode61 separators '\u1234'`);
- INSERT INTO a2 VALUES('abc\u1234def');
- "
- execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' }
- } {1}
- do_catchsql_test 5.2 {
- CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars');
- } {1 {error in tokenizer constructor}}
- do_catchsql_test 5.3 {
- CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg');
- } {1 {error in tokenizer constructor}}
- #-------------------------------------------------------------------------
- # Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE
- # correctly.
- #
- proc test_token_cb {varname token iStart iEnd} {
- upvar $varname var
- lappend var $token
- if {[llength $var]==3} { return "SQLITE_DONE" }
- return "SQLITE_OK"
- }
- proc tokenize {cmd} {
- set res [list]
- $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res]
- set res
- }
- sqlite3_fts5_create_function db tokenize tokenize
- do_execsql_test 6.0 {
- CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii);
- INSERT INTO x1 VALUES('q w e r t y');
- INSERT INTO x1 VALUES('y t r e w q');
- SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r';
- } {
- {q w e} {y t r}
- }
- do_execsql_test 6.1 {
- CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61);
- INSERT INTO x2 VALUES('q w e r t y');
- INSERT INTO x2 VALUES('y t r e w q');
- SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r';
- } {
- {q w e} {y t r}
- }
- #-------------------------------------------------------------------------
- # Miscellaneous tests for the unicode tokenizer.
- #
- do_catchsql_test 6.1 {
- CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
- } {1 {error in tokenizer constructor}}
- do_catchsql_test 6.2 {
- CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
- } {1 {error in tokenizer constructor}}
- do_catchsql_test 6.3 {
- CREATE VIRTUAL TABLE a3 USING fts5(
- x, y, tokenize = 'unicode61 remove_diacritics 3'
- );
- } {1 {error in tokenizer constructor}}
- do_catchsql_test 6.4 {
- CREATE VIRTUAL TABLE a3 USING fts5(
- x, y, tokenize = 'unicode61 remove_diacritics 10'
- );
- } {1 {error in tokenizer constructor}}
- #-------------------------------------------------------------------------
- # Porter tokenizer with very large tokens.
- #
- set a [string repeat a 100]
- set b [string repeat b 500]
- set c [string repeat c 1000]
- do_execsql_test 7.0 {
- CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter);
- INSERT INTO e5 VALUES($a || ' ' || $b);
- INSERT INTO e5 VALUES($b || ' ' || $c);
- INSERT INTO e5 VALUES($c || ' ' || $a);
- }
- do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
- do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
- do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }
- #-------------------------------------------------------------------------
- # Test the 'separators' option with the unicode61 tokenizer.
- #
- do_execsql_test 8.1 {
- BEGIN;
- CREATE VIRTUAL TABLE e6 USING fts5(x,
- tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- );
- INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
- CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
- SELECT term FROM e7;
- ROLLBACK;
- } {
- brown dog fox jumped lazy over quick the
- }
- do_execsql_test 8.2 [subst {
- BEGIN;
- CREATE VIRTUAL TABLE e6 USING fts5(x,
- tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'"
- );
- INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01'
- || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog'
- );
- INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09');
- CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
- SELECT term FROM e7;
- ROLLBACK;
- }] [subst {
- brown dog fox jumped lazy over quick the \u0E08 \u0E09
- }]
- # Test that the porter tokenizer correctly passes arguments through to
- # its parent tokenizer.
- do_execsql_test 8.3 {
- BEGIN;
- CREATE VIRTUAL TABLE e6 USING fts5(x,
- tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- );
- INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
- CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
- SELECT term FROM e7;
- ROLLBACK;
- } {
- brown dog fox jump lazi over quick the
- }
- #-------------------------------------------------------------------------
- # Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer
- # implementation.
- #
- reset_db
- proc tcl_create {args} { return "tcl_tokenize" }
- sqlite3_fts5_create_tokenizer db tcl tcl_create
- set ::flags [list]
- proc tcl_tokenize {tflags text} {
- lappend ::flags $tflags
- foreach {w iStart iEnd} [fts5_tokenize_split $text] {
- sqlite3_fts5_token $w $iStart $iEnd
- }
- }
- do_execsql_test 9.1.1 {
- CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl);
- INSERT INTO t1 VALUES('abc');
- INSERT INTO t1 VALUES('xyz');
- } {}
- do_test 9.1.2 { set ::flags } {document document}
- set ::flags [list]
- do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc}
- do_test 9.2.2 { set ::flags } {query}
- set ::flags [list]
- do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc}
- do_test 9.3.2 { set ::flags } {prefixquery}
- set ::flags [list]
- do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {}
- do_test 9.4.2 { set ::flags } {prefixquery}
- set ::flags [list]
- do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {}
- do_test 9.5.2 { set ::flags } {query}
- #-------------------------------------------------------------------------
- #
- reset_db
- do_execsql_test 10.1 {
- CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=unicode61);
- PRAGMA writable_schema = 1;
- UPDATE sqlite_schema
- SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize="unicode61 error");'
- WHERE name = 'x1';
- }
- db close
- sqlite3 db test.db
- do_catchsql_test 10.2 {
- SELECT * FROM x1('abc');
- } {1 {error in tokenizer constructor}}
- do_catchsql_test 10.3 {
- INSERT INTO x1 VALUES('abc');
- } {1 {error in tokenizer constructor}}
- do_execsql_test 10.4 {
- PRAGMA writable_schema = 1;
- UPDATE sqlite_schema
- SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize="nosuch error");'
- WHERE name = 'x1';
- }
- db close
- sqlite3 db test.db
- do_catchsql_test 10.5 {
- SELECT * FROM x1('abc');
- } {1 {no such tokenizer: nosuch}}
- do_catchsql_test 10.6 {
- INSERT INTO x1 VALUES('abc');
- } {1 {no such tokenizer: nosuch}}
- do_execsql_test 10.7 {
- DROP TABLE x1;
- SELECT * FROM sqlite_schema;
- }
- reset_db
- do_execsql_test 10.8 {
- CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=unicode61);
- INSERT INTO x1 VALUES('a b c'), ('d e f'), ('a b c');
- CREATE VIRTUAL TABLE x1v USING fts5vocab(x1, row);
- PRAGMA writable_schema = 1;
- UPDATE sqlite_schema
- SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=simplify);'
- WHERE name = 'x1';
- }
- do_execsql_test 10.9 {
- SELECT * FROM x1v
- } {
- a 2 2 b 2 2 c 2 2 d 1 1 e 1 1 f 1 1
- }
- db close
- sqlite3 db test.db
- do_execsql_test 10.10 {
- SELECT * FROM x1v
- } {
- a 2 2 b 2 2 c 2 2 d 1 1 e 1 1 f 1 1
- }
- finish_test
|