fts5tokenizer.test 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. # 2014 Dec 20
  2. #
  3. # The author disclaims copyright to this source code. In place of
  4. # a legal notice, here is a blessing:
  5. #
  6. # May you do good and not evil.
  7. # May you find forgiveness for yourself and forgive others.
  8. # May you share freely, never taking more than you give.
  9. #
  10. #***********************************************************************
  11. #
  12. # Tests focusing on the built-in fts5 tokenizers.
  13. #
  14. source [file join [file dirname [info script]] fts5_common.tcl]
  15. set testprefix fts5tokenizer
  16. # If SQLITE_ENABLE_FTS5 is not defined, omit this file.
  17. ifcapable !fts5 {
  18. finish_test
  19. return
  20. }
  21. do_execsql_test 1.0 {
  22. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  23. DROP TABLE ft1;
  24. }
  25. do_execsql_test 1.1 {
  26. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter');
  27. DROP TABLE ft1;
  28. }
  29. do_execsql_test 1.2 {
  30. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter);
  31. DROP TABLE ft1;
  32. }
  33. do_execsql_test 1.3 {
  34. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
  35. DROP TABLE ft1;
  36. }
  37. do_execsql_test 1.4 {
  38. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
  39. DROP TABLE ft1;
  40. }
  41. do_catchsql_test 1.5 {
  42. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch');
  43. } {1 {no such tokenizer: nosuch}}
  44. do_catchsql_test 1.6 {
  45. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch');
  46. } {1 {error in tokenizer constructor}}
  47. do_execsql_test 2.0 {
  48. CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  49. INSERT INTO ft1 VALUES('embedded databases');
  50. }
  51. do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
  52. do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
  53. do_execsql_test 2.3 {
  54. SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding'
  55. } 1
  56. proc tcl_create {args} {
  57. set ::targs $args
  58. error "failed"
  59. }
  60. sqlite3_fts5_create_tokenizer db tcl tcl_create
  61. foreach {tn directive expected} {
  62. 1 {tokenize='tcl a b c'} {a b c}
  63. 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
  64. 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i}
  65. 4 {tokenize = tcl} {}
  66. } {
  67. do_catchsql_test 3.$tn.1 "
  68. CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
  69. " {1 {error in tokenizer constructor}}
  70. do_test 3.$tn.2 { set ::targs } $expected
  71. }
  72. do_catchsql_test 4.1 {
  73. CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
  74. } {1 {parse error in "tokenize = tcl abc"}}
  75. do_catchsql_test 4.2 {
  76. CREATE VIRTUAL TABLE ft2 USING fts5(x y)
  77. } {1 {unrecognized column option: y}}
  78. #-------------------------------------------------------------------------
  79. # Test the "separators" and "tokenchars" options a bit.
  80. #
  81. foreach {tn tokenizer} {1 ascii 2 unicode61} {
  82. reset_db
  83. set T "$tokenizer tokenchars ',.:' separators 'xyz'"
  84. execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
  85. do_execsql_test 5.$tn.1 {
  86. INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
  87. }
  88. foreach {tn2 token res} {
  89. 1 abc 1 2 def 1 3 ghi 1 4 jkl {}
  90. 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1
  91. 9 vw 1
  92. } {
  93. do_execsql_test 5.$tn.2.$tn2 "
  94. SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
  95. " $res
  96. }
  97. }
  98. #-------------------------------------------------------------------------
  99. # Miscellaneous tests for the ascii tokenizer.
  100. #
  101. # 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the
  102. # 'separators' option. But unicode61 does not.
  103. #
  104. # 5.2.*: An option without an argument is an error.
  105. #
  106. do_test 5.1.1 {
  107. execsql "
  108. CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`);
  109. INSERT INTO a1 VALUES('abc\u1234def');
  110. "
  111. execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' }
  112. } {}
  113. do_test 5.1.2 {
  114. execsql "
  115. CREATE VIRTUAL TABLE a2 USING fts5(
  116. x, tokenize=`unicode61 separators '\u1234'`);
  117. INSERT INTO a2 VALUES('abc\u1234def');
  118. "
  119. execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' }
  120. } {1}
  121. do_catchsql_test 5.2 {
  122. CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars');
  123. } {1 {error in tokenizer constructor}}
  124. do_catchsql_test 5.3 {
  125. CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg');
  126. } {1 {error in tokenizer constructor}}
  127. #-------------------------------------------------------------------------
  128. # Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE
  129. # correctly.
  130. #
  131. proc test_token_cb {varname token iStart iEnd} {
  132. upvar $varname var
  133. lappend var $token
  134. if {[llength $var]==3} { return "SQLITE_DONE" }
  135. return "SQLITE_OK"
  136. }
  137. proc tokenize {cmd} {
  138. set res [list]
  139. $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res]
  140. set res
  141. }
  142. sqlite3_fts5_create_function db tokenize tokenize
  143. do_execsql_test 6.0 {
  144. CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii);
  145. INSERT INTO x1 VALUES('q w e r t y');
  146. INSERT INTO x1 VALUES('y t r e w q');
  147. SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r';
  148. } {
  149. {q w e} {y t r}
  150. }
  151. do_execsql_test 6.1 {
  152. CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61);
  153. INSERT INTO x2 VALUES('q w e r t y');
  154. INSERT INTO x2 VALUES('y t r e w q');
  155. SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r';
  156. } {
  157. {q w e} {y t r}
  158. }
  159. #-------------------------------------------------------------------------
  160. # Miscellaneous tests for the unicode tokenizer.
  161. #
  162. do_catchsql_test 6.1 {
  163. CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
  164. } {1 {error in tokenizer constructor}}
  165. do_catchsql_test 6.2 {
  166. CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
  167. } {1 {error in tokenizer constructor}}
  168. do_catchsql_test 6.3 {
  169. CREATE VIRTUAL TABLE a3 USING fts5(
  170. x, y, tokenize = 'unicode61 remove_diacritics 3'
  171. );
  172. } {1 {error in tokenizer constructor}}
  173. do_catchsql_test 6.4 {
  174. CREATE VIRTUAL TABLE a3 USING fts5(
  175. x, y, tokenize = 'unicode61 remove_diacritics 10'
  176. );
  177. } {1 {error in tokenizer constructor}}
  178. #-------------------------------------------------------------------------
  179. # Porter tokenizer with very large tokens.
  180. #
  181. set a [string repeat a 100]
  182. set b [string repeat b 500]
  183. set c [string repeat c 1000]
  184. do_execsql_test 7.0 {
  185. CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter);
  186. INSERT INTO e5 VALUES($a || ' ' || $b);
  187. INSERT INTO e5 VALUES($b || ' ' || $c);
  188. INSERT INTO e5 VALUES($c || ' ' || $a);
  189. }
  190. do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
  191. do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
  192. do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }
  193. #-------------------------------------------------------------------------
  194. # Test the 'separators' option with the unicode61 tokenizer.
  195. #
  196. do_execsql_test 8.1 {
  197. BEGIN;
  198. CREATE VIRTUAL TABLE e6 USING fts5(x,
  199. tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  200. );
  201. INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
  202. CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  203. SELECT term FROM e7;
  204. ROLLBACK;
  205. } {
  206. brown dog fox jumped lazy over quick the
  207. }
  208. do_execsql_test 8.2 [subst {
  209. BEGIN;
  210. CREATE VIRTUAL TABLE e6 USING fts5(x,
  211. tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'"
  212. );
  213. INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01'
  214. || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog'
  215. );
  216. INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09');
  217. CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  218. SELECT term FROM e7;
  219. ROLLBACK;
  220. }] [subst {
  221. brown dog fox jumped lazy over quick the \u0E08 \u0E09
  222. }]
  223. # Test that the porter tokenizer correctly passes arguments through to
  224. # its parent tokenizer.
  225. do_execsql_test 8.3 {
  226. BEGIN;
  227. CREATE VIRTUAL TABLE e6 USING fts5(x,
  228. tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  229. );
  230. INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
  231. CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
  232. SELECT term FROM e7;
  233. ROLLBACK;
  234. } {
  235. brown dog fox jump lazi over quick the
  236. }
  237. #-------------------------------------------------------------------------
  238. # Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer
  239. # implementation.
  240. #
  241. reset_db
  242. proc tcl_create {args} { return "tcl_tokenize" }
  243. sqlite3_fts5_create_tokenizer db tcl tcl_create
  244. set ::flags [list]
  245. proc tcl_tokenize {tflags text} {
  246. lappend ::flags $tflags
  247. foreach {w iStart iEnd} [fts5_tokenize_split $text] {
  248. sqlite3_fts5_token $w $iStart $iEnd
  249. }
  250. }
  251. do_execsql_test 9.1.1 {
  252. CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl);
  253. INSERT INTO t1 VALUES('abc');
  254. INSERT INTO t1 VALUES('xyz');
  255. } {}
  256. do_test 9.1.2 { set ::flags } {document document}
  257. set ::flags [list]
  258. do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc}
  259. do_test 9.2.2 { set ::flags } {query}
  260. set ::flags [list]
  261. do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc}
  262. do_test 9.3.2 { set ::flags } {prefixquery}
  263. set ::flags [list]
  264. do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {}
  265. do_test 9.4.2 { set ::flags } {prefixquery}
  266. set ::flags [list]
  267. do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {}
  268. do_test 9.5.2 { set ::flags } {query}
  269. #-------------------------------------------------------------------------
  270. #
  271. reset_db
  272. do_execsql_test 10.1 {
  273. CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=unicode61);
  274. PRAGMA writable_schema = 1;
  275. UPDATE sqlite_schema
  276. SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize="unicode61 error");'
  277. WHERE name = 'x1';
  278. }
  279. db close
  280. sqlite3 db test.db
  281. do_catchsql_test 10.2 {
  282. SELECT * FROM x1('abc');
  283. } {1 {error in tokenizer constructor}}
  284. do_catchsql_test 10.3 {
  285. INSERT INTO x1 VALUES('abc');
  286. } {1 {error in tokenizer constructor}}
  287. do_execsql_test 10.4 {
  288. PRAGMA writable_schema = 1;
  289. UPDATE sqlite_schema
  290. SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize="nosuch error");'
  291. WHERE name = 'x1';
  292. }
  293. db close
  294. sqlite3 db test.db
  295. do_catchsql_test 10.5 {
  296. SELECT * FROM x1('abc');
  297. } {1 {no such tokenizer: nosuch}}
  298. do_catchsql_test 10.6 {
  299. INSERT INTO x1 VALUES('abc');
  300. } {1 {no such tokenizer: nosuch}}
  301. do_execsql_test 10.7 {
  302. DROP TABLE x1;
  303. SELECT * FROM sqlite_schema;
  304. }
  305. reset_db
  306. do_execsql_test 10.8 {
  307. CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=unicode61);
  308. INSERT INTO x1 VALUES('a b c'), ('d e f'), ('a b c');
  309. CREATE VIRTUAL TABLE x1v USING fts5vocab(x1, row);
  310. PRAGMA writable_schema = 1;
  311. UPDATE sqlite_schema
  312. SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=simplify);'
  313. WHERE name = 'x1';
  314. }
  315. do_execsql_test 10.9 {
  316. SELECT * FROM x1v
  317. } {
  318. a 2 2 b 2 2 c 2 2 d 1 1 e 1 1 f 1 1
  319. }
  320. db close
  321. sqlite3 db test.db
  322. do_execsql_test 10.10 {
  323. SELECT * FROM x1v
  324. } {
  325. a 2 2 b 2 2 c 2 2 d 1 1 e 1 1 f 1 1
  326. }
  327. finish_test