fts5trigram.test 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. # 2020 September 30
  2. #
  3. # The author disclaims copyright to this source code. In place of
  4. # a legal notice, here is a blessing:
  5. #
  6. # May you do good and not evil.
  7. # May you find forgiveness for yourself and forgive others.
  8. # May you share freely, never taking more than you give.
  9. #
  10. #*************************************************************************
  11. #
  12. # Tests for the fts5 "trigram" tokenizer.
  13. #
  14. source [file join [file dirname [info script]] fts5_common.tcl]
  15. ifcapable !fts5 { finish_test ; return }
  16. set ::testprefix fts5trigram
  17. do_execsql_test 1.0 {
  18. CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize=trigram);
  19. INSERT INTO t1 VALUES('abcdefghijklm');
  20. INSERT INTO t1 VALUES('กรุงเทพมหานคร');
  21. }
  22. foreach {tn s res} {
  23. 1 abc "(abc)defghijklm"
  24. 2 defgh "abc(defgh)ijklm"
  25. 3 abcdefghijklm "(abcdefghijklm)"
  26. 4 กรุ "(กรุ)งเทพมหานคร"
  27. 5 งเทพมห "กรุ(งเทพมห)านคร"
  28. 6 กรุงเทพมหานคร "(กรุงเทพมหานคร)"
  29. 7 Abc "(abc)defghijklm"
  30. 8 deFgh "abc(defgh)ijklm"
  31. 9 aBcdefGhijKlm "(abcdefghijklm)"
  32. } {
  33. do_execsql_test 1.1.$tn {
  34. SELECT highlight(t1, 0, '(', ')') FROM t1($s)
  35. } $res
  36. }
  37. do_execsql_test 1.2.0 {
  38. SELECT fts5_expr('ABCD', 'tokenize=trigram')
  39. } {{"abc" + "bcd"}}
  40. do_execsql_test 1.2.1 {
  41. SELECT * FROM t1 WHERE y LIKE ? ESCAPE 'a'
  42. }
  43. foreach {tn like res} {
  44. 1 {%cDef%} 1
  45. 2 {cDef%} {}
  46. 3 {%f%} 1
  47. 4 {%f_h%} 1
  48. 5 {%f_g%} {}
  49. 6 {abc%klm} 1
  50. 7 {ABCDEFG%} 1
  51. 8 {%รุงเ%} 2
  52. 9 {%งเ%} 2
  53. 10 {%"งเ"%} {}
  54. } {
  55. do_execsql_test 1.3.$tn {
  56. SELECT rowid FROM t1 WHERE y LIKE $like
  57. } $res
  58. }
  59. #-------------------------------------------------------------------------
  60. reset_db
  61. do_execsql_test 2.0 {
  62. CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize="trigram case_sensitive 1");
  63. INSERT INTO t1 VALUES('abcdefghijklm');
  64. INSERT INTO t1 VALUES('กรุงเทพมหานคร');
  65. }
  66. do_catchsql_test 2.0.1 {
  67. CREATE VIRTUAL TABLE t2 USING fts5(z, tokenize='trigram case_sensitive');
  68. } {1 {error in tokenizer constructor}}
  69. foreach {tn s res} {
  70. 1 abc "(abc)defghijklm"
  71. 2 defgh "abc(defgh)ijklm"
  72. 3 abcdefghijklm "(abcdefghijklm)"
  73. 4 กรุ "(กรุ)งเทพมหานคร"
  74. 5 งเทพมห "กรุ(งเทพมห)านคร"
  75. 6 กรุงเทพมหานคร "(กรุงเทพมหานคร)"
  76. 7 Abc ""
  77. 8 deFgh ""
  78. 9 aBcdefGhijKlm ""
  79. } {
  80. do_execsql_test 2.1.$tn {
  81. SELECT highlight(t1, 0, '(', ')') FROM t1($s)
  82. } $res
  83. }
  84. foreach {tn like res} {
  85. 1 {%cDef%} 1
  86. 2 {cDef%} {}
  87. 3 {%f%} 1
  88. 4 {%f_h%} 1
  89. 5 {%f_g%} {}
  90. 6 {abc%klm} 1
  91. 7 {ABCDEFG%} 1
  92. 8 {%รุงเ%} 2
  93. } {
  94. do_execsql_test 2.2.$tn {
  95. SELECT rowid FROM t1 WHERE y LIKE $like
  96. } $res
  97. }
  98. foreach {tn like res} {
  99. 1 {*cdef*} 1
  100. 2 {cdef*} {}
  101. 3 {*f*} 1
  102. 4 {*f?h*} 1
  103. 5 {*f?g*} {}
  104. 6 {abc*klm} 1
  105. 7 {abcdefg*} 1
  106. 8 {*รุงเ*} 2
  107. 9 {abc[d]efg*} 1
  108. 10 {abc[]d]efg*} 1
  109. 11 {abc[^]d]efg*} {}
  110. 12 {abc[^]XYZ]efg*} 1
  111. } {
  112. do_execsql_test 2.3.$tn {
  113. SELECT rowid FROM t1 WHERE y GLOB $like
  114. } $res
  115. }
  116. do_execsql_test 2.3.null.1 {
  117. SELECT rowid FROM t1 WHERE y LIKE NULL
  118. }
  119. #-------------------------------------------------------------------------
  120. reset_db
  121. do_catchsql_test 3.1 {
  122. CREATE VIRTUAL TABLE ttt USING fts5(c, tokenize="trigram case_sensitive 2");
  123. } {1 {error in tokenizer constructor}}
  124. do_catchsql_test 3.2 {
  125. CREATE VIRTUAL TABLE ttt USING fts5(c, tokenize="trigram case_sensitive 11");
  126. } {1 {error in tokenizer constructor}}
  127. do_catchsql_test 3.3 {
  128. CREATE VIRTUAL TABLE ttt USING fts5(c, "tokenize=trigram case_sensitive 1");
  129. } {0 {}}
  130. #-------------------------------------------------------------------------
  131. reset_db
  132. do_execsql_test 4.0 {
  133. CREATE VIRTUAL TABLE t0 USING fts5(b, tokenize = "trigram");
  134. }
  135. do_execsql_test 4.1 {
  136. INSERT INTO t0 VALUES (x'000b01');
  137. }
  138. do_execsql_test 4.2 {
  139. INSERT INTO t0(t0) VALUES('integrity-check');
  140. }
  141. #-------------------------------------------------------------------------
  142. reset_db
  143. foreach_detail_mode $::testprefix {
  144. foreach {ci} {0 1} {
  145. reset_db
  146. do_execsql_test 5.cs=$ci.0.1 "
  147. CREATE VIRTUAL TABLE t1 USING fts5(
  148. y, tokenize=\"trigram case_sensitive $ci\", detail=%DETAIL%
  149. );
  150. "
  151. do_execsql_test 5.cs=$ci.0.2 {
  152. INSERT INTO t1 VALUES('abcdefghijklm');
  153. INSERT INTO t1 VALUES('กรุงเทพมหานคร');
  154. }
  155. foreach {tn like res} {
  156. 1 {%cDef%} 1
  157. 2 {cDef%} {}
  158. 3 {%f%} 1
  159. 4 {%f_h%} 1
  160. 5 {%f_g%} {}
  161. 6 {abc%klm} 1
  162. 7 {ABCDEFG%} 1
  163. 8 {%รุงเ%} 2
  164. } {
  165. do_execsql_test 5.cs=$ci.1.$tn {
  166. SELECT rowid FROM t1 WHERE y LIKE $like
  167. } $res
  168. }
  169. }
  170. }
  171. do_execsql_test 6.0 {
  172. CREATE VIRTUAL TABLE ci0 USING fts5(x, tokenize="trigram");
  173. CREATE VIRTUAL TABLE ci1 USING fts5(x, tokenize="trigram case_sensitive 1");
  174. }
  175. # LIKE and GLOB both work with case-insensitive tokenizers. Only GLOB works
  176. # with case-sensitive.
  177. do_eqp_test 6.1 {
  178. SELECT * FROM ci0 WHERE x LIKE ?
  179. } {VIRTUAL TABLE INDEX 0:L0}
  180. do_eqp_test 6.2 {
  181. SELECT * FROM ci0 WHERE x GLOB ?
  182. } {VIRTUAL TABLE INDEX 0:G0}
  183. do_eqp_test 6.3 {
  184. SELECT * FROM ci1 WHERE x LIKE ?
  185. } {{SCAN ci1 VIRTUAL TABLE INDEX 0:}}
  186. do_eqp_test 6.4 {
  187. SELECT * FROM ci1 WHERE x GLOB ?
  188. } {VIRTUAL TABLE INDEX 0:G0}
  189. do_eqp_test 6.5 {
  190. SELECT * FROM ci1 WHERE x < ?
  191. } {{SCAN ci1 VIRTUAL TABLE INDEX 0:}}
  192. do_eqp_test 6.6 {
  193. SELECT * FROM ci0 WHERE x < ?
  194. } {{SCAN ci0 VIRTUAL TABLE INDEX 0:}}
  195. reset_db
  196. do_execsql_test 7.0 {
  197. CREATE VIRTUAL TABLE f USING FTS5(filename, tokenize="trigram");
  198. INSERT INTO f (rowid, filename) VALUES
  199. (10, "giraffe.png"),
  200. (20, "жираф.png"),
  201. (30, "cat.png"),
  202. (40, "кот.png"),
  203. (50, "misic-🎵-.mp3");
  204. }
  205. do_execsql_test 7.1 {
  206. SELECT rowid FROM f WHERE +filename GLOB '*ир*';
  207. } {20}
  208. do_execsql_test 7.2 {
  209. SELECT rowid FROM f WHERE filename GLOB '*ир*';
  210. } {20}
  211. #-------------------------------------------------------------------------
  212. reset_db
  213. do_execsql_test 8.0 {
  214. CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize=trigram);
  215. INSERT INTO t1 VALUES('abcdefghijklm');
  216. }
  217. foreach {tn match res} {
  218. 1 "abc ghi" "(abc)def(ghi)jklm"
  219. 2 "def ghi" "abc(defghi)jklm"
  220. 3 "efg ghi" "abcd(efghi)jklm"
  221. 4 "efghi" "abcd(efghi)jklm"
  222. 5 "abcd jklm" "(abcd)efghi(jklm)"
  223. 6 "ijkl jklm" "abcdefgh(ijklm)"
  224. 7 "ijk ijkl hijk" "abcdefg(hijkl)m"
  225. } {
  226. do_execsql_test 8.1.$tn {
  227. SELECT highlight(t1, 0, '(', ')') FROM t1($match)
  228. } $res
  229. }
  230. do_execsql_test 8.2 {
  231. CREATE VIRTUAL TABLE ft2 USING fts5(a, tokenize="trigram");
  232. INSERT INTO ft2 VALUES('abc x cde');
  233. INSERT INTO ft2 VALUES('abc cde');
  234. INSERT INTO ft2 VALUES('abcde');
  235. }
  236. do_execsql_test 8.3 {
  237. SELECT highlight(ft2, 0, '[', ']') FROM ft2 WHERE ft2 MATCH 'abc AND cde';
  238. } {
  239. {[abc] x [cde]}
  240. {[abc] [cde]}
  241. {[abcde]}
  242. }
  243. #-------------------------------------------------------------------------
  244. reset_db
  245. do_execsql_test 9.0 {
  246. CREATE VIRTUAL TABLE t1 USING fts5(
  247. a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
  248. tokenize=trigram
  249. );
  250. INSERT INTO t1(rowid, a12) VALUES(111, 'thats a tricky case though');
  251. INSERT INTO t1(rowid, a12) VALUES(222, 'the query planner cannot do');
  252. }
  253. do_execsql_test 9.1 {
  254. SELECT rowid FROM t1 WHERE a12 LIKE '%tricky%'
  255. } {111}
  256. do_execsql_test 9.2 {
  257. SELECT rowid FROM t1 WHERE a12 LIKE '%tricky%' AND a12 LIKE '%case%'
  258. } {111}
  259. do_execsql_test 9.3 {
  260. SELECT rowid FROM t1 WHERE a12 LIKE NULL
  261. } {}
  262. #-------------------------------------------------------------------------
  263. reset_db
  264. do_execsql_test 10.0 {
  265. CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=trigram);
  266. }
  267. do_test 10.1 {
  268. foreach {val} {
  269. "abc \UFFjkl\UFF"
  270. "abc \UFFFjkl\UFFF"
  271. "abc \UFFFFjkl\UFFFF"
  272. "abc \UFFFFFjkl\UFFFFF"
  273. "\UFFjkl\UFF abc"
  274. "\UFFFjkl\UFFF abc"
  275. "\UFFFFjkl\UFFFF abc"
  276. "\UFFFFFjkl\UFFFFF abc"
  277. "\U10001jkl\U10001 abc"
  278. } {
  279. execsql { INSERT INTO t1 VALUES( $val ) }
  280. }
  281. } {}
  282. do_test 10.2 {
  283. foreach {val} {
  284. X'E18000626320646566'
  285. X'61EDA0806320646566'
  286. X'61EDA0806320646566'
  287. X'61EFBFBE6320646566'
  288. X'76686920E18000626320646566'
  289. X'7668692061EDA0806320646566'
  290. X'7668692061EDA0806320646566'
  291. X'7668692061EFBFBE6320646566'
  292. } {
  293. execsql " INSERT INTO t1 VALUES( $val ) "
  294. }
  295. } {}
  296. do_test 10.3 {
  297. set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
  298. set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
  299. set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
  300. set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
  301. execsql {
  302. INSERT INTO t1 VALUES($a);
  303. INSERT INTO t1 VALUES($b);
  304. INSERT INTO t1 VALUES($c);
  305. INSERT INTO t1 VALUES($d);
  306. INSERT INTO t1 VALUES('abcd' || $a);
  307. INSERT INTO t1 VALUES('abcd' || $b);
  308. INSERT INTO t1 VALUES('abcd' || $c);
  309. INSERT INTO t1 VALUES('abcd' || $d);
  310. }
  311. } {}
  312. do_execsql_test 11.0 {
  313. CREATE VIRTUAL TABLE t4 USING fts5(y, tokenize=trigram);
  314. }
  315. sqlite3_fts5_register_str db
  316. do_execsql_test 11.1 {
  317. INSERT INTO t4 VALUES( str('') );
  318. }
  319. do_test 12.0 {
  320. sqlite3_fts5_tokenize db trigram "abcd"
  321. } {abc 0 3 bcd 1 4}
  322. do_test 12.1 {
  323. sqlite3_fts5_tokenize db trigram "a"
  324. } {}
  325. do_test 12.2 {
  326. sqlite3_fts5_tokenize db trigram ""
  327. } {}
  328. finish_test