fts5unicode.test 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. # 2014 Dec 20
  2. #
  3. # The author disclaims copyright to this source code. In place of
  4. # a legal notice, here is a blessing:
  5. #
  6. # May you do good and not evil.
  7. # May you find forgiveness for yourself and forgive others.
  8. # May you share freely, never taking more than you give.
  9. #
  10. #***********************************************************************
  11. #
  12. # Tests focusing on the fts5 tokenizers
  13. #
  14. source [file join [file dirname [info script]] fts5_common.tcl]
  15. set testprefix fts5unicode
  16. # If SQLITE_ENABLE_FTS5 is not defined, omit this file.
  17. ifcapable !fts5 {
  18. finish_test
  19. return
  20. }
  21. proc tokenize_test {tn tokenizer input output} {
  22. uplevel [list do_test $tn [subst -nocommands {
  23. set ret {}
  24. foreach {z s e} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] {
  25. lappend ret [set z]
  26. }
  27. set ret
  28. }] [list {*}$output]]
  29. }
  30. foreach {tn t} {1 ascii 2 unicode61} {
  31. tokenize_test 1.$tn.0 $t {A B C D} {a b c d}
  32. tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely}
  33. tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
  34. tokenize_test 1.$tn.3 $t {} {}
  35. }
  36. #-------------------------------------------------------------------------
  37. # Check that "unicode61" really is the default tokenizer.
  38. #
  39. do_execsql_test 2.0 "
  40. CREATE VIRTUAL TABLE t1 USING fts5(x);
  41. CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61);
  42. CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii);
  43. INSERT INTO t1 VALUES('\xC0\xC8\xCC');
  44. INSERT INTO t2 VALUES('\xC0\xC8\xCC');
  45. INSERT INTO t3 VALUES('\xC0\xC8\xCC');
  46. "
  47. do_execsql_test 2.1 "
  48. SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
  49. SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
  50. SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC';
  51. " {t1 t2}
  52. #-------------------------------------------------------------------------
  53. # Check that codepoints that require 4 bytes to store in utf-8 (those that
  54. # require 17 or more bits to store).
  55. #
  56. unset -nocomplain A B C D
  57. set A [db one {SELECT char(0x1F75E)}] ;# Type So
  58. set B [db one {SELECT char(0x1F5FD)}] ;# Type So
  59. set C [db one {SELECT char(0x2F802)}] ;# Type Lo
  60. set D [db one {SELECT char(0x2F808)}] ;# Type Lo
  61. do_execsql_test 3.0 "
  62. CREATE VIRTUAL TABLE xyz USING fts5(x,
  63. tokenize = \"unicode61 separators '$C' tokenchars '$A'\"
  64. );
  65. CREATE VIRTUAL TABLE xyz_v USING fts5vocab(xyz, row);
  66. INSERT INTO xyz VALUES('$A$B$C$D');
  67. "
  68. do_execsql_test 3.1 {
  69. SELECT * FROM xyz_v;
  70. } [list $A 1 1 $D 1 1]
  71. finish_test