123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709 |
- # 2014 Dec 19
- #
- # The author disclaims copyright to this source code. In place of
- # a legal notice, here is a blessing:
- #
- # May you do good and not evil.
- # May you find forgiveness for yourself and forgive others.
- # May you share freely, never taking more than you give.
- #
- #***********************************************************************
- #
- if {![info exists testdir]} {
- set testdir [file join [file dirname [info script]] .. .. .. test]
- }
- source $testdir/tester.tcl
- ifcapable !fts5 {
- proc return_if_no_fts5 {} {
- finish_test
- return -code return
- }
- return
- } else {
- proc return_if_no_fts5 {} {}
- }
- catch {
- sqlite3_fts5_may_be_corrupt 0
- reset_db
- }
- proc fts5_test_poslist {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
- lappend res [string map {{ } .} [$cmd xInst $i]]
- }
- set res
- }
- proc fts5_test_poslist2 {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
- $cmd xPhraseForeach $i c o {
- lappend res $i.$c.$o
- }
- }
- #set res
- sort_poslist $res
- }
- proc fts5_test_insttoken {cmd iInst iToken} {
- $cmd xInstToken $iInst $iToken
- }
- proc fts5_test_collist {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
- $cmd xPhraseColumnForeach $i c { lappend res $i.$c }
- }
- set res
- }
- proc fts5_collist {cmd iPhrase} {
- set res [list]
- $cmd xPhraseColumnForeach $iPhrase c { lappend res $c }
- set res
- }
- proc fts5_test_columnsize {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
- lappend res [$cmd xColumnSize $i]
- }
- set res
- }
- proc fts5_columntext {cmd iCol} {
- $cmd xColumnText $iCol
- }
- proc fts5_columnlocale {cmd iCol} {
- $cmd xColumnLocale $iCol
- }
- proc fts5_test_columntext {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
- lappend res [$cmd xColumnText $i]
- }
- set res
- }
- proc fts5_test_columnlocale {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
- lappend res [$cmd xColumnLocale $i]
- }
- set res
- }
- proc fts5_test_columntotalsize {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
- lappend res [$cmd xColumnTotalSize $i]
- }
- set res
- }
- proc test_append_token {varname token iStart iEnd} {
- upvar $varname var
- lappend var $token
- return "SQLITE_OK"
- }
- proc fts5_test_tokenize {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
- set tokens [list]
- $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens]
- lappend res $tokens
- }
- set res
- }
- proc fts5_test_rowcount {cmd} {
- $cmd xRowCount
- }
- proc fts5_test_rowid {cmd} {
- $cmd xRowid
- }
- proc test_queryphrase_cb {cnt cmd} {
- upvar $cnt L
- for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
- foreach {ip ic io} [$cmd xInst $i] break
- set A($ic) 1
- }
- foreach ic [array names A] {
- lset L $ic [expr {[lindex $L $ic] + 1}]
- }
- }
- proc fts5_test_queryphrase {cmd} {
- set res [list]
- for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
- set cnt [list]
- for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 }
- $cmd xQueryPhrase $i [list test_queryphrase_cb cnt]
- lappend res $cnt
- }
- set res
- }
- proc fts5_queryphrase {cmd iPhrase} {
- set cnt [list]
- for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 }
- $cmd xQueryPhrase $iPhrase [list test_queryphrase_cb cnt]
- set cnt
- }
- proc fts5_test_phrasecount {cmd} {
- $cmd xPhraseCount
- }
- proc fts5_test_all {cmd} {
- set res [list]
- lappend res columnsize [fts5_test_columnsize $cmd]
- lappend res columntext [fts5_test_columntext $cmd]
- lappend res columntotalsize [fts5_test_columntotalsize $cmd]
- lappend res poslist [fts5_test_poslist $cmd]
- lappend res tokenize [fts5_test_tokenize $cmd]
- lappend res rowcount [fts5_test_rowcount $cmd]
- set res
- }
- proc fts5_aux_test_functions {db} {
- foreach f {
- fts5_test_columnsize
- fts5_test_columntext
- fts5_test_columnlocale
- fts5_test_columntotalsize
- fts5_test_poslist
- fts5_test_poslist2
- fts5_test_collist
- fts5_test_insttoken
- fts5_test_tokenize
- fts5_test_rowcount
- fts5_test_rowid
- fts5_test_all
- fts5_test_queryphrase
- fts5_test_phrasecount
- fts5_columntext
- fts5_columnlocale
- fts5_queryphrase
- fts5_collist
- } {
- sqlite3_fts5_create_function $db $f $f
- }
- }
- proc fts5_segcount {tbl} {
- set N 0
- foreach n [fts5_level_segs $tbl] { incr N $n }
- set N
- }
- proc fts5_level_segs {tbl} {
- set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
- set ret [list]
- foreach L [lrange [db one $sql] 1 end] {
- lappend ret [expr [llength $L] - 3]
- }
- set ret
- }
- proc fts5_level_segids {tbl} {
- set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
- set ret [list]
- foreach L [lrange [db one $sql] 1 end] {
- set lvl [list]
- foreach S [lrange $L 3 end] {
- regexp {id=([1234567890]*)} $S -> segid
- lappend lvl $segid
- }
- lappend ret $lvl
- }
- set ret
- }
- proc fts5_rnddoc {n} {
- set map [list 0 a 1 b 2 c 3 d 4 e 5 f 6 g 7 h 8 i 9 j]
- set doc [list]
- for {set i 0} {$i < $n} {incr i} {
- lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]"
- }
- set doc
- }
- #-------------------------------------------------------------------------
- # Usage:
- #
- # nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2...
- #
- # This command is used to test if a document (set of column values) matches
- # the logical equivalent of a single FTS5 NEAR() clump and, if so, return
- # the equivalent of an FTS5 position list.
- #
- # Parameter $aCol is passed a list of the column values for the document
- # to test. Parameters $phrase1 and so on are the phrases.
- #
- # The result is a list of phrase hits. Each phrase hit is formatted as
- # three integers separated by "." characters, in the following format:
- #
- # <phrase number> . <column number> . <token offset>
- #
- # Options:
- #
- # -near N (NEAR distance. Default 10)
- # -col C (List of column indexes to match against)
- # -pc VARNAME (variable in caller frame to use for phrase numbering)
- # -dict VARNAME (array in caller frame to use for synonyms)
- #
- proc nearset {aCol args} {
- # Process the command line options.
- #
- set O(-near) 10
- set O(-col) {}
- set O(-pc) ""
- set O(-dict) ""
- set nOpt [lsearch -exact $args --]
- if {$nOpt<0} { error "no -- option" }
- # Set $lPhrase to be a list of phrases. $nPhrase its length.
- set lPhrase [lrange $args [expr $nOpt+1] end]
- set nPhrase [llength $lPhrase]
- foreach {k v} [lrange $args 0 [expr $nOpt-1]] {
- if {[info exists O($k)]==0} { error "unrecognized option $k" }
- set O($k) $v
- }
- if {$O(-pc) == ""} {
- set counter 0
- } else {
- upvar $O(-pc) counter
- }
- if {$O(-dict)!=""} { upvar $O(-dict) aDict }
- for {set j 0} {$j < [llength $aCol]} {incr j} {
- for {set i 0} {$i < $nPhrase} {incr i} {
- set A($j,$i) [list]
- }
- }
- # Loop through each column of the current row.
- for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
- # If there is a column filter, test whether this column is excluded. If
- # so, skip to the next iteration of this loop. Otherwise, set zCol to the
- # column value and nToken to the number of tokens that comprise it.
- if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue
- set zCol [lindex $aCol $iCol]
- set nToken [llength $zCol]
- # Each iteration of the following loop searches a substring of the
- # column value for phrase matches. The last token of the substring
- # is token $iLast of the column value. The first token is:
- #
- # iFirst = ($iLast - $O(-near) - 1)
- #
- # where $sz is the length of the phrase being searched for. A phrase
- # counts as matching the substring if its first token lies on or before
- # $iLast and its last token on or after $iFirst.
- #
- # For example, if the query is "NEAR(a+b c, 2)" and the column value:
- #
- # "x x x x A B x x C x"
- # 0 1 2 3 4 5 6 7 8 9"
- #
- # when (iLast==8 && iFirst=5) the range will contain both phrases and
- # so both instances can be added to the output poslists.
- #
- set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)]
- for { } {$iLast < $nToken} {incr iLast} {
- catch { array unset B }
-
- for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
- set p [lindex $lPhrase $iPhrase]
- set nPm1 [expr {[llength $p] - 1}]
- set iFirst [expr $iLast - $O(-near) - [llength $p]]
- for {set i $iFirst} {$i <= $iLast} {incr i} {
- set lCand [lrange $zCol $i [expr $i+$nPm1]]
- set bMatch 1
- foreach tok $p term $lCand {
- if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break }
- }
- if {$bMatch} { lappend B($iPhrase) $i }
- }
- if {![info exists B($iPhrase)]} break
- }
- if {$iPhrase==$nPhrase} {
- for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
- set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)]
- set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)]
- }
- }
- }
- }
- set res [list]
- #puts [array names A]
- for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
- for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
- foreach a $A($iCol,$iPhrase) {
- lappend res "$counter.$iCol.$a"
- }
- }
- incr counter
- }
- #puts "$aCol -> $res"
- sort_poslist $res
- }
- proc nearset_match {aDictVar tok term} {
- if {[string match $tok $term]} { return 1 }
- upvar $aDictVar aDict
- if {[info exists aDict($tok)]} {
- foreach s $aDict($tok) {
- if {[string match $s $term]} { return 1 }
- }
- }
- return 0;
- }
- #-------------------------------------------------------------------------
- # Usage:
- #
- # sort_poslist LIST
- #
- # Sort a position list of the type returned by command [nearset]
- #
- proc sort_poslist {L} {
- lsort -command instcompare $L
- }
- proc instcompare {lhs rhs} {
- foreach {p1 c1 o1} [split $lhs .] {}
- foreach {p2 c2 o2} [split $rhs .] {}
- set res [expr $c1 - $c2]
- if {$res==0} { set res [expr $o1 - $o2] }
- if {$res==0} { set res [expr $p1 - $p2] }
- return $res
- }
- #-------------------------------------------------------------------------
- # Logical operators used by the commands returned by fts5_tcl_expr().
- #
- proc AND {args} {
- foreach a $args {
- if {[llength $a]==0} { return [list] }
- }
- sort_poslist [concat {*}$args]
- }
- proc OR {args} {
- sort_poslist [concat {*}$args]
- }
- proc NOT {a b} {
- if {[llength $b]>0} { return [list] }
- return $a
- }
- #-------------------------------------------------------------------------
- # This command is similar to [split], except that it also provides the
- # start and end offsets of each token. For example:
- #
- # [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8}
- #
- proc gobble_whitespace {textvar} {
- upvar $textvar t
- regexp {([ ]*)(.*)} $t -> space t
- return [string length $space]
- }
- proc gobble_text {textvar wordvar} {
- upvar $textvar t
- upvar $wordvar w
- regexp {([^ ]*)(.*)} $t -> w t
- return [string length $w]
- }
- proc fts5_tokenize_split {text} {
- set token ""
- set ret [list]
- set iOff [gobble_whitespace text]
- while {[set nToken [gobble_text text word]]} {
- lappend ret $word $iOff [expr $iOff+$nToken]
- incr iOff $nToken
- incr iOff [gobble_whitespace text]
- }
- set ret
- }
- #-------------------------------------------------------------------------
- #
- proc foreach_detail_mode {prefix script} {
- set saved $::testprefix
- foreach d [list full col none] {
- set s [string map [list %DETAIL% $d] $script]
- set ::detail $d
- set ::testprefix "$prefix-$d"
- reset_db
- uplevel $s
- unset ::detail
- }
- set ::testprefix $saved
- }
- proc detail_check {} {
- if {$::detail != "none" && $::detail!="full" && $::detail!="col"} {
- error "not in foreach_detail_mode {...} block"
- }
- }
- proc detail_is_none {} { detail_check ; expr {$::detail == "none"} }
- proc detail_is_col {} { detail_check ; expr {$::detail == "col" } }
- proc detail_is_full {} { detail_check ; expr {$::detail == "full"} }
- proc foreach_tokenizer_mode {prefix script} {
- set saved $::testprefix
- foreach {d mapping} {
- "" {}
- "-origintext" {, tokenize="origintext unicode61", tokendata=1}
- } {
- set s [string map [list %TOKENIZER% $mapping] $script]
- set ::testprefix "$prefix$d"
- reset_db
- sqlite3_fts5_register_origintext db
- uplevel $s
- }
- set ::testprefix $saved
- }
- #-------------------------------------------------------------------------
- # Convert a poslist of the type returned by fts5_test_poslist() to a
- # collist as returned by fts5_test_collist().
- #
- proc fts5_poslist2collist {poslist} {
- set res [list]
- foreach h $poslist {
- regexp {(.*)\.[1234567890]+} $h -> cand
- lappend res $cand
- }
- set res [lsort -command fts5_collist_elem_compare -unique $res]
- return $res
- }
- # Comparison function used by fts5_poslist2collist to sort collist entries.
- proc fts5_collist_elem_compare {a b} {
- foreach {a1 a2} [split $a .] {}
- foreach {b1 b2} [split $b .] {}
- if {$a1==$b1} { return [expr $a2 - $b2] }
- return [expr $a1 - $b1]
- }
- #--------------------------------------------------------------------------
- # Construct and return a tcl list equivalent to that returned by the SQL
- # query executed against database handle [db]:
- #
- # SELECT
- # rowid,
- # fts5_test_poslist($tbl),
- # fts5_test_collist($tbl)
- # FROM $tbl('$expr')
- # ORDER BY rowid $order;
- #
- proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} {
- # Figure out the set of columns in the FTS5 table. This routine does
- # not handle tables with UNINDEXED columns, but if it did, it would
- # have to be here.
- db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
- set d ""
- if {$aDictVar != ""} {
- upvar $aDictVar aDict
- set d aDict
- }
- set cols ""
- foreach e $lCols { append cols ", '$e'" }
- set tclexpr [db one [subst -novar {
- SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] )
- }]]
- set res [list]
- db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x {
- set cols [list]
- foreach col $lCols { lappend cols $x($col) }
-
- set ::pc 0
- set rowdata [eval $tclexpr]
- if {$rowdata != ""} {
- lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata]
- }
- }
- set res
- }
- #-------------------------------------------------------------------------
- # Similar to [fts5_query_data], but omit the collist field.
- #
- proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} {
- set res [list]
- if {$aDictVar!=""} {
- upvar $aDictVar aDict
- set dict aDict
- } else {
- set dict ""
- }
- foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
- lappend res $rowid $poslist
- }
- set res
- }
- proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} {
- set res [list]
- if {$aDictVar!=""} {
- upvar $aDictVar aDict
- set dict aDict
- } else {
- set dict ""
- }
- foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
- lappend res $rowid $collist
- }
- set res
- }
- #-------------------------------------------------------------------------
- #
- # This command will only work inside a [foreach_detail_mode] block. It tests
- # whether or not expression $expr run on FTS5 table $tbl is supported by
- # the current mode. If so, 1 is returned. If not, 0.
- #
- # detail=full (all queries supported)
- # detail=col (all but phrase queries and NEAR queries)
- # detail=none (all but phrase queries, NEAR queries, and column filters)
- #
- proc fts5_expr_ok {expr tbl} {
- if {![detail_is_full]} {
- set nearset "nearset_rc"
- if {[detail_is_col]} { set nearset "nearset_rf" }
- set ::expr_not_ok 0
- db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
- set cols ""
- foreach e $lCols { append cols ", '$e'" }
- set ::pc 0
- set tclexpr [db one [subst -novar {
- SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] )
- }]]
- eval $tclexpr
- if {$::expr_not_ok} { return 0 }
- }
- return 1
- }
- # Helper for [fts5_expr_ok]
- proc nearset_rf {aCol args} {
- set idx [lsearch -exact $args --]
- if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} {
- set ::expr_not_ok 1
- }
- list
- }
- # Helper for [fts5_expr_ok]
- proc nearset_rc {aCol args} {
- nearset_rf $aCol {*}$args
- if {[lsearch $args -col]>=0} {
- set ::expr_not_ok 1
- }
- list
- }
- proc dump {tname} {
- execsql_pp "SELECT * FROM ${tname}_idx"
- execsql_pp "SELECT id, quote(block), fts5_decode(id,block) FROM ${tname}_data"
- }
- #-------------------------------------------------------------------------
- # Code for a simple Tcl tokenizer that supports synonyms at query time.
- #
- proc tclnum_tokenize {mode tflags text} {
- foreach {w iStart iEnd} [fts5_tokenize_split $text] {
- sqlite3_fts5_token $w $iStart $iEnd
- if {$tflags == $mode && [info exists ::tclnum_syn($w)]} {
- foreach s $::tclnum_syn($w) { sqlite3_fts5_token -colo $s $iStart $iEnd }
- }
- }
- }
- proc tclnum_create {args} {
- set mode query
- if {[llength $args]} {
- set mode [lindex $args 0]
- }
- if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" }
- return [list tclnum_tokenize $mode]
- }
- proc fts5_tclnum_register {db} {
- foreach SYNDICT {
- {zero 0}
- {one 1 i}
- {two 2 ii}
- {three 3 iii}
- {four 4 iv}
- {five 5 v}
- {six 6 vi}
- {seven 7 vii}
- {eight 8 viii}
- {nine 9 ix}
- {a1 a2 a3 a4 a5 a6 a7 a8 a9}
- {b1 b2 b3 b4 b5 b6 b7 b8 b9}
- {c1 c2 c3 c4 c5 c6 c7 c8 c9}
- } {
- foreach s $SYNDICT {
- set o [list]
- foreach x $SYNDICT {if {$x!=$s} {lappend o $x}}
- set ::tclnum_syn($s) $o
- }
- }
- sqlite3_fts5_create_tokenizer db tclnum tclnum_create
- }
- #
- # End of tokenizer code.
- #-------------------------------------------------------------------------
|