fts5txt2db.tcl 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. ##########################################################################
  2. # 2016 Jan 27
  3. #
  4. # The author disclaims copyright to this source code. In place of
  5. # a legal notice, here is a blessing:
  6. #
  7. # May you do good and not evil.
  8. # May you find forgiveness for yourself and forgive others.
  9. # May you share freely, never taking more than you give.
  10. #
  11. proc process_cmdline {} {
  12. cmdline::process ::A $::argv {
  13. {fts5 "use fts5 (this is the default)"}
  14. {fts4 "use fts4"}
  15. {trigram "Use tokenize=trigram"}
  16. {colsize "10 10 10" "list of column sizes"}
  17. {tblname "t1" "table name to create"}
  18. {detail "full" "Fts5 detail mode to use"}
  19. {repeat 1 "Load each file this many times"}
  20. {prefix "" "Fts prefix= option"}
  21. {trans 1 "True to use a transaction"}
  22. database
  23. file...
  24. } {
  25. This script is designed to create fts4/5 tables with more than one column.
  26. The -colsize option should be set to a Tcl list of integer values, one for
  27. each column in the table. Each value is the number of tokens that will be
  28. inserted into the column value for each row. For example, setting the -colsize
  29. option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10
  30. tokens per row in each, respectively.
  31. Each "FILE" argument should be a text file. The contents of these text files
  32. is split on whitespace characters to form a list of tokens. The first N1
  33. tokens are used for the first column of the first row, where N1 is the first
  34. element of the -colsize list. The next N2 are used for the second column of
  35. the first row, and so on. Rows are added to the table until the entire list
  36. of tokens is exhausted.
  37. }
  38. }
  39. ###########################################################################
  40. ###########################################################################
  41. # Command line options processor. This is generic code that can be copied
  42. # between scripts.
  43. #
  44. namespace eval cmdline {
  45. proc cmdline_error {O E {msg ""}} {
  46. if {$msg != ""} {
  47. puts stderr "Error: $msg"
  48. puts stderr ""
  49. }
  50. set L [list]
  51. foreach o $O {
  52. if {[llength $o]==1} {
  53. lappend L [string toupper $o]
  54. }
  55. }
  56. puts stderr "Usage: $::argv0 ?SWITCHES? $L"
  57. puts stderr ""
  58. puts stderr "Switches are:"
  59. foreach o $O {
  60. if {[llength $o]==3} {
  61. foreach {a b c} $o {}
  62. puts stderr [format " -%-15s %s (default \"%s\")" "$a VAL" $c $b]
  63. } elseif {[llength $o]==2} {
  64. foreach {a b} $o {}
  65. puts stderr [format " -%-15s %s" $a $b]
  66. }
  67. }
  68. puts stderr ""
  69. puts stderr $E
  70. exit -1
  71. }
  72. proc process {avar lArgs O E} {
  73. upvar $avar A
  74. set zTrailing "" ;# True if ... is present in $O
  75. set lPosargs [list]
  76. # Populate A() with default values. Also, for each switch in the command
  77. # line spec, set an entry in the idx() array as follows:
  78. #
  79. # {tblname t1 "table name to use"}
  80. # -> [set idx(-tblname) {tblname t1 "table name to use"}
  81. #
  82. # For each position parameter, append its name to $lPosargs. If the ...
  83. # specifier is present, set $zTrailing to the name of the prefix.
  84. #
  85. foreach o $O {
  86. set nm [lindex $o 0]
  87. set nArg [llength $o]
  88. switch -- $nArg {
  89. 1 {
  90. if {[string range $nm end-2 end]=="..."} {
  91. set zTrailing [string range $nm 0 end-3]
  92. } else {
  93. lappend lPosargs $nm
  94. }
  95. }
  96. 2 {
  97. set A($nm) 0
  98. set idx(-$nm) $o
  99. }
  100. 3 {
  101. set A($nm) [lindex $o 1]
  102. set idx(-$nm) $o
  103. }
  104. default {
  105. error "Error in command line specification"
  106. }
  107. }
  108. }
  109. # Set explicitly specified option values
  110. #
  111. set nArg [llength $lArgs]
  112. for {set i 0} {$i < $nArg} {incr i} {
  113. set opt [lindex $lArgs $i]
  114. if {[string range $opt 0 0]!="-" || $opt=="--"} break
  115. set c [array names idx "${opt}*"]
  116. if {[llength $c]==0} { cmdline_error $O $E "Unrecognized option: $opt"}
  117. if {[llength $c]>1} { cmdline_error $O $E "Ambiguous option: $opt"}
  118. if {[llength $idx($c)]==3} {
  119. if {$i==[llength $lArgs]-1} {
  120. cmdline_error $O $E "Option requires argument: $c"
  121. }
  122. incr i
  123. set A([lindex $idx($c) 0]) [lindex $lArgs $i]
  124. } else {
  125. set A([lindex $idx($c) 0]) 1
  126. }
  127. }
  128. # Deal with position arguments.
  129. #
  130. set nPosarg [llength $lPosargs]
  131. set nRem [expr $nArg - $i]
  132. if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} {
  133. cmdline_error $O $E
  134. }
  135. for {set j 0} {$j < $nPosarg} {incr j} {
  136. set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]]
  137. }
  138. if {$zTrailing!=""} {
  139. set A($zTrailing) [lrange $lArgs [expr $j+$i] end]
  140. }
  141. }
  142. } ;# namespace eval cmdline
  143. # End of command line options processor.
  144. ###########################################################################
  145. ###########################################################################
  146. process_cmdline
  147. # If -fts4 was specified, use fts4. Otherwise, fts5.
  148. if {$A(fts4)} {
  149. set A(fts) fts4
  150. } else {
  151. set A(fts) fts5
  152. }
  153. sqlite3 db $A(database)
  154. # Create the FTS table in the db. Return a list of the table columns.
  155. #
  156. proc create_table {} {
  157. global A
  158. set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z]
  159. set nCol [llength $A(colsize)]
  160. set cols [lrange $cols 0 [expr $nCol-1]]
  161. set sql "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) ("
  162. append sql [join $cols ,]
  163. if {$A(fts)=="fts5"} { append sql ",detail=$A(detail)" }
  164. if {$A(trigram)} { append sql ",tokenize=trigram" }
  165. append sql ", prefix='$A(prefix)');"
  166. db eval $sql
  167. return $cols
  168. }
  169. # Return a list of tokens from the named file.
  170. #
  171. proc readfile {file} {
  172. set fd [open $file]
  173. set data [read $fd]
  174. close $fd
  175. split $data
  176. }
  177. proc repeat {L n} {
  178. set res [list]
  179. for {set i 0} {$i < $n} {incr i} {
  180. set res [concat $res $L]
  181. }
  182. set res
  183. }
  184. # Load all the data into a big list of tokens.
  185. #
  186. set tokens [list]
  187. foreach f $A(file) {
  188. set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]]
  189. }
  190. set N [llength $tokens]
  191. set i 0
  192. set cols [create_table]
  193. set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])"
  194. foreach c [lrange $cols 1 end] {
  195. append sql ", \$R($c)"
  196. }
  197. append sql ")"
  198. if {$A(trans)} { db eval BEGIN }
  199. while {$i < $N} {
  200. foreach c $cols s $A(colsize) {
  201. set R($c) [lrange $tokens $i [expr $i+$s-1]]
  202. incr i $s
  203. }
  204. db eval $sql
  205. }
  206. if {$A(trans)} { db eval COMMIT }