uhferret.rb 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. # This file is part of uhferret.
  2. #
  3. # Author:: Peter Lane
  4. # Copyright:: Copyright 2011-2020, Peter Lane.
  5. # License:: GPLv3
  6. #
  7. # uhferret is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # uhferret is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  19. require 'uhferret_lib'
  20. require 'utils'
  21. module UHFerret
  22. # Constant to indicate document is a natural-language document.
  23. TextDocument = Uhferret_lib::Document::TypeText
  24. # Constant to indicate document is a computer program.
  25. CodeDocument = Uhferret_lib::Document::TypeCode
  26. # UHFerret::Ferret holds a reference to a list of documents, and
  27. # provides methods to manage this list of documents, compute and
  28. # retrieve similarities between documents.
  29. class Ferret
  30. # Constructs an instance of Ferret.
  31. # block:: optional block is used to add files etc during construction.
  32. def initialize &block
  33. @ferret = Uhferret_lib::DocumentList.new
  34. self.instance_eval(&block) if block_given?
  35. @ferret_run = false
  36. end
  37. # Add given filename to list of documents.
  38. # The type of document can be given as:
  39. # * UHFerret::TextDocument, for natural language documents
  40. # * UHFerret::CodeDocument, for c-style computer programs
  41. # Option third argument specifies the group_id for this document.
  42. # The group_id can be used to suppress comparisons in some kinds
  43. # of output.
  44. # - If a pdf or word-processed document is added, it must first
  45. # be converted to text. Ferret tries to do this, attaching .txt
  46. # to the end of the filename.
  47. def add(filename, type = TextDocument, id = 0)
  48. if Utils.is_pdf_document?(filename)
  49. filename = Utils.convert_pdf_document filename
  50. elsif Utils.is_wp_document?(filename)
  51. filename = Utils.convert_wp_document filename
  52. end
  53. @ferret.AddDocument(filename, type, (id.zero? ? @ferret.GetNewGroupId : id))
  54. @ferret_run = false
  55. end
  56. # Add list of files specified in given filename
  57. # The type of documents can be given as:
  58. # * UHFerret::TextDocument, for natural language documents
  59. # * UHFerret::CodeDocument, for c-style computer programs
  60. def add_list_from_file(filename, type = TextDocument)
  61. within_group = false
  62. current_id = 0
  63. IO.foreach(filename) do |line|
  64. line.strip!
  65. if line.upcase == "START GROUP"
  66. within_group = true
  67. current_id = @ferret.GetNewGroupId
  68. elsif line.upcase == "END GROUP"
  69. within_group = false
  70. elsif File.readable? line
  71. add(line, type, (within_group ? current_id : 0))
  72. end
  73. end
  74. @ferret_run = false
  75. end
  76. # Run ferret on the current document list.
  77. # You must run ferret before retrieving measures of containment or resemblance.
  78. #
  79. # Raises an ArgumentError if there are not at least two documents in the document
  80. # list.
  81. def run
  82. if @ferret.Size >= 2
  83. @ferret.RunFerret
  84. @ferret_run = true
  85. @sorted_pairs = []
  86. else
  87. raise ArgumentError.new("UHFerret needs at least two documents to run")
  88. end
  89. end
  90. # Return document in document list at given index position.
  91. #
  92. # Raises an IndexError if index is not valid.
  93. def [](index)
  94. check_index index
  95. @ferret.getDocument index
  96. end
  97. # Apply provided block to each document in the document list.
  98. def each
  99. @ferret.Size.times do |i|
  100. yield @ferret.getDocument(i)
  101. end
  102. end
  103. # Return the number of documents in the document list.
  104. def size
  105. @ferret.Size
  106. end
  107. # Return the number of pairs of documents compared.
  108. def num_pairs
  109. @ferret.NumberOfPairs
  110. end
  111. # Apply provided block to each pair of compared document indices,
  112. # in descending order of resemblance.
  113. #
  114. # Raises an ArgumentError if ferret has not been 'run' before.
  115. def each_pair
  116. check_ferret_has_run :each_pair
  117. if @sorted_pairs == []
  118. # extract all valid document pairs
  119. @ferret.Size.times do |i|
  120. (i+1).upto(@ferret.Size-1) do |j|
  121. @sorted_pairs << [i, j]
  122. end
  123. end
  124. # sort into descending order of resemblance
  125. @sorted_pairs.sort! do |pair_a, pair_b|
  126. @ferret.ComputeResemblance(pair_b[0], pair_b[1]) <=>
  127. @ferret.ComputeResemblance(pair_a[0], pair_a[1])
  128. end
  129. end
  130. # apply block to each pair in sorted order
  131. @sorted_pairs.each do |pair|
  132. yield(pair[0], pair[1])
  133. end
  134. end
  135. # Return the containment of doc_1 in doc_2.
  136. #
  137. # Raises an ArgumentError if ferret has not been 'run' before, and
  138. # an IndexError if the document indices are not valid.
  139. def containment(doc_1, doc_2)
  140. check_ferret_has_run :containment
  141. check_index doc_1
  142. check_index doc_2
  143. @ferret.ComputeContainment(doc_1, doc_2)
  144. end
  145. # Return the resemblance of doc_1 and doc_2.
  146. #
  147. # Raises an ArgumentError if ferret has not been 'run' before, and
  148. # an IndexError if the document indices are not valid.
  149. def resemblance(doc_1, doc_2)
  150. check_ferret_has_run :resemblance
  151. check_index doc_1
  152. check_index doc_2
  153. if doc_1 == doc_2
  154. return 1.0
  155. else
  156. @ferret.ComputeResemblance([doc_1, doc_2].min, [doc_1, doc_2].max)
  157. end
  158. end
  159. # Return the number of trigrams in given document index.
  160. #
  161. # Raises an ArgumentError if ferret has not been 'run' before, and
  162. # an IndexError if the document index is not valid.
  163. def trigram_count index
  164. check_ferret_has_run :trigram_count
  165. check_index index
  166. @ferret.CountTrigrams index
  167. end
  168. # Return the total number of distinct trigrams in set of documents.
  169. #
  170. # Raises an ArgumentError if ferret has not been 'run' before calling.
  171. def distinct_trigrams_count
  172. check_ferret_has_run :distinct_trigrams_count
  173. @ferret.GetTotalTrigramCount
  174. end
  175. # Return the number of matching trigrams in given two document indices.
  176. #
  177. # Raises an ArgumentError if ferret has not been 'run' before, and
  178. # an IndexError if the document indices are not valid.
  179. def trigram_matches(doc_1, doc_2)
  180. check_ferret_has_run :trigram_matches
  181. check_index doc_1
  182. check_index doc_2
  183. @ferret.CountMatches(doc_1, doc_2)
  184. end
  185. # Write an XML report of the given two document indices into given filename.
  186. #
  187. # Raises an ArgumentError if ferret has not been 'run' before, and
  188. # an IndexError if the document indices are not valid.
  189. def xml_output(output_file, doc_1, doc_2)
  190. check_ferret_has_run :xml_output
  191. check_index doc_1
  192. check_index doc_2
  193. File.open(output_file, "w") do |file|
  194. file.puts "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
  195. file.puts "<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>"
  196. file.puts "<uhferret>"
  197. file.puts "<common-trigrams>#{trigram_matches(doc_1, doc_2)}</common-trigrams>"
  198. file.puts "<similarity>#{resemblance(doc_1, doc_2)}</similarity>"
  199. write_xml_document(file, doc_1, doc_2)
  200. write_xml_document(file, doc_2, doc_1)
  201. file.puts "</uhferret>"
  202. end
  203. end
  204. # displays each pair of documents, sorted in order of similarity
  205. def output_similarity_table(full_path = false)
  206. puts "Number of documents: #{size}"
  207. puts "Number of distinct trigrams: #{distinct_trigrams_count}"
  208. each_pair do |i, j|
  209. unless self[i].group_id == self[j].group_id
  210. if full_path
  211. puts "#{self[i].pathname} ; #{self[j].pathname} ; \
  212. #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
  213. #{resemblance(i, j)}"
  214. else
  215. puts "#{self[i].filename} ; #{self[j].filename} ; \
  216. #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
  217. #{resemblance(i, j)}"
  218. end
  219. end
  220. end
  221. end
  222. # outputs similarity table as a html page, sorted in order of similarity
  223. def output_html_similarity_table
  224. puts <<BODY
  225. <html><body>
  226. <h1>Ferret: Table of Comparisons</h1>
  227. <p>Return to <a href="/ferret/home">Ferret home page</a>.</p>
  228. <table border=1><tbody><tr><th>Index</th><th>Document 1</th><th>Document 2</th><th>Similarity</th><th>View</th></tr>
  229. BODY
  230. idx = 0
  231. each_pair do |i, j|
  232. unless self[i].group_id == self[j].group_id
  233. idx += 1
  234. break if idx > MAX_TABLE_SIZE
  235. puts <<ROW
  236. <tr>
  237. <td> #{idx} </td>
  238. <td> #{format_file(self[i].pathname)} </td>
  239. <td> #{format_file(self[j].pathname)} </td>
  240. <td> #{format("%0.3f", resemblance(i, j))} </td>
  241. <td><a href="/ferret/report?upload=#{Dir.pwd}&file1=#{self[i].pathname}&file2=#{self[j].pathname}" target="_blank"\>View</a></td>
  242. </tr>
  243. ROW
  244. end
  245. end
  246. puts "</tbody></table></p>"
  247. puts <<TAIL
  248. <hr>
  249. <p>Return to <a href="/ferret/home">Ferret home page.</a>
  250. <hr><font size=-1>Generated by Ferret, Copyright 2012 University of Hertfordshire</font>
  251. </body></html>
  252. TAIL
  253. end
  254. # outputs a list of trigrams with the document indices in which they
  255. # appear, indices are space separated
  256. def output_trigram_list
  257. begin
  258. tuples = @ferret.GetTupleSet
  259. tuples.Begin
  260. while tuples.HasMore
  261. print @ferret.MakeTrigramString(tuples.GetToken(0),
  262. tuples.GetToken(1),
  263. tuples.GetToken(2))
  264. print " FILES:[ "
  265. doc_indices = tuples.GetDocumentsForCurrentTuple
  266. doc_indices.size.times do |i|
  267. print "#{doc_indices[i]} "
  268. end
  269. print " ]"
  270. puts
  271. tuples.GetNext
  272. end
  273. rescue Exception => ex
  274. puts "Error in writing trigram list: #{ex}"
  275. end
  276. end
  277. # outputs a table of all comparisons, suitable for loading into a spreadsheet
  278. def output_all_comparisons
  279. # -- output headings
  280. size.times do |i|
  281. print ", #{self[i].filename}"
  282. end
  283. puts
  284. # -- output comparisons
  285. size.times do |i|
  286. print self[i].filename
  287. size.times do |j|
  288. print ", #{resemblance(i, j)}"
  289. end
  290. puts
  291. end
  292. end
  293. private
  294. def rm_cwd dir
  295. dir[(Dir.pwd.length+1)..-1]
  296. end
  297. private
  298. def format_file file
  299. rm_cwd(File.dirname(file)) + "/<b>" + File.basename(file) + "</b>"
  300. end
  301. private
  302. def write_xml_document(out, doc_1, doc_2)
  303. # -- output header
  304. out.puts "<document>"
  305. out.puts "<source>#{self[doc_1].pathname}</source>"
  306. out.puts "<num-trigrams>#{self.trigram_count(doc_1)}</num-trigrams>"
  307. out.puts "<containment>#{self.containment(doc_1, doc_2)}</containment>"
  308. out.puts "<text>"
  309. # -- output document itself
  310. source_text = IO.readlines(self[doc_1].pathname).join
  311. source_document = self[doc_1]
  312. source_document.StartInput(@ferret.GetTokenSet)
  313. last_written = 0
  314. inside_block = false
  315. while source_document.ReadTrigram(@ferret.GetTokenSet)
  316. if @ferret.IsMatchingTrigram(
  317. source_document.GetToken(0),
  318. source_document.GetToken(1),
  319. source_document.GetToken(2),
  320. doc_1,
  321. doc_2
  322. )
  323. unless inside_block
  324. if last_written > 0
  325. out.print "]]></block>" # end the last block
  326. end
  327. out.print "<block text=\"copied\"><![CDATA[" # start copied block
  328. inside_block = true
  329. end
  330. out.print source_text[last_written, source_document.GetTrigramEnd - last_written]
  331. last_written = source_document.GetTrigramEnd
  332. else
  333. if last_written < source_document.GetTrigramStart(1)
  334. if inside_block or last_written.zero? # moving from inside block to not
  335. if last_written > 0
  336. out.print "]]></block>" # end the last block
  337. end
  338. out.print "<block text=\"normal\"><![CDATA[" # start normal block
  339. inside_block = false
  340. end
  341. out.print source_text[last_written, source_document.GetTrigramStart(1) - last_written]
  342. last_written = source_document.GetTrigramStart(1)
  343. end
  344. end
  345. end
  346. if last_written < source_text.length
  347. if inside_block
  348. out.print "]]></block>" # end the last block
  349. inside_block = false
  350. out.print "<block text=\"normal\"><![CDATA[" # start normal block
  351. end
  352. out.print source_text[last_written..-1] # finish printing whole of source
  353. end
  354. unless last_written.zero? # i.e. nothing has been written
  355. out.print "]]></block>" # end the last block
  356. end
  357. # -- output footer
  358. out.puts "</text>"
  359. out.puts "</document>"
  360. # -- close up document
  361. source_document.CloseInput
  362. end
  363. private
  364. def check_index index
  365. unless index >= 0 and index < @ferret.Size
  366. raise IndexError.new("Index #{index} not in range [0, #{@ferret.Size})")
  367. end
  368. end
  369. def check_ferret_has_run method
  370. unless @ferret_run
  371. raise ArgumentError.new("UHFerret must be 'run' before #{method} can be calculated.")
  372. end
  373. end
  374. end
  375. # Extend the native class with some convenience methods.
  376. class Uhferret_lib::Document
  377. # Return the filename for this document.
  378. def filename
  379. File.basename(self.GetPathname)
  380. end
  381. # Return the full pathname for this document.
  382. def pathname
  383. self.GetPathname
  384. end
  385. # Return the id for this document.
  386. def group_id
  387. self.GetGroupId
  388. end
  389. end
  390. end