123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442 |
- # This file is part of uhferret.
- #
- # Author:: Peter Lane
- # Copyright:: Copyright 2011-2020, Peter Lane.
- # License:: GPLv3
- #
- # uhferret is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # uhferret is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- require 'uhferret_lib'
- require 'utils'
- module UHFerret
- # Constant to indicate document is a natural-language document.
- TextDocument = Uhferret_lib::Document::TypeText
- # Constant to indicate document is a computer program.
- CodeDocument = Uhferret_lib::Document::TypeCode
- # UHFerret::Ferret holds a reference to a list of documents, and
- # provides methods to manage this list of documents, compute and
- # retrieve similarities between documents.
- class Ferret
- # Constructs an instance of Ferret.
- # block:: optional block is used to add files etc during construction.
- def initialize &block
- @ferret = Uhferret_lib::DocumentList.new
- self.instance_eval(&block) if block_given?
- @ferret_run = false
- end
- # Add given filename to list of documents.
- # The type of document can be given as:
- # * UHFerret::TextDocument, for natural language documents
- # * UHFerret::CodeDocument, for c-style computer programs
- # Option third argument specifies the group_id for this document.
- # The group_id can be used to suppress comparisons in some kinds
- # of output.
- # - If a pdf or word-processed document is added, it must first
- # be converted to text. Ferret tries to do this, attaching .txt
- # to the end of the filename.
- def add(filename, type = TextDocument, id = 0)
- if Utils.is_pdf_document?(filename)
- filename = Utils.convert_pdf_document filename
- elsif Utils.is_wp_document?(filename)
- filename = Utils.convert_wp_document filename
- end
- @ferret.AddDocument(filename, type, (id.zero? ? @ferret.GetNewGroupId : id))
- @ferret_run = false
- end
- # Add list of files specified in given filename
- # The type of documents can be given as:
- # * UHFerret::TextDocument, for natural language documents
- # * UHFerret::CodeDocument, for c-style computer programs
- def add_list_from_file(filename, type = TextDocument)
- within_group = false
- current_id = 0
- IO.foreach(filename) do |line|
- line.strip!
- if line.upcase == "START GROUP"
- within_group = true
- current_id = @ferret.GetNewGroupId
- elsif line.upcase == "END GROUP"
- within_group = false
- elsif File.readable? line
- add(line, type, (within_group ? current_id : 0))
- end
- end
- @ferret_run = false
- end
- # Run ferret on the current document list.
- # You must run ferret before retrieving measures of containment or resemblance.
- #
- # Raises an ArgumentError if there are not at least two documents in the document
- # list.
- def run
- if @ferret.Size >= 2
- @ferret.RunFerret
- @ferret_run = true
- @sorted_pairs = []
- else
- raise ArgumentError.new("UHFerret needs at least two documents to run")
- end
- end
- # Return document in document list at given index position.
- #
- # Raises an IndexError if index is not valid.
- def [](index)
- check_index index
- @ferret.getDocument index
- end
- # Apply provided block to each document in the document list.
- def each
- @ferret.Size.times do |i|
- yield @ferret.getDocument(i)
- end
- end
- # Return the number of documents in the document list.
- def size
- @ferret.Size
- end
- # Return the number of pairs of documents compared.
- def num_pairs
- @ferret.NumberOfPairs
- end
- # Apply provided block to each pair of compared document indices,
- # in descending order of resemblance.
- #
- # Raises an ArgumentError if ferret has not been 'run' before.
- def each_pair
- check_ferret_has_run :each_pair
- if @sorted_pairs == []
- # extract all valid document pairs
- @ferret.Size.times do |i|
- (i+1).upto(@ferret.Size-1) do |j|
- @sorted_pairs << [i, j]
- end
- end
- # sort into descending order of resemblance
- @sorted_pairs.sort! do |pair_a, pair_b|
- @ferret.ComputeResemblance(pair_b[0], pair_b[1]) <=>
- @ferret.ComputeResemblance(pair_a[0], pair_a[1])
- end
- end
- # apply block to each pair in sorted order
- @sorted_pairs.each do |pair|
- yield(pair[0], pair[1])
- end
- end
- # Return the containment of doc_1 in doc_2.
- #
- # Raises an ArgumentError if ferret has not been 'run' before, and
- # an IndexError if the document indices are not valid.
- def containment(doc_1, doc_2)
- check_ferret_has_run :containment
- check_index doc_1
- check_index doc_2
- @ferret.ComputeContainment(doc_1, doc_2)
- end
- # Return the resemblance of doc_1 and doc_2.
- #
- # Raises an ArgumentError if ferret has not been 'run' before, and
- # an IndexError if the document indices are not valid.
- def resemblance(doc_1, doc_2)
- check_ferret_has_run :resemblance
- check_index doc_1
- check_index doc_2
- if doc_1 == doc_2
- return 1.0
- else
- @ferret.ComputeResemblance([doc_1, doc_2].min, [doc_1, doc_2].max)
- end
- end
- # Return the number of trigrams in given document index.
- #
- # Raises an ArgumentError if ferret has not been 'run' before, and
- # an IndexError if the document index is not valid.
- def trigram_count index
- check_ferret_has_run :trigram_count
- check_index index
- @ferret.CountTrigrams index
- end
- # Return the total number of distinct trigrams in set of documents.
- #
- # Raises an ArgumentError if ferret has not been 'run' before calling.
- def distinct_trigrams_count
- check_ferret_has_run :distinct_trigrams_count
- @ferret.GetTotalTrigramCount
- end
- # Return the number of matching trigrams in given two document indices.
- #
- # Raises an ArgumentError if ferret has not been 'run' before, and
- # an IndexError if the document indices are not valid.
- def trigram_matches(doc_1, doc_2)
- check_ferret_has_run :trigram_matches
- check_index doc_1
- check_index doc_2
- @ferret.CountMatches(doc_1, doc_2)
- end
- # Write an XML report of the given two document indices into given filename.
- #
- # Raises an ArgumentError if ferret has not been 'run' before, and
- # an IndexError if the document indices are not valid.
- def xml_output(output_file, doc_1, doc_2)
- check_ferret_has_run :xml_output
- check_index doc_1
- check_index doc_2
- File.open(output_file, "w") do |file|
- file.puts "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
- file.puts "<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>"
- file.puts "<uhferret>"
- file.puts "<common-trigrams>#{trigram_matches(doc_1, doc_2)}</common-trigrams>"
- file.puts "<similarity>#{resemblance(doc_1, doc_2)}</similarity>"
- write_xml_document(file, doc_1, doc_2)
- write_xml_document(file, doc_2, doc_1)
- file.puts "</uhferret>"
- end
- end
- # displays each pair of documents, sorted in order of similarity
- def output_similarity_table(full_path = false)
- puts "Number of documents: #{size}"
- puts "Number of distinct trigrams: #{distinct_trigrams_count}"
- each_pair do |i, j|
- unless self[i].group_id == self[j].group_id
- if full_path
- puts "#{self[i].pathname} ; #{self[j].pathname} ; \
- #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
- #{resemblance(i, j)}"
- else
- puts "#{self[i].filename} ; #{self[j].filename} ; \
- #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
- #{resemblance(i, j)}"
- end
- end
- end
- end
- # outputs similarity table as a html page, sorted in order of similarity
- def output_html_similarity_table
- puts <<BODY
- <html><body>
- <h1>Ferret: Table of Comparisons</h1>
- <p>Return to <a href="/ferret/home">Ferret home page</a>.</p>
- <table border=1><tbody><tr><th>Index</th><th>Document 1</th><th>Document 2</th><th>Similarity</th><th>View</th></tr>
- BODY
- idx = 0
- each_pair do |i, j|
- unless self[i].group_id == self[j].group_id
- idx += 1
- break if idx > MAX_TABLE_SIZE
- puts <<ROW
- <tr>
- <td> #{idx} </td>
- <td> #{format_file(self[i].pathname)} </td>
- <td> #{format_file(self[j].pathname)} </td>
- <td> #{format("%0.3f", resemblance(i, j))} </td>
- <td><a href="/ferret/report?upload=#{Dir.pwd}&file1=#{self[i].pathname}&file2=#{self[j].pathname}" target="_blank"\>View</a></td>
- </tr>
- ROW
- end
- end
- puts "</tbody></table></p>"
- puts <<TAIL
- <hr>
- <p>Return to <a href="/ferret/home">Ferret home page.</a>
- <hr><font size=-1>Generated by Ferret, Copyright 2012 University of Hertfordshire</font>
- </body></html>
- TAIL
- end
- # outputs a list of trigrams with the document indices in which they
- # appear, indices are space separated
- def output_trigram_list
- begin
- tuples = @ferret.GetTupleSet
- tuples.Begin
- while tuples.HasMore
- print @ferret.MakeTrigramString(tuples.GetToken(0),
- tuples.GetToken(1),
- tuples.GetToken(2))
- print " FILES:[ "
- doc_indices = tuples.GetDocumentsForCurrentTuple
- doc_indices.size.times do |i|
- print "#{doc_indices[i]} "
- end
- print " ]"
- puts
- tuples.GetNext
- end
- rescue Exception => ex
- puts "Error in writing trigram list: #{ex}"
- end
- end
- # outputs a table of all comparisons, suitable for loading into a spreadsheet
- def output_all_comparisons
- # -- output headings
- size.times do |i|
- print ", #{self[i].filename}"
- end
- puts
- # -- output comparisons
- size.times do |i|
- print self[i].filename
- size.times do |j|
- print ", #{resemblance(i, j)}"
- end
- puts
- end
- end
- private
- def rm_cwd dir
- dir[(Dir.pwd.length+1)..-1]
- end
- private
- def format_file file
- rm_cwd(File.dirname(file)) + "/<b>" + File.basename(file) + "</b>"
- end
- private
- def write_xml_document(out, doc_1, doc_2)
- # -- output header
- out.puts "<document>"
- out.puts "<source>#{self[doc_1].pathname}</source>"
- out.puts "<num-trigrams>#{self.trigram_count(doc_1)}</num-trigrams>"
- out.puts "<containment>#{self.containment(doc_1, doc_2)}</containment>"
- out.puts "<text>"
- # -- output document itself
- source_text = IO.readlines(self[doc_1].pathname).join
- source_document = self[doc_1]
- source_document.StartInput(@ferret.GetTokenSet)
- last_written = 0
- inside_block = false
- while source_document.ReadTrigram(@ferret.GetTokenSet)
- if @ferret.IsMatchingTrigram(
- source_document.GetToken(0),
- source_document.GetToken(1),
- source_document.GetToken(2),
- doc_1,
- doc_2
- )
- unless inside_block
- if last_written > 0
- out.print "]]></block>" # end the last block
- end
- out.print "<block text=\"copied\"><![CDATA[" # start copied block
- inside_block = true
- end
- out.print source_text[last_written, source_document.GetTrigramEnd - last_written]
- last_written = source_document.GetTrigramEnd
- else
- if last_written < source_document.GetTrigramStart(1)
- if inside_block or last_written.zero? # moving from inside block to not
- if last_written > 0
- out.print "]]></block>" # end the last block
- end
- out.print "<block text=\"normal\"><![CDATA[" # start normal block
- inside_block = false
- end
- out.print source_text[last_written, source_document.GetTrigramStart(1) - last_written]
- last_written = source_document.GetTrigramStart(1)
- end
- end
- end
- if last_written < source_text.length
- if inside_block
- out.print "]]></block>" # end the last block
- inside_block = false
- out.print "<block text=\"normal\"><![CDATA[" # start normal block
- end
- out.print source_text[last_written..-1] # finish printing whole of source
- end
- unless last_written.zero? # i.e. nothing has been written
- out.print "]]></block>" # end the last block
- end
- # -- output footer
- out.puts "</text>"
- out.puts "</document>"
- # -- close up document
- source_document.CloseInput
- end
- private
- def check_index index
- unless index >= 0 and index < @ferret.Size
- raise IndexError.new("Index #{index} not in range [0, #{@ferret.Size})")
- end
- end
- def check_ferret_has_run method
- unless @ferret_run
- raise ArgumentError.new("UHFerret must be 'run' before #{method} can be calculated.")
- end
- end
- end
- # Extend the native class with some convenience methods.
- class Uhferret_lib::Document
- # Return the filename for this document.
- def filename
- File.basename(self.GetPathname)
- end
- # Return the full pathname for this document.
- def pathname
- self.GetPathname
- end
- # Return the id for this document.
- def group_id
- self.GetGroupId
- end
- end
- end
|