peterlane
/
uhferret-gem


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
							# This file is part of uhferret.
#
# Author::    Peter Lane
# Copyright:: Copyright 2011-2020, Peter Lane.
# License::   GPLv3
#
# uhferret is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# uhferret is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with uhferret.  If not, see <http://www.gnu.org/licenses/>.

require 'uhferret_lib'
require 'utils'

module UHFerret

  # Constant to indicate document is a natural-language document.
  TextDocument = Uhferret_lib::Document::TypeText

  # Constant to indicate document is a computer program.
  CodeDocument = Uhferret_lib::Document::TypeCode

  # UHFerret::Ferret holds a reference to a list of documents, and 
  # provides methods to manage this list of documents, compute and  
  # retrieve similarities between documents.
  class Ferret

    # Constructs an instance of Ferret.
    # block:: optional block is used to add files etc during construction.
    def initialize &block
      @ferret = Uhferret_lib::DocumentList.new
      self.instance_eval(&block) if block_given?
      @ferret_run = false
    end

    # Add given filename to list of documents.
    # The type of document can be given as:
    # * UHFerret::TextDocument, for natural language documents
    # * UHFerret::CodeDocument, for c-style computer programs
    # Option third argument specifies the group_id for this document.
    # The group_id can be used to suppress comparisons in some kinds 
    # of output.
    # - If a pdf or word-processed document is added, it must first 
    #   be converted to text.  Ferret tries to do this, attaching .txt 
    #   to the end of the filename.
    def add(filename, type = TextDocument, id = 0)
      if Utils.is_pdf_document?(filename)
        filename = Utils.convert_pdf_document filename
      elsif Utils.is_wp_document?(filename)
        filename = Utils.convert_wp_document filename
      end
      @ferret.AddDocument(filename, type, (id.zero? ? @ferret.GetNewGroupId : id))
      @ferret_run = false
    end

    # Add list of files specified in given filename
    # The type of documents can be given as:
    # * UHFerret::TextDocument, for natural language documents
    # * UHFerret::CodeDocument, for c-style computer programs
    def add_list_from_file(filename, type = TextDocument)
      within_group = false
      current_id = 0

      IO.foreach(filename) do |line|
        line.strip!
        if line.upcase == "START GROUP"
          within_group = true
          current_id = @ferret.GetNewGroupId
        elsif line.upcase == "END GROUP"
          within_group = false
        elsif File.readable? line
          add(line, type, (within_group ? current_id : 0))
        end
      end

      @ferret_run = false
    end

    # Run ferret on the current document list.
    # You must run ferret before retrieving measures of containment or resemblance.
    #
    # Raises an ArgumentError if there are not at least two documents in the document 
    # list.
    def run
      if @ferret.Size >= 2
        @ferret.RunFerret
        @ferret_run = true
        @sorted_pairs = []
      else
        raise ArgumentError.new("UHFerret needs at least two documents to run")
      end
    end

    # Return document in document list at given index position. 
    #
    # Raises an IndexError if index is not valid.
    def [](index)
      check_index index

      @ferret.getDocument index
    end

    # Apply provided block to each document in the document list.
    def each
      @ferret.Size.times do |i|
        yield @ferret.getDocument(i)
      end
    end

    # Return the number of documents in the document list.
    def size
      @ferret.Size
    end

    # Return the number of pairs of documents compared.
    def num_pairs
      @ferret.NumberOfPairs
    end

    # Apply provided block to each pair of compared document indices, 
    # in descending order of resemblance.
    #
    # Raises an ArgumentError if ferret has not been 'run' before.
    def each_pair
      check_ferret_has_run :each_pair

      if @sorted_pairs == []
        # extract all valid document pairs
        @ferret.Size.times do |i|
          (i+1).upto(@ferret.Size-1) do |j|
            @sorted_pairs << [i, j]
          end
        end
        # sort into descending order of resemblance
        @sorted_pairs.sort! do |pair_a, pair_b|
          @ferret.ComputeResemblance(pair_b[0], pair_b[1]) <=>
          @ferret.ComputeResemblance(pair_a[0], pair_a[1])
        end
      end

      # apply block to each pair in sorted order
      @sorted_pairs.each do |pair|
        yield(pair[0], pair[1])
      end
    end

    # Return the containment of doc_1 in doc_2.
    #
    # Raises an ArgumentError if ferret has not been 'run' before, and 
    # an IndexError if the document indices are not valid.
    def containment(doc_1, doc_2)
      check_ferret_has_run :containment
      check_index doc_1
      check_index doc_2

      @ferret.ComputeContainment(doc_1, doc_2)
    end

    # Return the resemblance of doc_1 and doc_2.
    #
    # Raises an ArgumentError if ferret has not been 'run' before, and 
    # an IndexError if the document indices are not valid.
    def resemblance(doc_1, doc_2)
      check_ferret_has_run :resemblance
      check_index doc_1
      check_index doc_2

      if doc_1 == doc_2 
        return 1.0
      else
        @ferret.ComputeResemblance([doc_1, doc_2].min, [doc_1, doc_2].max)
      end
    end

    # Return the number of trigrams in given document index.
    #
    # Raises an ArgumentError if ferret has not been 'run' before, and 
    # an IndexError if the document index is not valid.
    def trigram_count index
      check_ferret_has_run :trigram_count
      check_index index

      @ferret.CountTrigrams index
    end

    # Return the total number of distinct trigrams in set of documents.
    #
    # Raises an ArgumentError if ferret has not been 'run' before calling.
    def distinct_trigrams_count
      check_ferret_has_run :distinct_trigrams_count

      @ferret.GetTotalTrigramCount
    end

    # Return the number of matching trigrams in given two document indices.
    #
    # Raises an ArgumentError if ferret has not been 'run' before, and 
    # an IndexError if the document indices are not valid.
    def trigram_matches(doc_1, doc_2)
      check_ferret_has_run :trigram_matches
      check_index doc_1
      check_index doc_2

      @ferret.CountMatches(doc_1, doc_2)
    end

    # Write an XML report of the given two document indices into given filename.
    #
    # Raises an ArgumentError if ferret has not been 'run' before, and 
    # an IndexError if the document indices are not valid.
    def xml_output(output_file, doc_1, doc_2)
      check_ferret_has_run :xml_output
      check_index doc_1
      check_index doc_2

      File.open(output_file, "w") do |file|
        file.puts "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
        file.puts "<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>"
        file.puts "<uhferret>"

        file.puts "<common-trigrams>#{trigram_matches(doc_1, doc_2)}</common-trigrams>"
        file.puts "<similarity>#{resemblance(doc_1, doc_2)}</similarity>"
        write_xml_document(file, doc_1, doc_2)
        write_xml_document(file, doc_2, doc_1)

        file.puts "</uhferret>"
      end
    end

    # displays each pair of documents, sorted in order of similarity
    def output_similarity_table(full_path = false)
      puts "Number of documents: #{size}"
      puts "Number of distinct trigrams: #{distinct_trigrams_count}"
      each_pair do |i, j|
        unless self[i].group_id == self[j].group_id
          if full_path
            puts "#{self[i].pathname} ; #{self[j].pathname} ; \
          #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
          #{resemblance(i, j)}"
          else
            puts "#{self[i].filename} ; #{self[j].filename} ; \
          #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
          #{resemblance(i, j)}"
            end
        end
      end
    end

    # outputs similarity table as a html page, sorted in order of similarity
    def output_html_similarity_table
      puts <<BODY
    <html><body>
    <h1>Ferret: Table of Comparisons</h1>
    <p>Return to <a href="/ferret/home">Ferret home page</a>.</p>
    <table border=1><tbody><tr><th>Index</th><th>Document 1</th><th>Document 2</th><th>Similarity</th><th>View</th></tr>
BODY
      idx = 0
      each_pair do |i, j|
        unless self[i].group_id == self[j].group_id
          idx += 1
          break if idx > MAX_TABLE_SIZE

          puts <<ROW
        <tr>
        <td> #{idx} </td>
        <td> #{format_file(self[i].pathname)} </td>
        <td> #{format_file(self[j].pathname)} </td>
        <td> #{format("%0.3f", resemblance(i, j))} </td>
        <td><a href="/ferret/report?upload=#{Dir.pwd}&file1=#{self[i].pathname}&file2=#{self[j].pathname}" target="_blank"\>View</a></td>
        </tr>
ROW
        end
      end
      puts "</tbody></table></p>"

      puts <<TAIL
    <hr>
    <p>Return to <a href="/ferret/home">Ferret home page.</a>
    <hr><font size=-1>Generated by Ferret, Copyright 2012 University of Hertfordshire</font>
    </body></html>
TAIL
    end

    # outputs a list of trigrams with the document indices in which they 
    # appear, indices are space separated
    def output_trigram_list
      begin
        tuples = @ferret.GetTupleSet
        tuples.Begin
        while tuples.HasMore
          print @ferret.MakeTrigramString(tuples.GetToken(0),
                                          tuples.GetToken(1),
                                          tuples.GetToken(2))
          print "  FILES:[ "
          doc_indices = tuples.GetDocumentsForCurrentTuple
          doc_indices.size.times do |i|
            print "#{doc_indices[i]} "
          end
          print " ]"
          puts
          tuples.GetNext
        end
      rescue Exception => ex
        puts "Error in writing trigram list: #{ex}"
      end
    end

    # outputs a table of all comparisons, suitable for loading into a spreadsheet
    def output_all_comparisons
      # -- output headings
      size.times do |i|
        print ", #{self[i].filename}"
      end
      puts
      # -- output comparisons
      size.times do |i|
        print self[i].filename
        size.times do |j|
          print ", #{resemblance(i, j)}"
        end
        puts
      end
    end

    private
    def rm_cwd dir
      dir[(Dir.pwd.length+1)..-1]
    end

    private
    def format_file file
      rm_cwd(File.dirname(file)) + "/<b>" + File.basename(file) + "</b>"
    end


    private
    def write_xml_document(out, doc_1, doc_2)
      # -- output header
      out.puts "<document>"
      out.puts "<source>#{self[doc_1].pathname}</source>"
      out.puts "<num-trigrams>#{self.trigram_count(doc_1)}</num-trigrams>"
      out.puts "<containment>#{self.containment(doc_1, doc_2)}</containment>"
      out.puts "<text>"
      # -- output document itself
      source_text = IO.readlines(self[doc_1].pathname).join
      source_document = self[doc_1]
      source_document.StartInput(@ferret.GetTokenSet)
      last_written = 0
      inside_block = false
      while source_document.ReadTrigram(@ferret.GetTokenSet)
        if @ferret.IsMatchingTrigram(
            source_document.GetToken(0),
            source_document.GetToken(1),
            source_document.GetToken(2),
            doc_1,
            doc_2
        )
          unless inside_block
            if last_written > 0
              out.print "]]></block>" # end the last block
            end
            out.print "<block text=\"copied\"><![CDATA[" # start copied block
            inside_block = true
          end
          out.print source_text[last_written, source_document.GetTrigramEnd - last_written]
          last_written = source_document.GetTrigramEnd
        else
          if last_written < source_document.GetTrigramStart(1)
            if inside_block or last_written.zero? # moving from inside block to not
              if last_written > 0
                out.print "]]></block>" # end the last block
              end
              out.print "<block text=\"normal\"><![CDATA[" # start normal block
              inside_block = false
            end
            out.print source_text[last_written, source_document.GetTrigramStart(1) - last_written]
            last_written = source_document.GetTrigramStart(1)
          end
        end
      end
      if last_written < source_text.length
        if inside_block
          out.print "]]></block>" # end the last block
          inside_block = false
          out.print "<block text=\"normal\"><![CDATA[" # start normal block
        end
        out.print source_text[last_written..-1] # finish printing whole of source
      end
      unless last_written.zero? # i.e. nothing has been written
        out.print "]]></block>" # end the last block
      end
      # -- output footer
      out.puts "</text>"
      out.puts "</document>"
      # -- close up document
      source_document.CloseInput
    end

    private 
    def check_index index
      unless index >= 0 and index < @ferret.Size
        raise IndexError.new("Index #{index} not in range [0, #{@ferret.Size})")
      end
    end

    def check_ferret_has_run method
      unless @ferret_run
        raise ArgumentError.new("UHFerret must be 'run' before #{method} can be calculated.")
      end
    end
  end

  # Extend the native class with some convenience methods.
  class Uhferret_lib::Document

    # Return the filename for this document.
    def filename
      File.basename(self.GetPathname)
    end

    # Return the full pathname for this document.
    def pathname
      self.GetPathname
    end

    # Return the id for this document.
    def group_id
      self.GetGroupId
    end
  end

end