123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247 |
- #--
- # This file is part of uhferret.
- #
- # Author:: Peter Lane
- # Copyright:: Copyright 2012, Peter Lane.
- # License:: GPLv3
- #
- # uhferret is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # uhferret is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- require 'find'
- require 'uhferret'
- require 'utils'
- require 'webrick'
- include WEBrick
- @@next_upload = 0 # global variable, keeps track of number of uploads, for naming folders
- @@next_report = 0 # global variable, keeps track of number of reports created
- module UHFerret
- # Displays a welcome page, providing a field to upload the zipped file.
- # On pressing 'submit', runs Ferret and passes results to FerretResultsServlet.
- class FerretHomeServlet < HTTPServlet::AbstractServlet
- # Returns the 'welcome page' html.
- def do_GET(req, res)
- res['Content-Type'] = "text/html"
- res.body = <<BODY
- <html><body><h1>Ferret Server</h1>
- <p>
- Ferret is a tool for detecting copying in groups of documents,
- and was created by the (now defunct) Plagiarism Detection Group,
- University of Hertfordshire.
- </p>
- <form method="POST" enctype="multipart/form-data">
- <p>Compressed file: <input type="file" name="data" size="40">
- <p><input type="submit"/>
- </p>
- </form>
- <hr><h2>Instructions for use</h2>
- <p>
- <ol>
- <li>Construct a compressed folder of your files in a way suitable for
- your own computer. The Ferret Server will handle a compressed
- folder in one of the following forms:
- #{Utils::CompressedFileExtensions.map {|ext| "<tt>.#{ext}</tt>"}.join(", ")}.
- The files within it may be as:
- <ul>
- <li>plain text files</li>
- #{if Utils.command_present?("abiword")
- "<li>word-processed files (such as <tt>doc</tt> or <tt>rtf</tt> files)</li>"
- else
- ""
- end
- }
- #{if Utils.command_present?("pdftotext")
- "<li><tt>pdf</tt> documents</li>"
- else
- ""
- end
- }
- </ul>
- Files may contain natural language text or computer programs (C-type
- languages).
- </li>
- <li>Use the 'Browse' button to select your compressed file.
- </li>
- <li>Once Ferret has finished analysing the documents, the display will show
- a table of the top 100 results.
- </li>
- <li>Click on the 'view' link beside each pair to see a report of
- the comparisons found in that pair of documents. Use the print option of
- your browser to preserve a copy (e.g. using 'print to pdf').
- </li>
- </ol>
- </p>
- <hr><font size=-1>Ferret home page generated on: #{Time.now}<br />.
- </font>
- </body>
- </html>
- BODY
- end
- # Convenience method to check if a string ends with given ending.
- def endsWith?(str, str_end)
- return false if str.length < str_end.length
- str[-str_end.length .. -1] == str_end
- end
- # Checks if given _filename_ is an example of a compressed file.
- def isCompressedFile? filename
- Utils::CompressedFileExtensions.any? {|e| endsWith?(filename, e) }
- end
- # If _filename_ names a known compressed file format, it is decompressed
- # and deleted.
- def decompress_file filename
- if endsWith?(filename, "rar")
- `unrar x #{filename}` if Utils.command_present? "unrar"
- elsif endsWith?(filename, "tbz2") || endsWith?(filename, "tar.bz2")
- `tar jxf #{filename}` if Utils.command_present? "tar"
- elsif endsWith?(filename, "tgz") || endsWith?(filename, "tar.gz")
- `tar zxf #{filename}` if Utils.command_present? "tar"
- elsif endsWith?(filename, "zip")
- `unzip #{filename}` if Utils.command_present? "unzip"
- end
- File.delete filename # remove the compressed folder
- end
- # find all files in given folder and add their names to a definitions file
- # -- return true if files are text documents, or false if not
- def create_file_definitions folder
- text_files = true
- Dir.chdir folder
- files = []
- Find.find(folder) do |filename|
- next unless File.file?(filename) # ignore directories
- files << filename
- text_files = false if Utils.is_code?(filename)
- end
- # write the names of valid files into a definitions file
- File.open("ferret-file-definitions.def", "w") do |defn_file|
- files.each do |f|
- defn_file.puts f if Utils.valid_document? f
- end
- end
- return text_files
- end
- # this method is triggered when the user clicks on 'submit query'
- def do_POST(req, res)
- upload_dir = "Upload#{@@next_upload}" # create a unique folder for user's files
- @@next_upload += 1
- Dir.mkdir $base unless File.exists? $base
- Dir.mkdir "#{$base}/#{upload_dir}"
- upload_data = req.query["data"]
- filename = upload_data.filename.gsub(' ', "-") # replace spaces
- uploaded_file = "#{$base}/#{upload_dir}/#{filename}"
- File.open(uploaded_file, "wb") do |file| # do the actual upload of the data
- upload_data.each_data do |data|
- file << data.to_s
- end
- end
- # if uploaded file is a compressed file, then decompress and compute similarities
- if isCompressedFile?(uploaded_file)
- Dir.chdir "#{$base}/#{upload_dir}"
- decompress_file File.basename(uploaded_file)
- is_text = create_file_definitions Dir.pwd
- # do the computation of similarities
- # -- output to html table with given folder name, using file definition list
- `#{FERRET} #{is_text ? "-t" : "-c"} -w -f ferret-file-definitions.def > results.html`
- res['Content-Type'] = "text/html"
- res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{$base}/#{upload_dir}/results.html\">"
- else
- res['Content-Type'] = "text/html"
- res.body = %{<html><body><h1>Error</h1>
- <p>You did not submit a valid zip file.</p>
- <p><a href="/ferret/home">Return to Ferret home page</a>.</p>
- </body></html>}
- end
- end
- end
- # This servlet is triggered by a click on 'Download' link in report table
- # It creates the xml report comparing two documents
- class FerretReportServlet < HTTPServlet::AbstractServlet
- # Handles the request to create a report in xml format.
- def do_GET(req, res)
- upload_dir = req.query['upload']
- file1 = req.query['file1'].gsub("%20", "\ ")
- file2 = req.query['file2'].gsub("%20", "\ ")
- report_name = "#{upload_dir}/report#{@@next_report}.xml"
- @@next_report += 1
- Dir.chdir "#{upload_dir}"
- `#{FERRET} #{Utils.is_code?(file1) ? "-c" : "-t"} -x "#{report_name}" "#{file1}" "#{file2}"`
- write_style_sheet File.dirname(report_name)
- res['Content-Type'] = "text/html"
- res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{report_name}\">"
- end
- private
- def write_style_sheet dir
- File.open("#{dir}/uhferret.xsl", "w") do |f|
- f.puts <<STYLESHEET
- <?xml version="1.0" encoding="ISO-8859-1"?>
- <html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <style> <!-- style sheet for document -->
- h1 {background-color: #d0d0d0} <!-- add a background to make headings stand out -->
- h2 {background-color: #d0d0d0}
- .highlight {font-weight:bold; color:blue} <!-- highlighted text style -->
- .normal {font-weight:normal} <!-- normal text style -->
- </style>
- </head>
- <body>
- <h1>UH-Ferret: Document comparison</h1>
- <!-- display top-level information -->
- <p>Common trigrams: <xsl:value-of select="uhferret/common-trigrams"/></p>
- <p>Similarity: <xsl:value-of select="uhferret/similarity"/></p>
- <!-- work through each document -->
- <xsl:for-each select="uhferret/document">
- <!-- display document-level information -->
- <h2>Document: <xsl:value-of select="source"/></h2>
- <p>Number of trigrams: <xsl:value-of select="num-trigrams"/></p>
- <p>Containment in other document: <xsl:value-of select="containment"/></p>
- <!-- work through each block in text, displaying as highlighted or normal -->
- <pre>
- <xsl:for-each select="text/block">
- <xsl:if test="@text='copied'">
- <span class="highlight"><xsl:value-of select="."/></span>
- </xsl:if>
- <xsl:if test="@text='normal'">
- <span style="normal"><xsl:value-of select="."/></span>
- </xsl:if>
- </xsl:for-each>
- </pre>
- </xsl:for-each>
- </body>
- </html>
- STYLESHEET
- end
- end
- end
- end
|