webferret.rb 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #--
  2. # This file is part of uhferret.
  3. #
  4. # Author:: Peter Lane
  5. # Copyright:: Copyright 2012, Peter Lane.
  6. # License:: GPLv3
  7. #
  8. # uhferret is free software: you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation, either version 3 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # uhferret is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  20. require 'find'
  21. require 'uhferret'
  22. require 'utils'
  23. require 'webrick'
  24. include WEBrick
  25. @@next_upload = 0 # global variable, keeps track of number of uploads, for naming folders
  26. @@next_report = 0 # global variable, keeps track of number of reports created
  27. module UHFerret
  28. # Displays a welcome page, providing a field to upload the zipped file.
  29. # On pressing 'submit', runs Ferret and passes results to FerretResultsServlet.
  30. class FerretHomeServlet < HTTPServlet::AbstractServlet
  31. # Returns the 'welcome page' html.
  32. def do_GET(req, res)
  33. res['Content-Type'] = "text/html"
  34. res.body = <<BODY
  35. <html><body><h1>Ferret Server</h1>
  36. <p>
  37. Ferret is a tool for detecting copying in groups of documents,
  38. and was created by the (now defunct) Plagiarism Detection Group,
  39. University of Hertfordshire.
  40. </p>
  41. <form method="POST" enctype="multipart/form-data">
  42. <p>Compressed file: <input type="file" name="data" size="40">
  43. <p><input type="submit"/>
  44. </p>
  45. </form>
  46. <hr><h2>Instructions for use</h2>
  47. <p>
  48. <ol>
  49. <li>Construct a compressed folder of your files in a way suitable for
  50. your own computer. The Ferret Server will handle a compressed
  51. folder in one of the following forms:
  52. #{Utils::CompressedFileExtensions.map {|ext| "<tt>.#{ext}</tt>"}.join(", ")}.
  53. The files within it may be as:
  54. <ul>
  55. <li>plain text files</li>
  56. #{if Utils.command_present?("abiword")
  57. "<li>word-processed files (such as <tt>doc</tt> or <tt>rtf</tt> files)</li>"
  58. else
  59. ""
  60. end
  61. }
  62. #{if Utils.command_present?("pdftotext")
  63. "<li><tt>pdf</tt> documents</li>"
  64. else
  65. ""
  66. end
  67. }
  68. </ul>
  69. Files may contain natural language text or computer programs (C-type
  70. languages).
  71. </li>
  72. <li>Use the 'Browse' button to select your compressed file.
  73. </li>
  74. <li>Once Ferret has finished analysing the documents, the display will show
  75. a table of the top 100 results.
  76. </li>
  77. <li>Click on the 'view' link beside each pair to see a report of
  78. the comparisons found in that pair of documents. Use the print option of
  79. your browser to preserve a copy (e.g. using 'print to pdf').
  80. </li>
  81. </ol>
  82. </p>
  83. <hr><font size=-1>Ferret home page generated on: #{Time.now}<br />.
  84. </font>
  85. </body>
  86. </html>
  87. BODY
  88. end
  89. # Convenience method to check if a string ends with given ending.
  90. def endsWith?(str, str_end)
  91. return false if str.length < str_end.length
  92. str[-str_end.length .. -1] == str_end
  93. end
  94. # Checks if given _filename_ is an example of a compressed file.
  95. def isCompressedFile? filename
  96. Utils::CompressedFileExtensions.any? {|e| endsWith?(filename, e) }
  97. end
  98. # If _filename_ names a known compressed file format, it is decompressed
  99. # and deleted.
  100. def decompress_file filename
  101. if endsWith?(filename, "rar")
  102. `unrar x #{filename}` if Utils.command_present? "unrar"
  103. elsif endsWith?(filename, "tbz2") || endsWith?(filename, "tar.bz2")
  104. `tar jxf #{filename}` if Utils.command_present? "tar"
  105. elsif endsWith?(filename, "tgz") || endsWith?(filename, "tar.gz")
  106. `tar zxf #{filename}` if Utils.command_present? "tar"
  107. elsif endsWith?(filename, "zip")
  108. `unzip #{filename}` if Utils.command_present? "unzip"
  109. end
  110. File.delete filename # remove the compressed folder
  111. end
  112. # find all files in given folder and add their names to a definitions file
  113. # -- return true if files are text documents, or false if not
  114. def create_file_definitions folder
  115. text_files = true
  116. Dir.chdir folder
  117. files = []
  118. Find.find(folder) do |filename|
  119. next unless File.file?(filename) # ignore directories
  120. files << filename
  121. text_files = false if Utils.is_code?(filename)
  122. end
  123. # write the names of valid files into a definitions file
  124. File.open("ferret-file-definitions.def", "w") do |defn_file|
  125. files.each do |f|
  126. defn_file.puts f if Utils.valid_document? f
  127. end
  128. end
  129. return text_files
  130. end
  131. # this method is triggered when the user clicks on 'submit query'
  132. def do_POST(req, res)
  133. upload_dir = "Upload#{@@next_upload}" # create a unique folder for user's files
  134. @@next_upload += 1
  135. Dir.mkdir $base unless File.exists? $base
  136. Dir.mkdir "#{$base}/#{upload_dir}"
  137. upload_data = req.query["data"]
  138. filename = upload_data.filename.gsub(' ', "-") # replace spaces
  139. uploaded_file = "#{$base}/#{upload_dir}/#{filename}"
  140. File.open(uploaded_file, "wb") do |file| # do the actual upload of the data
  141. upload_data.each_data do |data|
  142. file << data.to_s
  143. end
  144. end
  145. # if uploaded file is a compressed file, then decompress and compute similarities
  146. if isCompressedFile?(uploaded_file)
  147. Dir.chdir "#{$base}/#{upload_dir}"
  148. decompress_file File.basename(uploaded_file)
  149. is_text = create_file_definitions Dir.pwd
  150. # do the computation of similarities
  151. # -- output to html table with given folder name, using file definition list
  152. `#{FERRET} #{is_text ? "-t" : "-c"} -w -f ferret-file-definitions.def > results.html`
  153. res['Content-Type'] = "text/html"
  154. res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{$base}/#{upload_dir}/results.html\">"
  155. else
  156. res['Content-Type'] = "text/html"
  157. res.body = %{<html><body><h1>Error</h1>
  158. <p>You did not submit a valid zip file.</p>
  159. <p><a href="/ferret/home">Return to Ferret home page</a>.</p>
  160. </body></html>}
  161. end
  162. end
  163. end
  164. # This servlet is triggered by a click on 'Download' link in report table
  165. # It creates the xml report comparing two documents
  166. class FerretReportServlet < HTTPServlet::AbstractServlet
  167. # Handles the request to create a report in xml format.
  168. def do_GET(req, res)
  169. upload_dir = req.query['upload']
  170. file1 = req.query['file1'].gsub("%20", "\ ")
  171. file2 = req.query['file2'].gsub("%20", "\ ")
  172. report_name = "#{upload_dir}/report#{@@next_report}.xml"
  173. @@next_report += 1
  174. Dir.chdir "#{upload_dir}"
  175. `#{FERRET} #{Utils.is_code?(file1) ? "-c" : "-t"} -x "#{report_name}" "#{file1}" "#{file2}"`
  176. write_style_sheet File.dirname(report_name)
  177. res['Content-Type'] = "text/html"
  178. res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{report_name}\">"
  179. end
  180. private
  181. def write_style_sheet dir
  182. File.open("#{dir}/uhferret.xsl", "w") do |f|
  183. f.puts <<STYLESHEET
  184. <?xml version="1.0" encoding="ISO-8859-1"?>
  185. <html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns="http://www.w3.org/1999/xhtml">
  186. <head>
  187. <style> <!-- style sheet for document -->
  188. h1 {background-color: #d0d0d0} <!-- add a background to make headings stand out -->
  189. h2 {background-color: #d0d0d0}
  190. .highlight {font-weight:bold; color:blue} <!-- highlighted text style -->
  191. .normal {font-weight:normal} <!-- normal text style -->
  192. </style>
  193. </head>
  194. <body>
  195. <h1>UH-Ferret: Document comparison</h1>
  196. <!-- display top-level information -->
  197. <p>Common trigrams: <xsl:value-of select="uhferret/common-trigrams"/></p>
  198. <p>Similarity: <xsl:value-of select="uhferret/similarity"/></p>
  199. <!-- work through each document -->
  200. <xsl:for-each select="uhferret/document">
  201. <!-- display document-level information -->
  202. <h2>Document: <xsl:value-of select="source"/></h2>
  203. <p>Number of trigrams: <xsl:value-of select="num-trigrams"/></p>
  204. <p>Containment in other document: <xsl:value-of select="containment"/></p>
  205. <!-- work through each block in text, displaying as highlighted or normal -->
  206. <pre>
  207. <xsl:for-each select="text/block">
  208. <xsl:if test="@text='copied'">
  209. <span class="highlight"><xsl:value-of select="."/></span>
  210. </xsl:if>
  211. <xsl:if test="@text='normal'">
  212. <span style="normal"><xsl:value-of select="."/></span>
  213. </xsl:if>
  214. </xsl:for-each>
  215. </pre>
  216. </xsl:for-each>
  217. </body>
  218. </html>
  219. STYLESHEET
  220. end
  221. end
  222. end
  223. end