12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- #--
- # This file is part of uhferret.
- #
- # Author:: Peter Lane
- # Copyright:: Copyright 2012-20, Peter Lane.
- # License:: GPLv3
- #
- # uhferret is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # uhferret is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- # TODO: Make the conversions etc work on Windows as well as Linux.
- #
- # A collection of methods to support checking and converting different
- # document file types.
- #
- module Utils
-
- # Check if given command is present on the system
- def Utils.command_present? command
- `which #{command}` != ""
- end
- # Create a list of permitted compressed file extensions
- # depending on the available commands
- CompressedFileExtensions = []
- [["unrar", ["rar"]],
- ["tar", ["tar.bz2", "tar.gz", "tbz2", "tgz"]],
- ["unzip", ["zip"]]].each do |defn|
- if Utils.command_present? defn[0]
- CompressedFileExtensions.concat defn[1]
- end
- end
- # Return true if the filename has a file ending for code
- def Utils.is_code? filename
- [".c", ".h", ".cpp", ".java"].include? File.extname(filename)
- end
- # Return true if the filename has a valid extension
- def Utils.valid_document? filename
- Utils.is_code? filename or
- (".txt" == File.extname(filename)) or
- Utils.is_pdf_document? filename or
- Utils.is_wp_document? filename
- end
- # Return true if the filename ends with .pdf and so is a pdf document.
- def Utils.is_pdf_document? filename
- ".pdf" == File.extname(filename)
- end
- # Return true if the filename ends with a known word processor extension.
- def Utils.is_wp_document? filename
- [".doc", ".rtf", ".docx", ".abw"].include? File.extname(filename)
- end
- # Use pdf2txt to convert the pdf file to text
- # The output is the converted filename, obtained by adding .txt to
- # the given filename
- def Utils.convert_pdf_document filename
- if Utils.command_present?("pdftotext")
- output_filename = "#{filename}.txt"
- `pdftotext -layout -enc Latin1 -nopgbrk #{filename} #{output_filename}`
- return output_filename
- else
- return filename
- end
- end
- # Use abiword to convert the word-processed file to text
- # The output is the converted filename, obtained by adding .txt to
- # the given filename
- def Utils.convert_wp_document filename
- if Utils.command_present?("abiword")
- output_filename = "#{filename}.txt"
- `abiword --to=txt #{filename} -o #{output_filename}`
- return output_filename
- else
- return filename
- end
- end
- end
|