utils.rb 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #--
  2. # This file is part of uhferret.
  3. #
  4. # Author:: Peter Lane
  5. # Copyright:: Copyright 2012-20, Peter Lane.
  6. # License:: GPLv3
  7. #
  8. # uhferret is free software: you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation, either version 3 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # uhferret is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  20. # TODO: Make the conversions etc work on Windows as well as Linux.
  21. #
  22. # A collection of methods to support checking and converting different
  23. # document file types.
  24. #
  25. module Utils
  26. # Check if given command is present on the system
  27. def Utils.command_present? command
  28. `which #{command}` != ""
  29. end
  30. # Create a list of permitted compressed file extensions
  31. # depending on the available commands
  32. CompressedFileExtensions = []
  33. [["unrar", ["rar"]],
  34. ["tar", ["tar.bz2", "tar.gz", "tbz2", "tgz"]],
  35. ["unzip", ["zip"]]].each do |defn|
  36. if Utils.command_present? defn[0]
  37. CompressedFileExtensions.concat defn[1]
  38. end
  39. end
  40. # Return true if the filename has a file ending for code
  41. def Utils.is_code? filename
  42. [".c", ".h", ".cpp", ".java"].include? File.extname(filename)
  43. end
  44. # Return true if the filename has a valid extension
  45. def Utils.valid_document? filename
  46. Utils.is_code? filename or
  47. (".txt" == File.extname(filename)) or
  48. Utils.is_pdf_document? filename or
  49. Utils.is_wp_document? filename
  50. end
  51. # Return true if the filename ends with .pdf and so is a pdf document.
  52. def Utils.is_pdf_document? filename
  53. ".pdf" == File.extname(filename)
  54. end
  55. # Return true if the filename ends with a known word processor extension.
  56. def Utils.is_wp_document? filename
  57. [".doc", ".rtf", ".docx", ".abw"].include? File.extname(filename)
  58. end
  59. # Use pdf2txt to convert the pdf file to text
  60. # The output is the converted filename, obtained by adding .txt to
  61. # the given filename
  62. def Utils.convert_pdf_document filename
  63. if Utils.command_present?("pdftotext")
  64. output_filename = "#{filename}.txt"
  65. `pdftotext -layout -enc Latin1 -nopgbrk #{filename} #{output_filename}`
  66. return output_filename
  67. else
  68. return filename
  69. end
  70. end
  71. # Use abiword to convert the word-processed file to text
  72. # The output is the converted filename, obtained by adding .txt to
  73. # the given filename
  74. def Utils.convert_wp_document filename
  75. if Utils.command_present?("abiword")
  76. output_filename = "#{filename}.txt"
  77. `abiword --to=txt #{filename} -o #{output_filename}`
  78. return output_filename
  79. else
  80. return filename
  81. end
  82. end
  83. end