plaintext.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/usr/bin/env python
  2. import magic
  3. import mimetypes
  4. import os.path
  5. import resource
  6. import shutil
  7. import subprocess
  8. import tempfile
  9. """
  10. convert attachment to plaint text using external commands, as given below
  11. basically the type of an attachment is determined first by its filename extension,
  12. then stored mimetype and if neither are present by using python-magic.
  13. """
  14. MAX_TIME = 10 # time limit: 10 seconds # XXX get from cfg
  15. MAX_MEMORY = 256*10**6 # max mem usage: 256 MB # XXX get from cfg
  16. CONVERT_ODF = 'unzip -p %(file)s content.xml | %(xmltotext)s -'
  17. CONVERT_OOXML = 'cd %(dir)s; unzip -o -qq %(file)s; for i in $(find . -name \*.xml); do %(xmltotext)s $i; done'
  18. DB = [ # XXX read from file, test encodings
  19. ('txt;text/plain', 'iconv -c -t utf-8 %(file)s'),
  20. ('html;htm;text/html', 'w3m -dump -s -O utf-8 %(file)s'),
  21. ('xml;application/xml;text/xml', '%(xmltotext)s %(file)s'),
  22. ('pdf;application/pdf', 'pdftotext -q -nopgbrk %(file)s /dev/stdout'),
  23. ('odt;application/vnd.oasis.opendocument.text', CONVERT_ODF),
  24. ('ods;application/vnd.oasis.opendocument.spreadsheet', CONVERT_ODF),
  25. ('odp;application/vnd.oasis.opendocument.presentation', CONVERT_ODF),
  26. ('doc;application/msword', 'catdoc -s cp1252 -d utf-8 -f ascii -w %(file)s'), # XXX specifying codepage here doesn't look right..
  27. ('ppt;application/mspowerpoint;application/powerpoint;application/x-mspowerpoint;application/vnd.ms-powerpoint', 'catppt -s cp1252 -d utf-8 %(file)s'),
  28. ('xls;application/excel;application/x-excel;application/x-msexcel;application/vnd.ms-excel', "xls2csv -s cp1252 -d utf-8 -c ' ' %(file)s"),
  29. ('docx;application/vnd.openxmlformats-officedocument.wordprocessingml.document', CONVERT_OOXML),
  30. ('xlsx;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', CONVERT_OOXML),
  31. ('pptx;application/vnd.openxmlformats-officedocument.presentationml.presentation', CONVERT_OOXML),
  32. ]
  33. CMD = {}
  34. for (X, C) in DB:
  35. for Y in X.split(';'):
  36. CMD[Y] = C
  37. def setlimits():
  38. resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY, MAX_MEMORY))
  39. resource.setrlimit(resource.RLIMIT_CPU, (MAX_TIME, MAX_TIME))
  40. def convert(cmd, data, log):
  41. """ save data to tempfile and call external command on it; abort if it uses too much memory/time """
  42. result = []
  43. tmpdir = tempfile.mkdtemp()
  44. try:
  45. tmpfile = '%s/attachment' % tmpdir
  46. f = file(tmpfile, 'wb')
  47. f.write(data)
  48. f.close()
  49. cmd = cmd % {
  50. 'dir': tmpdir,
  51. 'file': tmpfile,
  52. 'xmltotext': 'xsltproc '+os.path.join(os.path.dirname(os.path.realpath(__file__)), 'xmltotext.xslt')
  53. }
  54. log.debug("executing command: '%s'" % cmd)
  55. p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=setlimits)
  56. out, err = p.communicate()
  57. if err:
  58. log.warning('output on stderr:\n'+err[:1024]+'..')
  59. if p.returncode != 0:
  60. log.warning('return code = %d' % p.returncode)
  61. plain = out.decode('utf-8', 'ignore') # XXX warning instead of ignore
  62. log.debug('converted %d bytes to %d chars of plaintext' % (len(data), len(plain)))
  63. return plain
  64. finally:
  65. shutil.rmtree(tmpdir)
  66. def ext_mime_data(f, filename, mimetype, log):
  67. """ first use filename extension to determine type, if that fails the stored mimetype, and finally libmagic """
  68. ext = os.path.splitext(filename)[1]
  69. data = None
  70. if ext:
  71. method = 'extension'
  72. mimetype = mimetypes.guess_type(filename)[0]
  73. elif mimetype and mimetype != 'application/octet-stream':
  74. method = 'mimetype'
  75. ext = mimetypes.guess_extension(mimetype)
  76. else:
  77. method = 'magic'
  78. data = f.read()
  79. mimetype = magic.from_buffer(data, mime=True)
  80. ext = mimetypes.guess_extension(mimetype)
  81. if ext:
  82. ext = ext[1:]
  83. log.debug('detected extension, mimetype: %s, %s (method=%s)' % (ext, mimetype, method))
  84. return ext, mimetype, data
  85. def get(f, mimetype=None, log=None):
  86. """ convert file-like object to plaintext, only reading data if needed; check DB with determined extension and mimetype """
  87. filename = f.name or u''
  88. ext, mimetype, data = ext_mime_data(f, filename, mimetype, log)
  89. for key in ext, mimetype:
  90. if key in CMD:
  91. return convert(CMD[key], data or f.read(), log)
  92. log.debug('unknown or unsupported filetype, skipping')
  93. return u''