attachments_parser.db 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. # Database file containing a map of mime-types and extensions together
  2. # with the corresponding command to parse the attachment into plain-text.
  3. #
  4. # The layout of each line is as followed:
  5. #
  6. # <mime-type>;<extension> `<command>`
  7. #
  8. # Each line can have as many mime-types and extensions as needed.
  9. #
  10. # The command must assume the attachment data is coming through /dev/stdin
  11. # and should return the parsed data through /dev/stdout. When utilities
  12. # require the attachment data to be stored on the harddisk the command
  13. # `zmktemp` can be used. That script will write all attachment data in a
  14. # temporary file and print the location of the file to /dev/stdout again.
  15. #
  16. # To supress logmessages from the indexer regarding file types which should
  17. # not be indexed (like images) the command `echo -n` can be used.
  18. #
  19. # The command will be detected and executed by the attachments_parser script,
  20. # to test the command the following can be done to check the result:
  21. #
  22. # <command> < <attachment>
  23. #
  24. text/plain;txt `cat`
  25. text/html;html;htm `FILE=$(zmktemp); w3m -dump -T text/html -s -O utf-8 $FILE`
  26. application/xml;text/xml;xml `xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
  27. application/pdf;pdf `pdftotext -q -nopgbrk $(zmktemp) /dev/stdout`
  28. application/msword;doc `catdoc -s cp1252 -d utf-8 -f ascii -w /dev/stdin`
  29. application/mspowerpoint;application/powerpoint;application/x-mspowerpoint;application/vnd.ms-powerpoint;ppt `catppt -s cp1252 -d utf-8 /dev/stdin`
  30. application/excel;application/x-excel;application/x-msexcel;application/vnd.ms-excel;xls `xls2csv -s cp1252 -d utf-8 -c ' ' /dev/stdin`
  31. application/vnd.openxmlformats-officedocument.wordprocessingml.document;docx `zmktemp | unzip -p $(cat) word/document.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
  32. application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;xlsx `zmktemp | unzip -o -qq $(cat) -d $TMPDIR; for i in $(find $TMPDIR -name \*.xml); do xsltproc ${SCRIPTPATH}/xmltotext.xslt $i; done`
  33. application/vnd.openxmlformats-officedocument.presentationml.presentation;pptx `zmktemp | unzip -o -qq $(cat) -d $TMPDIR; for i in $(find $TMPDIR -name \*.xml); do xsltproc ${SCRIPTPATH}/xmltotext.xslt $i; done`
  34. application/vnd.oasis.opendocument.text;odt `zmktemp | unzip -p $(cat) content.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
  35. application/vnd.oasis.opendocument.spreadsheet;ods `zmktemp | unzip -p $(cat) content.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
  36. application/vnd.oasis.opendocument.presentation;odp `zmktemp | unzip -p $(cat) content.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
  37. image/gif;image/jpeg;image/png;image/tiff;gif;jpg;jpeg;png;tif;tiff `echo -n`
  38. # fallback if the mimetype was not specified
  39. application/octet-stream `FILE=$(zmktemp); attachments_parser < $FILE mime $(file --mime-type $FILE | awk '{print $2}' | sed -e 's%application/octet-stream%stop_loop%')`