1234567891011121314151617181920212223242526272829303132333435363738394041 |
- # Database file containing a map of mime-types and extensions together
- # with the corresponding command to parse the attachment into plain-text.
- #
- # The layout of each line is as followed:
- #
- # <mime-type>;<extension> `<command>`
- #
- # Each line can have as many mime-types and extensions as needed.
- #
- # The command must assume the attachment data is coming through /dev/stdin
- # and should return the parsed data through /dev/stdout. When utilities
- # require the attachment data to be stored on the harddisk the command
- # `zmktemp` can be used. That script will write all attachment data in a
- # temporary file and print the location of the file to /dev/stdout again.
- #
- # To supress logmessages from the indexer regarding file types which should
- # not be indexed (like images) the command `echo -n` can be used.
- #
- # The command will be detected and executed by the attachments_parser script,
- # to test the command the following can be done to check the result:
- #
- # <command> < <attachment>
- #
- text/plain;txt `cat`
- text/html;html;htm `FILE=$(zmktemp); w3m -dump -T text/html -s -O utf-8 $FILE`
- application/xml;text/xml;xml `xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
- application/pdf;pdf `pdftotext -q -nopgbrk $(zmktemp) /dev/stdout`
- application/msword;doc `catdoc -s cp1252 -d utf-8 -f ascii -w /dev/stdin`
- application/mspowerpoint;application/powerpoint;application/x-mspowerpoint;application/vnd.ms-powerpoint;ppt `catppt -s cp1252 -d utf-8 /dev/stdin`
- application/excel;application/x-excel;application/x-msexcel;application/vnd.ms-excel;xls `xls2csv -s cp1252 -d utf-8 -c ' ' /dev/stdin`
- application/vnd.openxmlformats-officedocument.wordprocessingml.document;docx `zmktemp | unzip -p $(cat) word/document.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;xlsx `zmktemp | unzip -o -qq $(cat) -d $TMPDIR; for i in $(find $TMPDIR -name \*.xml); do xsltproc ${SCRIPTPATH}/xmltotext.xslt $i; done`
- application/vnd.openxmlformats-officedocument.presentationml.presentation;pptx `zmktemp | unzip -o -qq $(cat) -d $TMPDIR; for i in $(find $TMPDIR -name \*.xml); do xsltproc ${SCRIPTPATH}/xmltotext.xslt $i; done`
- application/vnd.oasis.opendocument.text;odt `zmktemp | unzip -p $(cat) content.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
- application/vnd.oasis.opendocument.spreadsheet;ods `zmktemp | unzip -p $(cat) content.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
- application/vnd.oasis.opendocument.presentation;odp `zmktemp | unzip -p $(cat) content.xml | xsltproc ${SCRIPTPATH}/xmltotext.xslt -`
- image/gif;image/jpeg;image/png;image/tiff;gif;jpg;jpeg;png;tif;tiff `echo -n`
- # fallback if the mimetype was not specified
- application/octet-stream `FILE=$(zmktemp); attachments_parser < $FILE mime $(file --mime-type $FILE | awk '{print $2}' | sed -e 's%application/octet-stream%stop_loop%')`
|