DjVuImage.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. <?php
  2. /**
  3. * DjVu image handler.
  4. *
  5. * Copyright © 2006 Brion Vibber <brion@pobox.com>
  6. * https://www.mediawiki.org/
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21. * http://www.gnu.org/copyleft/gpl.html
  22. *
  23. * @file
  24. * @ingroup Media
  25. */
  26. use MediaWiki\Shell\Shell;
  27. /**
  28. * Support for detecting/validating DjVu image files and getting
  29. * some basic file metadata (resolution etc)
  30. *
  31. * File format docs are available in source package for DjVuLibre:
  32. * http://djvulibre.djvuzone.org/
  33. *
  34. * @ingroup Media
  35. */
  36. class DjVuImage {
  37. /**
  38. * Memory limit for the DjVu description software
  39. */
  40. const DJVUTXT_MEMORY_LIMIT = 300000;
  41. /** @var string */
  42. private $mFilename;
  43. /**
  44. * @param string $filename The DjVu file name.
  45. */
  46. function __construct( $filename ) {
  47. $this->mFilename = $filename;
  48. }
  49. /**
  50. * Check if the given file is indeed a valid DjVu image file
  51. * @return bool
  52. */
  53. public function isValid() {
  54. $info = $this->getInfo();
  55. return $info !== false;
  56. }
  57. /**
  58. * Return data in the style of getimagesize()
  59. * @return array|false Array or false on failure
  60. */
  61. public function getImageSize() {
  62. $data = $this->getInfo();
  63. if ( $data !== false ) {
  64. $width = $data['width'];
  65. $height = $data['height'];
  66. return [ $width, $height, 'DjVu',
  67. "width=\"$width\" height=\"$height\"" ];
  68. }
  69. return false;
  70. }
  71. // ---------
  72. /**
  73. * For debugging; dump the IFF chunk structure
  74. */
  75. function dump() {
  76. $file = fopen( $this->mFilename, 'rb' );
  77. $header = fread( $file, 12 );
  78. $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
  79. $chunk = $arr['chunk'];
  80. $chunkLength = $arr['chunkLength'];
  81. echo "$chunk $chunkLength\n";
  82. $this->dumpForm( $file, $chunkLength, 1 );
  83. fclose( $file );
  84. }
  85. private function dumpForm( $file, $length, $indent ) {
  86. $start = ftell( $file );
  87. $secondary = fread( $file, 4 );
  88. echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
  89. while ( ftell( $file ) - $start < $length ) {
  90. $chunkHeader = fread( $file, 8 );
  91. if ( $chunkHeader == '' ) {
  92. break;
  93. }
  94. $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
  95. $chunk = $arr['chunk'];
  96. $chunkLength = $arr['chunkLength'];
  97. echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
  98. if ( $chunk == 'FORM' ) {
  99. $this->dumpForm( $file, $chunkLength, $indent + 1 );
  100. } else {
  101. fseek( $file, $chunkLength, SEEK_CUR );
  102. if ( $chunkLength & 1 ) {
  103. // Padding byte between chunks
  104. fseek( $file, 1, SEEK_CUR );
  105. }
  106. }
  107. }
  108. }
  109. function getInfo() {
  110. Wikimedia\suppressWarnings();
  111. $file = fopen( $this->mFilename, 'rb' );
  112. Wikimedia\restoreWarnings();
  113. if ( $file === false ) {
  114. wfDebug( __METHOD__ . ": missing or failed file read\n" );
  115. return false;
  116. }
  117. $header = fread( $file, 16 );
  118. $info = false;
  119. if ( strlen( $header ) < 16 ) {
  120. wfDebug( __METHOD__ . ": too short file header\n" );
  121. } else {
  122. $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
  123. $subtype = $arr['subtype'];
  124. if ( $arr['magic'] != 'AT&T' ) {
  125. wfDebug( __METHOD__ . ": not a DjVu file\n" );
  126. } elseif ( $subtype == 'DJVU' ) {
  127. // Single-page document
  128. $info = $this->getPageInfo( $file );
  129. } elseif ( $subtype == 'DJVM' ) {
  130. // Multi-page document
  131. $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
  132. } else {
  133. wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'\n" );
  134. }
  135. }
  136. fclose( $file );
  137. return $info;
  138. }
  139. private function readChunk( $file ) {
  140. $header = fread( $file, 8 );
  141. if ( strlen( $header ) < 8 ) {
  142. return [ false, 0 ];
  143. } else {
  144. $arr = unpack( 'a4chunk/Nlength', $header );
  145. return [ $arr['chunk'], $arr['length'] ];
  146. }
  147. }
  148. private function skipChunk( $file, $chunkLength ) {
  149. fseek( $file, $chunkLength, SEEK_CUR );
  150. if ( ( $chunkLength & 1 ) && !feof( $file ) ) {
  151. // padding byte
  152. fseek( $file, 1, SEEK_CUR );
  153. }
  154. }
  155. private function getMultiPageInfo( $file, $formLength ) {
  156. // For now, we'll just look for the first page in the file
  157. // and report its information, hoping others are the same size.
  158. $start = ftell( $file );
  159. do {
  160. list( $chunk, $length ) = $this->readChunk( $file );
  161. if ( !$chunk ) {
  162. break;
  163. }
  164. if ( $chunk == 'FORM' ) {
  165. $subtype = fread( $file, 4 );
  166. if ( $subtype == 'DJVU' ) {
  167. wfDebug( __METHOD__ . ": found first subpage\n" );
  168. return $this->getPageInfo( $file );
  169. }
  170. $this->skipChunk( $file, $length - 4 );
  171. } else {
  172. wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
  173. $this->skipChunk( $file, $length );
  174. }
  175. } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
  176. wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
  177. return false;
  178. }
  179. private function getPageInfo( $file ) {
  180. list( $chunk, $length ) = $this->readChunk( $file );
  181. if ( $chunk != 'INFO' ) {
  182. wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
  183. return false;
  184. }
  185. if ( $length < 9 ) {
  186. wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
  187. return false;
  188. }
  189. $data = fread( $file, $length );
  190. if ( strlen( $data ) < $length ) {
  191. wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
  192. return false;
  193. }
  194. $arr = unpack(
  195. 'nwidth/' .
  196. 'nheight/' .
  197. 'Cminor/' .
  198. 'Cmajor/' .
  199. 'vresolution/' .
  200. 'Cgamma', $data );
  201. # Newer files have rotation info in byte 10, but we don't use it yet.
  202. return [
  203. 'width' => $arr['width'],
  204. 'height' => $arr['height'],
  205. 'version' => "{$arr['major']}.{$arr['minor']}",
  206. 'resolution' => $arr['resolution'],
  207. 'gamma' => $arr['gamma'] / 10.0 ];
  208. }
  209. /**
  210. * Return an XML string describing the DjVu image
  211. * @return string|bool
  212. */
  213. function retrieveMetaData() {
  214. global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt;
  215. if ( !$this->isValid() ) {
  216. return false;
  217. }
  218. if ( isset( $wgDjvuDump ) ) {
  219. # djvudump is faster as of version 3.5
  220. # https://sourceforge.net/p/djvu/bugs/71/
  221. $cmd = Shell::escape( $wgDjvuDump ) . ' ' . Shell::escape( $this->mFilename );
  222. $dump = wfShellExec( $cmd );
  223. $xml = $this->convertDumpToXML( $dump );
  224. } elseif ( isset( $wgDjvuToXML ) ) {
  225. $cmd = Shell::escape( $wgDjvuToXML ) . ' --without-anno --without-text ' .
  226. Shell::escape( $this->mFilename );
  227. $xml = wfShellExec( $cmd );
  228. } else {
  229. $xml = null;
  230. }
  231. # Text layer
  232. if ( isset( $wgDjvuTxt ) ) {
  233. $cmd = Shell::escape( $wgDjvuTxt ) . ' --detail=page ' . Shell::escape( $this->mFilename );
  234. wfDebug( __METHOD__ . ": $cmd\n" );
  235. $retval = '';
  236. $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
  237. if ( $retval == 0 ) {
  238. # Strip some control characters
  239. $txt = preg_replace( "/[\013\035\037]/", "", $txt );
  240. $reg = <<<EOR
  241. /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
  242. ((?> # Text to match is composed of atoms of either:
  243. \\\\. # - any escaped character
  244. | # - any character different from " and \
  245. [^"\\\\]+
  246. )*?)
  247. "\s*\)
  248. | # Or page can be empty ; in this case, djvutxt dumps ()
  249. \(\s*()\)/sx
  250. EOR;
  251. $txt = preg_replace_callback( $reg, [ $this, 'pageTextCallback' ], $txt );
  252. $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
  253. $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 ) .
  254. $txt .
  255. '</mw-djvu>';
  256. }
  257. }
  258. return $xml;
  259. }
  260. function pageTextCallback( $matches ) {
  261. # Get rid of invalid UTF-8, strip control characters
  262. $val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
  263. $val = str_replace( [ "\n", '�' ], [ '&#10;', '' ], $val );
  264. return '<PAGE value="' . $val . '" />';
  265. }
  266. /**
  267. * Hack to temporarily work around djvutoxml bug
  268. * @param string $dump
  269. * @return string
  270. */
  271. function convertDumpToXML( $dump ) {
  272. if ( strval( $dump ) == '' ) {
  273. return false;
  274. }
  275. $xml = <<<EOT
  276. <?xml version="1.0" ?>
  277. <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
  278. <DjVuXML>
  279. <HEAD></HEAD>
  280. <BODY>
  281. EOT;
  282. $dump = str_replace( "\r", '', $dump );
  283. $line = strtok( $dump, "\n" );
  284. $m = false;
  285. $good = false;
  286. if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
  287. # Single-page
  288. if ( $this->parseFormDjvu( $line, $xml ) ) {
  289. $good = true;
  290. } else {
  291. return false;
  292. }
  293. } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
  294. # Multi-page
  295. $parentLevel = strlen( $m[1] );
  296. # Find DIRM
  297. $line = strtok( "\n" );
  298. while ( $line !== false ) {
  299. $childLevel = strspn( $line, ' ' );
  300. if ( $childLevel <= $parentLevel ) {
  301. # End of chunk
  302. break;
  303. }
  304. if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
  305. wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
  306. return false;
  307. }
  308. if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
  309. # Found page
  310. if ( $this->parseFormDjvu( $line, $xml ) ) {
  311. $good = true;
  312. } else {
  313. return false;
  314. }
  315. }
  316. $line = strtok( "\n" );
  317. }
  318. }
  319. if ( !$good ) {
  320. return false;
  321. }
  322. $xml .= "</BODY>\n</DjVuXML>\n";
  323. return $xml;
  324. }
  325. function parseFormDjvu( $line, &$xml ) {
  326. $parentLevel = strspn( $line, ' ' );
  327. $line = strtok( "\n" );
  328. # Find INFO
  329. while ( $line !== false ) {
  330. $childLevel = strspn( $line, ' ' );
  331. if ( $childLevel <= $parentLevel ) {
  332. # End of chunk
  333. break;
  334. }
  335. if ( preg_match(
  336. '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
  337. $line,
  338. $m
  339. ) ) {
  340. $xml .= Xml::tags(
  341. 'OBJECT',
  342. [
  343. # 'data' => '',
  344. # 'type' => 'image/x.djvu',
  345. 'height' => $m[2],
  346. 'width' => $m[1],
  347. # 'usemap' => '',
  348. ],
  349. "\n" .
  350. Xml::element( 'PARAM', [ 'name' => 'DPI', 'value' => $m[3] ] ) . "\n" .
  351. Xml::element( 'PARAM', [ 'name' => 'GAMMA', 'value' => $m[4] ] ) . "\n"
  352. ) . "\n";
  353. return true;
  354. }
  355. $line = strtok( "\n" );
  356. }
  357. # Not found
  358. return false;
  359. }
  360. }