JpegMetadataExtractor.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. <?php
  2. /**
  3. * Extraction of JPEG image metadata.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Media
  22. */
  23. use Wikimedia\XMPReader\Reader as XMPReader;
  24. /**
  25. * Class for reading jpegs and extracting metadata.
  26. * see also BitmapMetadataHandler.
  27. *
  28. * Based somewhat on GIFMetadataExtractor.
  29. *
  30. * @ingroup Media
  31. */
  32. class JpegMetadataExtractor {
  33. const MAX_JPEG_SEGMENTS = 200;
  34. // the max segment is a sanity check.
  35. // A jpeg file should never even remotely have
  36. // that many segments. Your average file has about 10.
  37. /** Function to extract metadata segments of interest from jpeg files
  38. * based on GIFMetadataExtractor.
  39. *
  40. * we can almost use getimagesize to do this
  41. * but gis doesn't support having multiple app1 segments
  42. * and those can't extract xmp on files containing both exif and xmp data
  43. *
  44. * @param string $filename Name of jpeg file
  45. * @return array Array of interesting segments.
  46. * @throws MWException If given invalid file.
  47. */
  48. static function segmentSplitter( $filename ) {
  49. $showXMP = XMPReader::isSupported();
  50. $segmentCount = 0;
  51. $segments = [
  52. 'XMP_ext' => [],
  53. 'COM' => [],
  54. 'PSIR' => [],
  55. ];
  56. if ( !$filename ) {
  57. throw new MWException( "No filename specified for " . __METHOD__ );
  58. }
  59. if ( !file_exists( $filename ) || is_dir( $filename ) ) {
  60. throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
  61. }
  62. $fh = fopen( $filename, "rb" );
  63. if ( !$fh ) {
  64. throw new MWException( "Could not open file $filename" );
  65. }
  66. $buffer = fread( $fh, 2 );
  67. if ( $buffer !== "\xFF\xD8" ) {
  68. throw new MWException( "Not a jpeg, no SOI" );
  69. }
  70. while ( !feof( $fh ) ) {
  71. $buffer = fread( $fh, 1 );
  72. $segmentCount++;
  73. if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
  74. // this is just a sanity check
  75. throw new MWException( 'Too many jpeg segments. Aborting' );
  76. }
  77. while ( $buffer !== "\xFF" && !feof( $fh ) ) {
  78. // In theory JPEG files are not allowed to contain anything between the sections,
  79. // but in practice they sometimes do. It's customary to ignore the garbage data.
  80. $buffer = fread( $fh, 1 );
  81. }
  82. $buffer = fread( $fh, 1 );
  83. while ( $buffer === "\xFF" && !feof( $fh ) ) {
  84. // Skip through any 0xFF padding bytes.
  85. $buffer = fread( $fh, 1 );
  86. }
  87. if ( $buffer === "\xFE" ) {
  88. // COM section -- file comment
  89. // First see if valid utf-8,
  90. // if not try to convert it to windows-1252.
  91. $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
  92. UtfNormal\Validator::quickIsNFCVerify( $com );
  93. // turns $com to valid utf-8.
  94. // thus if no change, its utf-8, otherwise its something else.
  95. if ( $com !== $oldCom ) {
  96. Wikimedia\suppressWarnings();
  97. $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
  98. Wikimedia\restoreWarnings();
  99. }
  100. // Try it again, if its still not a valid string, then probably
  101. // binary junk or some really weird encoding, so don't extract.
  102. UtfNormal\Validator::quickIsNFCVerify( $com );
  103. if ( $com === $oldCom ) {
  104. $segments["COM"][] = $oldCom;
  105. } else {
  106. wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
  107. }
  108. } elseif ( $buffer === "\xE1" ) {
  109. // APP1 section (Exif, XMP, and XMP extended)
  110. // only extract if XMP is enabled.
  111. $temp = self::jpegExtractMarker( $fh );
  112. // check what type of app segment this is.
  113. if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
  114. // use trim to remove trailing \0 chars
  115. $segments["XMP"] = trim( substr( $temp, 29 ) );
  116. } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
  117. // use trim to remove trailing \0 chars
  118. $segments["XMP_ext"][] = trim( substr( $temp, 35 ) );
  119. } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
  120. // Some images (especially flickr images) seem to have this.
  121. // I really have no idea what the deal is with them, but
  122. // whatever...
  123. // use trim to remove trailing \0 chars
  124. $segments["XMP"] = trim( substr( $temp, 29 ) );
  125. wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
  126. . "Using anyways.\n" );
  127. } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
  128. // Just need to find out what the byte order is.
  129. // because php's exif plugin sucks...
  130. // This is a II for little Endian, MM for big. Not a unicode BOM.
  131. $byteOrderMarker = substr( $temp, 6, 2 );
  132. if ( $byteOrderMarker === 'MM' ) {
  133. $segments['byteOrder'] = 'BE';
  134. } elseif ( $byteOrderMarker === 'II' ) {
  135. $segments['byteOrder'] = 'LE';
  136. } else {
  137. wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
  138. }
  139. }
  140. } elseif ( $buffer === "\xED" ) {
  141. // APP13 - PSIR. IPTC and some photoshop stuff
  142. $temp = self::jpegExtractMarker( $fh );
  143. if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
  144. $segments["PSIR"][] = $temp;
  145. }
  146. } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
  147. // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
  148. return $segments;
  149. } else {
  150. // segment we don't care about, so skip
  151. $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
  152. if ( $size['int'] < 2 ) {
  153. throw new MWException( "invalid marker size in jpeg" );
  154. }
  155. // Note it's possible to seek beyond end of file if truncated.
  156. // fseek doesn't report a failure in this case.
  157. fseek( $fh, $size['int'] - 2, SEEK_CUR );
  158. }
  159. }
  160. // shouldn't get here.
  161. throw new MWException( "Reached end of jpeg file unexpectedly" );
  162. }
  163. /**
  164. * Helper function for jpegSegmentSplitter
  165. * @param resource &$fh File handle for JPEG file
  166. * @throws MWException
  167. * @return string Data content of segment.
  168. */
  169. private static function jpegExtractMarker( &$fh ) {
  170. $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
  171. if ( $size['int'] < 2 ) {
  172. throw new MWException( "invalid marker size in jpeg" );
  173. }
  174. if ( $size['int'] === 2 ) {
  175. // fread( ..., 0 ) generates a warning
  176. return '';
  177. }
  178. $segment = fread( $fh, $size['int'] - 2 );
  179. if ( strlen( $segment ) !== $size['int'] - 2 ) {
  180. throw new MWException( "Segment shorter than expected" );
  181. }
  182. return $segment;
  183. }
  184. /**
  185. * This reads the photoshop image resource.
  186. * Currently it only compares the iptc/iim hash
  187. * with the stored hash, which is used to determine the precedence
  188. * of the iptc data. In future it may extract some other info, like
  189. * url of copyright license.
  190. *
  191. * This should generally be called by BitmapMetadataHandler::doApp13()
  192. *
  193. * @param string $app13 Photoshop psir app13 block from jpg.
  194. * @throws MWException (It gets caught next level up though)
  195. * @return string If the iptc hash is good or not. One of 'iptc-no-hash',
  196. * 'iptc-good-hash', 'iptc-bad-hash'.
  197. */
  198. public static function doPSIR( $app13 ) {
  199. if ( !$app13 ) {
  200. throw new MWException( "No App13 segment given" );
  201. }
  202. // First compare hash with real thing
  203. // 0x404 contains IPTC, 0x425 has hash
  204. // This is used to determine if the iptc is newer than
  205. // the xmp data, as xmp programs update the hash,
  206. // where non-xmp programs don't.
  207. $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
  208. $appLen = strlen( $app13 );
  209. $realHash = "";
  210. $recordedHash = "";
  211. // the +12 is the length of an empty item.
  212. while ( $offset + 12 <= $appLen ) {
  213. $valid = true;
  214. if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
  215. // its supposed to be 8BIM
  216. // but apparently sometimes isn't esp. in
  217. // really old jpg's
  218. $valid = false;
  219. }
  220. $offset += 4;
  221. $id = substr( $app13, $offset, 2 );
  222. // id is a 2 byte id number which identifies
  223. // the piece of info this record contains.
  224. $offset += 2;
  225. // some record types can contain a name, which
  226. // is a pascal string 0-padded to be an even
  227. // number of bytes. Most times (and any time
  228. // we care) this is empty, making it two null bytes.
  229. $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
  230. // we never use the name so skip it. +1 for length byte
  231. if ( $lenName % 2 == 1 ) {
  232. $lenName++;
  233. } // pad to even.
  234. $offset += $lenName;
  235. // now length of data (unsigned long big endian)
  236. $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
  237. // PHP can take issue with very large unsigned ints and make them negative.
  238. // Which should never ever happen, as this has to be inside a segment
  239. // which is limited to a 16 bit number.
  240. if ( $lenData['len'] < 0 ) {
  241. throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
  242. }
  243. $offset += 4; // 4bytes length field;
  244. // this should not happen, but check.
  245. if ( $lenData['len'] + $offset > $appLen ) {
  246. throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
  247. . "; offset=$offset; total length=$appLen)" );
  248. }
  249. if ( $valid ) {
  250. switch ( $id ) {
  251. case "\x04\x04":
  252. // IPTC block
  253. $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
  254. break;
  255. case "\x04\x25":
  256. $recordedHash = substr( $app13, $offset, $lenData['len'] );
  257. break;
  258. }
  259. }
  260. // if odd, add 1 to length to account for
  261. // null pad byte.
  262. if ( $lenData['len'] % 2 == 1 ) {
  263. $lenData['len']++;
  264. }
  265. $offset += $lenData['len'];
  266. }
  267. if ( !$realHash || !$recordedHash ) {
  268. return 'iptc-no-hash';
  269. } elseif ( $realHash === $recordedHash ) {
  270. return 'iptc-good-hash';
  271. } else { /*$realHash !== $recordedHash */
  272. return 'iptc-bad-hash';
  273. }
  274. }
  275. }