IPTC.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. <?php
  2. /**
  3. * Class for some IPTC functions.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Media
  22. */
  23. /**
  24. * Class for some IPTC functions.
  25. *
  26. * @ingroup Media
  27. */
  28. class IPTC {
  29. /**
  30. * This takes the results of iptcparse() and puts it into a
  31. * form that can be handled by mediawiki. Generally called from
  32. * BitmapMetadataHandler::doApp13.
  33. *
  34. * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
  35. *
  36. * @param string $rawData The app13 block from jpeg containing iptc/iim data
  37. * @return array IPTC metadata array
  38. * @suppress PhanTypeArraySuspicious
  39. */
  40. static function parse( $rawData ) {
  41. $parsed = iptcparse( $rawData );
  42. $data = [];
  43. if ( !is_array( $parsed ) ) {
  44. return $data;
  45. }
  46. $c = '';
  47. // charset info contained in tag 1:90.
  48. if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
  49. $c = self::getCharset( $parsed['1#090'][0] );
  50. if ( $c === false ) {
  51. // Unknown charset. refuse to parse.
  52. // note: There is a different between
  53. // unknown and no charset specified.
  54. return [];
  55. }
  56. unset( $parsed['1#090'] );
  57. }
  58. foreach ( $parsed as $tag => $val ) {
  59. if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
  60. wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
  61. continue;
  62. }
  63. switch ( $tag ) {
  64. case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
  65. $data['ImageDescription'] = self::convIPTC( $val, $c );
  66. break;
  67. case '2#116': /* copyright. Mapped with exif copyright */
  68. $data['Copyright'] = self::convIPTC( $val, $c );
  69. break;
  70. case '2#080': /* byline. Mapped with exif Artist */
  71. /* merge with byline title (2:85)
  72. * like how exif does it with
  73. * Title, person. Not sure if this is best
  74. * approach since we no longer have the two fields
  75. * separate. each byline title entry corresponds to a
  76. * specific byline. */
  77. $bylines = self::convIPTC( $val, $c );
  78. if ( isset( $parsed['2#085'] ) ) {
  79. $titles = self::convIPTC( $parsed['2#085'], $c );
  80. } else {
  81. $titles = [];
  82. }
  83. $titleCount = count( $titles );
  84. for ( $i = 0; $i < $titleCount; $i++ ) {
  85. if ( isset( $bylines[$i] ) ) {
  86. // theoretically this should always be set
  87. // but doesn't hurt to be careful.
  88. $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
  89. }
  90. }
  91. $data['Artist'] = $bylines;
  92. break;
  93. case '2#025': /* keywords */
  94. $data['Keywords'] = self::convIPTC( $val, $c );
  95. break;
  96. case '2#101': /* Country (shown) */
  97. $data['CountryDest'] = self::convIPTC( $val, $c );
  98. break;
  99. case '2#095': /* state/province (shown) */
  100. $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
  101. break;
  102. case '2#090': /* city (Shown) */
  103. $data['CityDest'] = self::convIPTC( $val, $c );
  104. break;
  105. case '2#092': /* sublocation (shown) */
  106. $data['SublocationDest'] = self::convIPTC( $val, $c );
  107. break;
  108. case '2#005': /* object name/title */
  109. $data['ObjectName'] = self::convIPTC( $val, $c );
  110. break;
  111. case '2#040': /* special instructions */
  112. $data['SpecialInstructions'] = self::convIPTC( $val, $c );
  113. break;
  114. case '2#105': /* headline */
  115. $data['Headline'] = self::convIPTC( $val, $c );
  116. break;
  117. case '2#110': /* credit */
  118. /*"Identifies the provider of the objectdata,
  119. * not necessarily the owner/creator". */
  120. $data['Credit'] = self::convIPTC( $val, $c );
  121. break;
  122. case '2#115': /* source */
  123. /* "Identifies the original owner of the intellectual content of the
  124. *objectdata. This could be an agency, a member of an agency or
  125. *an individual." */
  126. $data['Source'] = self::convIPTC( $val, $c );
  127. break;
  128. case '2#007': /* edit status (lead, correction, etc) */
  129. $data['EditStatus'] = self::convIPTC( $val, $c );
  130. break;
  131. case '2#015': /* category. deprecated. max 3 letters in theory, often more */
  132. $data['iimCategory'] = self::convIPTC( $val, $c );
  133. break;
  134. case '2#020': /* category. deprecated. */
  135. $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
  136. break;
  137. case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
  138. $data['Urgency'] = self::convIPTC( $val, $c );
  139. break;
  140. case '2#022':
  141. /* "Identifies objectdata that recurs often and predictably...
  142. * Example: Euroweather" */
  143. $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
  144. break;
  145. case '2#026':
  146. /* Content location code (iso 3166 + some custom things)
  147. * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
  148. * See wikipedia article on iso 3166 and appendix D of iim std. */
  149. $data['LocationDestCode'] = self::convIPTC( $val, $c );
  150. break;
  151. case '2#027':
  152. /* Content location name. Full printable name
  153. * of location of photo. */
  154. $data['LocationDest'] = self::convIPTC( $val, $c );
  155. break;
  156. case '2#065':
  157. /* Originating Program.
  158. * Combine with Program version (2:70) if present.
  159. */
  160. $software = self::convIPTC( $val, $c );
  161. if ( count( $software ) !== 1 ) {
  162. // according to iim standard this cannot have multiple values
  163. // so if there is more than one, something weird is happening,
  164. // and we skip it.
  165. wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
  166. break;
  167. }
  168. if ( isset( $parsed['2#070'] ) ) {
  169. // if a version is set for the software.
  170. $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
  171. unset( $parsed['2#070'] );
  172. $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
  173. } else {
  174. $data['Software'] = $software;
  175. }
  176. break;
  177. case '2#075':
  178. /* Object cycle.
  179. * a for morning (am), p for evening, b for both */
  180. $data['ObjectCycle'] = self::convIPTC( $val, $c );
  181. break;
  182. case '2#100':
  183. /* Country/Primary location code.
  184. * "Indicates the code of the country/primary location where the
  185. * intellectual property of the objectdata was created"
  186. * unclear how this differs from 2#026
  187. */
  188. $data['CountryCodeDest'] = self::convIPTC( $val, $c );
  189. break;
  190. case '2#103':
  191. /* original transmission ref.
  192. * "A code representing the location of original transmission ac-
  193. * cording to practises of the provider."
  194. */
  195. $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
  196. break;
  197. case '2#118': /*contact*/
  198. $data['Contact'] = self::convIPTC( $val, $c );
  199. break;
  200. case '2#122':
  201. /* Writer/Editor
  202. * "Identification of the name of the person involved in the writing,
  203. * editing or correcting the objectdata or caption/abstract."
  204. */
  205. $data['Writer'] = self::convIPTC( $val, $c );
  206. break;
  207. case '2#135': /* lang code */
  208. $data['LanguageCode'] = self::convIPTC( $val, $c );
  209. break;
  210. // Start date stuff.
  211. // It doesn't accept incomplete dates even though they are valid
  212. // according to spec.
  213. // Should potentially store timezone as well.
  214. case '2#055':
  215. // Date created (not date digitized).
  216. // Maps to exif DateTimeOriginal
  217. $time = $parsed['2#060'] ?? [];
  218. $timestamp = self::timeHelper( $val, $time, $c );
  219. if ( $timestamp ) {
  220. $data['DateTimeOriginal'] = $timestamp;
  221. }
  222. break;
  223. case '2#062':
  224. // Date converted to digital representation.
  225. // Maps to exif DateTimeDigitized
  226. $time = $parsed['2#063'] ?? [];
  227. $timestamp = self::timeHelper( $val, $time, $c );
  228. if ( $timestamp ) {
  229. $data['DateTimeDigitized'] = $timestamp;
  230. }
  231. break;
  232. case '2#030':
  233. // Date released.
  234. $time = $parsed['2#035'] ?? [];
  235. $timestamp = self::timeHelper( $val, $time, $c );
  236. if ( $timestamp ) {
  237. $data['DateTimeReleased'] = $timestamp;
  238. }
  239. break;
  240. case '2#037':
  241. // Date expires.
  242. $time = $parsed['2#038'] ?? [];
  243. $timestamp = self::timeHelper( $val, $time, $c );
  244. if ( $timestamp ) {
  245. $data['DateTimeExpires'] = $timestamp;
  246. }
  247. break;
  248. case '2#000': /* iim version */
  249. // unlike other tags, this is a 2-byte binary number.
  250. // technically this is required if there is iptc data
  251. // but in practise it isn't always there.
  252. if ( strlen( $val[0] ) == 2 ) {
  253. // if is just to be paranoid.
  254. $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
  255. $versionValue += ord( substr( $val[0], 1, 1 ) );
  256. $data['iimVersion'] = $versionValue;
  257. }
  258. break;
  259. case '2#004':
  260. // IntellectualGenere.
  261. // first 4 characters are an id code
  262. // That we're not really interested in.
  263. // This prop is weird, since it's
  264. // allowed to have multiple values
  265. // in iim 4.1, but not in the XMP
  266. // stuff. We're going to just
  267. // extract the first value.
  268. $con = self::convIPTC( $val, $c );
  269. if ( strlen( $con[0] ) < 5 ) {
  270. wfDebugLog( 'iptc', 'IPTC: '
  271. . '2:04 too short. '
  272. . 'Ignoring.' );
  273. break;
  274. }
  275. $extracted = substr( $con[0], 4 );
  276. $data['IntellectualGenre'] = $extracted;
  277. break;
  278. case '2#012':
  279. // Subject News code - this is a compound field
  280. // at the moment we only extract the subject news
  281. // code, which is an 8 digit (ascii) number
  282. // describing the subject matter of the content.
  283. $codes = self::convIPTC( $val, $c );
  284. foreach ( $codes as $ic ) {
  285. $fields = explode( ':', $ic, 3 );
  286. if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
  287. wfDebugLog( 'IPTC', 'IPTC: '
  288. . 'Invalid 2:12 - ' . $ic );
  289. break;
  290. }
  291. $data['SubjectNewsCode'] = $fields[1];
  292. }
  293. break;
  294. // purposely does not do 2:125, 2:130, 2:131,
  295. // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
  296. // 2:200, 2:201, 2:202
  297. // or the audio stuff (2:150 to 2:154)
  298. case '2#070':
  299. case '2#060':
  300. case '2#063':
  301. case '2#085':
  302. case '2#038':
  303. case '2#035':
  304. // ignore. Handled elsewhere.
  305. break;
  306. default:
  307. wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
  308. break;
  309. }
  310. }
  311. return $data;
  312. }
  313. /**
  314. * Convert an iptc date and time tags into the exif format
  315. *
  316. * @todo Potentially this should also capture the timezone offset.
  317. * @param array $date The date tag
  318. * @param array $time The time tag
  319. * @param string $charset
  320. * @return string Date in EXIF format.
  321. */
  322. private static function timeHelper( $date, $time, $charset ) {
  323. if ( count( $date ) === 1 ) {
  324. // the standard says this should always be 1
  325. // just double checking.
  326. list( $date ) = self::convIPTC( $date, $charset );
  327. } else {
  328. return null;
  329. }
  330. if ( count( $time ) === 1 ) {
  331. list( $time ) = self::convIPTC( $time, $charset );
  332. $dateOnly = false;
  333. } else {
  334. $time = '000000+0000'; // placeholder
  335. $dateOnly = true;
  336. }
  337. if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
  338. && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
  339. && substr( $date, 0, 4 ) !== '0000'
  340. && substr( $date, 4, 2 ) !== '00'
  341. && substr( $date, 6, 2 ) !== '00'
  342. ) ) {
  343. // something wrong.
  344. // Note, this rejects some valid dates according to iptc spec
  345. // for example: the date 00000400 means the photo was taken in
  346. // April, but the year and day is unknown. We don't process these
  347. // types of incomplete dates atm.
  348. wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
  349. return null;
  350. }
  351. $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
  352. if ( $unixTS === false ) {
  353. wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
  354. return null;
  355. }
  356. $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
  357. + ( intval( substr( $time, 9, 2 ) ) * 60 );
  358. if ( substr( $time, 6, 1 ) === '-' ) {
  359. $tz = -$tz;
  360. }
  361. $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
  362. if ( $finalTimestamp === false ) {
  363. wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
  364. return null;
  365. }
  366. if ( $dateOnly ) {
  367. // return the date only
  368. return substr( $finalTimestamp, 0, 10 );
  369. } else {
  370. return $finalTimestamp;
  371. }
  372. }
  373. /**
  374. * Helper function to convert charset for iptc values.
  375. * @param string|array $data The iptc string
  376. * @param string $charset
  377. *
  378. * @return string|array
  379. */
  380. private static function convIPTC( $data, $charset ) {
  381. if ( is_array( $data ) ) {
  382. foreach ( $data as &$val ) {
  383. $val = self::convIPTCHelper( $val, $charset );
  384. }
  385. } else {
  386. $data = self::convIPTCHelper( $data, $charset );
  387. }
  388. return $data;
  389. }
  390. /**
  391. * Helper function of a helper function to convert charset for iptc values.
  392. * @param string|array $data The IPTC string
  393. * @param string $charset
  394. *
  395. * @return string
  396. */
  397. private static function convIPTCHelper( $data, $charset ) {
  398. if ( $charset ) {
  399. Wikimedia\suppressWarnings();
  400. $data = iconv( $charset, "UTF-8//IGNORE", $data );
  401. Wikimedia\restoreWarnings();
  402. if ( $data === false ) {
  403. $data = "";
  404. wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
  405. }
  406. } else {
  407. // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
  408. // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
  409. $oldData = $data;
  410. UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
  411. if ( $data === $oldData ) {
  412. return $data; // if validation didn't change $data
  413. } else {
  414. return self::convIPTCHelper( $oldData, 'Windows-1252' );
  415. }
  416. }
  417. return trim( $data );
  418. }
  419. /**
  420. * take the value of 1:90 tag and returns a charset
  421. * @param string $tag 1:90 tag.
  422. * @return string Charset name or "?"
  423. * Warning, this function does not (and is not intended to) detect
  424. * all iso 2022 escape codes. In practise, the code for utf-8 is the
  425. * only code that seems to have wide use. It does detect that code.
  426. */
  427. static function getCharset( $tag ) {
  428. // According to iim standard, charset is defined by the tag 1:90.
  429. // in which there are iso 2022 escape sequences to specify the character set.
  430. // the iim standard seems to encourage that all necessary escape sequences are
  431. // in the 1:90 tag, but says it doesn't have to be.
  432. // This is in need of more testing probably. This is definitely not complete.
  433. // however reading the docs of some other iptc software, it appears that most iptc software
  434. // only recognizes utf-8. If 1:90 tag is not present content is
  435. // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
  436. // This also won't work if there are more than one escape sequence in the 1:90 tag
  437. // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
  438. // This is just going through the charsets mentioned in appendix C of the iim standard.
  439. // \x1b = ESC.
  440. switch ( $tag ) {
  441. case "\x1b%G": // utf-8
  442. // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
  443. case "\x1b(B": // ascii
  444. case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
  445. $c = 'UTF-8';
  446. break;
  447. case "\x1b(A": // like ascii, but british.
  448. $c = 'ISO646-GB';
  449. break;
  450. case "\x1b(C": // some obscure sweedish/finland encoding
  451. $c = 'ISO-IR-8-1';
  452. break;
  453. case "\x1b(D":
  454. $c = 'ISO-IR-8-2';
  455. break;
  456. case "\x1b(E": // some obscure danish/norway encoding
  457. $c = 'ISO-IR-9-1';
  458. break;
  459. case "\x1b(F":
  460. $c = 'ISO-IR-9-2';
  461. break;
  462. case "\x1b(G":
  463. $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
  464. break;
  465. case "\x1b(I":
  466. $c = "ISO646-IT";
  467. break;
  468. case "\x1b(L":
  469. $c = "ISO646-PT";
  470. break;
  471. case "\x1b(Z":
  472. $c = "ISO646-ES";
  473. break;
  474. case "\x1b([":
  475. $c = "GREEK7-OLD";
  476. break;
  477. case "\x1b(K":
  478. $c = "ISO646-DE";
  479. break;
  480. case "\x1b(N": // crylic
  481. $c = "ISO_5427";
  482. break;
  483. case "\x1b(`": // iso646-NO
  484. $c = "NS_4551-1";
  485. break;
  486. case "\x1b(f": // iso646-FR
  487. $c = "NF_Z_62-010";
  488. break;
  489. case "\x1b(g":
  490. $c = "PT2"; // iso646-PT2
  491. break;
  492. case "\x1b(h":
  493. $c = "ES2";
  494. break;
  495. case "\x1b(i": // iso646-HU
  496. $c = "MSZ_7795.3";
  497. break;
  498. case "\x1b(w":
  499. $c = "CSA_Z243.4-1985-1";
  500. break;
  501. case "\x1b(x":
  502. $c = "CSA_Z243.4-1985-2";
  503. break;
  504. case "\x1b\$(B":
  505. case "\x1b\$B":
  506. case "\x1b&@\x1b\$B":
  507. case "\x1b&@\x1b\$(B":
  508. $c = "JIS_C6226-1983";
  509. break;
  510. case "\x1b-A": // iso-8859-1. at least for the high code characters.
  511. case "\x1b(@\x1b-A":
  512. case "\x1b(B\x1b-A":
  513. $c = 'ISO-8859-1';
  514. break;
  515. case "\x1b-B": // iso-8859-2. at least for the high code characters.
  516. $c = 'ISO-8859-2';
  517. break;
  518. case "\x1b-C": // iso-8859-3. at least for the high code characters.
  519. $c = 'ISO-8859-3';
  520. break;
  521. case "\x1b-D": // iso-8859-4. at least for the high code characters.
  522. $c = 'ISO-8859-4';
  523. break;
  524. case "\x1b-E": // iso-8859-5. at least for the high code characters.
  525. $c = 'ISO-8859-5';
  526. break;
  527. case "\x1b-F": // iso-8859-6. at least for the high code characters.
  528. $c = 'ISO-8859-6';
  529. break;
  530. case "\x1b-G": // iso-8859-7. at least for the high code characters.
  531. $c = 'ISO-8859-7';
  532. break;
  533. case "\x1b-H": // iso-8859-8. at least for the high code characters.
  534. $c = 'ISO-8859-8';
  535. break;
  536. case "\x1b-I": // CSN_369103. at least for the high code characters.
  537. $c = 'CSN_369103';
  538. break;
  539. default:
  540. wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
  541. // at this point just give up and refuse to parse iptc?
  542. $c = false;
  543. }
  544. return $c;
  545. }
  546. }