FormatJson.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. <?php
  2. /**
  3. * Wrapper for json_encode and json_decode.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. */
  22. /**
  23. * JSON formatter wrapper class
  24. */
  25. class FormatJson {
  26. /**
  27. * Skip escaping most characters above U+007F for readability and compactness.
  28. * This encoding option saves 3 to 8 bytes (uncompressed) for each such character;
  29. * however, it could break compatibility with systems that incorrectly handle UTF-8.
  30. *
  31. * @since 1.22
  32. */
  33. const UTF8_OK = 1;
  34. /**
  35. * Skip escaping the characters '<', '>', and '&', which have special meanings in
  36. * HTML and XML.
  37. *
  38. * @warning Do not use this option for JSON that could end up in inline scripts.
  39. * - HTML 5.2, §4.12.1.3 Restrictions for contents of script elements
  40. * - XML 1.0 (5th Ed.), §2.4 Character Data and Markup
  41. *
  42. * @since 1.22
  43. */
  44. const XMLMETA_OK = 2;
  45. /**
  46. * Skip escaping as many characters as reasonably possible.
  47. *
  48. * @warning When generating inline script blocks, use FormatJson::UTF8_OK instead.
  49. *
  50. * @since 1.22
  51. */
  52. const ALL_OK = self::UTF8_OK | self::XMLMETA_OK;
  53. /**
  54. * If set, treat JSON objects '{...}' as associative arrays. Without this option,
  55. * JSON objects will be converted to stdClass.
  56. *
  57. * @since 1.24
  58. */
  59. const FORCE_ASSOC = 0x100;
  60. /**
  61. * If set, attempt to fix invalid JSON.
  62. *
  63. * @since 1.24
  64. */
  65. const TRY_FIXING = 0x200;
  66. /**
  67. * If set, strip comments from input before parsing as JSON.
  68. *
  69. * @since 1.25
  70. */
  71. const STRIP_COMMENTS = 0x400;
  72. /**
  73. * Characters problematic in JavaScript.
  74. *
  75. * @note These are listed in ECMA-262 (5.1 Ed.), §7.3 Line Terminators along with U+000A (LF)
  76. * and U+000D (CR). However, PHP already escapes LF and CR according to RFC 4627.
  77. */
  78. private static $badChars = [
  79. "\u{2028}", // U+2028 LINE SEPARATOR
  80. "\u{2029}", // U+2029 PARAGRAPH SEPARATOR
  81. ];
  82. /**
  83. * Escape sequences for characters listed in FormatJson::$badChars.
  84. */
  85. private static $badCharsEscaped = [
  86. '\u2028', // U+2028 LINE SEPARATOR
  87. '\u2029', // U+2029 PARAGRAPH SEPARATOR
  88. ];
  89. /**
  90. * Returns the JSON representation of a value.
  91. *
  92. * @note Empty arrays are encoded as numeric arrays, not as objects, so cast any associative
  93. * array that might be empty to an object before encoding it.
  94. *
  95. * @note In pre-1.22 versions of MediaWiki, using this function for generating inline script
  96. * blocks may result in an XSS vulnerability, and quite likely will in XML documents
  97. * (cf. FormatJson::XMLMETA_OK). Use Xml::encodeJsVar() instead in such cases.
  98. *
  99. * @param mixed $value The value to encode. Can be any type except a resource.
  100. * @param string|bool $pretty If a string, add non-significant whitespace to improve
  101. * readability, using that string for indentation. If true, use the default indent
  102. * string (four spaces).
  103. * @param int $escaping Bitfield consisting of _OK class constants
  104. * @return string|false String if successful; false upon failure
  105. */
  106. public static function encode( $value, $pretty = false, $escaping = 0 ) {
  107. if ( !is_string( $pretty ) ) {
  108. $pretty = $pretty ? ' ' : false;
  109. }
  110. // PHP escapes '/' to prevent breaking out of inline script blocks using '</script>',
  111. // which is hardly useful when '<' and '>' are escaped (and inadequate), and such
  112. // escaping negatively impacts the human readability of URLs and similar strings.
  113. $options = JSON_UNESCAPED_SLASHES;
  114. $options |= $pretty !== false ? JSON_PRETTY_PRINT : 0;
  115. $options |= ( $escaping & self::UTF8_OK ) ? JSON_UNESCAPED_UNICODE : 0;
  116. $options |= ( $escaping & self::XMLMETA_OK ) ? 0 : ( JSON_HEX_TAG | JSON_HEX_AMP );
  117. $json = json_encode( $value, $options );
  118. if ( $json === false ) {
  119. return false;
  120. }
  121. if ( $pretty !== false && $pretty !== ' ' ) {
  122. // Change the four-space indent to a tab indent
  123. $json = str_replace( "\n ", "\n\t", $json );
  124. while ( strpos( $json, "\t " ) !== false ) {
  125. $json = str_replace( "\t ", "\t\t", $json );
  126. }
  127. if ( $pretty !== "\t" ) {
  128. // Change the tab indent to the provided indent
  129. $json = str_replace( "\t", $pretty, $json );
  130. }
  131. }
  132. if ( $escaping & self::UTF8_OK ) {
  133. $json = str_replace( self::$badChars, self::$badCharsEscaped, $json );
  134. }
  135. return $json;
  136. }
  137. /**
  138. * Decodes a JSON string. It is recommended to use FormatJson::parse(),
  139. * which returns more comprehensive result in case of an error, and has
  140. * more parsing options.
  141. *
  142. * In PHP versions before 7.1, decoding a JSON string containing an empty key
  143. * without passing $assoc as true results in a return object with a property
  144. * named "_empty_" (because true empty properties were not supported pre-PHP-7.1).
  145. * Instead, consider passing $assoc as true to return an associative array.
  146. *
  147. * But be aware that in all supported PHP versions, decoding an empty JSON object
  148. * with $assoc = true returns an array, not an object, breaking round-trip consistency.
  149. *
  150. * See https://phabricator.wikimedia.org/T206411 for more details on these quirks.
  151. *
  152. * @param string $value The JSON string being decoded
  153. * @param bool $assoc When true, returned objects will be converted into associative arrays.
  154. *
  155. * @return mixed The value encoded in JSON in appropriate PHP type.
  156. * `null` is returned if $value represented `null`, if $value could not be decoded,
  157. * or if the encoded data was deeper than the recursion limit.
  158. * Use FormatJson::parse() to distinguish between types of `null` and to get proper error code.
  159. */
  160. public static function decode( $value, $assoc = false ) {
  161. return json_decode( $value, $assoc );
  162. }
  163. /**
  164. * Decodes a JSON string.
  165. * Unlike FormatJson::decode(), if $value represents null value, it will be
  166. * properly decoded as valid.
  167. *
  168. * @param string $value The JSON string being decoded
  169. * @param int $options A bit field that allows FORCE_ASSOC, TRY_FIXING,
  170. * STRIP_COMMENTS
  171. * @return Status If valid JSON, the value is available in $result->getValue()
  172. */
  173. public static function parse( $value, $options = 0 ) {
  174. if ( $options & self::STRIP_COMMENTS ) {
  175. $value = self::stripComments( $value );
  176. }
  177. $assoc = ( $options & self::FORCE_ASSOC ) !== 0;
  178. $result = json_decode( $value, $assoc );
  179. $code = json_last_error();
  180. if ( $code === JSON_ERROR_SYNTAX && ( $options & self::TRY_FIXING ) !== 0 ) {
  181. // The most common error is the trailing comma in a list or an object.
  182. // We cannot simply replace /,\s*[}\]]/ because it could be inside a string value.
  183. // But we could use the fact that JSON does not allow multi-line string values,
  184. // And remove trailing commas if they are et the end of a line.
  185. // JSON only allows 4 control characters: [ \t\r\n]. So we must not use '\s' for matching.
  186. // Regex match ,]<any non-quote chars>\n or ,\n] with optional spaces/tabs.
  187. $count = 0;
  188. $value =
  189. preg_replace( '/,([ \t]*[}\]][^"\r\n]*([\r\n]|$)|[ \t]*[\r\n][ \t\r\n]*[}\]])/', '$1',
  190. $value, -1, $count );
  191. if ( $count > 0 ) {
  192. $result = json_decode( $value, $assoc );
  193. if ( JSON_ERROR_NONE === json_last_error() ) {
  194. // Report warning
  195. $st = Status::newGood( $result );
  196. $st->warning( wfMessage( 'json-warn-trailing-comma' )->numParams( $count ) );
  197. return $st;
  198. }
  199. }
  200. }
  201. switch ( $code ) {
  202. case JSON_ERROR_NONE:
  203. return Status::newGood( $result );
  204. default:
  205. return Status::newFatal( wfMessage( 'json-error-unknown' )->numParams( $code ) );
  206. case JSON_ERROR_DEPTH:
  207. $msg = 'json-error-depth';
  208. break;
  209. case JSON_ERROR_STATE_MISMATCH:
  210. $msg = 'json-error-state-mismatch';
  211. break;
  212. case JSON_ERROR_CTRL_CHAR:
  213. $msg = 'json-error-ctrl-char';
  214. break;
  215. case JSON_ERROR_SYNTAX:
  216. $msg = 'json-error-syntax';
  217. break;
  218. case JSON_ERROR_UTF8:
  219. $msg = 'json-error-utf8';
  220. break;
  221. case JSON_ERROR_RECURSION:
  222. $msg = 'json-error-recursion';
  223. break;
  224. case JSON_ERROR_INF_OR_NAN:
  225. $msg = 'json-error-inf-or-nan';
  226. break;
  227. case JSON_ERROR_UNSUPPORTED_TYPE:
  228. $msg = 'json-error-unsupported-type';
  229. break;
  230. }
  231. return Status::newFatal( $msg );
  232. }
  233. /**
  234. * Remove multiline and single line comments from an otherwise valid JSON
  235. * input string. This can be used as a preprocessor, to allow JSON
  236. * formatted configuration files to contain comments.
  237. *
  238. * @param string $json
  239. * @return string JSON with comments removed
  240. */
  241. public static function stripComments( $json ) {
  242. // Ensure we have a string
  243. $str = (string)$json;
  244. $buffer = '';
  245. $maxLen = strlen( $str );
  246. $mark = 0;
  247. $inString = false;
  248. $inComment = false;
  249. $multiline = false;
  250. for ( $idx = 0; $idx < $maxLen; $idx++ ) {
  251. switch ( $str[$idx] ) {
  252. case '"':
  253. $lookBehind = ( $idx - 1 >= 0 ) ? $str[$idx - 1] : '';
  254. if ( !$inComment && $lookBehind !== '\\' ) {
  255. // Either started or ended a string
  256. $inString = !$inString;
  257. }
  258. break;
  259. case '/':
  260. $lookAhead = ( $idx + 1 < $maxLen ) ? $str[$idx + 1] : '';
  261. $lookBehind = ( $idx - 1 >= 0 ) ? $str[$idx - 1] : '';
  262. if ( $inString ) {
  263. break;
  264. } elseif ( !$inComment &&
  265. ( $lookAhead === '/' || $lookAhead === '*' )
  266. ) {
  267. // Transition into a comment
  268. // Add characters seen to buffer
  269. $buffer .= substr( $str, $mark, $idx - $mark );
  270. // Consume the look ahead character
  271. $idx++;
  272. // Track state
  273. $inComment = true;
  274. $multiline = $lookAhead === '*';
  275. } elseif ( $multiline && $lookBehind === '*' ) {
  276. // Found the end of the current comment
  277. $mark = $idx + 1;
  278. $inComment = false;
  279. $multiline = false;
  280. }
  281. break;
  282. case "\n":
  283. if ( $inComment && !$multiline ) {
  284. // Found the end of the current comment
  285. $mark = $idx + 1;
  286. $inComment = false;
  287. }
  288. break;
  289. }
  290. }
  291. if ( $inComment ) {
  292. // Comment ends with input
  293. // Technically we should check to ensure that we aren't in
  294. // a multiline comment that hasn't been properly ended, but this
  295. // is a strip filter, not a validating parser.
  296. $mark = $maxLen;
  297. }
  298. // Add final chunk to buffer before returning
  299. return $buffer . substr( $str, $mark, $maxLen - $mark );
  300. }
  301. }