FontFamily.php 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. <?php
  2. /**
  3. * Validates a font family list according to CSS spec
  4. */
  5. class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
  6. {
  7. protected $mask = null;
  8. public function __construct()
  9. {
  10. $this->mask = '_- ';
  11. for ($c = 'a'; $c <= 'z'; $c++) {
  12. $this->mask .= $c;
  13. }
  14. for ($c = 'A'; $c <= 'Z'; $c++) {
  15. $this->mask .= $c;
  16. }
  17. for ($c = '0'; $c <= '9'; $c++) {
  18. $this->mask .= $c;
  19. } // cast-y, but should be fine
  20. // special bytes used by UTF-8
  21. for ($i = 0x80; $i <= 0xFF; $i++) {
  22. // We don't bother excluding invalid bytes in this range,
  23. // because the our restriction of well-formed UTF-8 will
  24. // prevent these from ever occurring.
  25. $this->mask .= chr($i);
  26. }
  27. /*
  28. PHP's internal strcspn implementation is
  29. O(length of string * length of mask), making it inefficient
  30. for large masks. However, it's still faster than
  31. preg_match 8)
  32. for (p = s1;;) {
  33. spanp = s2;
  34. do {
  35. if (*spanp == c || p == s1_end) {
  36. return p - s1;
  37. }
  38. } while (spanp++ < (s2_end - 1));
  39. c = *++p;
  40. }
  41. */
  42. // possible optimization: invert the mask.
  43. }
  44. /**
  45. * @param string $string
  46. * @param HTMLPurifier_Config $config
  47. * @param HTMLPurifier_Context $context
  48. * @return bool|string
  49. */
  50. public function validate($string, $config, $context)
  51. {
  52. static $generic_names = array(
  53. 'serif' => true,
  54. 'sans-serif' => true,
  55. 'monospace' => true,
  56. 'fantasy' => true,
  57. 'cursive' => true
  58. );
  59. $allowed_fonts = $config->get('CSS.AllowedFonts');
  60. // assume that no font names contain commas in them
  61. $fonts = explode(',', $string);
  62. $final = '';
  63. foreach ($fonts as $font) {
  64. $font = trim($font);
  65. if ($font === '') {
  66. continue;
  67. }
  68. // match a generic name
  69. if (isset($generic_names[$font])) {
  70. if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
  71. $final .= $font . ', ';
  72. }
  73. continue;
  74. }
  75. // match a quoted name
  76. if ($font[0] === '"' || $font[0] === "'") {
  77. $length = strlen($font);
  78. if ($length <= 2) {
  79. continue;
  80. }
  81. $quote = $font[0];
  82. if ($font[$length - 1] !== $quote) {
  83. continue;
  84. }
  85. $font = substr($font, 1, $length - 2);
  86. }
  87. $font = $this->expandCSSEscape($font);
  88. // $font is a pure representation of the font name
  89. if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
  90. continue;
  91. }
  92. if (ctype_alnum($font) && $font !== '') {
  93. // very simple font, allow it in unharmed
  94. $final .= $font . ', ';
  95. continue;
  96. }
  97. // bugger out on whitespace. form feed (0C) really
  98. // shouldn't show up regardless
  99. $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
  100. // Here, there are various classes of characters which need
  101. // to be treated differently:
  102. // - Alphanumeric characters are essentially safe. We
  103. // handled these above.
  104. // - Spaces require quoting, though most parsers will do
  105. // the right thing if there aren't any characters that
  106. // can be misinterpreted
  107. // - Dashes rarely occur, but they fairly unproblematic
  108. // for parsing/rendering purposes.
  109. // The above characters cover the majority of Western font
  110. // names.
  111. // - Arbitrary Unicode characters not in ASCII. Because
  112. // most parsers give little thought to Unicode, treatment
  113. // of these codepoints is basically uniform, even for
  114. // punctuation-like codepoints. These characters can
  115. // show up in non-Western pages and are supported by most
  116. // major browsers, for example: "MS 明朝" is a
  117. // legitimate font-name
  118. // <http://ja.wikipedia.org/wiki/MS_明朝>. See
  119. // the CSS3 spec for more examples:
  120. // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
  121. // You can see live samples of these on the Internet:
  122. // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
  123. // However, most of these fonts have ASCII equivalents:
  124. // for example, 'MS Mincho', and it's considered
  125. // professional to use ASCII font names instead of
  126. // Unicode font names. Thanks Takeshi Terada for
  127. // providing this information.
  128. // The following characters, to my knowledge, have not been
  129. // used to name font names.
  130. // - Single quote. While theoretically you might find a
  131. // font name that has a single quote in its name (serving
  132. // as an apostrophe, e.g. Dave's Scribble), I haven't
  133. // been able to find any actual examples of this.
  134. // Internet Explorer's cssText translation (which I
  135. // believe is invoked by innerHTML) normalizes any
  136. // quoting to single quotes, and fails to escape single
  137. // quotes. (Note that this is not IE's behavior for all
  138. // CSS properties, just some sort of special casing for
  139. // font-family). So a single quote *cannot* be used
  140. // safely in the font-family context if there will be an
  141. // innerHTML/cssText translation. Note that Firefox 3.x
  142. // does this too.
  143. // - Double quote. In IE, these get normalized to
  144. // single-quotes, no matter what the encoding. (Fun
  145. // fact, in IE8, the 'content' CSS property gained
  146. // support, where they special cased to preserve encoded
  147. // double quotes, but still translate unadorned double
  148. // quotes into single quotes.) So, because their
  149. // fixpoint behavior is identical to single quotes, they
  150. // cannot be allowed either. Firefox 3.x displays
  151. // single-quote style behavior.
  152. // - Backslashes are reduced by one (so \\ -> \) every
  153. // iteration, so they cannot be used safely. This shows
  154. // up in IE7, IE8 and FF3
  155. // - Semicolons, commas and backticks are handled properly.
  156. // - The rest of the ASCII punctuation is handled properly.
  157. // We haven't checked what browsers do to unadorned
  158. // versions, but this is not important as long as the
  159. // browser doesn't /remove/ surrounding quotes (as IE does
  160. // for HTML).
  161. //
  162. // With these results in hand, we conclude that there are
  163. // various levels of safety:
  164. // - Paranoid: alphanumeric, spaces and dashes(?)
  165. // - International: Paranoid + non-ASCII Unicode
  166. // - Edgy: Everything except quotes, backslashes
  167. // - NoJS: Standards compliance, e.g. sod IE. Note that
  168. // with some judicious character escaping (since certain
  169. // types of escaping doesn't work) this is theoretically
  170. // OK as long as innerHTML/cssText is not called.
  171. // We believe that international is a reasonable default
  172. // (that we will implement now), and once we do more
  173. // extensive research, we may feel comfortable with dropping
  174. // it down to edgy.
  175. // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of
  176. // str(c)spn assumes that the string was already well formed
  177. // Unicode (which of course it is).
  178. if (strspn($font, $this->mask) !== strlen($font)) {
  179. continue;
  180. }
  181. // Historical:
  182. // In the absence of innerHTML/cssText, these ugly
  183. // transforms don't pose a security risk (as \\ and \"
  184. // might--these escapes are not supported by most browsers).
  185. // We could try to be clever and use single-quote wrapping
  186. // when there is a double quote present, but I have choosen
  187. // not to implement that. (NOTE: you can reduce the amount
  188. // of escapes by one depending on what quoting style you use)
  189. // $font = str_replace('\\', '\\5C ', $font);
  190. // $font = str_replace('"', '\\22 ', $font);
  191. // $font = str_replace("'", '\\27 ', $font);
  192. // font possibly with spaces, requires quoting
  193. $final .= "'$font', ";
  194. }
  195. $final = rtrim($final, ', ');
  196. if ($final === '') {
  197. return false;
  198. }
  199. return $final;
  200. }
  201. }
  202. // vim: et sw=4 sts=4