URIParser.php 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. <?php
  2. /**
  3. * Parses a URI into the components and fragment identifier as specified
  4. * by RFC 3986.
  5. */
  6. class HTMLPurifier_URIParser
  7. {
  8. /**
  9. * Instance of HTMLPurifier_PercentEncoder to do normalization with.
  10. */
  11. protected $percentEncoder;
  12. public function __construct()
  13. {
  14. $this->percentEncoder = new HTMLPurifier_PercentEncoder();
  15. }
  16. /**
  17. * Parses a URI.
  18. * @param $uri string URI to parse
  19. * @return HTMLPurifier_URI representation of URI. This representation has
  20. * not been validated yet and may not conform to RFC.
  21. */
  22. public function parse($uri)
  23. {
  24. $uri = $this->percentEncoder->normalize($uri);
  25. // Regexp is as per Appendix B.
  26. // Note that ["<>] are an addition to the RFC's recommended
  27. // characters, because they represent external delimeters.
  28. $r_URI = '!'.
  29. '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
  30. '(//([^/?#"<>]*))?'. // 4. Authority
  31. '([^?#"<>]*)'. // 5. Path
  32. '(\?([^#"<>]*))?'. // 7. Query
  33. '(#([^"<>]*))?'. // 8. Fragment
  34. '!';
  35. $matches = array();
  36. $result = preg_match($r_URI, $uri, $matches);
  37. if (!$result) return false; // *really* invalid URI
  38. // seperate out parts
  39. $scheme = !empty($matches[1]) ? $matches[2] : null;
  40. $authority = !empty($matches[3]) ? $matches[4] : null;
  41. $path = $matches[5]; // always present, can be empty
  42. $query = !empty($matches[6]) ? $matches[7] : null;
  43. $fragment = !empty($matches[8]) ? $matches[9] : null;
  44. // further parse authority
  45. if ($authority !== null) {
  46. $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
  47. $matches = array();
  48. preg_match($r_authority, $authority, $matches);
  49. $userinfo = !empty($matches[1]) ? $matches[2] : null;
  50. $host = !empty($matches[3]) ? $matches[3] : '';
  51. $port = !empty($matches[4]) ? (int) $matches[5] : null;
  52. } else {
  53. $port = $host = $userinfo = null;
  54. }
  55. return new HTMLPurifier_URI(
  56. $scheme, $userinfo, $host, $port, $path, $query, $fragment);
  57. }
  58. }
  59. // vim: et sw=4 sts=4