Parser.php 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368
  1. <?php
  2. namespace Mf2;
  3. use DOMDocument;
  4. use DOMElement;
  5. use DOMXPath;
  6. use DOMNode;
  7. use DOMNodeList;
  8. use Exception;
  9. use SplObjectStorage;
  10. use stdClass;
  11. /**
  12. * Parse Microformats2
  13. *
  14. * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
  15. *
  16. * Example usage:
  17. *
  18. * use Mf2;
  19. * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
  20. * echo json_encode($output, JSON_PRETTY_PRINT);
  21. *
  22. * Produces:
  23. *
  24. * {
  25. * "items": [
  26. * {
  27. * "type": ["h-card"],
  28. * "properties": {
  29. * "name": ["Barnaby Walters"]
  30. * }
  31. * }
  32. * ],
  33. * "rels": {}
  34. * }
  35. *
  36. * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
  37. * @param string $url The URL the input document was found at, for relative URL resolution
  38. * @param bool $convertClassic whether or not to convert classic microformats
  39. * @return array Canonical MF2 array structure
  40. */
  41. function parse($input, $url = null, $convertClassic = true) {
  42. $parser = new Parser($input, $url);
  43. return $parser->parse($convertClassic);
  44. }
  45. /**
  46. * Fetch microformats2
  47. *
  48. * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
  49. * microformats2 array structure.
  50. *
  51. * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
  52. * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
  53. * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
  54. * for the actual value.
  55. *
  56. * @param string $url The URL to fetch
  57. * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
  58. * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
  59. * @return array|null canonical microformats2 array structure on success, null on failure
  60. */
  61. function fetch($url, $convertClassic = true, &$curlInfo=null) {
  62. $ch = curl_init();
  63. curl_setopt($ch, CURLOPT_URL, $url);
  64. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  65. curl_setopt($ch, CURLOPT_HEADER, 0);
  66. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  67. curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
  68. $html = curl_exec($ch);
  69. $info = $curlInfo = curl_getinfo($ch);
  70. curl_close($ch);
  71. if (strpos(strtolower($info['content_type']), 'html') === false) {
  72. // The content was not delivered as HTML, do not attempt to parse it.
  73. return null;
  74. }
  75. return parse($html, $url, $convertClassic);
  76. }
  77. /**
  78. * Unicode to HTML Entities
  79. * @param string $input String containing characters to convert into HTML entities
  80. * @return string
  81. */
  82. function unicodeToHtmlEntities($input) {
  83. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  84. }
  85. /**
  86. * Collapse Whitespace
  87. *
  88. * Collapses any sequences of whitespace within a string into a single space
  89. * character.
  90. *
  91. * @deprecated since v0.2.3
  92. * @param string $str
  93. * @return string
  94. */
  95. function collapseWhitespace($str) {
  96. return preg_replace('/[\s|\n]+/', ' ', $str);
  97. }
  98. function unicodeTrim($str) {
  99. // this is cheating. TODO: find a better way if this causes any problems
  100. $str = str_replace(mb_convert_encoding('&nbsp;', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
  101. $str = preg_replace('/^\s+/', '', $str);
  102. return preg_replace('/\s+$/', '', $str);
  103. }
  104. /**
  105. * Microformat Name From Class string
  106. *
  107. * Given the value of @class, get the relevant mf classnames (e.g. h-card,
  108. * p-name).
  109. *
  110. * @param string $class A space delimited list of classnames
  111. * @param string $prefix The prefix to look for
  112. * @return string|array The prefixed name of the first microfomats class found or false
  113. */
  114. function mfNamesFromClass($class, $prefix='h-') {
  115. $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
  116. $classes = explode(' ', $class);
  117. $matches = array();
  118. foreach ($classes as $classname) {
  119. $compare_classname = strtolower(' ' . $classname);
  120. $compare_prefix = strtolower(' ' . $prefix);
  121. if (stristr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
  122. $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
  123. }
  124. }
  125. return $matches;
  126. }
  127. /**
  128. * Get Nested µf Property Name From Class
  129. *
  130. * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
  131. * space-separated string.
  132. *
  133. * @param string $class
  134. * @return array
  135. */
  136. function nestedMfPropertyNamesFromClass($class) {
  137. $prefixes = array('p-', 'u-', 'dt-', 'e-');
  138. $propertyNames = array();
  139. $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
  140. foreach (explode(' ', $class) as $classname) {
  141. foreach ($prefixes as $prefix) {
  142. $compare_classname = strtolower(' ' . $classname);
  143. if (stristr($compare_classname, $prefix) && ($compare_classname != $prefix)) {
  144. $propertyNames = array_merge($propertyNames, mfNamesFromClass($classname, ltrim($prefix)));
  145. }
  146. }
  147. }
  148. return $propertyNames;
  149. }
  150. /**
  151. * Wraps mfNamesFromClass to handle an element as input (common)
  152. *
  153. * @param DOMElement $e The element to get the classname for
  154. * @param string $prefix The prefix to look for
  155. * @return mixed See return value of mf2\Parser::mfNameFromClass()
  156. */
  157. function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
  158. $class = $e->getAttribute('class');
  159. return mfNamesFromClass($class, $prefix);
  160. }
  161. /**
  162. * Wraps nestedMfPropertyNamesFromClass to handle an element as input
  163. */
  164. function nestedMfPropertyNamesFromElement(\DOMElement $e) {
  165. $class = $e->getAttribute('class');
  166. return nestedMfPropertyNamesFromClass($class);
  167. }
  168. /**
  169. * Converts various time formats to HH:MM
  170. * @param string $time The time to convert
  171. * @return string
  172. */
  173. function convertTimeFormat($time) {
  174. $hh = $mm = $ss = '';
  175. preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
  176. // if no am/pm specified
  177. if (empty($matches[4])) {
  178. return $time;
  179. }
  180. // else am/pm specified
  181. else {
  182. $meridiem = strtolower(str_replace('.', '', $matches[4]));
  183. // hours
  184. $hh = $matches[1];
  185. // add 12 to the pm hours
  186. if ($meridiem == 'pm' && ($hh < 12)) {
  187. $hh += 12;
  188. }
  189. $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
  190. // minutes
  191. $mm = (empty($matches[2]) ) ? '00' : $matches[2];
  192. // seconds, only if supplied
  193. if (!empty($matches[3])) {
  194. $ss = $matches[3];
  195. }
  196. if (empty($ss)) {
  197. return sprintf('%s:%s', $hh, $mm);
  198. }
  199. else {
  200. return sprintf('%s:%s:%s', $hh, $mm, $ss);
  201. }
  202. }
  203. }
  204. /**
  205. * Microformats2 Parser
  206. *
  207. * A class which holds state for parsing microformats2 from HTML.
  208. *
  209. * Example usage:
  210. *
  211. * use Mf2;
  212. * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
  213. * $output = $parser->parse();
  214. */
  215. class Parser {
  216. /** @var string The baseurl (if any) to use for this parse */
  217. public $baseurl;
  218. /** @var DOMXPath object which can be used to query over any fragment*/
  219. public $xpath;
  220. /** @var DOMDocument */
  221. public $doc;
  222. /** @var SplObjectStorage */
  223. protected $parsed;
  224. public $jsonMode;
  225. /**
  226. * Constructor
  227. *
  228. * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
  229. * @param string $url The URL of the parsed document, for relative URL resolution
  230. * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
  231. */
  232. public function __construct($input, $url = null, $jsonMode = false) {
  233. libxml_use_internal_errors(true);
  234. if (is_string($input)) {
  235. $doc = new DOMDocument();
  236. @$doc->loadHTML(unicodeToHtmlEntities($input));
  237. } elseif (is_a($input, 'DOMDocument')) {
  238. $doc = $input;
  239. } else {
  240. $doc = new DOMDocument();
  241. @$doc->loadHTML('');
  242. }
  243. $this->xpath = new DOMXPath($doc);
  244. $baseurl = $url;
  245. foreach ($this->xpath->query('//base[@href]') as $base) {
  246. $baseElementUrl = $base->getAttribute('href');
  247. if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
  248. /* The base element URL is relative to the document URL.
  249. *
  250. * :/
  251. *
  252. * Perhaps the author was high? */
  253. $baseurl = resolveUrl($url, $baseElementUrl);
  254. } else {
  255. $baseurl = $baseElementUrl;
  256. }
  257. break;
  258. }
  259. // Ignore <template> elements as per the HTML5 spec
  260. foreach ($this->xpath->query('//template') as $templateEl) {
  261. $templateEl->parentNode->removeChild($templateEl);
  262. }
  263. $this->baseurl = $baseurl;
  264. $this->doc = $doc;
  265. $this->parsed = new SplObjectStorage();
  266. $this->jsonMode = $jsonMode;
  267. }
  268. private function elementPrefixParsed(\DOMElement $e, $prefix) {
  269. if (!$this->parsed->contains($e))
  270. $this->parsed->attach($e, array());
  271. $prefixes = $this->parsed[$e];
  272. $prefixes[] = $prefix;
  273. $this->parsed[$e] = $prefixes;
  274. }
  275. private function isElementParsed(\DOMElement $e, $prefix) {
  276. if (!$this->parsed->contains($e))
  277. return false;
  278. $prefixes = $this->parsed[$e];
  279. if (!in_array($prefix, $prefixes))
  280. return false;
  281. return true;
  282. }
  283. private function resolveChildUrls(DOMElement $el) {
  284. $hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);
  285. foreach ($hyperlinkChildren as $child) {
  286. if ($child->hasAttribute('href'))
  287. $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
  288. if ($child->hasAttribute('src'))
  289. $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
  290. if ($child->hasAttribute('data'))
  291. $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
  292. }
  293. }
  294. public function textContent(DOMElement $el) {
  295. $this->resolveChildUrls($el);
  296. $clonedEl = $el->cloneNode(true);
  297. foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) {
  298. $newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
  299. $imgEl->parentNode->replaceChild($newNode, $imgEl);
  300. }
  301. return $clonedEl->textContent;
  302. }
  303. // TODO: figure out if this has problems with sms: and geo: URLs
  304. public function resolveUrl($url) {
  305. // If the URL is seriously malformed it’s probably beyond the scope of this
  306. // parser to try to do anything with it.
  307. if (parse_url($url) === false)
  308. return $url;
  309. $scheme = parse_url($url, PHP_URL_SCHEME);
  310. if (empty($scheme) and !empty($this->baseurl)) {
  311. return resolveUrl($this->baseurl, $url);
  312. } else {
  313. return $url;
  314. }
  315. }
  316. // Parsing Functions
  317. /**
  318. * Parse value-class/value-title on an element, joining with $separator if
  319. * there are multiple.
  320. *
  321. * @param \DOMElement $e
  322. * @param string $separator = '' if multiple value-title elements, join with this string
  323. * @return string|null the parsed value or null if value-class or -title aren’t in use
  324. */
  325. public function parseValueClassTitle(\DOMElement $e, $separator = '') {
  326. $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
  327. if ($valueClassElements->length !== 0) {
  328. // Process value-class stuff
  329. $val = '';
  330. foreach ($valueClassElements as $el) {
  331. $val .= $this->textContent($el);
  332. }
  333. return unicodeTrim($val);
  334. }
  335. $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
  336. if ($valueTitleElements->length !== 0) {
  337. // Process value-title stuff
  338. $val = '';
  339. foreach ($valueTitleElements as $el) {
  340. $val .= $el->getAttribute('title');
  341. }
  342. return unicodeTrim($val);
  343. }
  344. // No value-title or -class in this element
  345. return null;
  346. }
  347. /**
  348. * Given an element with class="p-*", get it’s value
  349. *
  350. * @param DOMElement $p The element to parse
  351. * @return string The plaintext value of $p, dependant on type
  352. * @todo Make this adhere to value-class
  353. */
  354. public function parseP(\DOMElement $p) {
  355. $classTitle = $this->parseValueClassTitle($p, ' ');
  356. if ($classTitle !== null)
  357. return $classTitle;
  358. if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
  359. $pValue = $p->getAttribute('alt');
  360. } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
  361. $pValue = $p->getAttribute('alt');
  362. } elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') {
  363. $pValue = $p->getAttribute('title');
  364. } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
  365. $pValue = $p->getAttribute('value');
  366. } else {
  367. $pValue = unicodeTrim($this->textContent($p));
  368. }
  369. return $pValue;
  370. }
  371. /**
  372. * Given an element with class="u-*", get the value of the URL
  373. *
  374. * @param DOMElement $u The element to parse
  375. * @return string The plaintext value of $u, dependant on type
  376. * @todo make this adhere to value-class
  377. */
  378. public function parseU(\DOMElement $u) {
  379. if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
  380. $uValue = $u->getAttribute('href');
  381. } elseif ($u->tagName == 'img' and $u->getAttribute('src') !== null) {
  382. $uValue = $u->getAttribute('src');
  383. } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
  384. $uValue = $u->getAttribute('data');
  385. }
  386. if (isset($uValue)) {
  387. return $this->resolveUrl($uValue);
  388. }
  389. $classTitle = $this->parseValueClassTitle($u);
  390. if ($classTitle !== null) {
  391. return $classTitle;
  392. } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
  393. return $u->getAttribute('title');
  394. } elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
  395. return $u->getAttribute('value');
  396. } else {
  397. return unicodeTrim($this->textContent($u));
  398. }
  399. }
  400. /**
  401. * Given an element with class="dt-*", get the value of the datetime as a php date object
  402. *
  403. * @param DOMElement $dt The element to parse
  404. * @param array $dates Array of dates processed so far
  405. * @return string The datetime string found
  406. */
  407. public function parseDT(\DOMElement $dt, &$dates = array()) {
  408. // Check for value-class pattern
  409. $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
  410. $dtValue = false;
  411. if ($valueClassChildren->length > 0) {
  412. // They’re using value-class
  413. $dateParts = array();
  414. foreach ($valueClassChildren as $e) {
  415. if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
  416. $title = $e->getAttribute('title');
  417. if (!empty($title))
  418. $dateParts[] = $title;
  419. }
  420. elseif ($e->tagName == 'img' or $e->tagName == 'area') {
  421. // Use @alt
  422. $alt = $e->getAttribute('alt');
  423. if (!empty($alt))
  424. $dateParts[] = $alt;
  425. }
  426. elseif ($e->tagName == 'data') {
  427. // Use @value, otherwise innertext
  428. $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
  429. if (!empty($value))
  430. $dateParts[] = $value;
  431. }
  432. elseif ($e->tagName == 'abbr') {
  433. // Use @title, otherwise innertext
  434. $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
  435. if (!empty($title))
  436. $dateParts[] = $title;
  437. }
  438. elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
  439. // Use @datetime if available, otherwise innertext
  440. $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
  441. if (!empty($dtAttr))
  442. $dateParts[] = $dtAttr;
  443. }
  444. else {
  445. if (!empty($e->nodeValue))
  446. $dateParts[] = unicodeTrim($e->nodeValue);
  447. }
  448. }
  449. // Look through dateParts
  450. $datePart = '';
  451. $timePart = '';
  452. foreach ($dateParts as $part) {
  453. // Is this part a full ISO8601 datetime?
  454. if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
  455. // Break completely, we’ve got our value.
  456. $dtValue = $part;
  457. break;
  458. } else {
  459. // Is the current part a valid time(+TZ?) AND no other time representation has been found?
  460. if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
  461. $timePart = $part;
  462. } elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
  463. // Is the current part a valid date AND no other date representation has been found?
  464. $datePart = $part;
  465. }
  466. if ( !empty($datePart) && !in_array($datePart, $dates) ) {
  467. $dates[] = $datePart;
  468. }
  469. $dtValue = '';
  470. if ( empty($datePart) && !empty($timePart) ) {
  471. $timePart = convertTimeFormat($timePart);
  472. $dtValue = unicodeTrim($timePart, 'T');
  473. }
  474. else if ( !empty($datePart) && empty($timePart) ) {
  475. $dtValue = rtrim($datePart, 'T');
  476. }
  477. else {
  478. $timePart = convertTimeFormat($timePart);
  479. $dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
  480. }
  481. }
  482. }
  483. } else {
  484. // Not using value-class (phew).
  485. if ($dt->tagName == 'img' or $dt->tagName == 'area') {
  486. // Use @alt
  487. // Is it an entire dt?
  488. $alt = $dt->getAttribute('alt');
  489. if (!empty($alt))
  490. $dtValue = $alt;
  491. } elseif (in_array($dt->tagName, array('data'))) {
  492. // Use @value, otherwise innertext
  493. // Is it an entire dt?
  494. $value = $dt->getAttribute('value');
  495. if (!empty($value))
  496. $dtValue = $value;
  497. else
  498. $dtValue = $dt->nodeValue;
  499. } elseif ($dt->tagName == 'abbr') {
  500. // Use @title, otherwise innertext
  501. // Is it an entire dt?
  502. $title = $dt->getAttribute('title');
  503. if (!empty($title))
  504. $dtValue = $title;
  505. else
  506. $dtValue = $dt->nodeValue;
  507. } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
  508. // Use @datetime if available, otherwise innertext
  509. // Is it an entire dt?
  510. $dtAttr = $dt->getAttribute('datetime');
  511. if (!empty($dtAttr))
  512. $dtValue = $dtAttr;
  513. else
  514. $dtValue = $dt->nodeValue;
  515. } else {
  516. $dtValue = $dt->nodeValue;
  517. }
  518. if ( preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches) ) {
  519. $dates[] = $matches[0];
  520. }
  521. }
  522. /**
  523. * if $dtValue is only a time and there are recently parsed dates,
  524. * form the full date-time using the most recnetly parsed dt- value
  525. */
  526. if ( (preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates) ) {
  527. $dtValue = convertTimeFormat($dtValue);
  528. $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
  529. }
  530. return $dtValue;
  531. }
  532. /**
  533. * Given the root element of some embedded markup, return a string representing that markup
  534. *
  535. * @param DOMElement $e The element to parse
  536. * @return string $e’s innerHTML
  537. *
  538. * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
  539. */
  540. public function parseE(\DOMElement $e) {
  541. $classTitle = $this->parseValueClassTitle($e);
  542. if ($classTitle !== null)
  543. return $classTitle;
  544. // Expand relative URLs within children of this element
  545. // TODO: as it is this is not relative to only children, make this .// and rerun tests
  546. $this->resolveChildUrls($e);
  547. $html = '';
  548. foreach ($e->childNodes as $node) {
  549. $html .= $node->C14N();
  550. }
  551. return array(
  552. 'html' => $html,
  553. 'value' => unicodeTrim($this->textContent($e))
  554. );
  555. }
  556. /**
  557. * Recursively parse microformats
  558. *
  559. * @param DOMElement $e The element to parse
  560. * @return array A representation of the values contained within microformat $e
  561. */
  562. public function parseH(\DOMElement $e) {
  563. // If it’s already been parsed (e.g. is a child mf), skip
  564. if ($this->parsed->contains($e))
  565. return null;
  566. // Get current µf name
  567. $mfTypes = mfNamesFromElement($e, 'h-');
  568. // Initalise var to store the representation in
  569. $return = array();
  570. $children = array();
  571. $dates = array();
  572. // Handle nested microformats (h-*)
  573. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) {
  574. // Parse
  575. $result = $this->parseH($subMF);
  576. // If result was already parsed, skip it
  577. if (null === $result)
  578. continue;
  579. $result['value'] = $this->parseP($subMF);
  580. // Does this µf have any property names other than h-*?
  581. $properties = nestedMfPropertyNamesFromElement($subMF);
  582. if (!empty($properties)) {
  583. // Yes! It’s a nested property µf
  584. foreach ($properties as $property) {
  585. $return[$property][] = $result;
  586. }
  587. } else {
  588. // No, it’s a child µf
  589. $children[] = $result;
  590. }
  591. // Make sure this sub-mf won’t get parsed as a µf or property
  592. // TODO: Determine if clearing this is required?
  593. $this->elementPrefixParsed($subMF, 'h');
  594. $this->elementPrefixParsed($subMF, 'p');
  595. $this->elementPrefixParsed($subMF, 'u');
  596. $this->elementPrefixParsed($subMF, 'dt');
  597. $this->elementPrefixParsed($subMF, 'e');
  598. }
  599. // Handle p-*
  600. foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
  601. if ($this->isElementParsed($p, 'p'))
  602. continue;
  603. $pValue = $this->parseP($p);
  604. // Add the value to the array for it’s p- properties
  605. foreach (mfNamesFromElement($p, 'p-') as $propName) {
  606. if (!empty($propName))
  607. $return[$propName][] = $pValue;
  608. }
  609. // Make sure this sub-mf won’t get parsed as a top level mf
  610. $this->elementPrefixParsed($p, 'p');
  611. }
  612. // Handle u-*
  613. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
  614. if ($this->isElementParsed($u, 'u'))
  615. continue;
  616. $uValue = $this->parseU($u);
  617. // Add the value to the array for it’s property types
  618. foreach (mfNamesFromElement($u, 'u-') as $propName) {
  619. $return[$propName][] = $uValue;
  620. }
  621. // Make sure this sub-mf won’t get parsed as a top level mf
  622. $this->elementPrefixParsed($u, 'u');
  623. }
  624. // Handle dt-*
  625. foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
  626. if ($this->isElementParsed($dt, 'dt'))
  627. continue;
  628. $dtValue = $this->parseDT($dt, $dates);
  629. if ($dtValue) {
  630. // Add the value to the array for dt- properties
  631. foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
  632. $return[$propName][] = $dtValue;
  633. }
  634. }
  635. // Make sure this sub-mf won’t get parsed as a top level mf
  636. $this->elementPrefixParsed($dt, 'dt');
  637. }
  638. // Handle e-*
  639. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
  640. if ($this->isElementParsed($em, 'e'))
  641. continue;
  642. $eValue = $this->parseE($em);
  643. if ($eValue) {
  644. // Add the value to the array for e- properties
  645. foreach (mfNamesFromElement($em, 'e-') as $propName) {
  646. $return[$propName][] = $eValue;
  647. }
  648. }
  649. // Make sure this sub-mf won’t get parsed as a top level mf
  650. $this->elementPrefixParsed($em, 'e');
  651. }
  652. // Implied Properties
  653. // Check for p-name
  654. if (!array_key_exists('name', $return)) {
  655. try {
  656. // Look for img @alt
  657. if ($e->tagName == 'img' and $e->getAttribute('alt') != '')
  658. throw new Exception($e->getAttribute('alt'));
  659. if ($e->tagName == 'abbr' and $e->hasAttribute('title'))
  660. throw new Exception($e->getAttribute('title'));
  661. // Look for nested img @alt
  662. foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  663. if ($em->getAttribute('alt') != '')
  664. throw new Exception($em->getAttribute('alt'));
  665. }
  666. // Look for double nested img @alt
  667. foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  668. if ($em->getAttribute('alt') != '')
  669. throw new Exception($em->getAttribute('alt'));
  670. }
  671. throw new Exception($e->nodeValue);
  672. } catch (Exception $exc) {
  673. $return['name'][] = unicodeTrim($exc->getMessage());
  674. }
  675. }
  676. // Check for u-photo
  677. if (!array_key_exists('photo', $return)) {
  678. // Look for img @src
  679. try {
  680. if ($e->tagName == 'img')
  681. throw new Exception($e->getAttribute('src'));
  682. // Look for nested img @src
  683. foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  684. if ($em->getAttribute('src') != '')
  685. throw new Exception($em->getAttribute('src'));
  686. }
  687. // Look for double nested img @src
  688. foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  689. if ($em->getAttribute('src') != '')
  690. throw new Exception($em->getAttribute('src'));
  691. }
  692. } catch (Exception $exc) {
  693. $return['photo'][] = $this->resolveUrl($exc->getMessage());
  694. }
  695. }
  696. // Check for u-url
  697. if (!array_key_exists('url', $return)) {
  698. // Look for img @src
  699. if ($e->tagName == 'a')
  700. $url = $e->getAttribute('href');
  701. // Look for nested img @src
  702. foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
  703. $url = $em->getAttribute('href');
  704. break;
  705. }
  706. if (!empty($url))
  707. $return['url'][] = $this->resolveUrl($url);
  708. }
  709. // Make sure things are in alphabetical order
  710. sort($mfTypes);
  711. // Phew. Return the final result.
  712. $parsed = array(
  713. 'type' => $mfTypes,
  714. 'properties' => $return
  715. );
  716. if (!empty($children))
  717. $parsed['children'] = array_values(array_filter($children));
  718. return $parsed;
  719. }
  720. /**
  721. * Parse Rels and Alternatives
  722. *
  723. * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page
  724. * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set
  725. * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case),
  726. * it will be an empty array.
  727. */
  728. public function parseRelsAndAlternates() {
  729. $rels = array();
  730. $alternates = array();
  731. // Iterate through all a, area and link elements with rel attributes
  732. foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) {
  733. if ($hyperlink->getAttribute('rel') == '')
  734. continue;
  735. // Resolve the href
  736. $href = $this->resolveUrl($hyperlink->getAttribute('href'));
  737. // Split up the rel into space-separated values
  738. $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel')));
  739. // If alternate in rels, create alternate structure, append
  740. if (in_array('alternate', $linkRels)) {
  741. $alt = array(
  742. 'url' => $href,
  743. 'rel' => implode(' ', array_diff($linkRels, array('alternate')))
  744. );
  745. if ($hyperlink->hasAttribute('media'))
  746. $alt['media'] = $hyperlink->getAttribute('media');
  747. if ($hyperlink->hasAttribute('hreflang'))
  748. $alt['hreflang'] = $hyperlink->getAttribute('hreflang');
  749. $alternates[] = $alt;
  750. } else {
  751. foreach ($linkRels as $rel) {
  752. $rels[$rel][] = $href;
  753. }
  754. }
  755. }
  756. if (empty($rels) and $this->jsonMode) {
  757. $rels = new stdClass();
  758. }
  759. return array($rels, $alternates);
  760. }
  761. /**
  762. * Kicks off the parsing routine
  763. *
  764. * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
  765. * will be HTML-encoded, bringing all output to the same level of encoding.
  766. *
  767. * If a DOMElement is set as the $context, only descendants of that element will
  768. * be parsed for microformats.
  769. *
  770. * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
  771. * @param DOMElement $context optionally an element from which to parse microformats
  772. * @return array An array containing all the µfs found in the current document
  773. */
  774. public function parse($convertClassic = true, DOMElement $context = null) {
  775. $mfs = array();
  776. if ($convertClassic) {
  777. $this->convertLegacy();
  778. }
  779. $mfElements = null === $context
  780. ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]')
  781. : $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context);
  782. // Parser microformats
  783. foreach ($mfElements as $node) {
  784. // For each microformat
  785. $result = $this->parseH($node);
  786. // Add the value to the array for this property type
  787. $mfs[] = $result;
  788. }
  789. // Parse rels
  790. list($rels, $alternates) = $this->parseRelsAndAlternates();
  791. $top = array(
  792. 'items' => array_values(array_filter($mfs)),
  793. 'rels' => $rels
  794. );
  795. if (count($alternates))
  796. $top['alternates'] = $alternates;
  797. return $top;
  798. }
  799. /**
  800. * Parse From ID
  801. *
  802. * Given an ID, parse all microformats which are children of the element with
  803. * that ID.
  804. *
  805. * Note that rel values are still document-wide.
  806. *
  807. * If an element with the ID is not found, an empty skeleton mf2 array structure
  808. * will be returned.
  809. *
  810. * @param string $id
  811. * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
  812. * @return array
  813. */
  814. public function parseFromId($id, $convertClassic=true) {
  815. $matches = $this->xpath->query("//*[@id='{$id}']");
  816. if (empty($matches))
  817. return array('items' => array(), 'rels' => array(), 'alternates' => array());
  818. return $this->parse($convertClassic, $matches->item(0));
  819. }
  820. /**
  821. * Convert Legacy Classnames
  822. *
  823. * Adds microformats2 classnames into a document containing only legacy
  824. * semantic classnames.
  825. *
  826. * @return Parser $this
  827. */
  828. public function convertLegacy() {
  829. $doc = $this->doc;
  830. $xp = new DOMXPath($doc);
  831. // replace all roots
  832. foreach ($this->classicRootMap as $old => $new) {
  833. foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
  834. $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
  835. }
  836. }
  837. foreach ($this->classicPropertyMap as $oldRoot => $properties) {
  838. $newRoot = $this->classicRootMap[$oldRoot];
  839. foreach ($properties as $old => $new) {
  840. foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
  841. $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
  842. }
  843. }
  844. }
  845. return $this;
  846. }
  847. /**
  848. * XPath Query
  849. *
  850. * Runs an XPath query over the current document. Works in exactly the same
  851. * way as DOMXPath::query.
  852. *
  853. * @param string $expression
  854. * @param DOMNode $context
  855. * @return DOMNodeList
  856. */
  857. public function query($expression, $context = null) {
  858. return $this->xpath->query($expression, $context);
  859. }
  860. /**
  861. * Classic Root Classname map
  862. */
  863. public $classicRootMap = array(
  864. 'vcard' => 'h-card',
  865. 'hfeed' => 'h-feed',
  866. 'hentry' => 'h-entry',
  867. 'hrecipe' => 'h-recipe',
  868. 'hresume' => 'h-resume',
  869. 'hevent' => 'h-event',
  870. 'hreview' => 'h-review',
  871. 'hproduct' => 'h-product'
  872. );
  873. public $classicPropertyMap = array(
  874. 'vcard' => array(
  875. 'fn' => 'p-name',
  876. 'url' => 'u-url',
  877. 'honorific-prefix' => 'p-honorific-prefix',
  878. 'given-name' => 'p-given-name',
  879. 'additional-name' => 'p-additional-name',
  880. 'family-name' => 'p-family-name',
  881. 'honorific-suffix' => 'p-honorific-suffix',
  882. 'nickname' => 'p-nickname',
  883. 'email' => 'u-email',
  884. 'logo' => 'u-logo',
  885. 'photo' => 'u-photo',
  886. 'url' => 'u-url',
  887. 'uid' => 'u-uid',
  888. 'category' => 'p-category',
  889. 'adr' => 'p-adr h-adr',
  890. 'extended-address' => 'p-extended-address',
  891. 'street-address' => 'p-street-address',
  892. 'locality' => 'p-locality',
  893. 'region' => 'p-region',
  894. 'postal-code' => 'p-postal-code',
  895. 'country-name' => 'p-country-name',
  896. 'label' => 'p-label',
  897. 'geo' => 'p-geo h-geo',
  898. 'latitude' => 'p-latitude',
  899. 'longitude' => 'p-longitude',
  900. 'tel' => 'p-tel',
  901. 'note' => 'p-note',
  902. 'bday' => 'dt-bday',
  903. 'key' => 'u-key',
  904. 'org' => 'p-org',
  905. 'organization-name' => 'p-organization-name',
  906. 'organization-unit' => 'p-organization-unit',
  907. ),
  908. 'hentry' => array(
  909. 'entry-title' => 'p-name',
  910. 'entry-summary' => 'p-summary',
  911. 'entry-content' => 'e-content',
  912. 'published' => 'dt-published',
  913. 'updated' => 'dt-updated',
  914. 'author' => 'p-author h-card',
  915. 'category' => 'p-category',
  916. 'geo' => 'p-geo h-geo',
  917. 'latitude' => 'p-latitude',
  918. 'longitude' => 'p-longitude',
  919. ),
  920. 'hrecipe' => array(
  921. 'fn' => 'p-name',
  922. 'ingredient' => 'p-ingredient',
  923. 'yield' => 'p-yield',
  924. 'instructions' => 'e-instructions',
  925. 'duration' => 'dt-duration',
  926. 'nutrition' => 'p-nutrition',
  927. 'photo' => 'u-photo',
  928. 'summary' => 'p-summary',
  929. 'author' => 'p-author h-card'
  930. ),
  931. 'hresume' => array(
  932. 'summary' => 'p-summary',
  933. 'contact' => 'h-card p-contact',
  934. 'education' => 'h-event p-education',
  935. 'experience' => 'h-event p-experience',
  936. 'skill' => 'p-skill',
  937. 'affiliation' => 'p-affiliation h-card',
  938. ),
  939. 'hevent' => array(
  940. 'dtstart' => 'dt-start',
  941. 'dtend' => 'dt-end',
  942. 'duration' => 'dt-duration',
  943. 'description' => 'p-description',
  944. 'summary' => 'p-summary',
  945. 'description' => 'p-description',
  946. 'url' => 'u-url',
  947. 'category' => 'p-category',
  948. 'location' => 'h-card',
  949. 'geo' => 'p-location h-geo'
  950. ),
  951. 'hreview' => array(
  952. 'summary' => 'p-name',
  953. 'fn' => 'p-item h-item p-name', // doesn’t work properly, see spec
  954. 'photo' => 'u-photo', // of the item being reviewed (p-item h-item u-photo)
  955. 'url' => 'u-url', // of the item being reviewed (p-item h-item u-url)
  956. 'reviewer' => 'p-reviewer p-author h-card',
  957. 'dtreviewed' => 'dt-reviewed',
  958. 'rating' => 'p-rating',
  959. 'best' => 'p-best',
  960. 'worst' => 'p-worst',
  961. 'description' => 'p-description'
  962. ),
  963. 'hproduct' => array(
  964. 'fn' => 'p-name',
  965. 'photo' => 'u-photo',
  966. 'brand' => 'p-brand',
  967. 'category' => 'p-category',
  968. 'description' => 'p-description',
  969. 'identifier' => 'u-identifier',
  970. 'url' => 'u-url',
  971. 'review' => 'p-review h-review',
  972. 'price' => 'p-price'
  973. )
  974. );
  975. }
  976. function parseUriToComponents($uri) {
  977. $result = array(
  978. 'scheme' => null,
  979. 'authority' => null,
  980. 'path' => null,
  981. 'query' => null,
  982. 'fragment' => null
  983. );
  984. $u = @parse_url($uri);
  985. if(array_key_exists('scheme', $u))
  986. $result['scheme'] = $u['scheme'];
  987. if(array_key_exists('host', $u)) {
  988. if(array_key_exists('user', $u))
  989. $result['authority'] = $u['user'];
  990. if(array_key_exists('pass', $u))
  991. $result['authority'] .= ':' . $u['pass'];
  992. if(array_key_exists('user', $u) || array_key_exists('pass', $u))
  993. $result['authority'] .= '@';
  994. $result['authority'] .= $u['host'];
  995. if(array_key_exists('port', $u))
  996. $result['authority'] .= ':' . $u['port'];
  997. }
  998. if(array_key_exists('path', $u))
  999. $result['path'] = $u['path'];
  1000. if(array_key_exists('query', $u))
  1001. $result['query'] = $u['query'];
  1002. if(array_key_exists('fragment', $u))
  1003. $result['fragment'] = $u['fragment'];
  1004. return $result;
  1005. }
  1006. function resolveUrl($baseURI, $referenceURI) {
  1007. $target = array(
  1008. 'scheme' => null,
  1009. 'authority' => null,
  1010. 'path' => null,
  1011. 'query' => null,
  1012. 'fragment' => null
  1013. );
  1014. # 5.2.1 Pre-parse the Base URI
  1015. # The base URI (Base) is established according to the procedure of
  1016. # Section 5.1 and parsed into the five main components described in
  1017. # Section 3
  1018. $base = parseUriToComponents($baseURI);
  1019. # If base path is blank (http://example.com) then set it to /
  1020. # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
  1021. if($base['path'] == null)
  1022. $base['path'] = '/';
  1023. # 5.2.2. Transform References
  1024. # The URI reference is parsed into the five URI components
  1025. # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
  1026. $reference = parseUriToComponents($referenceURI);
  1027. # A non-strict parser may ignore a scheme in the reference
  1028. # if it is identical to the base URI's scheme.
  1029. # TODO
  1030. if($reference['scheme']) {
  1031. $target['scheme'] = $reference['scheme'];
  1032. $target['authority'] = $reference['authority'];
  1033. $target['path'] = removeDotSegments($reference['path']);
  1034. $target['query'] = $reference['query'];
  1035. } else {
  1036. if($reference['authority']) {
  1037. $target['authority'] = $reference['authority'];
  1038. $target['path'] = removeDotSegments($reference['path']);
  1039. $target['query'] = $reference['query'];
  1040. } else {
  1041. if($reference['path'] == '') {
  1042. $target['path'] = $base['path'];
  1043. if($reference['query']) {
  1044. $target['query'] = $reference['query'];
  1045. } else {
  1046. $target['query'] = $base['query'];
  1047. }
  1048. } else {
  1049. if(substr($reference['path'], 0, 1) == '/') {
  1050. $target['path'] = removeDotSegments($reference['path']);
  1051. } else {
  1052. $target['path'] = mergePaths($base, $reference);
  1053. $target['path'] = removeDotSegments($target['path']);
  1054. }
  1055. $target['query'] = $reference['query'];
  1056. }
  1057. $target['authority'] = $base['authority'];
  1058. }
  1059. $target['scheme'] = $base['scheme'];
  1060. }
  1061. $target['fragment'] = $reference['fragment'];
  1062. # 5.3 Component Recomposition
  1063. $result = '';
  1064. if($target['scheme']) {
  1065. $result .= $target['scheme'] . ':';
  1066. }
  1067. if($target['authority']) {
  1068. $result .= '//' . $target['authority'];
  1069. }
  1070. $result .= $target['path'];
  1071. if($target['query']) {
  1072. $result .= '?' . $target['query'];
  1073. }
  1074. if($target['fragment']) {
  1075. $result .= '#' . $target['fragment'];
  1076. } elseif($referenceURI == '#') {
  1077. $result .= '#';
  1078. }
  1079. return $result;
  1080. }
  1081. # 5.2.3 Merge Paths
  1082. function mergePaths($base, $reference) {
  1083. # If the base URI has a defined authority component and an empty
  1084. # path,
  1085. if($base['authority'] && $base['path'] == null) {
  1086. # then return a string consisting of "/" concatenated with the
  1087. # reference's path; otherwise,
  1088. $merged = '/' . $reference['path'];
  1089. } else {
  1090. if(($pos=strrpos($base['path'], '/')) !== false) {
  1091. # return a string consisting of the reference's path component
  1092. # appended to all but the last segment of the base URI's path (i.e.,
  1093. # excluding any characters after the right-most "/" in the base URI
  1094. # path,
  1095. $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
  1096. } else {
  1097. # or excluding the entire base URI path if it does not contain
  1098. # any "/" characters).
  1099. $merged = $base['path'];
  1100. }
  1101. }
  1102. return $merged;
  1103. }
  1104. # 5.2.4.A Remove leading ../ or ./
  1105. function removeLeadingDotSlash(&$input) {
  1106. if(substr($input, 0, 3) == '../') {
  1107. $input = substr($input, 3);
  1108. } elseif(substr($input, 0, 2) == './') {
  1109. $input = substr($input, 2);
  1110. }
  1111. }
  1112. # 5.2.4.B Replace leading /. with /
  1113. function removeLeadingSlashDot(&$input) {
  1114. if(substr($input, 0, 3) == '/./') {
  1115. $input = '/' . substr($input, 3);
  1116. } else {
  1117. $input = '/' . substr($input, 2);
  1118. }
  1119. }
  1120. # 5.2.4.C Given leading /../ remove component from output buffer
  1121. function removeOneDirLevel(&$input, &$output) {
  1122. if(substr($input, 0, 4) == '/../') {
  1123. $input = '/' . substr($input, 4);
  1124. } else {
  1125. $input = '/' . substr($input, 3);
  1126. }
  1127. $output = substr($output, 0, strrpos($output, '/'));
  1128. }
  1129. # 5.2.4.D Remove . and .. if it's the only thing in the input
  1130. function removeLoneDotDot(&$input) {
  1131. if($input == '.') {
  1132. $input = substr($input, 1);
  1133. } else {
  1134. $input = substr($input, 2);
  1135. }
  1136. }
  1137. # 5.2.4.E Move one segment from input to output
  1138. function moveOneSegmentFromInput(&$input, &$output) {
  1139. if(substr($input, 0, 1) != '/') {
  1140. $pos = strpos($input, '/');
  1141. } else {
  1142. $pos = strpos($input, '/', 1);
  1143. }
  1144. if($pos === false) {
  1145. $output .= $input;
  1146. $input = '';
  1147. } else {
  1148. $output .= substr($input, 0, $pos);
  1149. $input = substr($input, $pos);
  1150. }
  1151. }
  1152. # 5.2.4 Remove Dot Segments
  1153. function removeDotSegments($path) {
  1154. # 1. The input buffer is initialized with the now-appended path
  1155. # components and the output buffer is initialized to the empty
  1156. # string.
  1157. $input = $path;
  1158. $output = '';
  1159. $step = 0;
  1160. # 2. While the input buffer is not empty, loop as follows:
  1161. while($input) {
  1162. $step++;
  1163. if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
  1164. # A. If the input buffer begins with a prefix of "../" or "./",
  1165. # then remove that prefix from the input buffer; otherwise,
  1166. removeLeadingDotSlash($input);
  1167. } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
  1168. # B. if the input buffer begins with a prefix of "/./" or "/.",
  1169. # where "." is a complete path segment, then replace that
  1170. # prefix with "/" in the input buffer; otherwise,
  1171. removeLeadingSlashDot($input);
  1172. } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
  1173. # C. if the input buffer begins with a prefix of "/../" or "/..",
  1174. # where ".." is a complete path segment, then replace that
  1175. # prefix with "/" in the input buffer and remove the last
  1176. # segment and its preceding "/" (if any) from the output
  1177. # buffer; otherwise,
  1178. removeOneDirLevel($input, $output);
  1179. } elseif($input == '.' || $input == '..') {
  1180. # D. if the input buffer consists only of "." or "..", then remove
  1181. # that from the input buffer; otherwise,
  1182. removeLoneDotDot($input);
  1183. } else {
  1184. # E. move the first path segment in the input buffer to the end of
  1185. # the output buffer and any subsequent characters up to, but not including,
  1186. # the next "/" character or the end of the input buffer
  1187. moveOneSegmentFromInput($input, $output);
  1188. }
  1189. }
  1190. return $output;
  1191. }