Parser.php 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442
  1. <?php
  2. namespace Mf2;
  3. use DOMDocument;
  4. use DOMElement;
  5. use DOMXPath;
  6. use DOMNode;
  7. use DOMNodeList;
  8. use Exception;
  9. use SplObjectStorage;
  10. use stdClass;
  11. /**
  12. * Parse Microformats2
  13. *
  14. * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
  15. *
  16. * Example usage:
  17. *
  18. * use Mf2;
  19. * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
  20. * echo json_encode($output, JSON_PRETTY_PRINT);
  21. *
  22. * Produces:
  23. *
  24. * {
  25. * "items": [
  26. * {
  27. * "type": ["h-card"],
  28. * "properties": {
  29. * "name": ["Barnaby Walters"]
  30. * }
  31. * }
  32. * ],
  33. * "rels": {}
  34. * }
  35. *
  36. * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
  37. * @param string $url The URL the input document was found at, for relative URL resolution
  38. * @param bool $convertClassic whether or not to convert classic microformats
  39. * @return array Canonical MF2 array structure
  40. */
  41. function parse($input, $url = null, $convertClassic = true) {
  42. $parser = new Parser($input, $url);
  43. return $parser->parse($convertClassic);
  44. }
  45. /**
  46. * Fetch microformats2
  47. *
  48. * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
  49. * microformats2 array structure.
  50. *
  51. * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
  52. * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
  53. * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
  54. * for the actual value.
  55. *
  56. * @param string $url The URL to fetch
  57. * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
  58. * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
  59. * @return array|null canonical microformats2 array structure on success, null on failure
  60. */
  61. function fetch($url, $convertClassic = true, &$curlInfo=null) {
  62. $ch = curl_init();
  63. curl_setopt($ch, CURLOPT_URL, $url);
  64. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  65. curl_setopt($ch, CURLOPT_HEADER, 0);
  66. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  67. curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
  68. $html = curl_exec($ch);
  69. $info = $curlInfo = curl_getinfo($ch);
  70. curl_close($ch);
  71. if (strpos(strtolower($info['content_type']), 'html') === false) {
  72. // The content was not delivered as HTML, do not attempt to parse it.
  73. return null;
  74. }
  75. return parse($html, $url, $convertClassic);
  76. }
  77. /**
  78. * Unicode to HTML Entities
  79. * @param string $input String containing characters to convert into HTML entities
  80. * @return string
  81. */
  82. function unicodeToHtmlEntities($input) {
  83. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  84. }
  85. /**
  86. * Collapse Whitespace
  87. *
  88. * Collapses any sequences of whitespace within a string into a single space
  89. * character.
  90. *
  91. * @deprecated since v0.2.3
  92. * @param string $str
  93. * @return string
  94. */
  95. function collapseWhitespace($str) {
  96. return preg_replace('/[\s|\n]+/', ' ', $str);
  97. }
  98. function unicodeTrim($str) {
  99. // this is cheating. TODO: find a better way if this causes any problems
  100. $str = str_replace(mb_convert_encoding('&nbsp;', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
  101. $str = preg_replace('/^\s+/', '', $str);
  102. return preg_replace('/\s+$/', '', $str);
  103. }
  104. /**
  105. * Microformat Name From Class string
  106. *
  107. * Given the value of @class, get the relevant mf classnames (e.g. h-card,
  108. * p-name).
  109. *
  110. * @param string $class A space delimited list of classnames
  111. * @param string $prefix The prefix to look for
  112. * @return string|array The prefixed name of the first microfomats class found or false
  113. */
  114. function mfNamesFromClass($class, $prefix='h-') {
  115. $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
  116. $classes = explode(' ', $class);
  117. $matches = array();
  118. foreach ($classes as $classname) {
  119. $compare_classname = ' ' . $classname;
  120. $compare_prefix = ' ' . $prefix;
  121. if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
  122. $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
  123. }
  124. }
  125. return $matches;
  126. }
  127. /**
  128. * Get Nested µf Property Name From Class
  129. *
  130. * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
  131. * space-separated string.
  132. *
  133. * @param string $class
  134. * @return array
  135. */
  136. function nestedMfPropertyNamesFromClass($class) {
  137. $prefixes = array('p-', 'u-', 'dt-', 'e-');
  138. $propertyNames = array();
  139. $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
  140. foreach (explode(' ', $class) as $classname) {
  141. foreach ($prefixes as $prefix) {
  142. // Check if $classname is a valid property classname for $prefix.
  143. if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
  144. $propertyName = mb_substr($classname, mb_strlen($prefix));
  145. $propertyNames[$propertyName][] = $prefix;
  146. }
  147. }
  148. }
  149. foreach ($propertyNames as $property => $prefixes) {
  150. $propertyNames[$property] = array_unique($prefixes);
  151. }
  152. return $propertyNames;
  153. }
  154. /**
  155. * Wraps mfNamesFromClass to handle an element as input (common)
  156. *
  157. * @param DOMElement $e The element to get the classname for
  158. * @param string $prefix The prefix to look for
  159. * @return mixed See return value of mf2\Parser::mfNameFromClass()
  160. */
  161. function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
  162. $class = $e->getAttribute('class');
  163. return mfNamesFromClass($class, $prefix);
  164. }
  165. /**
  166. * Wraps nestedMfPropertyNamesFromClass to handle an element as input
  167. */
  168. function nestedMfPropertyNamesFromElement(\DOMElement $e) {
  169. $class = $e->getAttribute('class');
  170. return nestedMfPropertyNamesFromClass($class);
  171. }
  172. /**
  173. * Converts various time formats to HH:MM
  174. * @param string $time The time to convert
  175. * @return string
  176. */
  177. function convertTimeFormat($time) {
  178. $hh = $mm = $ss = '';
  179. preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
  180. // If no am/pm is specified:
  181. if (empty($matches[4])) {
  182. return $time;
  183. } else {
  184. // Otherwise, am/pm is specified.
  185. $meridiem = strtolower(str_replace('.', '', $matches[4]));
  186. // Hours.
  187. $hh = $matches[1];
  188. // Add 12 to hours if pm applies.
  189. if ($meridiem == 'pm' && ($hh < 12)) {
  190. $hh += 12;
  191. }
  192. $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
  193. // Minutes.
  194. $mm = (empty($matches[2]) ) ? '00' : $matches[2];
  195. // Seconds, only if supplied.
  196. if (!empty($matches[3])) {
  197. $ss = $matches[3];
  198. }
  199. if (empty($ss)) {
  200. return sprintf('%s:%s', $hh, $mm);
  201. }
  202. else {
  203. return sprintf('%s:%s:%s', $hh, $mm, $ss);
  204. }
  205. }
  206. }
  207. /**
  208. * Microformats2 Parser
  209. *
  210. * A class which holds state for parsing microformats2 from HTML.
  211. *
  212. * Example usage:
  213. *
  214. * use Mf2;
  215. * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
  216. * $output = $parser->parse();
  217. */
  218. class Parser {
  219. /** @var string The baseurl (if any) to use for this parse */
  220. public $baseurl;
  221. /** @var DOMXPath object which can be used to query over any fragment*/
  222. public $xpath;
  223. /** @var DOMDocument */
  224. public $doc;
  225. /** @var SplObjectStorage */
  226. protected $parsed;
  227. public $jsonMode;
  228. /**
  229. * Constructor
  230. *
  231. * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
  232. * @param string $url The URL of the parsed document, for relative URL resolution
  233. * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
  234. */
  235. public function __construct($input, $url = null, $jsonMode = false) {
  236. libxml_use_internal_errors(true);
  237. if (is_string($input)) {
  238. $doc = new DOMDocument();
  239. @$doc->loadHTML(unicodeToHtmlEntities($input));
  240. } elseif (is_a($input, 'DOMDocument')) {
  241. $doc = $input;
  242. } else {
  243. $doc = new DOMDocument();
  244. @$doc->loadHTML('');
  245. }
  246. $this->xpath = new DOMXPath($doc);
  247. $baseurl = $url;
  248. foreach ($this->xpath->query('//base[@href]') as $base) {
  249. $baseElementUrl = $base->getAttribute('href');
  250. if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
  251. /* The base element URL is relative to the document URL.
  252. *
  253. * :/
  254. *
  255. * Perhaps the author was high? */
  256. $baseurl = resolveUrl($url, $baseElementUrl);
  257. } else {
  258. $baseurl = $baseElementUrl;
  259. }
  260. break;
  261. }
  262. // Ignore <template> elements as per the HTML5 spec
  263. foreach ($this->xpath->query('//template') as $templateEl) {
  264. $templateEl->parentNode->removeChild($templateEl);
  265. }
  266. $this->baseurl = $baseurl;
  267. $this->doc = $doc;
  268. $this->parsed = new SplObjectStorage();
  269. $this->jsonMode = $jsonMode;
  270. }
  271. private function elementPrefixParsed(\DOMElement $e, $prefix) {
  272. if (!$this->parsed->contains($e))
  273. $this->parsed->attach($e, array());
  274. $prefixes = $this->parsed[$e];
  275. $prefixes[] = $prefix;
  276. $this->parsed[$e] = $prefixes;
  277. }
  278. private function isElementParsed(\DOMElement $e, $prefix) {
  279. if (!$this->parsed->contains($e))
  280. return false;
  281. $prefixes = $this->parsed[$e];
  282. if (!in_array($prefix, $prefixes))
  283. return false;
  284. return true;
  285. }
  286. private function resolveChildUrls(DOMElement $el) {
  287. $hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);
  288. foreach ($hyperlinkChildren as $child) {
  289. if ($child->hasAttribute('href'))
  290. $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
  291. if ($child->hasAttribute('src'))
  292. $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
  293. if ($child->hasAttribute('data'))
  294. $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
  295. }
  296. }
  297. public function textContent(DOMElement $el) {
  298. $this->resolveChildUrls($el);
  299. $clonedEl = $el->cloneNode(true);
  300. foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) {
  301. $newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
  302. $imgEl->parentNode->replaceChild($newNode, $imgEl);
  303. }
  304. return $clonedEl->textContent;
  305. }
  306. // TODO: figure out if this has problems with sms: and geo: URLs
  307. public function resolveUrl($url) {
  308. // If the URL is seriously malformed it’s probably beyond the scope of this
  309. // parser to try to do anything with it.
  310. if (parse_url($url) === false)
  311. return $url;
  312. $scheme = parse_url($url, PHP_URL_SCHEME);
  313. if (empty($scheme) and !empty($this->baseurl)) {
  314. return resolveUrl($this->baseurl, $url);
  315. } else {
  316. return $url;
  317. }
  318. }
  319. // Parsing Functions
  320. /**
  321. * Parse value-class/value-title on an element, joining with $separator if
  322. * there are multiple.
  323. *
  324. * @param \DOMElement $e
  325. * @param string $separator = '' if multiple value-title elements, join with this string
  326. * @return string|null the parsed value or null if value-class or -title aren’t in use
  327. */
  328. public function parseValueClassTitle(\DOMElement $e, $separator = '') {
  329. $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
  330. if ($valueClassElements->length !== 0) {
  331. // Process value-class stuff
  332. $val = '';
  333. foreach ($valueClassElements as $el) {
  334. $val .= $this->textContent($el);
  335. }
  336. return unicodeTrim($val);
  337. }
  338. $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
  339. if ($valueTitleElements->length !== 0) {
  340. // Process value-title stuff
  341. $val = '';
  342. foreach ($valueTitleElements as $el) {
  343. $val .= $el->getAttribute('title');
  344. }
  345. return unicodeTrim($val);
  346. }
  347. // No value-title or -class in this element
  348. return null;
  349. }
  350. /**
  351. * Given an element with class="p-*", get it’s value
  352. *
  353. * @param DOMElement $p The element to parse
  354. * @return string The plaintext value of $p, dependant on type
  355. * @todo Make this adhere to value-class
  356. */
  357. public function parseP(\DOMElement $p) {
  358. $classTitle = $this->parseValueClassTitle($p, ' ');
  359. if ($classTitle !== null)
  360. return $classTitle;
  361. if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
  362. $pValue = $p->getAttribute('alt');
  363. } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
  364. $pValue = $p->getAttribute('alt');
  365. } elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') {
  366. $pValue = $p->getAttribute('title');
  367. } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
  368. $pValue = $p->getAttribute('value');
  369. } else {
  370. $pValue = unicodeTrim($this->textContent($p));
  371. }
  372. return $pValue;
  373. }
  374. /**
  375. * Given an element with class="u-*", get the value of the URL
  376. *
  377. * @param DOMElement $u The element to parse
  378. * @return string The plaintext value of $u, dependant on type
  379. * @todo make this adhere to value-class
  380. */
  381. public function parseU(\DOMElement $u) {
  382. if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
  383. $uValue = $u->getAttribute('href');
  384. } elseif (in_array($u->tagName, array('img', 'audio', 'video', 'source')) and $u->getAttribute('src') !== null) {
  385. $uValue = $u->getAttribute('src');
  386. } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
  387. $uValue = $u->getAttribute('data');
  388. }
  389. if (isset($uValue)) {
  390. return $this->resolveUrl($uValue);
  391. }
  392. $classTitle = $this->parseValueClassTitle($u);
  393. if ($classTitle !== null) {
  394. return $classTitle;
  395. } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
  396. return $u->getAttribute('title');
  397. } elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
  398. return $u->getAttribute('value');
  399. } else {
  400. return unicodeTrim($this->textContent($u));
  401. }
  402. }
  403. /**
  404. * Given an element with class="dt-*", get the value of the datetime as a php date object
  405. *
  406. * @param DOMElement $dt The element to parse
  407. * @param array $dates Array of dates processed so far
  408. * @return string The datetime string found
  409. */
  410. public function parseDT(\DOMElement $dt, &$dates = array()) {
  411. // Check for value-class pattern
  412. $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
  413. $dtValue = false;
  414. if ($valueClassChildren->length > 0) {
  415. // They’re using value-class
  416. $dateParts = array();
  417. foreach ($valueClassChildren as $e) {
  418. if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
  419. $title = $e->getAttribute('title');
  420. if (!empty($title))
  421. $dateParts[] = $title;
  422. }
  423. elseif ($e->tagName == 'img' or $e->tagName == 'area') {
  424. // Use @alt
  425. $alt = $e->getAttribute('alt');
  426. if (!empty($alt))
  427. $dateParts[] = $alt;
  428. }
  429. elseif ($e->tagName == 'data') {
  430. // Use @value, otherwise innertext
  431. $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
  432. if (!empty($value))
  433. $dateParts[] = $value;
  434. }
  435. elseif ($e->tagName == 'abbr') {
  436. // Use @title, otherwise innertext
  437. $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
  438. if (!empty($title))
  439. $dateParts[] = $title;
  440. }
  441. elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
  442. // Use @datetime if available, otherwise innertext
  443. $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
  444. if (!empty($dtAttr))
  445. $dateParts[] = $dtAttr;
  446. }
  447. else {
  448. if (!empty($e->nodeValue))
  449. $dateParts[] = unicodeTrim($e->nodeValue);
  450. }
  451. }
  452. // Look through dateParts
  453. $datePart = '';
  454. $timePart = '';
  455. foreach ($dateParts as $part) {
  456. // Is this part a full ISO8601 datetime?
  457. if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
  458. // Break completely, we’ve got our value.
  459. $dtValue = $part;
  460. break;
  461. } else {
  462. // Is the current part a valid time(+TZ?) AND no other time representation has been found?
  463. if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
  464. $timePart = $part;
  465. } elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
  466. // Is the current part a valid date AND no other date representation has been found?
  467. $datePart = $part;
  468. }
  469. if ( !empty($datePart) && !in_array($datePart, $dates) ) {
  470. $dates[] = $datePart;
  471. }
  472. $dtValue = '';
  473. if ( empty($datePart) && !empty($timePart) ) {
  474. $timePart = convertTimeFormat($timePart);
  475. $dtValue = unicodeTrim($timePart, 'T');
  476. }
  477. else if ( !empty($datePart) && empty($timePart) ) {
  478. $dtValue = rtrim($datePart, 'T');
  479. }
  480. else {
  481. $timePart = convertTimeFormat($timePart);
  482. $dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
  483. }
  484. }
  485. }
  486. } else {
  487. // Not using value-class (phew).
  488. if ($dt->tagName == 'img' or $dt->tagName == 'area') {
  489. // Use @alt
  490. // Is it an entire dt?
  491. $alt = $dt->getAttribute('alt');
  492. if (!empty($alt))
  493. $dtValue = $alt;
  494. } elseif (in_array($dt->tagName, array('data'))) {
  495. // Use @value, otherwise innertext
  496. // Is it an entire dt?
  497. $value = $dt->getAttribute('value');
  498. if (!empty($value))
  499. $dtValue = $value;
  500. else
  501. $dtValue = $dt->nodeValue;
  502. } elseif ($dt->tagName == 'abbr') {
  503. // Use @title, otherwise innertext
  504. // Is it an entire dt?
  505. $title = $dt->getAttribute('title');
  506. if (!empty($title))
  507. $dtValue = $title;
  508. else
  509. $dtValue = $dt->nodeValue;
  510. } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
  511. // Use @datetime if available, otherwise innertext
  512. // Is it an entire dt?
  513. $dtAttr = $dt->getAttribute('datetime');
  514. if (!empty($dtAttr))
  515. $dtValue = $dtAttr;
  516. else
  517. $dtValue = $dt->nodeValue;
  518. } else {
  519. $dtValue = $dt->nodeValue;
  520. }
  521. if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
  522. $dates[] = $matches[0];
  523. }
  524. }
  525. /**
  526. * if $dtValue is only a time and there are recently parsed dates,
  527. * form the full date-time using the most recently parsed dt- value
  528. */
  529. if ((preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates)) {
  530. $dtValue = convertTimeFormat($dtValue);
  531. $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
  532. }
  533. return $dtValue;
  534. }
  535. /**
  536. * Given the root element of some embedded markup, return a string representing that markup
  537. *
  538. * @param DOMElement $e The element to parse
  539. * @return string $e’s innerHTML
  540. *
  541. * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
  542. */
  543. public function parseE(\DOMElement $e) {
  544. $classTitle = $this->parseValueClassTitle($e);
  545. if ($classTitle !== null)
  546. return $classTitle;
  547. // Expand relative URLs within children of this element
  548. // TODO: as it is this is not relative to only children, make this .// and rerun tests
  549. $this->resolveChildUrls($e);
  550. $html = '';
  551. foreach ($e->childNodes as $node) {
  552. $html .= $node->C14N();
  553. }
  554. return array(
  555. 'html' => $html,
  556. 'value' => unicodeTrim($this->textContent($e))
  557. );
  558. }
  559. /**
  560. * Recursively parse microformats
  561. *
  562. * @param DOMElement $e The element to parse
  563. * @return array A representation of the values contained within microformat $e
  564. */
  565. public function parseH(\DOMElement $e) {
  566. // If it’s already been parsed (e.g. is a child mf), skip
  567. if ($this->parsed->contains($e))
  568. return null;
  569. // Get current µf name
  570. $mfTypes = mfNamesFromElement($e, 'h-');
  571. // Initalise var to store the representation in
  572. $return = array();
  573. $children = array();
  574. $dates = array();
  575. // Handle nested microformats (h-*)
  576. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) {
  577. // Parse
  578. $result = $this->parseH($subMF);
  579. // If result was already parsed, skip it
  580. if (null === $result)
  581. continue;
  582. // In most cases, the value attribute of the nested microformat should be the p- parsed value of the elemnt.
  583. // The only times this is different is when the microformat is nested under certain prefixes, which are handled below.
  584. $result['value'] = $this->parseP($subMF);
  585. // Does this µf have any property names other than h-*?
  586. $properties = nestedMfPropertyNamesFromElement($subMF);
  587. if (!empty($properties)) {
  588. // Yes! It’s a nested property µf
  589. foreach ($properties as $property => $prefixes) {
  590. // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
  591. $prefixSpecificResult = $result;
  592. if (in_array('p-', $prefixes)) {
  593. $prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0];
  594. } elseif (in_array('e-', $prefixes)) {
  595. $eParsedResult = $this->parseE($subMF);
  596. $prefixSpecificResult['html'] = $eParsedResult['html'];
  597. $prefixSpecificResult['value'] = $eParsedResult['value'];
  598. } elseif (in_array('u-', $prefixes)) {
  599. $prefixSpecificResult['value'] = $this->parseU($subMF);
  600. }
  601. $return[$property][] = $prefixSpecificResult;
  602. }
  603. } else {
  604. // No, it’s a child µf
  605. $children[] = $result;
  606. }
  607. // Make sure this sub-mf won’t get parsed as a µf or property
  608. // TODO: Determine if clearing this is required?
  609. $this->elementPrefixParsed($subMF, 'h');
  610. $this->elementPrefixParsed($subMF, 'p');
  611. $this->elementPrefixParsed($subMF, 'u');
  612. $this->elementPrefixParsed($subMF, 'dt');
  613. $this->elementPrefixParsed($subMF, 'e');
  614. }
  615. if($e->tagName == 'area') {
  616. $coords = $e->getAttribute('coords');
  617. $shape = $e->getAttribute('shape');
  618. }
  619. // Handle p-*
  620. foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
  621. if ($this->isElementParsed($p, 'p'))
  622. continue;
  623. $pValue = $this->parseP($p);
  624. // Add the value to the array for it’s p- properties
  625. foreach (mfNamesFromElement($p, 'p-') as $propName) {
  626. if (!empty($propName))
  627. $return[$propName][] = $pValue;
  628. }
  629. // Make sure this sub-mf won’t get parsed as a top level mf
  630. $this->elementPrefixParsed($p, 'p');
  631. }
  632. // Handle u-*
  633. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
  634. if ($this->isElementParsed($u, 'u'))
  635. continue;
  636. $uValue = $this->parseU($u);
  637. // Add the value to the array for it’s property types
  638. foreach (mfNamesFromElement($u, 'u-') as $propName) {
  639. $return[$propName][] = $uValue;
  640. }
  641. // Make sure this sub-mf won’t get parsed as a top level mf
  642. $this->elementPrefixParsed($u, 'u');
  643. }
  644. // Handle dt-*
  645. foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
  646. if ($this->isElementParsed($dt, 'dt'))
  647. continue;
  648. $dtValue = $this->parseDT($dt, $dates);
  649. if ($dtValue) {
  650. // Add the value to the array for dt- properties
  651. foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
  652. $return[$propName][] = $dtValue;
  653. }
  654. }
  655. // Make sure this sub-mf won’t get parsed as a top level mf
  656. $this->elementPrefixParsed($dt, 'dt');
  657. }
  658. // Handle e-*
  659. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
  660. if ($this->isElementParsed($em, 'e'))
  661. continue;
  662. $eValue = $this->parseE($em);
  663. if ($eValue) {
  664. // Add the value to the array for e- properties
  665. foreach (mfNamesFromElement($em, 'e-') as $propName) {
  666. $return[$propName][] = $eValue;
  667. }
  668. }
  669. // Make sure this sub-mf won’t get parsed as a top level mf
  670. $this->elementPrefixParsed($em, 'e');
  671. }
  672. // Implied Properties
  673. // Check for p-name
  674. if (!array_key_exists('name', $return)) {
  675. try {
  676. // Look for img @alt
  677. if (($e->tagName == 'img' or $e->tagName == 'area') and $e->getAttribute('alt') != '')
  678. throw new Exception($e->getAttribute('alt'));
  679. if ($e->tagName == 'abbr' and $e->hasAttribute('title'))
  680. throw new Exception($e->getAttribute('title'));
  681. // Look for nested img @alt
  682. foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  683. $emNames = mfNamesFromElement($em, 'h-');
  684. if (empty($emNames) && $em->getAttribute('alt') != '') {
  685. throw new Exception($em->getAttribute('alt'));
  686. }
  687. }
  688. // Look for nested area @alt
  689. foreach ($this->xpath->query('./area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  690. $emNames = mfNamesFromElement($em, 'h-');
  691. if (empty($emNames) && $em->getAttribute('alt') != '') {
  692. throw new Exception($em->getAttribute('alt'));
  693. }
  694. }
  695. // Look for double nested img @alt
  696. foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  697. $emNames = mfNamesFromElement($em, 'h-');
  698. if (empty($emNames) && $em->getAttribute('alt') != '') {
  699. throw new Exception($em->getAttribute('alt'));
  700. }
  701. }
  702. // Look for double nested img @alt
  703. foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  704. $emNames = mfNamesFromElement($em, 'h-');
  705. if (empty($emNames) && $em->getAttribute('alt') != '') {
  706. throw new Exception($em->getAttribute('alt'));
  707. }
  708. }
  709. throw new Exception($e->nodeValue);
  710. } catch (Exception $exc) {
  711. $return['name'][] = unicodeTrim($exc->getMessage());
  712. }
  713. }
  714. // Check for u-photo
  715. if (!array_key_exists('photo', $return)) {
  716. // Look for img @src
  717. try {
  718. if ($e->tagName == 'img')
  719. throw new Exception($e->getAttribute('src'));
  720. // Look for nested img @src
  721. foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  722. if ($em->getAttribute('src') != '')
  723. throw new Exception($em->getAttribute('src'));
  724. }
  725. // Look for double nested img @src
  726. foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
  727. if ($em->getAttribute('src') != '')
  728. throw new Exception($em->getAttribute('src'));
  729. }
  730. } catch (Exception $exc) {
  731. $return['photo'][] = $this->resolveUrl($exc->getMessage());
  732. }
  733. }
  734. // Check for u-url
  735. if (!array_key_exists('url', $return)) {
  736. // Look for img @src
  737. if ($e->tagName == 'a' or $e->tagName == 'area')
  738. $url = $e->getAttribute('href');
  739. // Look for nested a @href
  740. foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
  741. $emNames = mfNamesFromElement($em, 'h-');
  742. if (empty($emNames)) {
  743. $url = $em->getAttribute('href');
  744. break;
  745. }
  746. }
  747. // Look for nested area @src
  748. foreach ($this->xpath->query('./area[count(preceding-sibling::area)+count(following-sibling::area)=0]', $e) as $em) {
  749. $emNames = mfNamesFromElement($em, 'h-');
  750. if (empty($emNames)) {
  751. $url = $em->getAttribute('href');
  752. break;
  753. }
  754. }
  755. if (!empty($url))
  756. $return['url'][] = $this->resolveUrl($url);
  757. }
  758. // Make sure things are in alphabetical order
  759. sort($mfTypes);
  760. // Phew. Return the final result.
  761. $parsed = array(
  762. 'type' => $mfTypes,
  763. 'properties' => $return
  764. );
  765. if (!empty($shape)) {
  766. $parsed['shape'] = $shape;
  767. }
  768. if (!empty($coords)) {
  769. $parsed['coords'] = $coords;
  770. }
  771. if (!empty($children)) {
  772. $parsed['children'] = array_values(array_filter($children));
  773. }
  774. return $parsed;
  775. }
  776. /**
  777. * Parse Rels and Alternatives
  778. *
  779. * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page
  780. * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set
  781. * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case),
  782. * it will be an empty array.
  783. */
  784. public function parseRelsAndAlternates() {
  785. $rels = array();
  786. $alternates = array();
  787. // Iterate through all a, area and link elements with rel attributes
  788. foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) {
  789. if ($hyperlink->getAttribute('rel') == '')
  790. continue;
  791. // Resolve the href
  792. $href = $this->resolveUrl($hyperlink->getAttribute('href'));
  793. // Split up the rel into space-separated values
  794. $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel')));
  795. // If alternate in rels, create alternate structure, append
  796. if (in_array('alternate', $linkRels)) {
  797. $alt = array(
  798. 'url' => $href,
  799. 'rel' => implode(' ', array_diff($linkRels, array('alternate')))
  800. );
  801. if ($hyperlink->hasAttribute('media'))
  802. $alt['media'] = $hyperlink->getAttribute('media');
  803. if ($hyperlink->hasAttribute('hreflang'))
  804. $alt['hreflang'] = $hyperlink->getAttribute('hreflang');
  805. if ($hyperlink->hasAttribute('title'))
  806. $alt['title'] = $hyperlink->getAttribute('title');
  807. if ($hyperlink->hasAttribute('type'))
  808. $alt['type'] = $hyperlink->getAttribute('type');
  809. if ($hyperlink->nodeValue)
  810. $alt['text'] = $hyperlink->nodeValue;
  811. $alternates[] = $alt;
  812. } else {
  813. foreach ($linkRels as $rel) {
  814. $rels[$rel][] = $href;
  815. }
  816. }
  817. }
  818. if (empty($rels) and $this->jsonMode) {
  819. $rels = new stdClass();
  820. }
  821. return array($rels, $alternates);
  822. }
  823. /**
  824. * Kicks off the parsing routine
  825. *
  826. * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
  827. * will be HTML-encoded, bringing all output to the same level of encoding.
  828. *
  829. * If a DOMElement is set as the $context, only descendants of that element will
  830. * be parsed for microformats.
  831. *
  832. * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
  833. * @param DOMElement $context optionally an element from which to parse microformats
  834. * @return array An array containing all the µfs found in the current document
  835. */
  836. public function parse($convertClassic = true, DOMElement $context = null) {
  837. $mfs = array();
  838. if ($convertClassic) {
  839. $this->convertLegacy();
  840. }
  841. $mfElements = null === $context
  842. ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]')
  843. : $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context);
  844. // Parser microformats
  845. foreach ($mfElements as $node) {
  846. // For each microformat
  847. $result = $this->parseH($node);
  848. // Add the value to the array for this property type
  849. $mfs[] = $result;
  850. }
  851. // Parse rels
  852. list($rels, $alternates) = $this->parseRelsAndAlternates();
  853. $top = array(
  854. 'items' => array_values(array_filter($mfs)),
  855. 'rels' => $rels
  856. );
  857. if (count($alternates))
  858. $top['alternates'] = $alternates;
  859. return $top;
  860. }
  861. /**
  862. * Parse From ID
  863. *
  864. * Given an ID, parse all microformats which are children of the element with
  865. * that ID.
  866. *
  867. * Note that rel values are still document-wide.
  868. *
  869. * If an element with the ID is not found, an empty skeleton mf2 array structure
  870. * will be returned.
  871. *
  872. * @param string $id
  873. * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
  874. * @return array
  875. */
  876. public function parseFromId($id, $convertClassic=true) {
  877. $matches = $this->xpath->query("//*[@id='{$id}']");
  878. if (empty($matches))
  879. return array('items' => array(), 'rels' => array(), 'alternates' => array());
  880. return $this->parse($convertClassic, $matches->item(0));
  881. }
  882. /**
  883. * Convert Legacy Classnames
  884. *
  885. * Adds microformats2 classnames into a document containing only legacy
  886. * semantic classnames.
  887. *
  888. * @return Parser $this
  889. */
  890. public function convertLegacy() {
  891. $doc = $this->doc;
  892. $xp = new DOMXPath($doc);
  893. // replace all roots
  894. foreach ($this->classicRootMap as $old => $new) {
  895. foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
  896. $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
  897. }
  898. }
  899. foreach ($this->classicPropertyMap as $oldRoot => $properties) {
  900. $newRoot = $this->classicRootMap[$oldRoot];
  901. foreach ($properties as $old => $new) {
  902. foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
  903. $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
  904. }
  905. }
  906. }
  907. return $this;
  908. }
  909. /**
  910. * XPath Query
  911. *
  912. * Runs an XPath query over the current document. Works in exactly the same
  913. * way as DOMXPath::query.
  914. *
  915. * @param string $expression
  916. * @param DOMNode $context
  917. * @return DOMNodeList
  918. */
  919. public function query($expression, $context = null) {
  920. return $this->xpath->query($expression, $context);
  921. }
  922. /**
  923. * Classic Root Classname map
  924. */
  925. public $classicRootMap = array(
  926. 'vcard' => 'h-card',
  927. 'hfeed' => 'h-feed',
  928. 'hentry' => 'h-entry',
  929. 'hrecipe' => 'h-recipe',
  930. 'hresume' => 'h-resume',
  931. 'vevent' => 'h-event',
  932. 'hreview' => 'h-review',
  933. 'hproduct' => 'h-product'
  934. );
  935. public $classicPropertyMap = array(
  936. 'vcard' => array(
  937. 'fn' => 'p-name',
  938. 'url' => 'u-url',
  939. 'honorific-prefix' => 'p-honorific-prefix',
  940. 'given-name' => 'p-given-name',
  941. 'additional-name' => 'p-additional-name',
  942. 'family-name' => 'p-family-name',
  943. 'honorific-suffix' => 'p-honorific-suffix',
  944. 'nickname' => 'p-nickname',
  945. 'email' => 'u-email',
  946. 'logo' => 'u-logo',
  947. 'photo' => 'u-photo',
  948. 'url' => 'u-url',
  949. 'uid' => 'u-uid',
  950. 'category' => 'p-category',
  951. 'adr' => 'p-adr h-adr',
  952. 'extended-address' => 'p-extended-address',
  953. 'street-address' => 'p-street-address',
  954. 'locality' => 'p-locality',
  955. 'region' => 'p-region',
  956. 'postal-code' => 'p-postal-code',
  957. 'country-name' => 'p-country-name',
  958. 'label' => 'p-label',
  959. 'geo' => 'p-geo h-geo',
  960. 'latitude' => 'p-latitude',
  961. 'longitude' => 'p-longitude',
  962. 'tel' => 'p-tel',
  963. 'note' => 'p-note',
  964. 'bday' => 'dt-bday',
  965. 'key' => 'u-key',
  966. 'org' => 'p-org',
  967. 'organization-name' => 'p-organization-name',
  968. 'organization-unit' => 'p-organization-unit',
  969. ),
  970. 'hentry' => array(
  971. 'entry-title' => 'p-name',
  972. 'entry-summary' => 'p-summary',
  973. 'entry-content' => 'e-content',
  974. 'published' => 'dt-published',
  975. 'updated' => 'dt-updated',
  976. 'author' => 'p-author h-card',
  977. 'category' => 'p-category',
  978. 'geo' => 'p-geo h-geo',
  979. 'latitude' => 'p-latitude',
  980. 'longitude' => 'p-longitude',
  981. ),
  982. 'hrecipe' => array(
  983. 'fn' => 'p-name',
  984. 'ingredient' => 'p-ingredient',
  985. 'yield' => 'p-yield',
  986. 'instructions' => 'e-instructions',
  987. 'duration' => 'dt-duration',
  988. 'nutrition' => 'p-nutrition',
  989. 'photo' => 'u-photo',
  990. 'summary' => 'p-summary',
  991. 'author' => 'p-author h-card'
  992. ),
  993. 'hresume' => array(
  994. 'summary' => 'p-summary',
  995. 'contact' => 'h-card p-contact',
  996. 'education' => 'h-event p-education',
  997. 'experience' => 'h-event p-experience',
  998. 'skill' => 'p-skill',
  999. 'affiliation' => 'p-affiliation h-card',
  1000. ),
  1001. 'vevent' => array(
  1002. 'dtstart' => 'dt-start',
  1003. 'dtend' => 'dt-end',
  1004. 'duration' => 'dt-duration',
  1005. 'description' => 'p-description',
  1006. 'summary' => 'p-summary',
  1007. 'description' => 'p-description',
  1008. 'url' => 'u-url',
  1009. 'category' => 'p-category',
  1010. 'location' => 'h-card',
  1011. 'geo' => 'p-location h-geo'
  1012. ),
  1013. 'hreview' => array(
  1014. 'summary' => 'p-name',
  1015. 'fn' => 'p-item h-item p-name', // doesn’t work properly, see spec
  1016. 'photo' => 'u-photo', // of the item being reviewed (p-item h-item u-photo)
  1017. 'url' => 'u-url', // of the item being reviewed (p-item h-item u-url)
  1018. 'reviewer' => 'p-reviewer p-author h-card',
  1019. 'dtreviewed' => 'dt-reviewed',
  1020. 'rating' => 'p-rating',
  1021. 'best' => 'p-best',
  1022. 'worst' => 'p-worst',
  1023. 'description' => 'p-description'
  1024. ),
  1025. 'hproduct' => array(
  1026. 'fn' => 'p-name',
  1027. 'photo' => 'u-photo',
  1028. 'brand' => 'p-brand',
  1029. 'category' => 'p-category',
  1030. 'description' => 'p-description',
  1031. 'identifier' => 'u-identifier',
  1032. 'url' => 'u-url',
  1033. 'review' => 'p-review h-review',
  1034. 'price' => 'p-price'
  1035. )
  1036. );
  1037. }
  1038. function parseUriToComponents($uri) {
  1039. $result = array(
  1040. 'scheme' => null,
  1041. 'authority' => null,
  1042. 'path' => null,
  1043. 'query' => null,
  1044. 'fragment' => null
  1045. );
  1046. $u = @parse_url($uri);
  1047. if(array_key_exists('scheme', $u))
  1048. $result['scheme'] = $u['scheme'];
  1049. if(array_key_exists('host', $u)) {
  1050. if(array_key_exists('user', $u))
  1051. $result['authority'] = $u['user'];
  1052. if(array_key_exists('pass', $u))
  1053. $result['authority'] .= ':' . $u['pass'];
  1054. if(array_key_exists('user', $u) || array_key_exists('pass', $u))
  1055. $result['authority'] .= '@';
  1056. $result['authority'] .= $u['host'];
  1057. if(array_key_exists('port', $u))
  1058. $result['authority'] .= ':' . $u['port'];
  1059. }
  1060. if(array_key_exists('path', $u))
  1061. $result['path'] = $u['path'];
  1062. if(array_key_exists('query', $u))
  1063. $result['query'] = $u['query'];
  1064. if(array_key_exists('fragment', $u))
  1065. $result['fragment'] = $u['fragment'];
  1066. return $result;
  1067. }
  1068. function resolveUrl($baseURI, $referenceURI) {
  1069. $target = array(
  1070. 'scheme' => null,
  1071. 'authority' => null,
  1072. 'path' => null,
  1073. 'query' => null,
  1074. 'fragment' => null
  1075. );
  1076. # 5.2.1 Pre-parse the Base URI
  1077. # The base URI (Base) is established according to the procedure of
  1078. # Section 5.1 and parsed into the five main components described in
  1079. # Section 3
  1080. $base = parseUriToComponents($baseURI);
  1081. # If base path is blank (http://example.com) then set it to /
  1082. # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
  1083. if($base['path'] == null)
  1084. $base['path'] = '/';
  1085. # 5.2.2. Transform References
  1086. # The URI reference is parsed into the five URI components
  1087. # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
  1088. $reference = parseUriToComponents($referenceURI);
  1089. # A non-strict parser may ignore a scheme in the reference
  1090. # if it is identical to the base URI's scheme.
  1091. # TODO
  1092. if($reference['scheme']) {
  1093. $target['scheme'] = $reference['scheme'];
  1094. $target['authority'] = $reference['authority'];
  1095. $target['path'] = removeDotSegments($reference['path']);
  1096. $target['query'] = $reference['query'];
  1097. } else {
  1098. if($reference['authority']) {
  1099. $target['authority'] = $reference['authority'];
  1100. $target['path'] = removeDotSegments($reference['path']);
  1101. $target['query'] = $reference['query'];
  1102. } else {
  1103. if($reference['path'] == '') {
  1104. $target['path'] = $base['path'];
  1105. if($reference['query']) {
  1106. $target['query'] = $reference['query'];
  1107. } else {
  1108. $target['query'] = $base['query'];
  1109. }
  1110. } else {
  1111. if(substr($reference['path'], 0, 1) == '/') {
  1112. $target['path'] = removeDotSegments($reference['path']);
  1113. } else {
  1114. $target['path'] = mergePaths($base, $reference);
  1115. $target['path'] = removeDotSegments($target['path']);
  1116. }
  1117. $target['query'] = $reference['query'];
  1118. }
  1119. $target['authority'] = $base['authority'];
  1120. }
  1121. $target['scheme'] = $base['scheme'];
  1122. }
  1123. $target['fragment'] = $reference['fragment'];
  1124. # 5.3 Component Recomposition
  1125. $result = '';
  1126. if($target['scheme']) {
  1127. $result .= $target['scheme'] . ':';
  1128. }
  1129. if($target['authority']) {
  1130. $result .= '//' . $target['authority'];
  1131. }
  1132. $result .= $target['path'];
  1133. if($target['query']) {
  1134. $result .= '?' . $target['query'];
  1135. }
  1136. if($target['fragment']) {
  1137. $result .= '#' . $target['fragment'];
  1138. } elseif($referenceURI == '#') {
  1139. $result .= '#';
  1140. }
  1141. return $result;
  1142. }
  1143. # 5.2.3 Merge Paths
  1144. function mergePaths($base, $reference) {
  1145. # If the base URI has a defined authority component and an empty
  1146. # path,
  1147. if($base['authority'] && $base['path'] == null) {
  1148. # then return a string consisting of "/" concatenated with the
  1149. # reference's path; otherwise,
  1150. $merged = '/' . $reference['path'];
  1151. } else {
  1152. if(($pos=strrpos($base['path'], '/')) !== false) {
  1153. # return a string consisting of the reference's path component
  1154. # appended to all but the last segment of the base URI's path (i.e.,
  1155. # excluding any characters after the right-most "/" in the base URI
  1156. # path,
  1157. $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
  1158. } else {
  1159. # or excluding the entire base URI path if it does not contain
  1160. # any "/" characters).
  1161. $merged = $base['path'];
  1162. }
  1163. }
  1164. return $merged;
  1165. }
  1166. # 5.2.4.A Remove leading ../ or ./
  1167. function removeLeadingDotSlash(&$input) {
  1168. if(substr($input, 0, 3) == '../') {
  1169. $input = substr($input, 3);
  1170. } elseif(substr($input, 0, 2) == './') {
  1171. $input = substr($input, 2);
  1172. }
  1173. }
  1174. # 5.2.4.B Replace leading /. with /
  1175. function removeLeadingSlashDot(&$input) {
  1176. if(substr($input, 0, 3) == '/./') {
  1177. $input = '/' . substr($input, 3);
  1178. } else {
  1179. $input = '/' . substr($input, 2);
  1180. }
  1181. }
  1182. # 5.2.4.C Given leading /../ remove component from output buffer
  1183. function removeOneDirLevel(&$input, &$output) {
  1184. if(substr($input, 0, 4) == '/../') {
  1185. $input = '/' . substr($input, 4);
  1186. } else {
  1187. $input = '/' . substr($input, 3);
  1188. }
  1189. $output = substr($output, 0, strrpos($output, '/'));
  1190. }
  1191. # 5.2.4.D Remove . and .. if it's the only thing in the input
  1192. function removeLoneDotDot(&$input) {
  1193. if($input == '.') {
  1194. $input = substr($input, 1);
  1195. } else {
  1196. $input = substr($input, 2);
  1197. }
  1198. }
  1199. # 5.2.4.E Move one segment from input to output
  1200. function moveOneSegmentFromInput(&$input, &$output) {
  1201. if(substr($input, 0, 1) != '/') {
  1202. $pos = strpos($input, '/');
  1203. } else {
  1204. $pos = strpos($input, '/', 1);
  1205. }
  1206. if($pos === false) {
  1207. $output .= $input;
  1208. $input = '';
  1209. } else {
  1210. $output .= substr($input, 0, $pos);
  1211. $input = substr($input, $pos);
  1212. }
  1213. }
  1214. # 5.2.4 Remove Dot Segments
  1215. function removeDotSegments($path) {
  1216. # 1. The input buffer is initialized with the now-appended path
  1217. # components and the output buffer is initialized to the empty
  1218. # string.
  1219. $input = $path;
  1220. $output = '';
  1221. $step = 0;
  1222. # 2. While the input buffer is not empty, loop as follows:
  1223. while($input) {
  1224. $step++;
  1225. if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
  1226. # A. If the input buffer begins with a prefix of "../" or "./",
  1227. # then remove that prefix from the input buffer; otherwise,
  1228. removeLeadingDotSlash($input);
  1229. } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
  1230. # B. if the input buffer begins with a prefix of "/./" or "/.",
  1231. # where "." is a complete path segment, then replace that
  1232. # prefix with "/" in the input buffer; otherwise,
  1233. removeLeadingSlashDot($input);
  1234. } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
  1235. # C. if the input buffer begins with a prefix of "/../" or "/..",
  1236. # where ".." is a complete path segment, then replace that
  1237. # prefix with "/" in the input buffer and remove the last
  1238. # segment and its preceding "/" (if any) from the output
  1239. # buffer; otherwise,
  1240. removeOneDirLevel($input, $output);
  1241. } elseif($input == '.' || $input == '..') {
  1242. # D. if the input buffer consists only of "." or "..", then remove
  1243. # that from the input buffer; otherwise,
  1244. removeLoneDotDot($input);
  1245. } else {
  1246. # E. move the first path segment in the input buffer to the end of
  1247. # the output buffer and any subsequent characters up to, but not including,
  1248. # the next "/" character or the end of the input buffer
  1249. moveOneSegmentFromInput($input, $output);
  1250. }
  1251. }
  1252. return $output;
  1253. }