12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442 |
- <?php
- namespace Mf2;
- use DOMDocument;
- use DOMElement;
- use DOMXPath;
- use DOMNode;
- use DOMNodeList;
- use Exception;
- use SplObjectStorage;
- use stdClass;
- /**
- * Parse Microformats2
- *
- * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
- *
- * Example usage:
- *
- * use Mf2;
- * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
- * echo json_encode($output, JSON_PRETTY_PRINT);
- *
- * Produces:
- *
- * {
- * "items": [
- * {
- * "type": ["h-card"],
- * "properties": {
- * "name": ["Barnaby Walters"]
- * }
- * }
- * ],
- * "rels": {}
- * }
- *
- * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
- * @param string $url The URL the input document was found at, for relative URL resolution
- * @param bool $convertClassic whether or not to convert classic microformats
- * @return array Canonical MF2 array structure
- */
- function parse($input, $url = null, $convertClassic = true) {
- $parser = new Parser($input, $url);
- return $parser->parse($convertClassic);
- }
- /**
- * Fetch microformats2
- *
- * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
- * microformats2 array structure.
- *
- * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
- * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
- * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
- * for the actual value.
- *
- * @param string $url The URL to fetch
- * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
- * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
- * @return array|null canonical microformats2 array structure on success, null on failure
- */
- function fetch($url, $convertClassic = true, &$curlInfo=null) {
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_HEADER, 0);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
- curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
- $html = curl_exec($ch);
- $info = $curlInfo = curl_getinfo($ch);
- curl_close($ch);
- if (strpos(strtolower($info['content_type']), 'html') === false) {
- // The content was not delivered as HTML, do not attempt to parse it.
- return null;
- }
- return parse($html, $url, $convertClassic);
- }
- /**
- * Unicode to HTML Entities
- * @param string $input String containing characters to convert into HTML entities
- * @return string
- */
- function unicodeToHtmlEntities($input) {
- return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
- }
- /**
- * Collapse Whitespace
- *
- * Collapses any sequences of whitespace within a string into a single space
- * character.
- *
- * @deprecated since v0.2.3
- * @param string $str
- * @return string
- */
- function collapseWhitespace($str) {
- return preg_replace('/[\s|\n]+/', ' ', $str);
- }
- function unicodeTrim($str) {
- // this is cheating. TODO: find a better way if this causes any problems
- $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
- $str = preg_replace('/^\s+/', '', $str);
- return preg_replace('/\s+$/', '', $str);
- }
- /**
- * Microformat Name From Class string
- *
- * Given the value of @class, get the relevant mf classnames (e.g. h-card,
- * p-name).
- *
- * @param string $class A space delimited list of classnames
- * @param string $prefix The prefix to look for
- * @return string|array The prefixed name of the first microfomats class found or false
- */
- function mfNamesFromClass($class, $prefix='h-') {
- $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
- $classes = explode(' ', $class);
- $matches = array();
- foreach ($classes as $classname) {
- $compare_classname = ' ' . $classname;
- $compare_prefix = ' ' . $prefix;
- if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
- $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
- }
- }
- return $matches;
- }
- /**
- * Get Nested µf Property Name From Class
- *
- * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
- * space-separated string.
- *
- * @param string $class
- * @return array
- */
- function nestedMfPropertyNamesFromClass($class) {
- $prefixes = array('p-', 'u-', 'dt-', 'e-');
- $propertyNames = array();
- $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
- foreach (explode(' ', $class) as $classname) {
- foreach ($prefixes as $prefix) {
- // Check if $classname is a valid property classname for $prefix.
- if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
- $propertyName = mb_substr($classname, mb_strlen($prefix));
- $propertyNames[$propertyName][] = $prefix;
- }
- }
- }
-
- foreach ($propertyNames as $property => $prefixes) {
- $propertyNames[$property] = array_unique($prefixes);
- }
- return $propertyNames;
- }
- /**
- * Wraps mfNamesFromClass to handle an element as input (common)
- *
- * @param DOMElement $e The element to get the classname for
- * @param string $prefix The prefix to look for
- * @return mixed See return value of mf2\Parser::mfNameFromClass()
- */
- function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
- $class = $e->getAttribute('class');
- return mfNamesFromClass($class, $prefix);
- }
- /**
- * Wraps nestedMfPropertyNamesFromClass to handle an element as input
- */
- function nestedMfPropertyNamesFromElement(\DOMElement $e) {
- $class = $e->getAttribute('class');
- return nestedMfPropertyNamesFromClass($class);
- }
- /**
- * Converts various time formats to HH:MM
- * @param string $time The time to convert
- * @return string
- */
- function convertTimeFormat($time) {
- $hh = $mm = $ss = '';
- preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
- // If no am/pm is specified:
- if (empty($matches[4])) {
- return $time;
- } else {
- // Otherwise, am/pm is specified.
- $meridiem = strtolower(str_replace('.', '', $matches[4]));
- // Hours.
- $hh = $matches[1];
- // Add 12 to hours if pm applies.
- if ($meridiem == 'pm' && ($hh < 12)) {
- $hh += 12;
- }
- $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
- // Minutes.
- $mm = (empty($matches[2]) ) ? '00' : $matches[2];
- // Seconds, only if supplied.
- if (!empty($matches[3])) {
- $ss = $matches[3];
- }
- if (empty($ss)) {
- return sprintf('%s:%s', $hh, $mm);
- }
- else {
- return sprintf('%s:%s:%s', $hh, $mm, $ss);
- }
- }
- }
- /**
- * Microformats2 Parser
- *
- * A class which holds state for parsing microformats2 from HTML.
- *
- * Example usage:
- *
- * use Mf2;
- * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
- * $output = $parser->parse();
- */
- class Parser {
- /** @var string The baseurl (if any) to use for this parse */
- public $baseurl;
- /** @var DOMXPath object which can be used to query over any fragment*/
- public $xpath;
- /** @var DOMDocument */
- public $doc;
- /** @var SplObjectStorage */
- protected $parsed;
- public $jsonMode;
- /**
- * Constructor
- *
- * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
- * @param string $url The URL of the parsed document, for relative URL resolution
- * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
- */
- public function __construct($input, $url = null, $jsonMode = false) {
- libxml_use_internal_errors(true);
- if (is_string($input)) {
- $doc = new DOMDocument();
- @$doc->loadHTML(unicodeToHtmlEntities($input));
- } elseif (is_a($input, 'DOMDocument')) {
- $doc = $input;
- } else {
- $doc = new DOMDocument();
- @$doc->loadHTML('');
- }
- $this->xpath = new DOMXPath($doc);
- $baseurl = $url;
- foreach ($this->xpath->query('//base[@href]') as $base) {
- $baseElementUrl = $base->getAttribute('href');
- if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
- /* The base element URL is relative to the document URL.
- *
- * :/
- *
- * Perhaps the author was high? */
- $baseurl = resolveUrl($url, $baseElementUrl);
- } else {
- $baseurl = $baseElementUrl;
- }
- break;
- }
- // Ignore <template> elements as per the HTML5 spec
- foreach ($this->xpath->query('//template') as $templateEl) {
- $templateEl->parentNode->removeChild($templateEl);
- }
- $this->baseurl = $baseurl;
- $this->doc = $doc;
- $this->parsed = new SplObjectStorage();
- $this->jsonMode = $jsonMode;
- }
- private function elementPrefixParsed(\DOMElement $e, $prefix) {
- if (!$this->parsed->contains($e))
- $this->parsed->attach($e, array());
- $prefixes = $this->parsed[$e];
- $prefixes[] = $prefix;
- $this->parsed[$e] = $prefixes;
- }
- private function isElementParsed(\DOMElement $e, $prefix) {
- if (!$this->parsed->contains($e))
- return false;
- $prefixes = $this->parsed[$e];
- if (!in_array($prefix, $prefixes))
- return false;
- return true;
- }
- private function resolveChildUrls(DOMElement $el) {
- $hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);
- foreach ($hyperlinkChildren as $child) {
- if ($child->hasAttribute('href'))
- $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
- if ($child->hasAttribute('src'))
- $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
- if ($child->hasAttribute('data'))
- $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
- }
- }
- public function textContent(DOMElement $el) {
- $this->resolveChildUrls($el);
- $clonedEl = $el->cloneNode(true);
- foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) {
- $newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
- $imgEl->parentNode->replaceChild($newNode, $imgEl);
- }
- return $clonedEl->textContent;
- }
- // TODO: figure out if this has problems with sms: and geo: URLs
- public function resolveUrl($url) {
- // If the URL is seriously malformed it’s probably beyond the scope of this
- // parser to try to do anything with it.
- if (parse_url($url) === false)
- return $url;
- $scheme = parse_url($url, PHP_URL_SCHEME);
- if (empty($scheme) and !empty($this->baseurl)) {
- return resolveUrl($this->baseurl, $url);
- } else {
- return $url;
- }
- }
- // Parsing Functions
- /**
- * Parse value-class/value-title on an element, joining with $separator if
- * there are multiple.
- *
- * @param \DOMElement $e
- * @param string $separator = '' if multiple value-title elements, join with this string
- * @return string|null the parsed value or null if value-class or -title aren’t in use
- */
- public function parseValueClassTitle(\DOMElement $e, $separator = '') {
- $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
- if ($valueClassElements->length !== 0) {
- // Process value-class stuff
- $val = '';
- foreach ($valueClassElements as $el) {
- $val .= $this->textContent($el);
- }
- return unicodeTrim($val);
- }
- $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
- if ($valueTitleElements->length !== 0) {
- // Process value-title stuff
- $val = '';
- foreach ($valueTitleElements as $el) {
- $val .= $el->getAttribute('title');
- }
- return unicodeTrim($val);
- }
- // No value-title or -class in this element
- return null;
- }
- /**
- * Given an element with class="p-*", get it’s value
- *
- * @param DOMElement $p The element to parse
- * @return string The plaintext value of $p, dependant on type
- * @todo Make this adhere to value-class
- */
- public function parseP(\DOMElement $p) {
- $classTitle = $this->parseValueClassTitle($p, ' ');
- if ($classTitle !== null)
- return $classTitle;
- if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
- $pValue = $p->getAttribute('alt');
- } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
- $pValue = $p->getAttribute('alt');
- } elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') {
- $pValue = $p->getAttribute('title');
- } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
- $pValue = $p->getAttribute('value');
- } else {
- $pValue = unicodeTrim($this->textContent($p));
- }
- return $pValue;
- }
- /**
- * Given an element with class="u-*", get the value of the URL
- *
- * @param DOMElement $u The element to parse
- * @return string The plaintext value of $u, dependant on type
- * @todo make this adhere to value-class
- */
- public function parseU(\DOMElement $u) {
- if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
- $uValue = $u->getAttribute('href');
- } elseif (in_array($u->tagName, array('img', 'audio', 'video', 'source')) and $u->getAttribute('src') !== null) {
- $uValue = $u->getAttribute('src');
- } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
- $uValue = $u->getAttribute('data');
- }
- if (isset($uValue)) {
- return $this->resolveUrl($uValue);
- }
- $classTitle = $this->parseValueClassTitle($u);
- if ($classTitle !== null) {
- return $classTitle;
- } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
- return $u->getAttribute('title');
- } elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
- return $u->getAttribute('value');
- } else {
- return unicodeTrim($this->textContent($u));
- }
- }
- /**
- * Given an element with class="dt-*", get the value of the datetime as a php date object
- *
- * @param DOMElement $dt The element to parse
- * @param array $dates Array of dates processed so far
- * @return string The datetime string found
- */
- public function parseDT(\DOMElement $dt, &$dates = array()) {
- // Check for value-class pattern
- $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
- $dtValue = false;
- if ($valueClassChildren->length > 0) {
- // They’re using value-class
- $dateParts = array();
- foreach ($valueClassChildren as $e) {
- if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
- $title = $e->getAttribute('title');
- if (!empty($title))
- $dateParts[] = $title;
- }
- elseif ($e->tagName == 'img' or $e->tagName == 'area') {
- // Use @alt
- $alt = $e->getAttribute('alt');
- if (!empty($alt))
- $dateParts[] = $alt;
- }
- elseif ($e->tagName == 'data') {
- // Use @value, otherwise innertext
- $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
- if (!empty($value))
- $dateParts[] = $value;
- }
- elseif ($e->tagName == 'abbr') {
- // Use @title, otherwise innertext
- $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
- if (!empty($title))
- $dateParts[] = $title;
- }
- elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
- // Use @datetime if available, otherwise innertext
- $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
- if (!empty($dtAttr))
- $dateParts[] = $dtAttr;
- }
- else {
- if (!empty($e->nodeValue))
- $dateParts[] = unicodeTrim($e->nodeValue);
- }
- }
- // Look through dateParts
- $datePart = '';
- $timePart = '';
- foreach ($dateParts as $part) {
- // Is this part a full ISO8601 datetime?
- if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
- // Break completely, we’ve got our value.
- $dtValue = $part;
- break;
- } else {
- // Is the current part a valid time(+TZ?) AND no other time representation has been found?
- if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
- $timePart = $part;
- } elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
- // Is the current part a valid date AND no other date representation has been found?
- $datePart = $part;
- }
- if ( !empty($datePart) && !in_array($datePart, $dates) ) {
- $dates[] = $datePart;
- }
- $dtValue = '';
- if ( empty($datePart) && !empty($timePart) ) {
- $timePart = convertTimeFormat($timePart);
- $dtValue = unicodeTrim($timePart, 'T');
- }
- else if ( !empty($datePart) && empty($timePart) ) {
- $dtValue = rtrim($datePart, 'T');
- }
- else {
- $timePart = convertTimeFormat($timePart);
- $dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
- }
- }
- }
- } else {
- // Not using value-class (phew).
- if ($dt->tagName == 'img' or $dt->tagName == 'area') {
- // Use @alt
- // Is it an entire dt?
- $alt = $dt->getAttribute('alt');
- if (!empty($alt))
- $dtValue = $alt;
- } elseif (in_array($dt->tagName, array('data'))) {
- // Use @value, otherwise innertext
- // Is it an entire dt?
- $value = $dt->getAttribute('value');
- if (!empty($value))
- $dtValue = $value;
- else
- $dtValue = $dt->nodeValue;
- } elseif ($dt->tagName == 'abbr') {
- // Use @title, otherwise innertext
- // Is it an entire dt?
- $title = $dt->getAttribute('title');
- if (!empty($title))
- $dtValue = $title;
- else
- $dtValue = $dt->nodeValue;
- } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
- // Use @datetime if available, otherwise innertext
- // Is it an entire dt?
- $dtAttr = $dt->getAttribute('datetime');
- if (!empty($dtAttr))
- $dtValue = $dtAttr;
- else
- $dtValue = $dt->nodeValue;
- } else {
- $dtValue = $dt->nodeValue;
- }
- if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
- $dates[] = $matches[0];
- }
- }
- /**
- * if $dtValue is only a time and there are recently parsed dates,
- * form the full date-time using the most recently parsed dt- value
- */
- if ((preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates)) {
- $dtValue = convertTimeFormat($dtValue);
- $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
- }
- return $dtValue;
- }
- /**
- * Given the root element of some embedded markup, return a string representing that markup
- *
- * @param DOMElement $e The element to parse
- * @return string $e’s innerHTML
- *
- * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
- */
- public function parseE(\DOMElement $e) {
- $classTitle = $this->parseValueClassTitle($e);
- if ($classTitle !== null)
- return $classTitle;
- // Expand relative URLs within children of this element
- // TODO: as it is this is not relative to only children, make this .// and rerun tests
- $this->resolveChildUrls($e);
- $html = '';
- foreach ($e->childNodes as $node) {
- $html .= $node->C14N();
- }
- return array(
- 'html' => $html,
- 'value' => unicodeTrim($this->textContent($e))
- );
- }
- /**
- * Recursively parse microformats
- *
- * @param DOMElement $e The element to parse
- * @return array A representation of the values contained within microformat $e
- */
- public function parseH(\DOMElement $e) {
- // If it’s already been parsed (e.g. is a child mf), skip
- if ($this->parsed->contains($e))
- return null;
- // Get current µf name
- $mfTypes = mfNamesFromElement($e, 'h-');
- // Initalise var to store the representation in
- $return = array();
- $children = array();
- $dates = array();
- // Handle nested microformats (h-*)
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) {
- // Parse
- $result = $this->parseH($subMF);
- // If result was already parsed, skip it
- if (null === $result)
- continue;
-
- // In most cases, the value attribute of the nested microformat should be the p- parsed value of the elemnt.
- // The only times this is different is when the microformat is nested under certain prefixes, which are handled below.
- $result['value'] = $this->parseP($subMF);
- // Does this µf have any property names other than h-*?
- $properties = nestedMfPropertyNamesFromElement($subMF);
- if (!empty($properties)) {
- // Yes! It’s a nested property µf
- foreach ($properties as $property => $prefixes) {
- // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
- $prefixSpecificResult = $result;
- if (in_array('p-', $prefixes)) {
- $prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0];
- } elseif (in_array('e-', $prefixes)) {
- $eParsedResult = $this->parseE($subMF);
- $prefixSpecificResult['html'] = $eParsedResult['html'];
- $prefixSpecificResult['value'] = $eParsedResult['value'];
- } elseif (in_array('u-', $prefixes)) {
- $prefixSpecificResult['value'] = $this->parseU($subMF);
- }
- $return[$property][] = $prefixSpecificResult;
- }
- } else {
- // No, it’s a child µf
- $children[] = $result;
- }
- // Make sure this sub-mf won’t get parsed as a µf or property
- // TODO: Determine if clearing this is required?
- $this->elementPrefixParsed($subMF, 'h');
- $this->elementPrefixParsed($subMF, 'p');
- $this->elementPrefixParsed($subMF, 'u');
- $this->elementPrefixParsed($subMF, 'dt');
- $this->elementPrefixParsed($subMF, 'e');
- }
- if($e->tagName == 'area') {
- $coords = $e->getAttribute('coords');
- $shape = $e->getAttribute('shape');
- }
- // Handle p-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
- if ($this->isElementParsed($p, 'p'))
- continue;
- $pValue = $this->parseP($p);
- // Add the value to the array for it’s p- properties
- foreach (mfNamesFromElement($p, 'p-') as $propName) {
- if (!empty($propName))
- $return[$propName][] = $pValue;
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($p, 'p');
- }
- // Handle u-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
- if ($this->isElementParsed($u, 'u'))
- continue;
- $uValue = $this->parseU($u);
- // Add the value to the array for it’s property types
- foreach (mfNamesFromElement($u, 'u-') as $propName) {
- $return[$propName][] = $uValue;
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($u, 'u');
- }
- // Handle dt-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
- if ($this->isElementParsed($dt, 'dt'))
- continue;
- $dtValue = $this->parseDT($dt, $dates);
- if ($dtValue) {
- // Add the value to the array for dt- properties
- foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
- $return[$propName][] = $dtValue;
- }
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($dt, 'dt');
- }
- // Handle e-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
- if ($this->isElementParsed($em, 'e'))
- continue;
- $eValue = $this->parseE($em);
- if ($eValue) {
- // Add the value to the array for e- properties
- foreach (mfNamesFromElement($em, 'e-') as $propName) {
- $return[$propName][] = $eValue;
- }
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($em, 'e');
- }
- // Implied Properties
- // Check for p-name
- if (!array_key_exists('name', $return)) {
- try {
- // Look for img @alt
- if (($e->tagName == 'img' or $e->tagName == 'area') and $e->getAttribute('alt') != '')
- throw new Exception($e->getAttribute('alt'));
- if ($e->tagName == 'abbr' and $e->hasAttribute('title'))
- throw new Exception($e->getAttribute('title'));
- // Look for nested img @alt
- foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
- $emNames = mfNamesFromElement($em, 'h-');
- if (empty($emNames) && $em->getAttribute('alt') != '') {
- throw new Exception($em->getAttribute('alt'));
- }
- }
- // Look for nested area @alt
- foreach ($this->xpath->query('./area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
- $emNames = mfNamesFromElement($em, 'h-');
- if (empty($emNames) && $em->getAttribute('alt') != '') {
- throw new Exception($em->getAttribute('alt'));
- }
- }
- // Look for double nested img @alt
- foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
- $emNames = mfNamesFromElement($em, 'h-');
- if (empty($emNames) && $em->getAttribute('alt') != '') {
- throw new Exception($em->getAttribute('alt'));
- }
- }
- // Look for double nested img @alt
- foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
- $emNames = mfNamesFromElement($em, 'h-');
- if (empty($emNames) && $em->getAttribute('alt') != '') {
- throw new Exception($em->getAttribute('alt'));
- }
- }
- throw new Exception($e->nodeValue);
- } catch (Exception $exc) {
- $return['name'][] = unicodeTrim($exc->getMessage());
- }
- }
- // Check for u-photo
- if (!array_key_exists('photo', $return)) {
- // Look for img @src
- try {
- if ($e->tagName == 'img')
- throw new Exception($e->getAttribute('src'));
- // Look for nested img @src
- foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
- if ($em->getAttribute('src') != '')
- throw new Exception($em->getAttribute('src'));
- }
- // Look for double nested img @src
- foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
- if ($em->getAttribute('src') != '')
- throw new Exception($em->getAttribute('src'));
- }
- } catch (Exception $exc) {
- $return['photo'][] = $this->resolveUrl($exc->getMessage());
- }
- }
- // Check for u-url
- if (!array_key_exists('url', $return)) {
- // Look for img @src
- if ($e->tagName == 'a' or $e->tagName == 'area')
- $url = $e->getAttribute('href');
- // Look for nested a @href
- foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
- $emNames = mfNamesFromElement($em, 'h-');
- if (empty($emNames)) {
- $url = $em->getAttribute('href');
- break;
- }
- }
- // Look for nested area @src
- foreach ($this->xpath->query('./area[count(preceding-sibling::area)+count(following-sibling::area)=0]', $e) as $em) {
- $emNames = mfNamesFromElement($em, 'h-');
- if (empty($emNames)) {
- $url = $em->getAttribute('href');
- break;
- }
- }
- if (!empty($url))
- $return['url'][] = $this->resolveUrl($url);
- }
- // Make sure things are in alphabetical order
- sort($mfTypes);
- // Phew. Return the final result.
- $parsed = array(
- 'type' => $mfTypes,
- 'properties' => $return
- );
- if (!empty($shape)) {
- $parsed['shape'] = $shape;
- }
- if (!empty($coords)) {
- $parsed['coords'] = $coords;
- }
- if (!empty($children)) {
- $parsed['children'] = array_values(array_filter($children));
- }
- return $parsed;
- }
- /**
- * Parse Rels and Alternatives
- *
- * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page
- * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set
- * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case),
- * it will be an empty array.
- */
- public function parseRelsAndAlternates() {
- $rels = array();
- $alternates = array();
- // Iterate through all a, area and link elements with rel attributes
- foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) {
- if ($hyperlink->getAttribute('rel') == '')
- continue;
- // Resolve the href
- $href = $this->resolveUrl($hyperlink->getAttribute('href'));
- // Split up the rel into space-separated values
- $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel')));
- // If alternate in rels, create alternate structure, append
- if (in_array('alternate', $linkRels)) {
- $alt = array(
- 'url' => $href,
- 'rel' => implode(' ', array_diff($linkRels, array('alternate')))
- );
- if ($hyperlink->hasAttribute('media'))
- $alt['media'] = $hyperlink->getAttribute('media');
- if ($hyperlink->hasAttribute('hreflang'))
- $alt['hreflang'] = $hyperlink->getAttribute('hreflang');
- if ($hyperlink->hasAttribute('title'))
- $alt['title'] = $hyperlink->getAttribute('title');
- if ($hyperlink->hasAttribute('type'))
- $alt['type'] = $hyperlink->getAttribute('type');
- if ($hyperlink->nodeValue)
- $alt['text'] = $hyperlink->nodeValue;
- $alternates[] = $alt;
- } else {
- foreach ($linkRels as $rel) {
- $rels[$rel][] = $href;
- }
- }
- }
- if (empty($rels) and $this->jsonMode) {
- $rels = new stdClass();
- }
- return array($rels, $alternates);
- }
- /**
- * Kicks off the parsing routine
- *
- * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
- * will be HTML-encoded, bringing all output to the same level of encoding.
- *
- * If a DOMElement is set as the $context, only descendants of that element will
- * be parsed for microformats.
- *
- * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
- * @param DOMElement $context optionally an element from which to parse microformats
- * @return array An array containing all the µfs found in the current document
- */
- public function parse($convertClassic = true, DOMElement $context = null) {
- $mfs = array();
- if ($convertClassic) {
- $this->convertLegacy();
- }
- $mfElements = null === $context
- ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]')
- : $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context);
- // Parser microformats
- foreach ($mfElements as $node) {
- // For each microformat
- $result = $this->parseH($node);
- // Add the value to the array for this property type
- $mfs[] = $result;
- }
- // Parse rels
- list($rels, $alternates) = $this->parseRelsAndAlternates();
- $top = array(
- 'items' => array_values(array_filter($mfs)),
- 'rels' => $rels
- );
- if (count($alternates))
- $top['alternates'] = $alternates;
- return $top;
- }
- /**
- * Parse From ID
- *
- * Given an ID, parse all microformats which are children of the element with
- * that ID.
- *
- * Note that rel values are still document-wide.
- *
- * If an element with the ID is not found, an empty skeleton mf2 array structure
- * will be returned.
- *
- * @param string $id
- * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
- * @return array
- */
- public function parseFromId($id, $convertClassic=true) {
- $matches = $this->xpath->query("//*[@id='{$id}']");
- if (empty($matches))
- return array('items' => array(), 'rels' => array(), 'alternates' => array());
- return $this->parse($convertClassic, $matches->item(0));
- }
- /**
- * Convert Legacy Classnames
- *
- * Adds microformats2 classnames into a document containing only legacy
- * semantic classnames.
- *
- * @return Parser $this
- */
- public function convertLegacy() {
- $doc = $this->doc;
- $xp = new DOMXPath($doc);
- // replace all roots
- foreach ($this->classicRootMap as $old => $new) {
- foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
- $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
- }
- }
- foreach ($this->classicPropertyMap as $oldRoot => $properties) {
- $newRoot = $this->classicRootMap[$oldRoot];
- foreach ($properties as $old => $new) {
- foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
- $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
- }
- }
- }
- return $this;
- }
- /**
- * XPath Query
- *
- * Runs an XPath query over the current document. Works in exactly the same
- * way as DOMXPath::query.
- *
- * @param string $expression
- * @param DOMNode $context
- * @return DOMNodeList
- */
- public function query($expression, $context = null) {
- return $this->xpath->query($expression, $context);
- }
- /**
- * Classic Root Classname map
- */
- public $classicRootMap = array(
- 'vcard' => 'h-card',
- 'hfeed' => 'h-feed',
- 'hentry' => 'h-entry',
- 'hrecipe' => 'h-recipe',
- 'hresume' => 'h-resume',
- 'vevent' => 'h-event',
- 'hreview' => 'h-review',
- 'hproduct' => 'h-product'
- );
- public $classicPropertyMap = array(
- 'vcard' => array(
- 'fn' => 'p-name',
- 'url' => 'u-url',
- 'honorific-prefix' => 'p-honorific-prefix',
- 'given-name' => 'p-given-name',
- 'additional-name' => 'p-additional-name',
- 'family-name' => 'p-family-name',
- 'honorific-suffix' => 'p-honorific-suffix',
- 'nickname' => 'p-nickname',
- 'email' => 'u-email',
- 'logo' => 'u-logo',
- 'photo' => 'u-photo',
- 'url' => 'u-url',
- 'uid' => 'u-uid',
- 'category' => 'p-category',
- 'adr' => 'p-adr h-adr',
- 'extended-address' => 'p-extended-address',
- 'street-address' => 'p-street-address',
- 'locality' => 'p-locality',
- 'region' => 'p-region',
- 'postal-code' => 'p-postal-code',
- 'country-name' => 'p-country-name',
- 'label' => 'p-label',
- 'geo' => 'p-geo h-geo',
- 'latitude' => 'p-latitude',
- 'longitude' => 'p-longitude',
- 'tel' => 'p-tel',
- 'note' => 'p-note',
- 'bday' => 'dt-bday',
- 'key' => 'u-key',
- 'org' => 'p-org',
- 'organization-name' => 'p-organization-name',
- 'organization-unit' => 'p-organization-unit',
- ),
- 'hentry' => array(
- 'entry-title' => 'p-name',
- 'entry-summary' => 'p-summary',
- 'entry-content' => 'e-content',
- 'published' => 'dt-published',
- 'updated' => 'dt-updated',
- 'author' => 'p-author h-card',
- 'category' => 'p-category',
- 'geo' => 'p-geo h-geo',
- 'latitude' => 'p-latitude',
- 'longitude' => 'p-longitude',
- ),
- 'hrecipe' => array(
- 'fn' => 'p-name',
- 'ingredient' => 'p-ingredient',
- 'yield' => 'p-yield',
- 'instructions' => 'e-instructions',
- 'duration' => 'dt-duration',
- 'nutrition' => 'p-nutrition',
- 'photo' => 'u-photo',
- 'summary' => 'p-summary',
- 'author' => 'p-author h-card'
- ),
- 'hresume' => array(
- 'summary' => 'p-summary',
- 'contact' => 'h-card p-contact',
- 'education' => 'h-event p-education',
- 'experience' => 'h-event p-experience',
- 'skill' => 'p-skill',
- 'affiliation' => 'p-affiliation h-card',
- ),
- 'vevent' => array(
- 'dtstart' => 'dt-start',
- 'dtend' => 'dt-end',
- 'duration' => 'dt-duration',
- 'description' => 'p-description',
- 'summary' => 'p-summary',
- 'description' => 'p-description',
- 'url' => 'u-url',
- 'category' => 'p-category',
- 'location' => 'h-card',
- 'geo' => 'p-location h-geo'
- ),
- 'hreview' => array(
- 'summary' => 'p-name',
- 'fn' => 'p-item h-item p-name', // doesn’t work properly, see spec
- 'photo' => 'u-photo', // of the item being reviewed (p-item h-item u-photo)
- 'url' => 'u-url', // of the item being reviewed (p-item h-item u-url)
- 'reviewer' => 'p-reviewer p-author h-card',
- 'dtreviewed' => 'dt-reviewed',
- 'rating' => 'p-rating',
- 'best' => 'p-best',
- 'worst' => 'p-worst',
- 'description' => 'p-description'
- ),
- 'hproduct' => array(
- 'fn' => 'p-name',
- 'photo' => 'u-photo',
- 'brand' => 'p-brand',
- 'category' => 'p-category',
- 'description' => 'p-description',
- 'identifier' => 'u-identifier',
- 'url' => 'u-url',
- 'review' => 'p-review h-review',
- 'price' => 'p-price'
- )
- );
- }
- function parseUriToComponents($uri) {
- $result = array(
- 'scheme' => null,
- 'authority' => null,
- 'path' => null,
- 'query' => null,
- 'fragment' => null
- );
- $u = @parse_url($uri);
- if(array_key_exists('scheme', $u))
- $result['scheme'] = $u['scheme'];
- if(array_key_exists('host', $u)) {
- if(array_key_exists('user', $u))
- $result['authority'] = $u['user'];
- if(array_key_exists('pass', $u))
- $result['authority'] .= ':' . $u['pass'];
- if(array_key_exists('user', $u) || array_key_exists('pass', $u))
- $result['authority'] .= '@';
- $result['authority'] .= $u['host'];
- if(array_key_exists('port', $u))
- $result['authority'] .= ':' . $u['port'];
- }
- if(array_key_exists('path', $u))
- $result['path'] = $u['path'];
- if(array_key_exists('query', $u))
- $result['query'] = $u['query'];
- if(array_key_exists('fragment', $u))
- $result['fragment'] = $u['fragment'];
- return $result;
- }
- function resolveUrl($baseURI, $referenceURI) {
- $target = array(
- 'scheme' => null,
- 'authority' => null,
- 'path' => null,
- 'query' => null,
- 'fragment' => null
- );
- # 5.2.1 Pre-parse the Base URI
- # The base URI (Base) is established according to the procedure of
- # Section 5.1 and parsed into the five main components described in
- # Section 3
- $base = parseUriToComponents($baseURI);
- # If base path is blank (http://example.com) then set it to /
- # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
- if($base['path'] == null)
- $base['path'] = '/';
- # 5.2.2. Transform References
- # The URI reference is parsed into the five URI components
- # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
- $reference = parseUriToComponents($referenceURI);
- # A non-strict parser may ignore a scheme in the reference
- # if it is identical to the base URI's scheme.
- # TODO
- if($reference['scheme']) {
- $target['scheme'] = $reference['scheme'];
- $target['authority'] = $reference['authority'];
- $target['path'] = removeDotSegments($reference['path']);
- $target['query'] = $reference['query'];
- } else {
- if($reference['authority']) {
- $target['authority'] = $reference['authority'];
- $target['path'] = removeDotSegments($reference['path']);
- $target['query'] = $reference['query'];
- } else {
- if($reference['path'] == '') {
- $target['path'] = $base['path'];
- if($reference['query']) {
- $target['query'] = $reference['query'];
- } else {
- $target['query'] = $base['query'];
- }
- } else {
- if(substr($reference['path'], 0, 1) == '/') {
- $target['path'] = removeDotSegments($reference['path']);
- } else {
- $target['path'] = mergePaths($base, $reference);
- $target['path'] = removeDotSegments($target['path']);
- }
- $target['query'] = $reference['query'];
- }
- $target['authority'] = $base['authority'];
- }
- $target['scheme'] = $base['scheme'];
- }
- $target['fragment'] = $reference['fragment'];
- # 5.3 Component Recomposition
- $result = '';
- if($target['scheme']) {
- $result .= $target['scheme'] . ':';
- }
- if($target['authority']) {
- $result .= '//' . $target['authority'];
- }
- $result .= $target['path'];
- if($target['query']) {
- $result .= '?' . $target['query'];
- }
- if($target['fragment']) {
- $result .= '#' . $target['fragment'];
- } elseif($referenceURI == '#') {
- $result .= '#';
- }
- return $result;
- }
- # 5.2.3 Merge Paths
- function mergePaths($base, $reference) {
- # If the base URI has a defined authority component and an empty
- # path,
- if($base['authority'] && $base['path'] == null) {
- # then return a string consisting of "/" concatenated with the
- # reference's path; otherwise,
- $merged = '/' . $reference['path'];
- } else {
- if(($pos=strrpos($base['path'], '/')) !== false) {
- # return a string consisting of the reference's path component
- # appended to all but the last segment of the base URI's path (i.e.,
- # excluding any characters after the right-most "/" in the base URI
- # path,
- $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
- } else {
- # or excluding the entire base URI path if it does not contain
- # any "/" characters).
- $merged = $base['path'];
- }
- }
- return $merged;
- }
- # 5.2.4.A Remove leading ../ or ./
- function removeLeadingDotSlash(&$input) {
- if(substr($input, 0, 3) == '../') {
- $input = substr($input, 3);
- } elseif(substr($input, 0, 2) == './') {
- $input = substr($input, 2);
- }
- }
- # 5.2.4.B Replace leading /. with /
- function removeLeadingSlashDot(&$input) {
- if(substr($input, 0, 3) == '/./') {
- $input = '/' . substr($input, 3);
- } else {
- $input = '/' . substr($input, 2);
- }
- }
- # 5.2.4.C Given leading /../ remove component from output buffer
- function removeOneDirLevel(&$input, &$output) {
- if(substr($input, 0, 4) == '/../') {
- $input = '/' . substr($input, 4);
- } else {
- $input = '/' . substr($input, 3);
- }
- $output = substr($output, 0, strrpos($output, '/'));
- }
- # 5.2.4.D Remove . and .. if it's the only thing in the input
- function removeLoneDotDot(&$input) {
- if($input == '.') {
- $input = substr($input, 1);
- } else {
- $input = substr($input, 2);
- }
- }
- # 5.2.4.E Move one segment from input to output
- function moveOneSegmentFromInput(&$input, &$output) {
- if(substr($input, 0, 1) != '/') {
- $pos = strpos($input, '/');
- } else {
- $pos = strpos($input, '/', 1);
- }
- if($pos === false) {
- $output .= $input;
- $input = '';
- } else {
- $output .= substr($input, 0, $pos);
- $input = substr($input, $pos);
- }
- }
- # 5.2.4 Remove Dot Segments
- function removeDotSegments($path) {
- # 1. The input buffer is initialized with the now-appended path
- # components and the output buffer is initialized to the empty
- # string.
- $input = $path;
- $output = '';
- $step = 0;
- # 2. While the input buffer is not empty, loop as follows:
- while($input) {
- $step++;
- if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
- # A. If the input buffer begins with a prefix of "../" or "./",
- # then remove that prefix from the input buffer; otherwise,
- removeLeadingDotSlash($input);
- } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
- # B. if the input buffer begins with a prefix of "/./" or "/.",
- # where "." is a complete path segment, then replace that
- # prefix with "/" in the input buffer; otherwise,
- removeLeadingSlashDot($input);
- } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
- # C. if the input buffer begins with a prefix of "/../" or "/..",
- # where ".." is a complete path segment, then replace that
- # prefix with "/" in the input buffer and remove the last
- # segment and its preceding "/" (if any) from the output
- # buffer; otherwise,
- removeOneDirLevel($input, $output);
- } elseif($input == '.' || $input == '..') {
- # D. if the input buffer consists only of "." or "..", then remove
- # that from the input buffer; otherwise,
- removeLoneDotDot($input);
- } else {
- # E. move the first path segment in the input buffer to the end of
- # the output buffer and any subsequent characters up to, but not including,
- # the next "/" character or the end of the input buffer
- moveOneSegmentFromInput($input, $output);
- }
- }
- return $output;
- }
|