1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314 |
- <?php
- namespace Mf2;
- use DOMDocument;
- use DOMElement;
- use DOMXPath;
- use DOMNode;
- use DOMNodeList;
- use Exception;
- use SplObjectStorage;
- use stdClass;
- /**
- * Parse Microformats2
- *
- * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
- *
- * Example usage:
- *
- * use Mf2;
- * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
- * echo json_encode($output, JSON_PRETTY_PRINT);
- *
- * Produces:
- *
- * {
- * "items": [
- * {
- * "type": ["h-card"],
- * "properties": {
- * "name": ["Barnaby Walters"]
- * }
- * }
- * ],
- * "rels": {}
- * }
- *
- * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
- * @param string $url The URL the input document was found at, for relative URL resolution
- * @param bool $convertClassic whether or not to convert classic microformats
- * @return array Canonical MF2 array structure
- */
- function parse($input, $url = null, $convertClassic = true) {
- $parser = new Parser($input, $url);
- return $parser->parse($convertClassic);
- }
- /**
- * Fetch microformats2
- *
- * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
- * microformats2 array structure.
- *
- * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
- * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
- * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
- * for the actual value.
- *
- * @param string $url The URL to fetch
- * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
- * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
- * @return array|null canonical microformats2 array structure on success, null on failure
- */
- function fetch($url, $convertClassic = true, &$curlInfo=null) {
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_HEADER, 0);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
- curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
- curl_setopt($ch, CURLOPT_HTTPHEADER, array(
- 'Accept: text/html'
- ));
- $html = curl_exec($ch);
- $info = $curlInfo = curl_getinfo($ch);
- curl_close($ch);
- if (strpos(strtolower($info['content_type']), 'html') === false) {
- // The content was not delivered as HTML, do not attempt to parse it.
- return null;
- }
- # ensure the final URL is used to resolve relative URLs
- $url = $info['url'];
- return parse($html, $url, $convertClassic);
- }
- /**
- * Unicode to HTML Entities
- * @param string $input String containing characters to convert into HTML entities
- * @return string
- */
- function unicodeToHtmlEntities($input) {
- return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
- }
- /**
- * Collapse Whitespace
- *
- * Collapses any sequences of whitespace within a string into a single space
- * character.
- *
- * @deprecated since v0.2.3
- * @param string $str
- * @return string
- */
- function collapseWhitespace($str) {
- return preg_replace('/[\s|\n]+/', ' ', $str);
- }
- function unicodeTrim($str) {
- // this is cheating. TODO: find a better way if this causes any problems
- $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
- $str = preg_replace('/^\s+/', '', $str);
- return preg_replace('/\s+$/', '', $str);
- }
- /**
- * Microformat Name From Class string
- *
- * Given the value of @class, get the relevant mf classnames (e.g. h-card,
- * p-name).
- *
- * @param string $class A space delimited list of classnames
- * @param string $prefix The prefix to look for
- * @return string|array The prefixed name of the first microfomats class found or false
- */
- function mfNamesFromClass($class, $prefix='h-') {
- $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
- $classes = explode(' ', $class);
- $classes = preg_grep('#^(h|p|u|dt|e)-([a-z0-9]+-)?[a-z]+(-[a-z]+)*$#', $classes);
- $matches = array();
- foreach ($classes as $classname) {
- $compare_classname = ' ' . $classname;
- $compare_prefix = ' ' . $prefix;
- if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
- $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
- }
- }
- return $matches;
- }
- /**
- * Get Nested µf Property Name From Class
- *
- * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
- * space-separated string.
- *
- * @param string $class
- * @return array
- */
- function nestedMfPropertyNamesFromClass($class) {
- $prefixes = array('p-', 'u-', 'dt-', 'e-');
- $propertyNames = array();
- $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
- foreach (explode(' ', $class) as $classname) {
- foreach ($prefixes as $prefix) {
- // Check if $classname is a valid property classname for $prefix.
- if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
- $propertyName = mb_substr($classname, mb_strlen($prefix));
- $propertyNames[$propertyName][] = $prefix;
- }
- }
- }
- foreach ($propertyNames as $property => $prefixes) {
- $propertyNames[$property] = array_unique($prefixes);
- }
- return $propertyNames;
- }
- /**
- * Wraps mfNamesFromClass to handle an element as input (common)
- *
- * @param DOMElement $e The element to get the classname for
- * @param string $prefix The prefix to look for
- * @return mixed See return value of mf2\Parser::mfNameFromClass()
- */
- function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
- $class = $e->getAttribute('class');
- return mfNamesFromClass($class, $prefix);
- }
- /**
- * Wraps nestedMfPropertyNamesFromClass to handle an element as input
- */
- function nestedMfPropertyNamesFromElement(\DOMElement $e) {
- $class = $e->getAttribute('class');
- return nestedMfPropertyNamesFromClass($class);
- }
- /**
- * Converts various time formats to HH:MM
- * @param string $time The time to convert
- * @return string
- */
- function convertTimeFormat($time) {
- $hh = $mm = $ss = '';
- preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
- // If no am/pm is specified:
- if (empty($matches[4])) {
- return $time;
- } else {
- // Otherwise, am/pm is specified.
- $meridiem = strtolower(str_replace('.', '', $matches[4]));
- // Hours.
- $hh = $matches[1];
- // Add 12 to hours if pm applies.
- if ($meridiem == 'pm' && ($hh < 12)) {
- $hh += 12;
- }
- $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
- // Minutes.
- $mm = (empty($matches[2]) ) ? '00' : $matches[2];
- // Seconds, only if supplied.
- if (!empty($matches[3])) {
- $ss = $matches[3];
- }
- if (empty($ss)) {
- return sprintf('%s:%s', $hh, $mm);
- }
- else {
- return sprintf('%s:%s:%s', $hh, $mm, $ss);
- }
- }
- }
- /**
- * Normalize an ordinal date to YYYY-MM-DD
- * This function should only be called after validating the $dtValue
- * matches regex \d{4}-\d{2}
- * @param string $dtValue
- * @return string
- */
- function normalizeOrdinalDate($dtValue) {
- list($year, $day) = explode('-', $dtValue, 2);
- $day = intval($day);
- if ($day < 367 && $day > 0) {
- $date = \DateTime::createFromFormat('Y-z', $dtValue);
- $date->modify('-1 day'); # 'z' format is zero-based so need to adjust
- if ($date->format('Y') === $year) {
- return $date->format('Y-m-d');
- }
- }
- return '';
- }
- /**
- * If a date value has a timezone offset, normalize it.
- * @param string $dtValue
- * @return string isolated, normalized TZ offset for implied TZ for other dt- properties
- */
- function normalizeTimezoneOffset(&$dtValue) {
- preg_match('/Z|[+-]\d{1,2}:?(\d{2})?$/i', $dtValue, $matches);
- if (empty($matches)) {
- return null;
- }
- $timezoneOffset = null;
- if ( $matches[0] != 'Z' ) {
- $timezoneString = str_replace(':', '', $matches[0]);
- $plus_minus = substr($timezoneString, 0, 1);
- $timezoneOffset = substr($timezoneString, 1);
- if ( strlen($timezoneOffset) <= 2 ) {
- $timezoneOffset .= '00';
- }
- $timezoneOffset = str_pad($timezoneOffset, 4, 0, STR_PAD_LEFT);
- $timezoneOffset = $plus_minus . $timezoneOffset;
- $dtValue = preg_replace('/Z?[+-]\d{1,2}:?(\d{2})?$/i', $timezoneOffset, $dtValue);
- }
- return $timezoneOffset;
- }
- function applySrcsetUrlTransformation($srcset, $transformation) {
- return implode(', ', array_filter(array_map(function ($srcsetPart) use ($transformation) {
- $parts = explode(" \t\n\r\0\x0B", trim($srcsetPart), 2);
- $parts[0] = rtrim($parts[0]);
- if (empty($parts[0])) { return false; }
- $parts[0] = call_user_func($transformation, $parts[0]);
- return $parts[0] . (empty($parts[1]) ? '' : ' ' . $parts[1]);
- }, explode(',', trim($srcset)))));
- }
- /**
- * Microformats2 Parser
- *
- * A class which holds state for parsing microformats2 from HTML.
- *
- * Example usage:
- *
- * use Mf2;
- * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
- * $output = $parser->parse();
- */
- class Parser {
- /** @var string The baseurl (if any) to use for this parse */
- public $baseurl;
- /** @var DOMXPath object which can be used to query over any fragment*/
- public $xpath;
- /** @var DOMDocument */
- public $doc;
- /** @var SplObjectStorage */
- protected $parsed;
- /**
- * @var bool
- */
- public $jsonMode;
- /** @var boolean Whether to include experimental language parsing in the result */
- public $lang = false;
- /** @var bool Whether to include alternates object (dropped from spec in favor of rel-urls) */
- public $enableAlternates = false;
- /**
- * Elements upgraded to mf2 during backcompat
- * @var SplObjectStorage
- */
- protected $upgraded;
- /**
- * Whether to convert classic microformats
- * @var bool
- */
- public $convertClassic;
- /**
- * Constructor
- *
- * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
- * @param string $url The URL of the parsed document, for relative URL resolution
- * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
- */
- public function __construct($input, $url = null, $jsonMode = false) {
- libxml_use_internal_errors(true);
- if (is_string($input)) {
- if (class_exists('Masterminds\\HTML5')) {
- $doc = new \Masterminds\HTML5(array('disable_html_ns' => true));
- $doc = $doc->loadHTML($input);
- } else {
- $doc = new DOMDocument();
- @$doc->loadHTML(unicodeToHtmlEntities($input));
- }
- } elseif (is_a($input, 'DOMDocument')) {
- $doc = clone $input;
- } else {
- $doc = new DOMDocument();
- @$doc->loadHTML('');
- }
- $this->xpath = new DOMXPath($doc);
- $baseurl = $url;
- foreach ($this->xpath->query('//base[@href]') as $base) {
- $baseElementUrl = $base->getAttribute('href');
- if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
- /* The base element URL is relative to the document URL.
- *
- * :/
- *
- * Perhaps the author was high? */
- $baseurl = resolveUrl($url, $baseElementUrl);
- } else {
- $baseurl = $baseElementUrl;
- }
- break;
- }
- // Ignore <template> elements as per the HTML5 spec
- foreach ($this->xpath->query('//template') as $templateEl) {
- $templateEl->parentNode->removeChild($templateEl);
- }
- $this->baseurl = $baseurl;
- $this->doc = $doc;
- $this->parsed = new SplObjectStorage();
- $this->upgraded = new SplObjectStorage();
- $this->jsonMode = $jsonMode;
- }
- private function elementPrefixParsed(\DOMElement $e, $prefix) {
- if (!$this->parsed->contains($e))
- $this->parsed->attach($e, array());
- $prefixes = $this->parsed[$e];
- $prefixes[] = $prefix;
- $this->parsed[$e] = $prefixes;
- }
- /**
- * Determine if the element has already been parsed
- * @param DOMElement $e
- * @param string $prefix
- * @return bool
- */
- private function isElementParsed(\DOMElement $e, $prefix) {
- if (!$this->parsed->contains($e)) {
- return false;
- }
- $prefixes = $this->parsed[$e];
- if (!in_array($prefix, $prefixes)) {
- return false;
- }
- return true;
- }
- /**
- * Determine if the element's specified property has already been upgraded during backcompat
- * @param DOMElement $el
- * @param string $property
- * @return bool
- */
- private function isElementUpgraded(\DOMElement $el, $property) {
- if ( $this->upgraded->contains($el) ) {
- if ( in_array($property, $this->upgraded[$el]) ) {
- return true;
- }
- }
- return false;
- }
- private function resolveChildUrls(DOMElement $el) {
- $hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);
- foreach ($hyperlinkChildren as $child) {
- if ($child->hasAttribute('href'))
- $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
- if ($child->hasAttribute('src'))
- $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
- if ($child->hasAttribute('srcset'))
- $child->setAttribute('srcset', applySrcsetUrlTransformation($child->getAttribute('href'), array($this, 'resolveUrl')));
- if ($child->hasAttribute('data'))
- $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
- }
- }
- /**
- * The following two methods implements plain text parsing.
- * @param DOMElement $element
- * @param bool $implied
- * @see https://wiki.zegnat.net/media/textparsing.html
- **/
- public function textContent(DOMElement $element, $implied=false)
- {
- return preg_replace(
- '/(^[\t\n\f\r ]+| +(?=\n)|(?<=\n) +| +(?= )|[\t\n\f\r ]+$)/',
- '',
- $this->elementToString($element, $implied)
- );
- }
- private function elementToString(DOMElement $input, $implied=false)
- {
- $output = '';
- foreach ($input->childNodes as $child) {
- if ($child->nodeType === XML_TEXT_NODE) {
- $output .= str_replace(array("\t", "\n", "\r") , ' ', $child->textContent);
- } else if ($child->nodeType === XML_ELEMENT_NODE) {
- $tagName = strtoupper($child->tagName);
- if (in_array($tagName, array('SCRIPT', 'STYLE'))) {
- continue;
- } else if ($tagName === 'IMG') {
- if ($child->hasAttribute('alt')) {
- $output .= ' ' . trim($child->getAttribute('alt'), "\t\n\f\r ") . ' ';
- } else if (!$implied && $child->hasAttribute('src')) {
- $output .= ' ' . $this->resolveUrl(trim($child->getAttribute('src'), "\t\n\f\r ")) . ' ';
- }
- } else if ($tagName === 'BR') {
- $output .= "\n";
- } else if ($tagName === 'P') {
- $output .= "\n" . $this->elementToString($child);
- } else {
- $output .= $this->elementToString($child);
- }
- }
- }
- return $output;
- }
- /**
- * This method parses the language of an element
- * @param DOMElement $el
- * @access public
- * @return string
- */
- public function language(DOMElement $el)
- {
- // element has a lang attribute; use it
- if ($el->hasAttribute('lang')) {
- return unicodeTrim($el->getAttribute('lang'));
- }
- if ($el->tagName == 'html') {
- // we're at the <html> element and no lang; check <meta> http-equiv Content-Language
- foreach ( $this->xpath->query('.//meta[@http-equiv]') as $node )
- {
- if ($node->hasAttribute('http-equiv') && $node->hasAttribute('content') && strtolower($node->getAttribute('http-equiv')) == 'content-language') {
- return unicodeTrim($node->getAttribute('content'));
- }
- }
- } elseif ($el->parentNode instanceof DOMElement) {
- // check the parent node
- return $this->language($el->parentNode);
- }
- return '';
- } # end method language()
- // TODO: figure out if this has problems with sms: and geo: URLs
- public function resolveUrl($url) {
- // If the URL is seriously malformed it’s probably beyond the scope of this
- // parser to try to do anything with it.
- if (parse_url($url) === false) {
- return $url;
- }
- // per issue #40 valid URLs could have a space on either side
- $url = trim($url);
- $scheme = parse_url($url, PHP_URL_SCHEME);
- if (empty($scheme) and !empty($this->baseurl)) {
- return resolveUrl($this->baseurl, $url);
- } else {
- return $url;
- }
- }
- // Parsing Functions
- /**
- * Parse value-class/value-title on an element, joining with $separator if
- * there are multiple.
- *
- * @param \DOMElement $e
- * @param string $separator = '' if multiple value-title elements, join with this string
- * @return string|null the parsed value or null if value-class or -title aren’t in use
- */
- public function parseValueClassTitle(\DOMElement $e, $separator = '') {
- $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
- if ($valueClassElements->length !== 0) {
- // Process value-class stuff
- $val = '';
- foreach ($valueClassElements as $el) {
- $val .= $this->textContent($el);
- }
- return unicodeTrim($val);
- }
- $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
- if ($valueTitleElements->length !== 0) {
- // Process value-title stuff
- $val = '';
- foreach ($valueTitleElements as $el) {
- $val .= $el->getAttribute('title');
- }
- return unicodeTrim($val);
- }
- // No value-title or -class in this element
- return null;
- }
- /**
- * Given an element with class="p-*", get its value
- *
- * @param DOMElement $p The element to parse
- * @return string The plaintext value of $p, dependant on type
- * @todo Make this adhere to value-class
- */
- public function parseP(\DOMElement $p) {
- $classTitle = $this->parseValueClassTitle($p, ' ');
- if ($classTitle !== null) {
- return $classTitle;
- }
- $this->resolveChildUrls($p);
- if ($p->tagName == 'img' and $p->hasAttribute('alt')) {
- $pValue = $p->getAttribute('alt');
- } elseif ($p->tagName == 'area' and $p->hasAttribute('alt')) {
- $pValue = $p->getAttribute('alt');
- } elseif (($p->tagName == 'abbr' or $p->tagName == 'link') and $p->hasAttribute('title')) {
- $pValue = $p->getAttribute('title');
- } elseif (in_array($p->tagName, array('data', 'input')) and $p->hasAttribute('value')) {
- $pValue = $p->getAttribute('value');
- } else {
- $pValue = $this->textContent($p);
- }
- return $pValue;
- }
- /**
- * Given an element with class="u-*", get the value of the URL
- *
- * @param DOMElement $u The element to parse
- * @return string The plaintext value of $u, dependant on type
- * @todo make this adhere to value-class
- */
- public function parseU(\DOMElement $u) {
- if (($u->tagName == 'a' or $u->tagName == 'area' or $u->tagName == 'link') and $u->hasAttribute('href')) {
- $uValue = $u->getAttribute('href');
- } elseif (in_array($u->tagName, array('img', 'audio', 'video', 'source')) and $u->hasAttribute('src')) {
- $uValue = $u->getAttribute('src');
- } elseif ($u->tagName == 'video' and !$u->hasAttribute('src') and $u->hasAttribute('poster')) {
- $uValue = $u->getAttribute('poster');
- } elseif ($u->tagName == 'object' and $u->hasAttribute('data')) {
- $uValue = $u->getAttribute('data');
- } elseif (($classTitle = $this->parseValueClassTitle($u)) !== null) {
- $uValue = $classTitle;
- } elseif (($u->tagName == 'abbr' or $u->tagName == 'link') and $u->hasAttribute('title')) {
- $uValue = $u->getAttribute('title');
- } elseif (in_array($u->tagName, array('data', 'input')) and $u->hasAttribute('value')) {
- $uValue = $u->getAttribute('value');
- } else {
- $uValue = $this->textContent($u);
- }
- return $this->resolveUrl($uValue);
- }
- /**
- * Given an element with class="dt-*", get the value of the datetime as a php date object
- *
- * @param DOMElement $dt The element to parse
- * @param array $dates Array of dates processed so far
- * @param string $impliedTimezone
- * @return string The datetime string found
- */
- public function parseDT(\DOMElement $dt, &$dates = array(), &$impliedTimezone = null) {
- // Check for value-class pattern
- $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
- $dtValue = false;
- if ($valueClassChildren->length > 0) {
- // They’re using value-class
- $dateParts = array();
- foreach ($valueClassChildren as $e) {
- if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
- $title = $e->getAttribute('title');
- if (!empty($title)) {
- $dateParts[] = $title;
- }
- }
- elseif ($e->tagName == 'img' or $e->tagName == 'area') {
- // Use @alt
- $alt = $e->getAttribute('alt');
- if (!empty($alt)) {
- $dateParts[] = $alt;
- }
- }
- elseif ($e->tagName == 'data') {
- // Use @value, otherwise innertext
- $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
- if (!empty($value)) {
- $dateParts[] = $value;
- }
- }
- elseif ($e->tagName == 'abbr') {
- // Use @title, otherwise innertext
- $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
- if (!empty($title)) {
- $dateParts[] = $title;
- }
- }
- elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
- // Use @datetime if available, otherwise innertext
- $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
- if (!empty($dtAttr)) {
- $dateParts[] = $dtAttr;
- }
- }
- else {
- if (!empty($e->nodeValue)) {
- $dateParts[] = unicodeTrim($e->nodeValue);
- }
- }
- }
- // Look through dateParts
- $datePart = '';
- $timePart = '';
- $timezonePart = '';
- foreach ($dateParts as $part) {
- // Is this part a full ISO8601 datetime?
- if (preg_match('/^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}(:\d{2})?(Z|[+-]\d{2}:?\d{2})?$/', $part)) {
- // Break completely, we’ve got our value.
- $dtValue = $part;
- break;
- } else {
- // Is the current part a valid time(+TZ?) AND no other time representation has been found?
- if ((preg_match('/^\d{1,2}:\d{2}(:\d{2})?(Z|[+-]\d{1,2}:?\d{2})?$/', $part) or preg_match('/^\d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i', $part)) and empty($timePart)) {
- $timePart = $part;
- $timezoneOffset = normalizeTimezoneOffset($timePart);
- if (!$impliedTimezone && $timezoneOffset) {
- $impliedTimezone = $timezoneOffset;
- }
- // Is the current part a valid date AND no other date representation has been found?
- } elseif (preg_match('/^\d{4}-\d{2}-\d{2}$/', $part) and empty($datePart)) {
- $datePart = $part;
- // Is the current part a valid ordinal date AND no other date representation has been found?
- } elseif (preg_match('/^\d{4}-\d{3}$/', $part) and empty($datePart)) {
- $datePart = normalizeOrdinalDate($part);
- // Is the current part a valid timezone offset AND no other timezone part has been found?
- } elseif (preg_match('/^(Z|[+-]\d{1,2}:?(\d{2})?)$/', $part) and empty($timezonePart)) {
- $timezonePart = $part;
- $timezoneOffset = normalizeTimezoneOffset($timezonePart);
- if (!$impliedTimezone && $timezoneOffset) {
- $impliedTimezone = $timezoneOffset;
- }
- // Current part already represented by other VCP parts; do nothing with it
- } else {
- continue;
- }
- if ( !empty($datePart) && !in_array($datePart, $dates) ) {
- $dates[] = $datePart;
- }
- if (!empty($timezonePart) && !empty($timePart)) {
- $timePart .= $timezonePart;
- }
- $dtValue = '';
- if ( empty($datePart) && !empty($timePart) ) {
- $timePart = convertTimeFormat($timePart);
- $dtValue = unicodeTrim($timePart);
- }
- else if ( !empty($datePart) && empty($timePart) ) {
- $dtValue = rtrim($datePart, 'T');
- }
- else {
- $timePart = convertTimeFormat($timePart);
- $dtValue = rtrim($datePart, 'T') . ' ' . unicodeTrim($timePart);
- }
- }
- }
- } else {
- // Not using value-class (phew).
- if ($dt->tagName == 'img' or $dt->tagName == 'area') {
- // Use @alt
- // Is it an entire dt?
- $alt = $dt->getAttribute('alt');
- if (!empty($alt)) {
- $dtValue = $alt;
- }
- } elseif (in_array($dt->tagName, array('data'))) {
- // Use @value, otherwise innertext
- // Is it an entire dt?
- $value = $dt->getAttribute('value');
- if (!empty($value)) {
- $dtValue = $value;
- }
- else {
- $dtValue = $this->textContent($dt);
- }
- } elseif ($dt->tagName == 'abbr') {
- // Use @title, otherwise innertext
- // Is it an entire dt?
- $title = $dt->getAttribute('title');
- if (!empty($title)) {
- $dtValue = $title;
- }
- else {
- $dtValue = $this->textContent($dt);
- }
- } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
- // Use @datetime if available, otherwise innertext
- // Is it an entire dt?
- $dtAttr = $dt->getAttribute('datetime');
- if (!empty($dtAttr)) {
- $dtValue = $dtAttr;
- }
- else {
- $dtValue = $this->textContent($dt);
- }
- } else {
- $dtValue = $this->textContent($dt);
- }
- // if the dtValue is not just YYYY-MM-DD
- if (!preg_match('/^(\d{4}-\d{2}-\d{2})$/', $dtValue)) {
- // no implied timezone set and dtValue has a TZ offset, use un-normalized TZ offset
- preg_match('/Z|[+-]\d{1,2}:?(\d{2})?$/i', $dtValue, $matches);
- if (!$impliedTimezone && !empty($matches[0])) {
- $impliedTimezone = $matches[0];
- }
- }
- $dtValue = unicodeTrim($dtValue);
- // Store the date part so that we can use it when assembling the final timestamp if the next one is missing a date part
- if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
- $dates[] = $matches[0];
- }
- }
- /**
- * if $dtValue is only a time and there are recently parsed dates,
- * form the full date-time using the most recently parsed dt- value
- */
- if ((preg_match('/^\d{1,2}:\d{2}(:\d{2})?(Z|[+-]\d{2}:?\d{2}?)?$/', $dtValue) or preg_match('/^\d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i', $dtValue)) && !empty($dates)) {
- $timezoneOffset = normalizeTimezoneOffset($dtValue);
- if (!$impliedTimezone && $timezoneOffset) {
- $impliedTimezone = $timezoneOffset;
- }
- $dtValue = convertTimeFormat($dtValue);
- $dtValue = end($dates) . ' ' . unicodeTrim($dtValue);
- }
- return $dtValue;
- }
- /**
- * Given the root element of some embedded markup, return a string representing that markup
- *
- * @param DOMElement $e The element to parse
- * @return string $e’s innerHTML
- *
- * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
- */
- public function parseE(\DOMElement $e) {
- $classTitle = $this->parseValueClassTitle($e);
- if ($classTitle !== null)
- return $classTitle;
- // Expand relative URLs within children of this element
- // TODO: as it is this is not relative to only children, make this .// and rerun tests
- $this->resolveChildUrls($e);
- // Temporarily move all descendants into a separate DocumentFragment.
- // This way we can DOMDocument::saveHTML on the entire collection at once.
- // Running DOMDocument::saveHTML per node may add whitespace that isn't in source.
- // See https://stackoverflow.com/q/38317903
- $innerNodes = $e->ownerDocument->createDocumentFragment();
- while ($e->hasChildNodes()) {
- $innerNodes->appendChild($e->firstChild);
- }
- $html = $e->ownerDocument->saveHtml($innerNodes);
- // Put the nodes back in place.
- if($innerNodes->hasChildNodes()) {
- $e->appendChild($innerNodes);
- }
- $return = array(
- 'html' => unicodeTrim($html),
- 'value' => $this->textContent($e),
- );
- if($this->lang) {
- // Language
- if ( $html_lang = $this->language($e) ) {
- $return['lang'] = $html_lang;
- }
- }
- return $return;
- }
- private function removeTags(\DOMElement &$e, $tagName) {
- while(($r = $e->getElementsByTagName($tagName)) && $r->length) {
- $r->item(0)->parentNode->removeChild($r->item(0));
- }
- }
- /**
- * Recursively parse microformats
- *
- * @param DOMElement $e The element to parse
- * @param bool $is_backcompat Whether using backcompat parsing or not
- * @param bool $has_nested_mf Whether this microformat has a nested microformat
- * @return array A representation of the values contained within microformat $e
- */
- public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf = false) {
- // If it’s already been parsed (e.g. is a child mf), skip
- if ($this->parsed->contains($e)) {
- return null;
- }
- // Get current µf name
- $mfTypes = mfNamesFromElement($e, 'h-');
- if (!$mfTypes) {
- return null;
- }
- // Initalise var to store the representation in
- $return = array();
- $children = array();
- $dates = array();
- $prefixes = array();
- $impliedTimezone = null;
- if($e->tagName == 'area') {
- $coords = $e->getAttribute('coords');
- $shape = $e->getAttribute('shape');
- }
- // Handle p-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
- // element is already parsed
- if ($this->isElementParsed($p, 'p')) {
- continue;
- // backcompat parsing and element was not upgraded; skip it
- } else if ( $is_backcompat && empty($this->upgraded[$p]) ) {
- $this->elementPrefixParsed($p, 'p');
- continue;
- }
- $prefixes[] = 'p-';
- $pValue = $this->parseP($p);
- // Add the value to the array for it’s p- properties
- foreach (mfNamesFromElement($p, 'p-') as $propName) {
- if (!empty($propName)) {
- $return[$propName][] = $pValue;
- }
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($p, 'p');
- }
- // Handle u-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
- // element is already parsed
- if ($this->isElementParsed($u, 'u')) {
- continue;
- // backcompat parsing and element was not upgraded; skip it
- } else if ( $is_backcompat && empty($this->upgraded[$u]) ) {
- $this->elementPrefixParsed($u, 'u');
- continue;
- }
- $prefixes[] = 'u-';
- $uValue = $this->parseU($u);
- // Add the value to the array for it’s property types
- foreach (mfNamesFromElement($u, 'u-') as $propName) {
- $return[$propName][] = $uValue;
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($u, 'u');
- }
- $temp_dates = array();
- // Handle dt-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
- // element is already parsed
- if ($this->isElementParsed($dt, 'dt')) {
- continue;
- // backcompat parsing and element was not upgraded; skip it
- } else if ( $is_backcompat && empty($this->upgraded[$dt]) ) {
- $this->elementPrefixParsed($dt, 'dt');
- continue;
- }
- $prefixes[] = 'dt-';
- $dtValue = $this->parseDT($dt, $dates, $impliedTimezone);
- if ($dtValue) {
- // Add the value to the array for dt- properties
- foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
- $temp_dates[$propName][] = $dtValue;
- }
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($dt, 'dt');
- }
- foreach ($temp_dates as $propName => $data) {
- foreach ( $data as $dtValue ) {
- // var_dump(preg_match('/[+-]\d{2}(\d{2})?$/i', $dtValue));
- if ( $impliedTimezone && preg_match('/(Z|[+-]\d{2}:?(\d{2})?)$/i', $dtValue, $matches) == 0 ) {
- $dtValue .= $impliedTimezone;
- }
- $return[$propName][] = $dtValue;
- }
- }
- // Handle e-*
- foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
- // element is already parsed
- if ($this->isElementParsed($em, 'e')) {
- continue;
- // backcompat parsing and element was not upgraded; skip it
- } else if ( $is_backcompat && empty($this->upgraded[$em]) ) {
- $this->elementPrefixParsed($em, 'e');
- continue;
- }
- $prefixes[] = 'e-';
- $eValue = $this->parseE($em);
- if ($eValue) {
- // Add the value to the array for e- properties
- foreach (mfNamesFromElement($em, 'e-') as $propName) {
- $return[$propName][] = $eValue;
- }
- }
- // Make sure this sub-mf won’t get parsed as a top level mf
- $this->elementPrefixParsed($em, 'e');
- }
- // Do we need to imply a name property?
- // if no explicit "name" property, and no other p-* or e-* properties, and no nested microformats,
- if (!array_key_exists('name', $return) && !in_array('p-', $prefixes) && !in_array('e-', $prefixes) && !$has_nested_mf && !$is_backcompat) {
- $name = false;
- // img.h-x[alt] or area.h-x[alt]
- if (($e->tagName === 'img' || $e->tagName === 'area') && $e->hasAttribute('alt')) {
- $name = $e->getAttribute('alt');
- // abbr.h-x[title]
- } elseif ($e->tagName === 'abbr' && $e->hasAttribute('title')) {
- $name = $e->getAttribute('title');
- } else {
- $xpaths = array(
- // .h-x>img:only-child[alt]:not([alt=""]):not[.h-*]
- './img[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and @alt and string-length(@alt) != 0]',
- // .h-x>area:only-child[alt]:not([alt=""]):not[.h-*]
- './area[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and @alt and string-length(@alt) != 0]',
- // .h-x>abbr:only-child[title]:not([title=""]):not[.h-*]
- './abbr[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and @title and string-length(@title) != 0]',
- // .h-x>:only-child:not[.h-*]>img:only-child[alt]:not([alt=""]):not[.h-*]
- './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(*) = 1]/img[not(contains(concat(" ", @class), " h-")) and @alt and string-length(@alt) != 0]',
- // .h-x>:only-child:not[.h-*]>area:only-child[alt]:not([alt=""]):not[.h-*]
- './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(*) = 1]/area[not(contains(concat(" ", @class), " h-")) and @alt and string-length(@alt) != 0]',
- // .h-x>:only-child:not[.h-*]>abbr:only-child[title]:not([title=""]):not[.h-*]
- './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(*) = 1]/abbr[not(contains(concat(" ", @class), " h-")) and @title and string-length(@title) != 0]'
- );
- foreach ($xpaths as $xpath) {
- $nameElement = $this->xpath->query($xpath, $e);
- if ($nameElement !== false && $nameElement->length === 1) {
- $nameElement = $nameElement->item(0);
- if ($nameElement->tagName === 'img' || $nameElement->tagName === 'area') {
- $name = $nameElement->getAttribute('alt');
- } else {
- $name = $nameElement->getAttribute('title');
- }
- break;
- }
- }
- }
- if ($name === false) {
- $name = $this->textContent($e, true);
- }
- $return['name'][] = unicodeTrim($name);
- }
- // Check for u-photo
- if (!array_key_exists('photo', $return) && !$is_backcompat) {
- $photo = $this->parseImpliedPhoto($e);
- if ($photo !== false) {
- $return['photo'][] = $photo;
- }
- }
- // Do we need to imply a url property?
- // if no explicit "url" property, and no other explicit u-* properties, and no nested microformats
- if (!array_key_exists('url', $return) && !in_array('u-', $prefixes) && !$has_nested_mf && !$is_backcompat) {
- // a.h-x[href] or area.h-x[href]
- if (($e->tagName === 'a' || $e->tagName === 'area') && $e->hasAttribute('href')) {
- $return['url'][] = $this->resolveUrl($e->getAttribute('href'));
- } else {
- $xpaths = array(
- // .h-x>a[href]:only-of-type:not[.h-*]
- './a[not(contains(concat(" ", @class), " h-")) and count(../a) = 1 and @href]',
- // .h-x>area[href]:only-of-type:not[.h-*]
- './area[not(contains(concat(" ", @class), " h-")) and count(../area) = 1 and @href]',
- // .h-x>:only-child:not[.h-*]>a[href]:only-of-type:not[.h-*]
- './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(a) = 1]/a[not(contains(concat(" ", @class), " h-")) and @href]',
- // .h-x>:only-child:not[.h-*]>area[href]:only-of-type:not[.h-*]
- './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(area) = 1]/area[not(contains(concat(" ", @class), " h-")) and @href]'
- );
- foreach ($xpaths as $xpath) {
- $url = $this->xpath->query($xpath, $e);
- if ($url !== false && $url->length === 1) {
- $return['url'][] = $this->resolveUrl($url->item(0)->getAttribute('href'));
- break;
- }
- }
- }
- }
- // Make sure things are unique and in alphabetical order
- $mfTypes = array_unique($mfTypes);
- sort($mfTypes);
- // Properties should be an object when JSON serialised
- if (empty($return) and $this->jsonMode) {
- $return = new stdClass();
- }
- // Phew. Return the final result.
- $parsed = array(
- 'type' => $mfTypes,
- 'properties' => $return
- );
- if($this->lang) {
- // Language
- if ( $html_lang = $this->language($e) ) {
- $parsed['lang'] = $html_lang;
- }
- }
- if (!empty($shape)) {
- $parsed['shape'] = $shape;
- }
- if (!empty($coords)) {
- $parsed['coords'] = $coords;
- }
- if (!empty($children)) {
- $parsed['children'] = array_values(array_filter($children));
- }
- return $parsed;
- }
- /**
- * @see http://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties
- */
- public function parseImpliedPhoto(\DOMElement $e) {
- // img.h-x[src]
- if ($e->tagName == 'img') {
- return $this->resolveUrl($e->getAttribute('src'));
- }
- // object.h-x[data]
- if ($e->tagName == 'object' && $e->hasAttribute('data')) {
- return $this->resolveUrl($e->getAttribute('data'));
- }
- $xpaths = array(
- // .h-x>img[src]:only-of-type:not[.h-*]
- './img[not(contains(concat(" ", @class), " h-")) and count(../img) = 1 and @src]',
- // .h-x>object[data]:only-of-type:not[.h-*]
- './object[not(contains(concat(" ", @class), " h-")) and count(../object) = 1 and @data]',
- // .h-x>:only-child:not[.h-*]>img[src]:only-of-type:not[.h-*]
- './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(img) = 1]/img[not(contains(concat(" ", @class), " h-")) and @src]',
- // .h-x>:only-child:not[.h-*]>object[data]:only-of-type:not[.h-*]
- './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(object) = 1]/object[not(contains(concat(" ", @class), " h-")) and @data]',
- );
- foreach ($xpaths as $path) {
- $els = $this->xpath->query($path, $e);
- if ($els !== false && $els->length === 1) {
- $el = $els->item(0);
- if ($el->tagName == 'img') {
- return $this->resolveUrl($el->getAttribute('src'));
- } else if ($el->tagName == 'object') {
- return $this->resolveUrl($el->getAttribute('data'));
- }
- }
- }
- // no implied photo
- return false;
- }
- /**
- * Parse rels and alternates
- *
- * Returns [$rels, $rel_urls, $alternates].
- * For $rels and $rel_urls, if they are empty and $this->jsonMode = true, they will be returned as stdClass,
- * optimizing for JSON serialization. Otherwise they will be returned as an empty array.
- * Note that $alternates is deprecated in the microformats spec in favor of $rel_urls. $alternates only appears
- * in parsed results if $this->enableAlternates = true.
- * @return array|stdClass
- */
- public function parseRelsAndAlternates() {
- $rels = array();
- $rel_urls = array();
- $alternates = array();
- // Iterate through all a, area and link elements with rel attributes
- foreach ($this->xpath->query('//a[@rel and @href] | //link[@rel and @href] | //area[@rel and @href]') as $hyperlink) {
- // Parse the set of rels for the current link
- $linkRels = array_unique(array_filter(preg_split('/[\t\n\f\r ]/', $hyperlink->getAttribute('rel'))));
- if (count($linkRels) === 0) {
- continue;
- }
- // Resolve the href
- $href = $this->resolveUrl($hyperlink->getAttribute('href'));
- $rel_attributes = array();
- if ($hyperlink->hasAttribute('media')) {
- $rel_attributes['media'] = $hyperlink->getAttribute('media');
- }
- if ($hyperlink->hasAttribute('hreflang')) {
- $rel_attributes['hreflang'] = $hyperlink->getAttribute('hreflang');
- }
- if ($hyperlink->hasAttribute('title')) {
- $rel_attributes['title'] = $hyperlink->getAttribute('title');
- }
- if ($hyperlink->hasAttribute('type')) {
- $rel_attributes['type'] = $hyperlink->getAttribute('type');
- }
- if (strlen($hyperlink->textContent) > 0) {
- $rel_attributes['text'] = $hyperlink->textContent;
- }
- if ($this->enableAlternates) {
- // If 'alternate' in rels, create 'alternates' structure, append
- if (in_array('alternate', $linkRels)) {
- $alternates[] = array_merge(
- $rel_attributes,
- array(
- 'url' => $href,
- 'rel' => implode(' ', array_diff($linkRels, array('alternate')))
- )
- );
- }
- }
- foreach ($linkRels as $rel) {
- if (!array_key_exists($rel, $rels)) {
- $rels[$rel] = array($href);
- } elseif (!in_array($href, $rels[$rel])) {
- $rels[$rel][] = $href;
- }
- }
- if (!array_key_exists($href, $rel_urls)) {
- $rel_urls[$href] = array('rels' => array());
- }
- // Add the attributes collected only if they were not already set
- $rel_urls[$href] = array_merge(
- $rel_attributes,
- $rel_urls[$href]
- );
- // Merge current rels with those already set
- $rel_urls[$href]['rels'] = array_merge(
- $rel_urls[$href]['rels'],
- $linkRels
- );
- }
- // Alphabetically sort the rels arrays after removing duplicates
- foreach ($rel_urls as $href => $object) {
- $rel_urls[$href]['rels'] = array_unique($rel_urls[$href]['rels']);
- sort($rel_urls[$href]['rels']);
- }
- if (empty($rels) and $this->jsonMode) {
- $rels = new stdClass();
- }
- if (empty($rel_urls) and $this->jsonMode) {
- $rel_urls = new stdClass();
- }
- return array($rels, $rel_urls, $alternates);
- }
- /**
- * Find rel=tag elements that don't have class=category and have an href.
- * For each element, get the last non-empty URL segment. Append a <data>
- * element with that value as the category. Uses the mf1 class 'category'
- * which will then be upgraded to p-category during backcompat.
- * @param DOMElement $el
- */
- public function upgradeRelTagToCategory(DOMElement $el) {
- $rel_tag = $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," tag ") and not(contains(concat(" ", normalize-space(@class), " "), " category ")) and @href]', $el);
- if ( $rel_tag->length ) {
- foreach ( $rel_tag as $tempEl ) {
- $path = trim(parse_url($tempEl->getAttribute('href'), PHP_URL_PATH), ' /');
- $segments = explode('/', $path);
- $value = array_pop($segments);
- # build the <data> element
- $dataEl = $tempEl->ownerDocument->createElement('data');
- $dataEl->setAttribute('class', 'category');
- $dataEl->setAttribute('value', $value);
- # append as child of input element. this should ensure added element does get parsed inside e-*
- $el->appendChild($dataEl);
- }
- }
- }
- /**
- * Kicks off the parsing routine
- * @param bool $convertClassic whether to do backcompat parsing on microformats1. Defaults to true.
- * @param DOMElement $context optionally specify an element from which to parse microformats
- * @return array An array containing all the microformats found in the current document
- */
- public function parse($convertClassic = true, DOMElement $context = null) {
- $this->convertClassic = $convertClassic;
- $mfs = $this->parse_recursive($context);
- // Parse rels
- list($rels, $rel_urls, $alternates) = $this->parseRelsAndAlternates();
- $top = array(
- 'items' => array_values(array_filter($mfs)),
- 'rels' => $rels,
- 'rel-urls' => $rel_urls,
- );
- if ($this->enableAlternates && count($alternates)) {
- $top['alternates'] = $alternates;
- }
- return $top;
- }
- /**
- * Parse microformats recursively
- * Keeps track of whether inside a backcompat root or not
- * @param DOMElement $context: node to start with
- * @param int $depth: recursion depth
- * @return array
- */
- public function parse_recursive(DOMElement $context = null, $depth = 0) {
- $mfs = array();
- $mfElements = $this->getRootMF($context);
- foreach ($mfElements as $node) {
- $is_backcompat = !$this->hasRootMf2($node);
- if ($this->convertClassic && $is_backcompat) {
- $this->backcompat($node);
- }
- $recurse = $this->parse_recursive($node, $depth + 1);
- // set bool flag for nested mf
- $has_nested_mf = ($recurse);
- // parse for root mf
- $result = $this->parseH($node, $is_backcompat, $has_nested_mf);
- // TODO: Determine if clearing this is required?
- $this->elementPrefixParsed($node, 'h');
- $this->elementPrefixParsed($node, 'p');
- $this->elementPrefixParsed($node, 'u');
- $this->elementPrefixParsed($node, 'dt');
- $this->elementPrefixParsed($node, 'e');
- // parseH returned a parsed result
- if ($result) {
- // merge recursive results into current results
- if ($recurse) {
- $result = array_merge_recursive($result, $recurse);
- }
- // currently a nested mf; check if node is an mf property of parent
- if ($depth > 0) {
- $temp_properties = nestedMfPropertyNamesFromElement($node);
- // properties found; set up parsed result in 'properties'
- if (!empty($temp_properties)) {
- foreach ($temp_properties as $property => $prefixes) {
- // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
- $prefixSpecificResult = $result;
- if (in_array('p-', $prefixes)) {
- $prefixSpecificResult['value'] = (!is_array($prefixSpecificResult['properties']) || empty($prefixSpecificResult['properties']['name'][0])) ? $this->parseP($node) : $prefixSpecificResult['properties']['name'][0];
- } elseif (in_array('e-', $prefixes)) {
- $eParsedResult = $this->parseE($node);
- $prefixSpecificResult['html'] = $eParsedResult['html'];
- $prefixSpecificResult['value'] = $eParsedResult['value'];
- } elseif (in_array('u-', $prefixes)) {
- $prefixSpecificResult['value'] = (!is_array($result['properties']) || empty($result['properties']['url'])) ? $this->parseU($node) : reset($result['properties']['url']);
- } elseif (in_array('dt-', $prefixes)) {
- $parsed_property = $this->parseDT($node);
- $prefixSpecificResult['value'] = ($parsed_property) ? $parsed_property : '';
- }
- $mfs['properties'][$property][] = $prefixSpecificResult;
- }
- // otherwise, set up in 'children'
- } else {
- $mfs['children'][] = $result;
- }
- // otherwise, top-level mf
- } else {
- $mfs[] = $result;
- }
- }
- }
- return $mfs;
- }
- /**
- * Parse From ID
- *
- * Given an ID, parse all microformats which are children of the element with
- * that ID.
- *
- * Note that rel values are still document-wide.
- *
- * If an element with the ID is not found, an empty skeleton mf2 array structure
- * will be returned.
- *
- * @param string $id
- * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
- * @return array
- */
- public function parseFromId($id, $convertClassic=true) {
- $matches = $this->xpath->query("//*[@id='{$id}']");
- if (empty($matches))
- return array('items' => array(), 'rels' => array(), 'alternates' => array());
- return $this->parse($convertClassic, $matches->item(0));
- }
- /**
- * Get the root microformat elements
- * @param DOMElement $context
- * @return DOMNodeList
- */
- public function getRootMF(DOMElement $context = null) {
- // start with mf2 root class name xpath
- $xpaths = array(
- 'contains(concat(" ",normalize-space(@class)), " h-")'
- );
- // add mf1 root class names
- foreach ( $this->classicRootMap as $old => $new ) {
- $xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") )';
- }
- // final xpath with OR
- $xpath = '//*[' . implode(' or ', $xpaths) . ']';
- $mfElements = (null === $context)
- ? $this->xpath->query($xpath)
- : $this->xpath->query('.' . $xpath, $context);
- return $mfElements;
- }
- /**
- * Apply the backcompat algorithm to upgrade mf1 classes to mf2.
- * This method is called recursively.
- * @param DOMElement $el
- * @param string $context
- * @param bool $isParentMf2
- * @see http://microformats.org/wiki/microformats2-parsing#algorithm
- */
- public function backcompat(DOMElement $el, $context = '', $isParentMf2 = false) {
- if ( $context ) {
- $mf1Classes = array($context);
- } else {
- $class = str_replace(array("\t", "\n"), ' ', $el->getAttribute('class'));
- $classes = array_filter(explode(' ', $class));
- $mf1Classes = array_intersect($classes, array_keys($this->classicRootMap));
- }
- $elHasMf2 = $this->hasRootMf2($el);
- foreach ($mf1Classes as $classname) {
- // special handling for specific properties
- switch ( $classname )
- {
- case 'hentry':
- $this->upgradeRelTagToCategory($el);
- $rel_bookmark = $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $el);
- if ( $rel_bookmark->length ) {
- foreach ( $rel_bookmark as $tempEl ) {
- $this->addMfClasses($tempEl, 'u-url');
- $this->addUpgraded($tempEl, array('bookmark'));
- }
- }
- break;
- case 'hreview':
- $item_and_vcard = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " vcard ")]', $el);
- if ( $item_and_vcard->length ) {
- foreach ( $item_and_vcard as $tempEl ) {
- if ( !$this->hasRootMf2($tempEl) ) {
- $this->backcompat($tempEl, 'vcard');
- $this->addMfClasses($tempEl, 'p-item h-card');
- $this->addUpgraded($tempEl, array('item', 'vcard'));
- }
- }
- }
- $item_and_vevent = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " vevent ")]', $el);
- if ( $item_and_vevent->length ) {
- foreach ( $item_and_vevent as $tempEl ) {
- if ( !$this->hasRootMf2($tempEl) ) {
- $this->addMfClasses($tempEl, 'p-item h-event');
- $this->backcompat($tempEl, 'vevent');
- $this->addUpgraded($tempEl, array('item', 'vevent'));
- }
- }
- }
- $item_and_hproduct = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " hproduct ")]', $el);
- if ( $item_and_hproduct->length ) {
- foreach ( $item_and_hproduct as $tempEl ) {
- if ( !$this->hasRootMf2($tempEl) ) {
- $this->addMfClasses($tempEl, 'p-item h-product');
- $this->backcompat($tempEl, 'vevent');
- $this->addUpgraded($tempEl, array('item', 'hproduct'));
- }
- }
- }
- $this->upgradeRelTagToCategory($el);
- break;
- case 'vevent':
- $location = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " location ")]', $el);
- if ( $location->length ) {
- foreach ( $location as $tempEl ) {
- if ( !$this->hasRootMf2($tempEl) ) {
- $this->addMfClasses($tempEl, 'h-card');
- $this->backcompat($tempEl, 'vcard');
- }
- }
- }
- break;
- }
- // root class has mf1 properties to be upgraded
- if ( isset($this->classicPropertyMap[$classname]) ) {
- // loop through each property of the mf1 root
- foreach ( $this->classicPropertyMap[$classname] as $property => $data ) {
- $propertyElements = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " ' . $property . ' ")]', $el);
- // loop through each element with the property
- foreach ( $propertyElements as $propertyEl ) {
- $hasRootMf2 = $this->hasRootMf2($propertyEl);
- // if the element has not been upgraded and we're not inside an mf2 root, recurse
- if ( !$this->isElementUpgraded($propertyEl, $property) && !$isParentMf2 )
- {
- $temp_context = ( isset($data['context']) ) ? $data['context'] : null;
- $this->backcompat($propertyEl, $temp_context, $hasRootMf2);
- $this->addMfClasses($propertyEl, $data['replace']);
- }
- $this->addUpgraded($propertyEl, $property);
- }
- }
- }
- if ( empty($context) && isset($this->classicRootMap[$classname]) && !$elHasMf2 ) {
- $this->addMfClasses($el, $this->classicRootMap[$classname]);
- }
- }
- return;
- }
- /**
- * Add element + property as upgraded during backcompat
- * @param DOMElement $el
- * @param string|array $property
- */
- public function addUpgraded(DOMElement $el, $property) {
- if ( !is_array($property) ) {
- $property = array($property);
- }
- // add element to list of upgraded elements
- if ( !$this->upgraded->contains($el) ) {
- $this->upgraded->attach($el, $property);
- } else {
- $this->upgraded[$el] = array_merge($this->upgraded[$el], $property);
- }
- }
- /**
- * Add the provided classes to an element.
- * Does not add duplicate if class name already exists.
- * @param DOMElement $el
- * @param string $classes
- */
- public function addMfClasses(DOMElement $el, $classes) {
- $existingClasses = str_replace(array("\t", "\n"), ' ', $el->getAttribute('class'));
- $existingClasses = array_filter(explode(' ', $existingClasses));
- $addClasses = array_diff(explode(' ', $classes), $existingClasses);
- if ( $addClasses ) {
- $el->setAttribute('class', $el->getAttribute('class') . ' ' . implode(' ', $addClasses));
- }
- }
- /**
- * Check an element for mf2 h-* class, typically to determine if backcompat should be used
- * @param DOMElement $el
- */
- public function hasRootMf2(\DOMElement $el) {
- $class = str_replace(array("\t", "\n"), ' ', $el->getAttribute('class'));
- $classes = array_filter(explode(' ', $class));
- foreach ( $classes as $classname ) {
- if ( strpos($classname, 'h-') === 0 ) {
- return true;
- }
- }
- return false;
- }
- /**
- * Convert Legacy Classnames
- *
- * Adds microformats2 classnames into a document containing only legacy
- * semantic classnames.
- *
- * @return Parser $this
- */
- public function convertLegacy() {
- $doc = $this->doc;
- $xp = new DOMXPath($doc);
- // replace all roots
- foreach ($this->classicRootMap as $old => $new) {
- foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
- $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
- }
- }
- foreach ($this->classicPropertyMap as $oldRoot => $properties) {
- $newRoot = $this->classicRootMap[$oldRoot];
- foreach ($properties as $old => $data) {
- foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $data['replace'] . ' "))]') as $el) {
- $el->setAttribute('class', $el->getAttribute('class') . ' ' . $data['replace']);
- }
- }
- }
- return $this;
- }
- /**
- * XPath Query
- *
- * Runs an XPath query over the current document. Works in exactly the same
- * way as DOMXPath::query.
- *
- * @param string $expression
- * @param DOMNode $context
- * @return DOMNodeList
- */
- public function query($expression, $context = null) {
- return $this->xpath->query($expression, $context);
- }
- /**
- * Classic Root Classname map
- * @var array
- */
- public $classicRootMap = array(
- 'vcard' => 'h-card',
- 'hfeed' => 'h-feed',
- 'hentry' => 'h-entry',
- 'hrecipe' => 'h-recipe',
- 'hresume' => 'h-resume',
- 'vevent' => 'h-event',
- 'hreview' => 'h-review',
- 'hproduct' => 'h-product',
- 'adr' => 'h-adr',
- );
- /**
- * Mapping of mf1 properties to mf2 and the context they're parsed with
- * @var array
- */
- public $classicPropertyMap = array(
- 'vcard' => array(
- 'fn' => array(
- 'replace' => 'p-name'
- ),
- 'honorific-prefix' => array(
- 'replace' => 'p-honorific-prefix'
- ),
- 'given-name' => array(
- 'replace' => 'p-given-name'
- ),
- 'additional-name' => array(
- 'replace' => 'p-additional-name'
- ),
- 'family-name' => array(
- 'replace' => 'p-family-name'
- ),
- 'honorific-suffix' => array(
- 'replace' => 'p-honorific-suffix'
- ),
- 'nickname' => array(
- 'replace' => 'p-nickname'
- ),
- 'email' => array(
- 'replace' => 'u-email'
- ),
- 'logo' => array(
- 'replace' => 'u-logo'
- ),
- 'photo' => array(
- 'replace' => 'u-photo'
- ),
- 'url' => array(
- 'replace' => 'u-url'
- ),
- 'uid' => array(
- 'replace' => 'u-uid'
- ),
- 'category' => array(
- 'replace' => 'p-category'
- ),
- 'adr' => array(
- 'replace' => 'p-adr',
- ),
- 'extended-address' => array(
- 'replace' => 'p-extended-address'
- ),
- 'street-address' => array(
- 'replace' => 'p-street-address'
- ),
- 'locality' => array(
- 'replace' => 'p-locality'
- ),
- 'region' => array(
- 'replace' => 'p-region'
- ),
- 'postal-code' => array(
- 'replace' => 'p-postal-code'
- ),
- 'country-name' => array(
- 'replace' => 'p-country-name'
- ),
- 'label' => array(
- 'replace' => 'p-label'
- ),
- 'geo' => array(
- 'replace' => 'p-geo h-geo'
- ),
- 'latitude' => array(
- 'replace' => 'p-latitude'
- ),
- 'longitude' => array(
- 'replace' => 'p-longitude'
- ),
- 'tel' => array(
- 'replace' => 'p-tel'
- ),
- 'note' => array(
- 'replace' => 'p-note'
- ),
- 'bday' => array(
- 'replace' => 'dt-bday'
- ),
- 'key' => array(
- 'replace' => 'u-key'
- ),
- 'org' => array(
- 'replace' => 'p-org'
- ),
- 'organization-name' => array(
- 'replace' => 'p-organization-name'
- ),
- 'organization-unit' => array(
- 'replace' => 'p-organization-unit'
- ),
- 'title' => array(
- 'replace' => 'p-job-title'
- ),
- 'role' => array(
- 'replace' => 'p-role'
- ),
- 'tz' => array(
- 'replace' => 'p-tz'
- ),
- 'rev' => array(
- 'replace' => 'dt-rev'
- ),
- ),
- 'hfeed' => array(
- # nothing currently
- ),
- 'hentry' => array(
- 'entry-title' => array(
- 'replace' => 'p-name'
- ),
- 'entry-summary' => array(
- 'replace' => 'p-summary'
- ),
- 'entry-content' => array(
- 'replace' => 'e-content'
- ),
- 'published' => array(
- 'replace' => 'dt-published'
- ),
- 'updated' => array(
- 'replace' => 'dt-updated'
- ),
- 'author' => array(
- 'replace' => 'p-author h-card',
- 'context' => 'vcard',
- ),
- 'category' => array(
- 'replace' => 'p-category'
- ),
- ),
- 'hrecipe' => array(
- 'fn' => array(
- 'replace' => 'p-name'
- ),
- 'ingredient' => array(
- 'replace' => 'p-ingredient'
- /**
- * TODO: hRecipe 'value' and 'type' child mf not parsing correctly currently.
- * Per http://microformats.org/wiki/hRecipe#Property_details, they're experimental.
- */
- ),
- 'yield' => array(
- 'replace' => 'p-yield'
- ),
- 'instructions' => array(
- 'replace' => 'e-instructions'
- ),
- 'duration' => array(
- 'replace' => 'dt-duration'
- ),
- 'photo' => array(
- 'replace' => 'u-photo'
- ),
- 'summary' => array(
- 'replace' => 'p-summary'
- ),
- 'author' => array(
- 'replace' => 'p-author h-card',
- 'context' => 'vcard',
- ),
- 'nutrition' => array(
- 'replace' => 'p-nutrition'
- ),
- 'category' => array(
- 'replace' => 'p-category'
- ),
- ),
- 'hresume' => array(
- 'summary' => array(
- 'replace' => 'p-summary'
- ),
- 'contact' => array(
- 'replace' => 'p-contact h-card',
- 'context' => 'vcard',
- ),
- 'education' => array(
- 'replace' => 'p-education h-event',
- 'context' => 'vevent',
- ),
- 'experience' => array(
- 'replace' => 'p-experience h-event',
- 'context' => 'vevent',
- ),
- 'skill' => array(
- 'replace' => 'p-skill'
- ),
- 'affiliation' => array(
- 'replace' => 'p-affiliation h-card',
- 'context' => 'vcard',
- ),
- ),
- 'vevent' => array(
- 'summary' => array(
- 'replace' => 'p-name'
- ),
- 'dtstart' => array(
- 'replace' => 'dt-start'
- ),
- 'dtend' => array(
- 'replace' => 'dt-end'
- ),
- 'duration' => array(
- 'replace' => 'dt-duration'
- ),
- 'description' => array(
- 'replace' => 'p-description'
- ),
- 'url' => array(
- 'replace' => 'u-url'
- ),
- 'category' => array(
- 'replace' => 'p-category'
- ),
- 'location' => array(
- 'replace' => 'h-card',
- 'context' => 'vcard'
- ),
- 'geo' => array(
- 'replace' => 'p-location h-geo'
- ),
- ),
- 'hreview' => array(
- 'summary' => array(
- 'replace' => 'p-name'
- ),
- # fn: see item.fn below
- # photo: see item.photo below
- # url: see item.url below
- 'item' => array(
- 'replace' => 'p-item h-item',
- 'context' => 'item'
- ),
- 'reviewer' => array(
- 'replace' => 'p-author h-card',
- 'context' => 'vcard',
- ),
- 'dtreviewed' => array(
- 'replace' => 'dt-published'
- ),
- 'rating' => array(
- 'replace' => 'p-rating'
- ),
- 'best' => array(
- 'replace' => 'p-best'
- ),
- 'worst' => array(
- 'replace' => 'p-worst'
- ),
- 'description' => array(
- 'replace' => 'e-content'
- ),
- 'category' => array(
- 'replace' => 'p-category'
- ),
- ),
- 'hproduct' => array(
- 'fn' => array(
- 'replace' => 'p-name',
- ),
- 'photo' => array(
- 'replace' => 'u-photo',
- ),
- 'brand' => array(
- 'replace' => 'p-brand',
- ),
- 'category' => array(
- 'replace' => 'p-category',
- ),
- 'description' => array(
- 'replace' => 'p-description',
- ),
- 'identifier' => array(
- 'replace' => 'u-identifier',
- ),
- 'url' => array(
- 'replace' => 'u-url',
- ),
- 'review' => array(
- 'replace' => 'p-review h-review',
- ),
- 'price' => array(
- 'replace' => 'p-price'
- ),
- ),
- 'item' => array(
- 'fn' => array(
- 'replace' => 'p-name'
- ),
- 'url' => array(
- 'replace' => 'u-url'
- ),
- 'photo' => array(
- 'replace' => 'u-photo'
- ),
- ),
- 'adr' => array(
- 'post-office-box' => array(
- 'replace' => 'p-post-office-box'
- ),
- 'extended-address' => array(
- 'replace' => 'p-extended-address'
- ),
- 'street-address' => array(
- 'replace' => 'p-street-address'
- ),
- 'locality' => array(
- 'replace' => 'p-locality'
- ),
- 'region' => array(
- 'replace' => 'p-region'
- ),
- 'postal-code' => array(
- 'replace' => 'p-postal-code'
- ),
- 'country-name' => array(
- 'replace' => 'p-country-name'
- ),
- ),
- 'geo' => array(
- 'latitude' => array(
- 'replace' => 'p-latitude'
- ),
- 'longitude' => array(
- 'replace' => 'p-longitude'
- ),
- ),
- );
- }
- function parseUriToComponents($uri) {
- $result = array(
- 'scheme' => null,
- 'authority' => null,
- 'path' => null,
- 'query' => null,
- 'fragment' => null
- );
- $u = @parse_url($uri);
- if(array_key_exists('scheme', $u))
- $result['scheme'] = $u['scheme'];
- if(array_key_exists('host', $u)) {
- if(array_key_exists('user', $u))
- $result['authority'] = $u['user'];
- if(array_key_exists('pass', $u))
- $result['authority'] .= ':' . $u['pass'];
- if(array_key_exists('user', $u) || array_key_exists('pass', $u))
- $result['authority'] .= '@';
- $result['authority'] .= $u['host'];
- if(array_key_exists('port', $u))
- $result['authority'] .= ':' . $u['port'];
- }
- if(array_key_exists('path', $u))
- $result['path'] = $u['path'];
- if(array_key_exists('query', $u))
- $result['query'] = $u['query'];
- if(array_key_exists('fragment', $u))
- $result['fragment'] = $u['fragment'];
- return $result;
- }
- function resolveUrl($baseURI, $referenceURI) {
- $target = array(
- 'scheme' => null,
- 'authority' => null,
- 'path' => null,
- 'query' => null,
- 'fragment' => null
- );
- # 5.2.1 Pre-parse the Base URI
- # The base URI (Base) is established according to the procedure of
- # Section 5.1 and parsed into the five main components described in
- # Section 3
- $base = parseUriToComponents($baseURI);
- # If base path is blank (http://example.com) then set it to /
- # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
- if($base['path'] == null)
- $base['path'] = '/';
- # 5.2.2. Transform References
- # The URI reference is parsed into the five URI components
- # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
- $reference = parseUriToComponents($referenceURI);
- # A non-strict parser may ignore a scheme in the reference
- # if it is identical to the base URI's scheme.
- # TODO
- if($reference['scheme']) {
- $target['scheme'] = $reference['scheme'];
- $target['authority'] = $reference['authority'];
- $target['path'] = removeDotSegments($reference['path']);
- $target['query'] = $reference['query'];
- } else {
- if($reference['authority']) {
- $target['authority'] = $reference['authority'];
- $target['path'] = removeDotSegments($reference['path']);
- $target['query'] = $reference['query'];
- } else {
- if($reference['path'] == '') {
- $target['path'] = $base['path'];
- if($reference['query']) {
- $target['query'] = $reference['query'];
- } else {
- $target['query'] = $base['query'];
- }
- } else {
- if(substr($reference['path'], 0, 1) == '/') {
- $target['path'] = removeDotSegments($reference['path']);
- } else {
- $target['path'] = mergePaths($base, $reference);
- $target['path'] = removeDotSegments($target['path']);
- }
- $target['query'] = $reference['query'];
- }
- $target['authority'] = $base['authority'];
- }
- $target['scheme'] = $base['scheme'];
- }
- $target['fragment'] = $reference['fragment'];
- # 5.3 Component Recomposition
- $result = '';
- if($target['scheme']) {
- $result .= $target['scheme'] . ':';
- }
- if($target['authority']) {
- $result .= '//' . $target['authority'];
- }
- $result .= $target['path'];
- if($target['query']) {
- $result .= '?' . $target['query'];
- }
- if($target['fragment']) {
- $result .= '#' . $target['fragment'];
- } elseif($referenceURI == '#') {
- $result .= '#';
- }
- return $result;
- }
- # 5.2.3 Merge Paths
- function mergePaths($base, $reference) {
- # If the base URI has a defined authority component and an empty
- # path,
- if($base['authority'] && $base['path'] == null) {
- # then return a string consisting of "/" concatenated with the
- # reference's path; otherwise,
- $merged = '/' . $reference['path'];
- } else {
- if(($pos=strrpos($base['path'], '/')) !== false) {
- # return a string consisting of the reference's path component
- # appended to all but the last segment of the base URI's path (i.e.,
- # excluding any characters after the right-most "/" in the base URI
- # path,
- $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
- } else {
- # or excluding the entire base URI path if it does not contain
- # any "/" characters).
- $merged = $base['path'];
- }
- }
- return $merged;
- }
- # 5.2.4.A Remove leading ../ or ./
- function removeLeadingDotSlash(&$input) {
- if(substr($input, 0, 3) == '../') {
- $input = substr($input, 3);
- } elseif(substr($input, 0, 2) == './') {
- $input = substr($input, 2);
- }
- }
- # 5.2.4.B Replace leading /. with /
- function removeLeadingSlashDot(&$input) {
- if(substr($input, 0, 3) == '/./') {
- $input = '/' . substr($input, 3);
- } else {
- $input = '/' . substr($input, 2);
- }
- }
- # 5.2.4.C Given leading /../ remove component from output buffer
- function removeOneDirLevel(&$input, &$output) {
- if(substr($input, 0, 4) == '/../') {
- $input = '/' . substr($input, 4);
- } else {
- $input = '/' . substr($input, 3);
- }
- $output = substr($output, 0, strrpos($output, '/'));
- }
- # 5.2.4.D Remove . and .. if it's the only thing in the input
- function removeLoneDotDot(&$input) {
- if($input == '.') {
- $input = substr($input, 1);
- } else {
- $input = substr($input, 2);
- }
- }
- # 5.2.4.E Move one segment from input to output
- function moveOneSegmentFromInput(&$input, &$output) {
- if(substr($input, 0, 1) != '/') {
- $pos = strpos($input, '/');
- } else {
- $pos = strpos($input, '/', 1);
- }
- if($pos === false) {
- $output .= $input;
- $input = '';
- } else {
- $output .= substr($input, 0, $pos);
- $input = substr($input, $pos);
- }
- }
- # 5.2.4 Remove Dot Segments
- function removeDotSegments($path) {
- # 1. The input buffer is initialized with the now-appended path
- # components and the output buffer is initialized to the empty
- # string.
- $input = $path;
- $output = '';
- $step = 0;
- # 2. While the input buffer is not empty, loop as follows:
- while($input) {
- $step++;
- if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
- # A. If the input buffer begins with a prefix of "../" or "./",
- # then remove that prefix from the input buffer; otherwise,
- removeLeadingDotSlash($input);
- } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
- # B. if the input buffer begins with a prefix of "/./" or "/.",
- # where "." is a complete path segment, then replace that
- # prefix with "/" in the input buffer; otherwise,
- removeLeadingSlashDot($input);
- } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
- # C. if the input buffer begins with a prefix of "/../" or "/..",
- # where ".." is a complete path segment, then replace that
- # prefix with "/" in the input buffer and remove the last
- # segment and its preceding "/" (if any) from the output
- # buffer; otherwise,
- removeOneDirLevel($input, $output);
- } elseif($input == '.' || $input == '..') {
- # D. if the input buffer consists only of "." or "..", then remove
- # that from the input buffer; otherwise,
- removeLoneDotDot($input);
- } else {
- # E. move the first path segment in the input buffer to the end of
- # the output buffer and any subsequent characters up to, but not including,
- # the next "/" character or the end of the input buffer
- moveOneSegmentFromInput($input, $output);
- }
- }
- return $output;
- }
|