Parser.php 68 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314
  1. <?php
  2. namespace Mf2;
  3. use DOMDocument;
  4. use DOMElement;
  5. use DOMXPath;
  6. use DOMNode;
  7. use DOMNodeList;
  8. use Exception;
  9. use SplObjectStorage;
  10. use stdClass;
  11. /**
  12. * Parse Microformats2
  13. *
  14. * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
  15. *
  16. * Example usage:
  17. *
  18. * use Mf2;
  19. * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
  20. * echo json_encode($output, JSON_PRETTY_PRINT);
  21. *
  22. * Produces:
  23. *
  24. * {
  25. * "items": [
  26. * {
  27. * "type": ["h-card"],
  28. * "properties": {
  29. * "name": ["Barnaby Walters"]
  30. * }
  31. * }
  32. * ],
  33. * "rels": {}
  34. * }
  35. *
  36. * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
  37. * @param string $url The URL the input document was found at, for relative URL resolution
  38. * @param bool $convertClassic whether or not to convert classic microformats
  39. * @return array Canonical MF2 array structure
  40. */
  41. function parse($input, $url = null, $convertClassic = true) {
  42. $parser = new Parser($input, $url);
  43. return $parser->parse($convertClassic);
  44. }
  45. /**
  46. * Fetch microformats2
  47. *
  48. * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
  49. * microformats2 array structure.
  50. *
  51. * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
  52. * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
  53. * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
  54. * for the actual value.
  55. *
  56. * @param string $url The URL to fetch
  57. * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
  58. * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
  59. * @return array|null canonical microformats2 array structure on success, null on failure
  60. */
  61. function fetch($url, $convertClassic = true, &$curlInfo=null) {
  62. $ch = curl_init();
  63. curl_setopt($ch, CURLOPT_URL, $url);
  64. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  65. curl_setopt($ch, CURLOPT_HEADER, 0);
  66. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  67. curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
  68. curl_setopt($ch, CURLOPT_HTTPHEADER, array(
  69. 'Accept: text/html'
  70. ));
  71. $html = curl_exec($ch);
  72. $info = $curlInfo = curl_getinfo($ch);
  73. curl_close($ch);
  74. if (strpos(strtolower($info['content_type']), 'html') === false) {
  75. // The content was not delivered as HTML, do not attempt to parse it.
  76. return null;
  77. }
  78. # ensure the final URL is used to resolve relative URLs
  79. $url = $info['url'];
  80. return parse($html, $url, $convertClassic);
  81. }
  82. /**
  83. * Unicode to HTML Entities
  84. * @param string $input String containing characters to convert into HTML entities
  85. * @return string
  86. */
  87. function unicodeToHtmlEntities($input) {
  88. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  89. }
  90. /**
  91. * Collapse Whitespace
  92. *
  93. * Collapses any sequences of whitespace within a string into a single space
  94. * character.
  95. *
  96. * @deprecated since v0.2.3
  97. * @param string $str
  98. * @return string
  99. */
  100. function collapseWhitespace($str) {
  101. return preg_replace('/[\s|\n]+/', ' ', $str);
  102. }
  103. function unicodeTrim($str) {
  104. // this is cheating. TODO: find a better way if this causes any problems
  105. $str = str_replace(mb_convert_encoding('&nbsp;', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
  106. $str = preg_replace('/^\s+/', '', $str);
  107. return preg_replace('/\s+$/', '', $str);
  108. }
  109. /**
  110. * Microformat Name From Class string
  111. *
  112. * Given the value of @class, get the relevant mf classnames (e.g. h-card,
  113. * p-name).
  114. *
  115. * @param string $class A space delimited list of classnames
  116. * @param string $prefix The prefix to look for
  117. * @return string|array The prefixed name of the first microfomats class found or false
  118. */
  119. function mfNamesFromClass($class, $prefix='h-') {
  120. $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
  121. $classes = explode(' ', $class);
  122. $classes = preg_grep('#^(h|p|u|dt|e)-([a-z0-9]+-)?[a-z]+(-[a-z]+)*$#', $classes);
  123. $matches = array();
  124. foreach ($classes as $classname) {
  125. $compare_classname = ' ' . $classname;
  126. $compare_prefix = ' ' . $prefix;
  127. if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
  128. $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
  129. }
  130. }
  131. return $matches;
  132. }
  133. /**
  134. * Get Nested µf Property Name From Class
  135. *
  136. * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
  137. * space-separated string.
  138. *
  139. * @param string $class
  140. * @return array
  141. */
  142. function nestedMfPropertyNamesFromClass($class) {
  143. $prefixes = array('p-', 'u-', 'dt-', 'e-');
  144. $propertyNames = array();
  145. $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
  146. foreach (explode(' ', $class) as $classname) {
  147. foreach ($prefixes as $prefix) {
  148. // Check if $classname is a valid property classname for $prefix.
  149. if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
  150. $propertyName = mb_substr($classname, mb_strlen($prefix));
  151. $propertyNames[$propertyName][] = $prefix;
  152. }
  153. }
  154. }
  155. foreach ($propertyNames as $property => $prefixes) {
  156. $propertyNames[$property] = array_unique($prefixes);
  157. }
  158. return $propertyNames;
  159. }
  160. /**
  161. * Wraps mfNamesFromClass to handle an element as input (common)
  162. *
  163. * @param DOMElement $e The element to get the classname for
  164. * @param string $prefix The prefix to look for
  165. * @return mixed See return value of mf2\Parser::mfNameFromClass()
  166. */
  167. function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
  168. $class = $e->getAttribute('class');
  169. return mfNamesFromClass($class, $prefix);
  170. }
  171. /**
  172. * Wraps nestedMfPropertyNamesFromClass to handle an element as input
  173. */
  174. function nestedMfPropertyNamesFromElement(\DOMElement $e) {
  175. $class = $e->getAttribute('class');
  176. return nestedMfPropertyNamesFromClass($class);
  177. }
  178. /**
  179. * Converts various time formats to HH:MM
  180. * @param string $time The time to convert
  181. * @return string
  182. */
  183. function convertTimeFormat($time) {
  184. $hh = $mm = $ss = '';
  185. preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
  186. // If no am/pm is specified:
  187. if (empty($matches[4])) {
  188. return $time;
  189. } else {
  190. // Otherwise, am/pm is specified.
  191. $meridiem = strtolower(str_replace('.', '', $matches[4]));
  192. // Hours.
  193. $hh = $matches[1];
  194. // Add 12 to hours if pm applies.
  195. if ($meridiem == 'pm' && ($hh < 12)) {
  196. $hh += 12;
  197. }
  198. $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
  199. // Minutes.
  200. $mm = (empty($matches[2]) ) ? '00' : $matches[2];
  201. // Seconds, only if supplied.
  202. if (!empty($matches[3])) {
  203. $ss = $matches[3];
  204. }
  205. if (empty($ss)) {
  206. return sprintf('%s:%s', $hh, $mm);
  207. }
  208. else {
  209. return sprintf('%s:%s:%s', $hh, $mm, $ss);
  210. }
  211. }
  212. }
  213. /**
  214. * Normalize an ordinal date to YYYY-MM-DD
  215. * This function should only be called after validating the $dtValue
  216. * matches regex \d{4}-\d{2}
  217. * @param string $dtValue
  218. * @return string
  219. */
  220. function normalizeOrdinalDate($dtValue) {
  221. list($year, $day) = explode('-', $dtValue, 2);
  222. $day = intval($day);
  223. if ($day < 367 && $day > 0) {
  224. $date = \DateTime::createFromFormat('Y-z', $dtValue);
  225. $date->modify('-1 day'); # 'z' format is zero-based so need to adjust
  226. if ($date->format('Y') === $year) {
  227. return $date->format('Y-m-d');
  228. }
  229. }
  230. return '';
  231. }
  232. /**
  233. * If a date value has a timezone offset, normalize it.
  234. * @param string $dtValue
  235. * @return string isolated, normalized TZ offset for implied TZ for other dt- properties
  236. */
  237. function normalizeTimezoneOffset(&$dtValue) {
  238. preg_match('/Z|[+-]\d{1,2}:?(\d{2})?$/i', $dtValue, $matches);
  239. if (empty($matches)) {
  240. return null;
  241. }
  242. $timezoneOffset = null;
  243. if ( $matches[0] != 'Z' ) {
  244. $timezoneString = str_replace(':', '', $matches[0]);
  245. $plus_minus = substr($timezoneString, 0, 1);
  246. $timezoneOffset = substr($timezoneString, 1);
  247. if ( strlen($timezoneOffset) <= 2 ) {
  248. $timezoneOffset .= '00';
  249. }
  250. $timezoneOffset = str_pad($timezoneOffset, 4, 0, STR_PAD_LEFT);
  251. $timezoneOffset = $plus_minus . $timezoneOffset;
  252. $dtValue = preg_replace('/Z?[+-]\d{1,2}:?(\d{2})?$/i', $timezoneOffset, $dtValue);
  253. }
  254. return $timezoneOffset;
  255. }
  256. function applySrcsetUrlTransformation($srcset, $transformation) {
  257. return implode(', ', array_filter(array_map(function ($srcsetPart) use ($transformation) {
  258. $parts = explode(" \t\n\r\0\x0B", trim($srcsetPart), 2);
  259. $parts[0] = rtrim($parts[0]);
  260. if (empty($parts[0])) { return false; }
  261. $parts[0] = call_user_func($transformation, $parts[0]);
  262. return $parts[0] . (empty($parts[1]) ? '' : ' ' . $parts[1]);
  263. }, explode(',', trim($srcset)))));
  264. }
  265. /**
  266. * Microformats2 Parser
  267. *
  268. * A class which holds state for parsing microformats2 from HTML.
  269. *
  270. * Example usage:
  271. *
  272. * use Mf2;
  273. * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
  274. * $output = $parser->parse();
  275. */
  276. class Parser {
  277. /** @var string The baseurl (if any) to use for this parse */
  278. public $baseurl;
  279. /** @var DOMXPath object which can be used to query over any fragment*/
  280. public $xpath;
  281. /** @var DOMDocument */
  282. public $doc;
  283. /** @var SplObjectStorage */
  284. protected $parsed;
  285. /**
  286. * @var bool
  287. */
  288. public $jsonMode;
  289. /** @var boolean Whether to include experimental language parsing in the result */
  290. public $lang = false;
  291. /** @var bool Whether to include alternates object (dropped from spec in favor of rel-urls) */
  292. public $enableAlternates = false;
  293. /**
  294. * Elements upgraded to mf2 during backcompat
  295. * @var SplObjectStorage
  296. */
  297. protected $upgraded;
  298. /**
  299. * Whether to convert classic microformats
  300. * @var bool
  301. */
  302. public $convertClassic;
  303. /**
  304. * Constructor
  305. *
  306. * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
  307. * @param string $url The URL of the parsed document, for relative URL resolution
  308. * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
  309. */
  310. public function __construct($input, $url = null, $jsonMode = false) {
  311. libxml_use_internal_errors(true);
  312. if (is_string($input)) {
  313. if (class_exists('Masterminds\\HTML5')) {
  314. $doc = new \Masterminds\HTML5(array('disable_html_ns' => true));
  315. $doc = $doc->loadHTML($input);
  316. } else {
  317. $doc = new DOMDocument();
  318. @$doc->loadHTML(unicodeToHtmlEntities($input));
  319. }
  320. } elseif (is_a($input, 'DOMDocument')) {
  321. $doc = clone $input;
  322. } else {
  323. $doc = new DOMDocument();
  324. @$doc->loadHTML('');
  325. }
  326. $this->xpath = new DOMXPath($doc);
  327. $baseurl = $url;
  328. foreach ($this->xpath->query('//base[@href]') as $base) {
  329. $baseElementUrl = $base->getAttribute('href');
  330. if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
  331. /* The base element URL is relative to the document URL.
  332. *
  333. * :/
  334. *
  335. * Perhaps the author was high? */
  336. $baseurl = resolveUrl($url, $baseElementUrl);
  337. } else {
  338. $baseurl = $baseElementUrl;
  339. }
  340. break;
  341. }
  342. // Ignore <template> elements as per the HTML5 spec
  343. foreach ($this->xpath->query('//template') as $templateEl) {
  344. $templateEl->parentNode->removeChild($templateEl);
  345. }
  346. $this->baseurl = $baseurl;
  347. $this->doc = $doc;
  348. $this->parsed = new SplObjectStorage();
  349. $this->upgraded = new SplObjectStorage();
  350. $this->jsonMode = $jsonMode;
  351. }
  352. private function elementPrefixParsed(\DOMElement $e, $prefix) {
  353. if (!$this->parsed->contains($e))
  354. $this->parsed->attach($e, array());
  355. $prefixes = $this->parsed[$e];
  356. $prefixes[] = $prefix;
  357. $this->parsed[$e] = $prefixes;
  358. }
  359. /**
  360. * Determine if the element has already been parsed
  361. * @param DOMElement $e
  362. * @param string $prefix
  363. * @return bool
  364. */
  365. private function isElementParsed(\DOMElement $e, $prefix) {
  366. if (!$this->parsed->contains($e)) {
  367. return false;
  368. }
  369. $prefixes = $this->parsed[$e];
  370. if (!in_array($prefix, $prefixes)) {
  371. return false;
  372. }
  373. return true;
  374. }
  375. /**
  376. * Determine if the element's specified property has already been upgraded during backcompat
  377. * @param DOMElement $el
  378. * @param string $property
  379. * @return bool
  380. */
  381. private function isElementUpgraded(\DOMElement $el, $property) {
  382. if ( $this->upgraded->contains($el) ) {
  383. if ( in_array($property, $this->upgraded[$el]) ) {
  384. return true;
  385. }
  386. }
  387. return false;
  388. }
  389. private function resolveChildUrls(DOMElement $el) {
  390. $hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);
  391. foreach ($hyperlinkChildren as $child) {
  392. if ($child->hasAttribute('href'))
  393. $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
  394. if ($child->hasAttribute('src'))
  395. $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
  396. if ($child->hasAttribute('srcset'))
  397. $child->setAttribute('srcset', applySrcsetUrlTransformation($child->getAttribute('href'), array($this, 'resolveUrl')));
  398. if ($child->hasAttribute('data'))
  399. $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
  400. }
  401. }
  402. /**
  403. * The following two methods implements plain text parsing.
  404. * @param DOMElement $element
  405. * @param bool $implied
  406. * @see https://wiki.zegnat.net/media/textparsing.html
  407. **/
  408. public function textContent(DOMElement $element, $implied=false)
  409. {
  410. return preg_replace(
  411. '/(^[\t\n\f\r ]+| +(?=\n)|(?<=\n) +| +(?= )|[\t\n\f\r ]+$)/',
  412. '',
  413. $this->elementToString($element, $implied)
  414. );
  415. }
  416. private function elementToString(DOMElement $input, $implied=false)
  417. {
  418. $output = '';
  419. foreach ($input->childNodes as $child) {
  420. if ($child->nodeType === XML_TEXT_NODE) {
  421. $output .= str_replace(array("\t", "\n", "\r") , ' ', $child->textContent);
  422. } else if ($child->nodeType === XML_ELEMENT_NODE) {
  423. $tagName = strtoupper($child->tagName);
  424. if (in_array($tagName, array('SCRIPT', 'STYLE'))) {
  425. continue;
  426. } else if ($tagName === 'IMG') {
  427. if ($child->hasAttribute('alt')) {
  428. $output .= ' ' . trim($child->getAttribute('alt'), "\t\n\f\r ") . ' ';
  429. } else if (!$implied && $child->hasAttribute('src')) {
  430. $output .= ' ' . $this->resolveUrl(trim($child->getAttribute('src'), "\t\n\f\r ")) . ' ';
  431. }
  432. } else if ($tagName === 'BR') {
  433. $output .= "\n";
  434. } else if ($tagName === 'P') {
  435. $output .= "\n" . $this->elementToString($child);
  436. } else {
  437. $output .= $this->elementToString($child);
  438. }
  439. }
  440. }
  441. return $output;
  442. }
  443. /**
  444. * This method parses the language of an element
  445. * @param DOMElement $el
  446. * @access public
  447. * @return string
  448. */
  449. public function language(DOMElement $el)
  450. {
  451. // element has a lang attribute; use it
  452. if ($el->hasAttribute('lang')) {
  453. return unicodeTrim($el->getAttribute('lang'));
  454. }
  455. if ($el->tagName == 'html') {
  456. // we're at the <html> element and no lang; check <meta> http-equiv Content-Language
  457. foreach ( $this->xpath->query('.//meta[@http-equiv]') as $node )
  458. {
  459. if ($node->hasAttribute('http-equiv') && $node->hasAttribute('content') && strtolower($node->getAttribute('http-equiv')) == 'content-language') {
  460. return unicodeTrim($node->getAttribute('content'));
  461. }
  462. }
  463. } elseif ($el->parentNode instanceof DOMElement) {
  464. // check the parent node
  465. return $this->language($el->parentNode);
  466. }
  467. return '';
  468. } # end method language()
  469. // TODO: figure out if this has problems with sms: and geo: URLs
  470. public function resolveUrl($url) {
  471. // If the URL is seriously malformed it’s probably beyond the scope of this
  472. // parser to try to do anything with it.
  473. if (parse_url($url) === false) {
  474. return $url;
  475. }
  476. // per issue #40 valid URLs could have a space on either side
  477. $url = trim($url);
  478. $scheme = parse_url($url, PHP_URL_SCHEME);
  479. if (empty($scheme) and !empty($this->baseurl)) {
  480. return resolveUrl($this->baseurl, $url);
  481. } else {
  482. return $url;
  483. }
  484. }
  485. // Parsing Functions
  486. /**
  487. * Parse value-class/value-title on an element, joining with $separator if
  488. * there are multiple.
  489. *
  490. * @param \DOMElement $e
  491. * @param string $separator = '' if multiple value-title elements, join with this string
  492. * @return string|null the parsed value or null if value-class or -title aren’t in use
  493. */
  494. public function parseValueClassTitle(\DOMElement $e, $separator = '') {
  495. $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
  496. if ($valueClassElements->length !== 0) {
  497. // Process value-class stuff
  498. $val = '';
  499. foreach ($valueClassElements as $el) {
  500. $val .= $this->textContent($el);
  501. }
  502. return unicodeTrim($val);
  503. }
  504. $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
  505. if ($valueTitleElements->length !== 0) {
  506. // Process value-title stuff
  507. $val = '';
  508. foreach ($valueTitleElements as $el) {
  509. $val .= $el->getAttribute('title');
  510. }
  511. return unicodeTrim($val);
  512. }
  513. // No value-title or -class in this element
  514. return null;
  515. }
  516. /**
  517. * Given an element with class="p-*", get its value
  518. *
  519. * @param DOMElement $p The element to parse
  520. * @return string The plaintext value of $p, dependant on type
  521. * @todo Make this adhere to value-class
  522. */
  523. public function parseP(\DOMElement $p) {
  524. $classTitle = $this->parseValueClassTitle($p, ' ');
  525. if ($classTitle !== null) {
  526. return $classTitle;
  527. }
  528. $this->resolveChildUrls($p);
  529. if ($p->tagName == 'img' and $p->hasAttribute('alt')) {
  530. $pValue = $p->getAttribute('alt');
  531. } elseif ($p->tagName == 'area' and $p->hasAttribute('alt')) {
  532. $pValue = $p->getAttribute('alt');
  533. } elseif (($p->tagName == 'abbr' or $p->tagName == 'link') and $p->hasAttribute('title')) {
  534. $pValue = $p->getAttribute('title');
  535. } elseif (in_array($p->tagName, array('data', 'input')) and $p->hasAttribute('value')) {
  536. $pValue = $p->getAttribute('value');
  537. } else {
  538. $pValue = $this->textContent($p);
  539. }
  540. return $pValue;
  541. }
  542. /**
  543. * Given an element with class="u-*", get the value of the URL
  544. *
  545. * @param DOMElement $u The element to parse
  546. * @return string The plaintext value of $u, dependant on type
  547. * @todo make this adhere to value-class
  548. */
  549. public function parseU(\DOMElement $u) {
  550. if (($u->tagName == 'a' or $u->tagName == 'area' or $u->tagName == 'link') and $u->hasAttribute('href')) {
  551. $uValue = $u->getAttribute('href');
  552. } elseif (in_array($u->tagName, array('img', 'audio', 'video', 'source')) and $u->hasAttribute('src')) {
  553. $uValue = $u->getAttribute('src');
  554. } elseif ($u->tagName == 'video' and !$u->hasAttribute('src') and $u->hasAttribute('poster')) {
  555. $uValue = $u->getAttribute('poster');
  556. } elseif ($u->tagName == 'object' and $u->hasAttribute('data')) {
  557. $uValue = $u->getAttribute('data');
  558. } elseif (($classTitle = $this->parseValueClassTitle($u)) !== null) {
  559. $uValue = $classTitle;
  560. } elseif (($u->tagName == 'abbr' or $u->tagName == 'link') and $u->hasAttribute('title')) {
  561. $uValue = $u->getAttribute('title');
  562. } elseif (in_array($u->tagName, array('data', 'input')) and $u->hasAttribute('value')) {
  563. $uValue = $u->getAttribute('value');
  564. } else {
  565. $uValue = $this->textContent($u);
  566. }
  567. return $this->resolveUrl($uValue);
  568. }
  569. /**
  570. * Given an element with class="dt-*", get the value of the datetime as a php date object
  571. *
  572. * @param DOMElement $dt The element to parse
  573. * @param array $dates Array of dates processed so far
  574. * @param string $impliedTimezone
  575. * @return string The datetime string found
  576. */
  577. public function parseDT(\DOMElement $dt, &$dates = array(), &$impliedTimezone = null) {
  578. // Check for value-class pattern
  579. $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
  580. $dtValue = false;
  581. if ($valueClassChildren->length > 0) {
  582. // They’re using value-class
  583. $dateParts = array();
  584. foreach ($valueClassChildren as $e) {
  585. if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
  586. $title = $e->getAttribute('title');
  587. if (!empty($title)) {
  588. $dateParts[] = $title;
  589. }
  590. }
  591. elseif ($e->tagName == 'img' or $e->tagName == 'area') {
  592. // Use @alt
  593. $alt = $e->getAttribute('alt');
  594. if (!empty($alt)) {
  595. $dateParts[] = $alt;
  596. }
  597. }
  598. elseif ($e->tagName == 'data') {
  599. // Use @value, otherwise innertext
  600. $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
  601. if (!empty($value)) {
  602. $dateParts[] = $value;
  603. }
  604. }
  605. elseif ($e->tagName == 'abbr') {
  606. // Use @title, otherwise innertext
  607. $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
  608. if (!empty($title)) {
  609. $dateParts[] = $title;
  610. }
  611. }
  612. elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
  613. // Use @datetime if available, otherwise innertext
  614. $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
  615. if (!empty($dtAttr)) {
  616. $dateParts[] = $dtAttr;
  617. }
  618. }
  619. else {
  620. if (!empty($e->nodeValue)) {
  621. $dateParts[] = unicodeTrim($e->nodeValue);
  622. }
  623. }
  624. }
  625. // Look through dateParts
  626. $datePart = '';
  627. $timePart = '';
  628. $timezonePart = '';
  629. foreach ($dateParts as $part) {
  630. // Is this part a full ISO8601 datetime?
  631. if (preg_match('/^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}(:\d{2})?(Z|[+-]\d{2}:?\d{2})?$/', $part)) {
  632. // Break completely, we’ve got our value.
  633. $dtValue = $part;
  634. break;
  635. } else {
  636. // Is the current part a valid time(+TZ?) AND no other time representation has been found?
  637. if ((preg_match('/^\d{1,2}:\d{2}(:\d{2})?(Z|[+-]\d{1,2}:?\d{2})?$/', $part) or preg_match('/^\d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i', $part)) and empty($timePart)) {
  638. $timePart = $part;
  639. $timezoneOffset = normalizeTimezoneOffset($timePart);
  640. if (!$impliedTimezone && $timezoneOffset) {
  641. $impliedTimezone = $timezoneOffset;
  642. }
  643. // Is the current part a valid date AND no other date representation has been found?
  644. } elseif (preg_match('/^\d{4}-\d{2}-\d{2}$/', $part) and empty($datePart)) {
  645. $datePart = $part;
  646. // Is the current part a valid ordinal date AND no other date representation has been found?
  647. } elseif (preg_match('/^\d{4}-\d{3}$/', $part) and empty($datePart)) {
  648. $datePart = normalizeOrdinalDate($part);
  649. // Is the current part a valid timezone offset AND no other timezone part has been found?
  650. } elseif (preg_match('/^(Z|[+-]\d{1,2}:?(\d{2})?)$/', $part) and empty($timezonePart)) {
  651. $timezonePart = $part;
  652. $timezoneOffset = normalizeTimezoneOffset($timezonePart);
  653. if (!$impliedTimezone && $timezoneOffset) {
  654. $impliedTimezone = $timezoneOffset;
  655. }
  656. // Current part already represented by other VCP parts; do nothing with it
  657. } else {
  658. continue;
  659. }
  660. if ( !empty($datePart) && !in_array($datePart, $dates) ) {
  661. $dates[] = $datePart;
  662. }
  663. if (!empty($timezonePart) && !empty($timePart)) {
  664. $timePart .= $timezonePart;
  665. }
  666. $dtValue = '';
  667. if ( empty($datePart) && !empty($timePart) ) {
  668. $timePart = convertTimeFormat($timePart);
  669. $dtValue = unicodeTrim($timePart);
  670. }
  671. else if ( !empty($datePart) && empty($timePart) ) {
  672. $dtValue = rtrim($datePart, 'T');
  673. }
  674. else {
  675. $timePart = convertTimeFormat($timePart);
  676. $dtValue = rtrim($datePart, 'T') . ' ' . unicodeTrim($timePart);
  677. }
  678. }
  679. }
  680. } else {
  681. // Not using value-class (phew).
  682. if ($dt->tagName == 'img' or $dt->tagName == 'area') {
  683. // Use @alt
  684. // Is it an entire dt?
  685. $alt = $dt->getAttribute('alt');
  686. if (!empty($alt)) {
  687. $dtValue = $alt;
  688. }
  689. } elseif (in_array($dt->tagName, array('data'))) {
  690. // Use @value, otherwise innertext
  691. // Is it an entire dt?
  692. $value = $dt->getAttribute('value');
  693. if (!empty($value)) {
  694. $dtValue = $value;
  695. }
  696. else {
  697. $dtValue = $this->textContent($dt);
  698. }
  699. } elseif ($dt->tagName == 'abbr') {
  700. // Use @title, otherwise innertext
  701. // Is it an entire dt?
  702. $title = $dt->getAttribute('title');
  703. if (!empty($title)) {
  704. $dtValue = $title;
  705. }
  706. else {
  707. $dtValue = $this->textContent($dt);
  708. }
  709. } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
  710. // Use @datetime if available, otherwise innertext
  711. // Is it an entire dt?
  712. $dtAttr = $dt->getAttribute('datetime');
  713. if (!empty($dtAttr)) {
  714. $dtValue = $dtAttr;
  715. }
  716. else {
  717. $dtValue = $this->textContent($dt);
  718. }
  719. } else {
  720. $dtValue = $this->textContent($dt);
  721. }
  722. // if the dtValue is not just YYYY-MM-DD
  723. if (!preg_match('/^(\d{4}-\d{2}-\d{2})$/', $dtValue)) {
  724. // no implied timezone set and dtValue has a TZ offset, use un-normalized TZ offset
  725. preg_match('/Z|[+-]\d{1,2}:?(\d{2})?$/i', $dtValue, $matches);
  726. if (!$impliedTimezone && !empty($matches[0])) {
  727. $impliedTimezone = $matches[0];
  728. }
  729. }
  730. $dtValue = unicodeTrim($dtValue);
  731. // Store the date part so that we can use it when assembling the final timestamp if the next one is missing a date part
  732. if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
  733. $dates[] = $matches[0];
  734. }
  735. }
  736. /**
  737. * if $dtValue is only a time and there are recently parsed dates,
  738. * form the full date-time using the most recently parsed dt- value
  739. */
  740. if ((preg_match('/^\d{1,2}:\d{2}(:\d{2})?(Z|[+-]\d{2}:?\d{2}?)?$/', $dtValue) or preg_match('/^\d{1,2}(:\d{2})?(:\d{2})?[ap]\.?m\.?$/i', $dtValue)) && !empty($dates)) {
  741. $timezoneOffset = normalizeTimezoneOffset($dtValue);
  742. if (!$impliedTimezone && $timezoneOffset) {
  743. $impliedTimezone = $timezoneOffset;
  744. }
  745. $dtValue = convertTimeFormat($dtValue);
  746. $dtValue = end($dates) . ' ' . unicodeTrim($dtValue);
  747. }
  748. return $dtValue;
  749. }
  750. /**
  751. * Given the root element of some embedded markup, return a string representing that markup
  752. *
  753. * @param DOMElement $e The element to parse
  754. * @return string $e’s innerHTML
  755. *
  756. * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
  757. */
  758. public function parseE(\DOMElement $e) {
  759. $classTitle = $this->parseValueClassTitle($e);
  760. if ($classTitle !== null)
  761. return $classTitle;
  762. // Expand relative URLs within children of this element
  763. // TODO: as it is this is not relative to only children, make this .// and rerun tests
  764. $this->resolveChildUrls($e);
  765. // Temporarily move all descendants into a separate DocumentFragment.
  766. // This way we can DOMDocument::saveHTML on the entire collection at once.
  767. // Running DOMDocument::saveHTML per node may add whitespace that isn't in source.
  768. // See https://stackoverflow.com/q/38317903
  769. $innerNodes = $e->ownerDocument->createDocumentFragment();
  770. while ($e->hasChildNodes()) {
  771. $innerNodes->appendChild($e->firstChild);
  772. }
  773. $html = $e->ownerDocument->saveHtml($innerNodes);
  774. // Put the nodes back in place.
  775. if($innerNodes->hasChildNodes()) {
  776. $e->appendChild($innerNodes);
  777. }
  778. $return = array(
  779. 'html' => unicodeTrim($html),
  780. 'value' => $this->textContent($e),
  781. );
  782. if($this->lang) {
  783. // Language
  784. if ( $html_lang = $this->language($e) ) {
  785. $return['lang'] = $html_lang;
  786. }
  787. }
  788. return $return;
  789. }
  790. private function removeTags(\DOMElement &$e, $tagName) {
  791. while(($r = $e->getElementsByTagName($tagName)) && $r->length) {
  792. $r->item(0)->parentNode->removeChild($r->item(0));
  793. }
  794. }
  795. /**
  796. * Recursively parse microformats
  797. *
  798. * @param DOMElement $e The element to parse
  799. * @param bool $is_backcompat Whether using backcompat parsing or not
  800. * @param bool $has_nested_mf Whether this microformat has a nested microformat
  801. * @return array A representation of the values contained within microformat $e
  802. */
  803. public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf = false) {
  804. // If it’s already been parsed (e.g. is a child mf), skip
  805. if ($this->parsed->contains($e)) {
  806. return null;
  807. }
  808. // Get current µf name
  809. $mfTypes = mfNamesFromElement($e, 'h-');
  810. if (!$mfTypes) {
  811. return null;
  812. }
  813. // Initalise var to store the representation in
  814. $return = array();
  815. $children = array();
  816. $dates = array();
  817. $prefixes = array();
  818. $impliedTimezone = null;
  819. if($e->tagName == 'area') {
  820. $coords = $e->getAttribute('coords');
  821. $shape = $e->getAttribute('shape');
  822. }
  823. // Handle p-*
  824. foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
  825. // element is already parsed
  826. if ($this->isElementParsed($p, 'p')) {
  827. continue;
  828. // backcompat parsing and element was not upgraded; skip it
  829. } else if ( $is_backcompat && empty($this->upgraded[$p]) ) {
  830. $this->elementPrefixParsed($p, 'p');
  831. continue;
  832. }
  833. $prefixes[] = 'p-';
  834. $pValue = $this->parseP($p);
  835. // Add the value to the array for it’s p- properties
  836. foreach (mfNamesFromElement($p, 'p-') as $propName) {
  837. if (!empty($propName)) {
  838. $return[$propName][] = $pValue;
  839. }
  840. }
  841. // Make sure this sub-mf won’t get parsed as a top level mf
  842. $this->elementPrefixParsed($p, 'p');
  843. }
  844. // Handle u-*
  845. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
  846. // element is already parsed
  847. if ($this->isElementParsed($u, 'u')) {
  848. continue;
  849. // backcompat parsing and element was not upgraded; skip it
  850. } else if ( $is_backcompat && empty($this->upgraded[$u]) ) {
  851. $this->elementPrefixParsed($u, 'u');
  852. continue;
  853. }
  854. $prefixes[] = 'u-';
  855. $uValue = $this->parseU($u);
  856. // Add the value to the array for it’s property types
  857. foreach (mfNamesFromElement($u, 'u-') as $propName) {
  858. $return[$propName][] = $uValue;
  859. }
  860. // Make sure this sub-mf won’t get parsed as a top level mf
  861. $this->elementPrefixParsed($u, 'u');
  862. }
  863. $temp_dates = array();
  864. // Handle dt-*
  865. foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
  866. // element is already parsed
  867. if ($this->isElementParsed($dt, 'dt')) {
  868. continue;
  869. // backcompat parsing and element was not upgraded; skip it
  870. } else if ( $is_backcompat && empty($this->upgraded[$dt]) ) {
  871. $this->elementPrefixParsed($dt, 'dt');
  872. continue;
  873. }
  874. $prefixes[] = 'dt-';
  875. $dtValue = $this->parseDT($dt, $dates, $impliedTimezone);
  876. if ($dtValue) {
  877. // Add the value to the array for dt- properties
  878. foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
  879. $temp_dates[$propName][] = $dtValue;
  880. }
  881. }
  882. // Make sure this sub-mf won’t get parsed as a top level mf
  883. $this->elementPrefixParsed($dt, 'dt');
  884. }
  885. foreach ($temp_dates as $propName => $data) {
  886. foreach ( $data as $dtValue ) {
  887. // var_dump(preg_match('/[+-]\d{2}(\d{2})?$/i', $dtValue));
  888. if ( $impliedTimezone && preg_match('/(Z|[+-]\d{2}:?(\d{2})?)$/i', $dtValue, $matches) == 0 ) {
  889. $dtValue .= $impliedTimezone;
  890. }
  891. $return[$propName][] = $dtValue;
  892. }
  893. }
  894. // Handle e-*
  895. foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
  896. // element is already parsed
  897. if ($this->isElementParsed($em, 'e')) {
  898. continue;
  899. // backcompat parsing and element was not upgraded; skip it
  900. } else if ( $is_backcompat && empty($this->upgraded[$em]) ) {
  901. $this->elementPrefixParsed($em, 'e');
  902. continue;
  903. }
  904. $prefixes[] = 'e-';
  905. $eValue = $this->parseE($em);
  906. if ($eValue) {
  907. // Add the value to the array for e- properties
  908. foreach (mfNamesFromElement($em, 'e-') as $propName) {
  909. $return[$propName][] = $eValue;
  910. }
  911. }
  912. // Make sure this sub-mf won’t get parsed as a top level mf
  913. $this->elementPrefixParsed($em, 'e');
  914. }
  915. // Do we need to imply a name property?
  916. // if no explicit "name" property, and no other p-* or e-* properties, and no nested microformats,
  917. if (!array_key_exists('name', $return) && !in_array('p-', $prefixes) && !in_array('e-', $prefixes) && !$has_nested_mf && !$is_backcompat) {
  918. $name = false;
  919. // img.h-x[alt] or area.h-x[alt]
  920. if (($e->tagName === 'img' || $e->tagName === 'area') && $e->hasAttribute('alt')) {
  921. $name = $e->getAttribute('alt');
  922. // abbr.h-x[title]
  923. } elseif ($e->tagName === 'abbr' && $e->hasAttribute('title')) {
  924. $name = $e->getAttribute('title');
  925. } else {
  926. $xpaths = array(
  927. // .h-x>img:only-child[alt]:not([alt=""]):not[.h-*]
  928. './img[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and @alt and string-length(@alt) != 0]',
  929. // .h-x>area:only-child[alt]:not([alt=""]):not[.h-*]
  930. './area[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and @alt and string-length(@alt) != 0]',
  931. // .h-x>abbr:only-child[title]:not([title=""]):not[.h-*]
  932. './abbr[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and @title and string-length(@title) != 0]',
  933. // .h-x>:only-child:not[.h-*]>img:only-child[alt]:not([alt=""]):not[.h-*]
  934. './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(*) = 1]/img[not(contains(concat(" ", @class), " h-")) and @alt and string-length(@alt) != 0]',
  935. // .h-x>:only-child:not[.h-*]>area:only-child[alt]:not([alt=""]):not[.h-*]
  936. './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(*) = 1]/area[not(contains(concat(" ", @class), " h-")) and @alt and string-length(@alt) != 0]',
  937. // .h-x>:only-child:not[.h-*]>abbr:only-child[title]:not([title=""]):not[.h-*]
  938. './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(*) = 1]/abbr[not(contains(concat(" ", @class), " h-")) and @title and string-length(@title) != 0]'
  939. );
  940. foreach ($xpaths as $xpath) {
  941. $nameElement = $this->xpath->query($xpath, $e);
  942. if ($nameElement !== false && $nameElement->length === 1) {
  943. $nameElement = $nameElement->item(0);
  944. if ($nameElement->tagName === 'img' || $nameElement->tagName === 'area') {
  945. $name = $nameElement->getAttribute('alt');
  946. } else {
  947. $name = $nameElement->getAttribute('title');
  948. }
  949. break;
  950. }
  951. }
  952. }
  953. if ($name === false) {
  954. $name = $this->textContent($e, true);
  955. }
  956. $return['name'][] = unicodeTrim($name);
  957. }
  958. // Check for u-photo
  959. if (!array_key_exists('photo', $return) && !$is_backcompat) {
  960. $photo = $this->parseImpliedPhoto($e);
  961. if ($photo !== false) {
  962. $return['photo'][] = $photo;
  963. }
  964. }
  965. // Do we need to imply a url property?
  966. // if no explicit "url" property, and no other explicit u-* properties, and no nested microformats
  967. if (!array_key_exists('url', $return) && !in_array('u-', $prefixes) && !$has_nested_mf && !$is_backcompat) {
  968. // a.h-x[href] or area.h-x[href]
  969. if (($e->tagName === 'a' || $e->tagName === 'area') && $e->hasAttribute('href')) {
  970. $return['url'][] = $this->resolveUrl($e->getAttribute('href'));
  971. } else {
  972. $xpaths = array(
  973. // .h-x>a[href]:only-of-type:not[.h-*]
  974. './a[not(contains(concat(" ", @class), " h-")) and count(../a) = 1 and @href]',
  975. // .h-x>area[href]:only-of-type:not[.h-*]
  976. './area[not(contains(concat(" ", @class), " h-")) and count(../area) = 1 and @href]',
  977. // .h-x>:only-child:not[.h-*]>a[href]:only-of-type:not[.h-*]
  978. './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(a) = 1]/a[not(contains(concat(" ", @class), " h-")) and @href]',
  979. // .h-x>:only-child:not[.h-*]>area[href]:only-of-type:not[.h-*]
  980. './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(area) = 1]/area[not(contains(concat(" ", @class), " h-")) and @href]'
  981. );
  982. foreach ($xpaths as $xpath) {
  983. $url = $this->xpath->query($xpath, $e);
  984. if ($url !== false && $url->length === 1) {
  985. $return['url'][] = $this->resolveUrl($url->item(0)->getAttribute('href'));
  986. break;
  987. }
  988. }
  989. }
  990. }
  991. // Make sure things are unique and in alphabetical order
  992. $mfTypes = array_unique($mfTypes);
  993. sort($mfTypes);
  994. // Properties should be an object when JSON serialised
  995. if (empty($return) and $this->jsonMode) {
  996. $return = new stdClass();
  997. }
  998. // Phew. Return the final result.
  999. $parsed = array(
  1000. 'type' => $mfTypes,
  1001. 'properties' => $return
  1002. );
  1003. if($this->lang) {
  1004. // Language
  1005. if ( $html_lang = $this->language($e) ) {
  1006. $parsed['lang'] = $html_lang;
  1007. }
  1008. }
  1009. if (!empty($shape)) {
  1010. $parsed['shape'] = $shape;
  1011. }
  1012. if (!empty($coords)) {
  1013. $parsed['coords'] = $coords;
  1014. }
  1015. if (!empty($children)) {
  1016. $parsed['children'] = array_values(array_filter($children));
  1017. }
  1018. return $parsed;
  1019. }
  1020. /**
  1021. * @see http://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties
  1022. */
  1023. public function parseImpliedPhoto(\DOMElement $e) {
  1024. // img.h-x[src]
  1025. if ($e->tagName == 'img') {
  1026. return $this->resolveUrl($e->getAttribute('src'));
  1027. }
  1028. // object.h-x[data]
  1029. if ($e->tagName == 'object' && $e->hasAttribute('data')) {
  1030. return $this->resolveUrl($e->getAttribute('data'));
  1031. }
  1032. $xpaths = array(
  1033. // .h-x>img[src]:only-of-type:not[.h-*]
  1034. './img[not(contains(concat(" ", @class), " h-")) and count(../img) = 1 and @src]',
  1035. // .h-x>object[data]:only-of-type:not[.h-*]
  1036. './object[not(contains(concat(" ", @class), " h-")) and count(../object) = 1 and @data]',
  1037. // .h-x>:only-child:not[.h-*]>img[src]:only-of-type:not[.h-*]
  1038. './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(img) = 1]/img[not(contains(concat(" ", @class), " h-")) and @src]',
  1039. // .h-x>:only-child:not[.h-*]>object[data]:only-of-type:not[.h-*]
  1040. './*[not(contains(concat(" ", @class), " h-")) and count(../*) = 1 and count(object) = 1]/object[not(contains(concat(" ", @class), " h-")) and @data]',
  1041. );
  1042. foreach ($xpaths as $path) {
  1043. $els = $this->xpath->query($path, $e);
  1044. if ($els !== false && $els->length === 1) {
  1045. $el = $els->item(0);
  1046. if ($el->tagName == 'img') {
  1047. return $this->resolveUrl($el->getAttribute('src'));
  1048. } else if ($el->tagName == 'object') {
  1049. return $this->resolveUrl($el->getAttribute('data'));
  1050. }
  1051. }
  1052. }
  1053. // no implied photo
  1054. return false;
  1055. }
  1056. /**
  1057. * Parse rels and alternates
  1058. *
  1059. * Returns [$rels, $rel_urls, $alternates].
  1060. * For $rels and $rel_urls, if they are empty and $this->jsonMode = true, they will be returned as stdClass,
  1061. * optimizing for JSON serialization. Otherwise they will be returned as an empty array.
  1062. * Note that $alternates is deprecated in the microformats spec in favor of $rel_urls. $alternates only appears
  1063. * in parsed results if $this->enableAlternates = true.
  1064. * @return array|stdClass
  1065. */
  1066. public function parseRelsAndAlternates() {
  1067. $rels = array();
  1068. $rel_urls = array();
  1069. $alternates = array();
  1070. // Iterate through all a, area and link elements with rel attributes
  1071. foreach ($this->xpath->query('//a[@rel and @href] | //link[@rel and @href] | //area[@rel and @href]') as $hyperlink) {
  1072. // Parse the set of rels for the current link
  1073. $linkRels = array_unique(array_filter(preg_split('/[\t\n\f\r ]/', $hyperlink->getAttribute('rel'))));
  1074. if (count($linkRels) === 0) {
  1075. continue;
  1076. }
  1077. // Resolve the href
  1078. $href = $this->resolveUrl($hyperlink->getAttribute('href'));
  1079. $rel_attributes = array();
  1080. if ($hyperlink->hasAttribute('media')) {
  1081. $rel_attributes['media'] = $hyperlink->getAttribute('media');
  1082. }
  1083. if ($hyperlink->hasAttribute('hreflang')) {
  1084. $rel_attributes['hreflang'] = $hyperlink->getAttribute('hreflang');
  1085. }
  1086. if ($hyperlink->hasAttribute('title')) {
  1087. $rel_attributes['title'] = $hyperlink->getAttribute('title');
  1088. }
  1089. if ($hyperlink->hasAttribute('type')) {
  1090. $rel_attributes['type'] = $hyperlink->getAttribute('type');
  1091. }
  1092. if (strlen($hyperlink->textContent) > 0) {
  1093. $rel_attributes['text'] = $hyperlink->textContent;
  1094. }
  1095. if ($this->enableAlternates) {
  1096. // If 'alternate' in rels, create 'alternates' structure, append
  1097. if (in_array('alternate', $linkRels)) {
  1098. $alternates[] = array_merge(
  1099. $rel_attributes,
  1100. array(
  1101. 'url' => $href,
  1102. 'rel' => implode(' ', array_diff($linkRels, array('alternate')))
  1103. )
  1104. );
  1105. }
  1106. }
  1107. foreach ($linkRels as $rel) {
  1108. if (!array_key_exists($rel, $rels)) {
  1109. $rels[$rel] = array($href);
  1110. } elseif (!in_array($href, $rels[$rel])) {
  1111. $rels[$rel][] = $href;
  1112. }
  1113. }
  1114. if (!array_key_exists($href, $rel_urls)) {
  1115. $rel_urls[$href] = array('rels' => array());
  1116. }
  1117. // Add the attributes collected only if they were not already set
  1118. $rel_urls[$href] = array_merge(
  1119. $rel_attributes,
  1120. $rel_urls[$href]
  1121. );
  1122. // Merge current rels with those already set
  1123. $rel_urls[$href]['rels'] = array_merge(
  1124. $rel_urls[$href]['rels'],
  1125. $linkRels
  1126. );
  1127. }
  1128. // Alphabetically sort the rels arrays after removing duplicates
  1129. foreach ($rel_urls as $href => $object) {
  1130. $rel_urls[$href]['rels'] = array_unique($rel_urls[$href]['rels']);
  1131. sort($rel_urls[$href]['rels']);
  1132. }
  1133. if (empty($rels) and $this->jsonMode) {
  1134. $rels = new stdClass();
  1135. }
  1136. if (empty($rel_urls) and $this->jsonMode) {
  1137. $rel_urls = new stdClass();
  1138. }
  1139. return array($rels, $rel_urls, $alternates);
  1140. }
  1141. /**
  1142. * Find rel=tag elements that don't have class=category and have an href.
  1143. * For each element, get the last non-empty URL segment. Append a <data>
  1144. * element with that value as the category. Uses the mf1 class 'category'
  1145. * which will then be upgraded to p-category during backcompat.
  1146. * @param DOMElement $el
  1147. */
  1148. public function upgradeRelTagToCategory(DOMElement $el) {
  1149. $rel_tag = $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," tag ") and not(contains(concat(" ", normalize-space(@class), " "), " category ")) and @href]', $el);
  1150. if ( $rel_tag->length ) {
  1151. foreach ( $rel_tag as $tempEl ) {
  1152. $path = trim(parse_url($tempEl->getAttribute('href'), PHP_URL_PATH), ' /');
  1153. $segments = explode('/', $path);
  1154. $value = array_pop($segments);
  1155. # build the <data> element
  1156. $dataEl = $tempEl->ownerDocument->createElement('data');
  1157. $dataEl->setAttribute('class', 'category');
  1158. $dataEl->setAttribute('value', $value);
  1159. # append as child of input element. this should ensure added element does get parsed inside e-*
  1160. $el->appendChild($dataEl);
  1161. }
  1162. }
  1163. }
  1164. /**
  1165. * Kicks off the parsing routine
  1166. * @param bool $convertClassic whether to do backcompat parsing on microformats1. Defaults to true.
  1167. * @param DOMElement $context optionally specify an element from which to parse microformats
  1168. * @return array An array containing all the microformats found in the current document
  1169. */
  1170. public function parse($convertClassic = true, DOMElement $context = null) {
  1171. $this->convertClassic = $convertClassic;
  1172. $mfs = $this->parse_recursive($context);
  1173. // Parse rels
  1174. list($rels, $rel_urls, $alternates) = $this->parseRelsAndAlternates();
  1175. $top = array(
  1176. 'items' => array_values(array_filter($mfs)),
  1177. 'rels' => $rels,
  1178. 'rel-urls' => $rel_urls,
  1179. );
  1180. if ($this->enableAlternates && count($alternates)) {
  1181. $top['alternates'] = $alternates;
  1182. }
  1183. return $top;
  1184. }
  1185. /**
  1186. * Parse microformats recursively
  1187. * Keeps track of whether inside a backcompat root or not
  1188. * @param DOMElement $context: node to start with
  1189. * @param int $depth: recursion depth
  1190. * @return array
  1191. */
  1192. public function parse_recursive(DOMElement $context = null, $depth = 0) {
  1193. $mfs = array();
  1194. $mfElements = $this->getRootMF($context);
  1195. foreach ($mfElements as $node) {
  1196. $is_backcompat = !$this->hasRootMf2($node);
  1197. if ($this->convertClassic && $is_backcompat) {
  1198. $this->backcompat($node);
  1199. }
  1200. $recurse = $this->parse_recursive($node, $depth + 1);
  1201. // set bool flag for nested mf
  1202. $has_nested_mf = ($recurse);
  1203. // parse for root mf
  1204. $result = $this->parseH($node, $is_backcompat, $has_nested_mf);
  1205. // TODO: Determine if clearing this is required?
  1206. $this->elementPrefixParsed($node, 'h');
  1207. $this->elementPrefixParsed($node, 'p');
  1208. $this->elementPrefixParsed($node, 'u');
  1209. $this->elementPrefixParsed($node, 'dt');
  1210. $this->elementPrefixParsed($node, 'e');
  1211. // parseH returned a parsed result
  1212. if ($result) {
  1213. // merge recursive results into current results
  1214. if ($recurse) {
  1215. $result = array_merge_recursive($result, $recurse);
  1216. }
  1217. // currently a nested mf; check if node is an mf property of parent
  1218. if ($depth > 0) {
  1219. $temp_properties = nestedMfPropertyNamesFromElement($node);
  1220. // properties found; set up parsed result in 'properties'
  1221. if (!empty($temp_properties)) {
  1222. foreach ($temp_properties as $property => $prefixes) {
  1223. // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
  1224. $prefixSpecificResult = $result;
  1225. if (in_array('p-', $prefixes)) {
  1226. $prefixSpecificResult['value'] = (!is_array($prefixSpecificResult['properties']) || empty($prefixSpecificResult['properties']['name'][0])) ? $this->parseP($node) : $prefixSpecificResult['properties']['name'][0];
  1227. } elseif (in_array('e-', $prefixes)) {
  1228. $eParsedResult = $this->parseE($node);
  1229. $prefixSpecificResult['html'] = $eParsedResult['html'];
  1230. $prefixSpecificResult['value'] = $eParsedResult['value'];
  1231. } elseif (in_array('u-', $prefixes)) {
  1232. $prefixSpecificResult['value'] = (!is_array($result['properties']) || empty($result['properties']['url'])) ? $this->parseU($node) : reset($result['properties']['url']);
  1233. } elseif (in_array('dt-', $prefixes)) {
  1234. $parsed_property = $this->parseDT($node);
  1235. $prefixSpecificResult['value'] = ($parsed_property) ? $parsed_property : '';
  1236. }
  1237. $mfs['properties'][$property][] = $prefixSpecificResult;
  1238. }
  1239. // otherwise, set up in 'children'
  1240. } else {
  1241. $mfs['children'][] = $result;
  1242. }
  1243. // otherwise, top-level mf
  1244. } else {
  1245. $mfs[] = $result;
  1246. }
  1247. }
  1248. }
  1249. return $mfs;
  1250. }
  1251. /**
  1252. * Parse From ID
  1253. *
  1254. * Given an ID, parse all microformats which are children of the element with
  1255. * that ID.
  1256. *
  1257. * Note that rel values are still document-wide.
  1258. *
  1259. * If an element with the ID is not found, an empty skeleton mf2 array structure
  1260. * will be returned.
  1261. *
  1262. * @param string $id
  1263. * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
  1264. * @return array
  1265. */
  1266. public function parseFromId($id, $convertClassic=true) {
  1267. $matches = $this->xpath->query("//*[@id='{$id}']");
  1268. if (empty($matches))
  1269. return array('items' => array(), 'rels' => array(), 'alternates' => array());
  1270. return $this->parse($convertClassic, $matches->item(0));
  1271. }
  1272. /**
  1273. * Get the root microformat elements
  1274. * @param DOMElement $context
  1275. * @return DOMNodeList
  1276. */
  1277. public function getRootMF(DOMElement $context = null) {
  1278. // start with mf2 root class name xpath
  1279. $xpaths = array(
  1280. 'contains(concat(" ",normalize-space(@class)), " h-")'
  1281. );
  1282. // add mf1 root class names
  1283. foreach ( $this->classicRootMap as $old => $new ) {
  1284. $xpaths[] = '( contains(concat(" ",normalize-space(@class), " "), " ' . $old . ' ") )';
  1285. }
  1286. // final xpath with OR
  1287. $xpath = '//*[' . implode(' or ', $xpaths) . ']';
  1288. $mfElements = (null === $context)
  1289. ? $this->xpath->query($xpath)
  1290. : $this->xpath->query('.' . $xpath, $context);
  1291. return $mfElements;
  1292. }
  1293. /**
  1294. * Apply the backcompat algorithm to upgrade mf1 classes to mf2.
  1295. * This method is called recursively.
  1296. * @param DOMElement $el
  1297. * @param string $context
  1298. * @param bool $isParentMf2
  1299. * @see http://microformats.org/wiki/microformats2-parsing#algorithm
  1300. */
  1301. public function backcompat(DOMElement $el, $context = '', $isParentMf2 = false) {
  1302. if ( $context ) {
  1303. $mf1Classes = array($context);
  1304. } else {
  1305. $class = str_replace(array("\t", "\n"), ' ', $el->getAttribute('class'));
  1306. $classes = array_filter(explode(' ', $class));
  1307. $mf1Classes = array_intersect($classes, array_keys($this->classicRootMap));
  1308. }
  1309. $elHasMf2 = $this->hasRootMf2($el);
  1310. foreach ($mf1Classes as $classname) {
  1311. // special handling for specific properties
  1312. switch ( $classname )
  1313. {
  1314. case 'hentry':
  1315. $this->upgradeRelTagToCategory($el);
  1316. $rel_bookmark = $this->xpath->query('.//a[contains(concat(" ",normalize-space(@rel)," ")," bookmark ") and @href]', $el);
  1317. if ( $rel_bookmark->length ) {
  1318. foreach ( $rel_bookmark as $tempEl ) {
  1319. $this->addMfClasses($tempEl, 'u-url');
  1320. $this->addUpgraded($tempEl, array('bookmark'));
  1321. }
  1322. }
  1323. break;
  1324. case 'hreview':
  1325. $item_and_vcard = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " vcard ")]', $el);
  1326. if ( $item_and_vcard->length ) {
  1327. foreach ( $item_and_vcard as $tempEl ) {
  1328. if ( !$this->hasRootMf2($tempEl) ) {
  1329. $this->backcompat($tempEl, 'vcard');
  1330. $this->addMfClasses($tempEl, 'p-item h-card');
  1331. $this->addUpgraded($tempEl, array('item', 'vcard'));
  1332. }
  1333. }
  1334. }
  1335. $item_and_vevent = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " vevent ")]', $el);
  1336. if ( $item_and_vevent->length ) {
  1337. foreach ( $item_and_vevent as $tempEl ) {
  1338. if ( !$this->hasRootMf2($tempEl) ) {
  1339. $this->addMfClasses($tempEl, 'p-item h-event');
  1340. $this->backcompat($tempEl, 'vevent');
  1341. $this->addUpgraded($tempEl, array('item', 'vevent'));
  1342. }
  1343. }
  1344. }
  1345. $item_and_hproduct = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " item ") and contains(concat(" ", normalize-space(@class), " "), " hproduct ")]', $el);
  1346. if ( $item_and_hproduct->length ) {
  1347. foreach ( $item_and_hproduct as $tempEl ) {
  1348. if ( !$this->hasRootMf2($tempEl) ) {
  1349. $this->addMfClasses($tempEl, 'p-item h-product');
  1350. $this->backcompat($tempEl, 'vevent');
  1351. $this->addUpgraded($tempEl, array('item', 'hproduct'));
  1352. }
  1353. }
  1354. }
  1355. $this->upgradeRelTagToCategory($el);
  1356. break;
  1357. case 'vevent':
  1358. $location = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " location ")]', $el);
  1359. if ( $location->length ) {
  1360. foreach ( $location as $tempEl ) {
  1361. if ( !$this->hasRootMf2($tempEl) ) {
  1362. $this->addMfClasses($tempEl, 'h-card');
  1363. $this->backcompat($tempEl, 'vcard');
  1364. }
  1365. }
  1366. }
  1367. break;
  1368. }
  1369. // root class has mf1 properties to be upgraded
  1370. if ( isset($this->classicPropertyMap[$classname]) ) {
  1371. // loop through each property of the mf1 root
  1372. foreach ( $this->classicPropertyMap[$classname] as $property => $data ) {
  1373. $propertyElements = $this->xpath->query('.//*[contains(concat(" ", normalize-space(@class), " "), " ' . $property . ' ")]', $el);
  1374. // loop through each element with the property
  1375. foreach ( $propertyElements as $propertyEl ) {
  1376. $hasRootMf2 = $this->hasRootMf2($propertyEl);
  1377. // if the element has not been upgraded and we're not inside an mf2 root, recurse
  1378. if ( !$this->isElementUpgraded($propertyEl, $property) && !$isParentMf2 )
  1379. {
  1380. $temp_context = ( isset($data['context']) ) ? $data['context'] : null;
  1381. $this->backcompat($propertyEl, $temp_context, $hasRootMf2);
  1382. $this->addMfClasses($propertyEl, $data['replace']);
  1383. }
  1384. $this->addUpgraded($propertyEl, $property);
  1385. }
  1386. }
  1387. }
  1388. if ( empty($context) && isset($this->classicRootMap[$classname]) && !$elHasMf2 ) {
  1389. $this->addMfClasses($el, $this->classicRootMap[$classname]);
  1390. }
  1391. }
  1392. return;
  1393. }
  1394. /**
  1395. * Add element + property as upgraded during backcompat
  1396. * @param DOMElement $el
  1397. * @param string|array $property
  1398. */
  1399. public function addUpgraded(DOMElement $el, $property) {
  1400. if ( !is_array($property) ) {
  1401. $property = array($property);
  1402. }
  1403. // add element to list of upgraded elements
  1404. if ( !$this->upgraded->contains($el) ) {
  1405. $this->upgraded->attach($el, $property);
  1406. } else {
  1407. $this->upgraded[$el] = array_merge($this->upgraded[$el], $property);
  1408. }
  1409. }
  1410. /**
  1411. * Add the provided classes to an element.
  1412. * Does not add duplicate if class name already exists.
  1413. * @param DOMElement $el
  1414. * @param string $classes
  1415. */
  1416. public function addMfClasses(DOMElement $el, $classes) {
  1417. $existingClasses = str_replace(array("\t", "\n"), ' ', $el->getAttribute('class'));
  1418. $existingClasses = array_filter(explode(' ', $existingClasses));
  1419. $addClasses = array_diff(explode(' ', $classes), $existingClasses);
  1420. if ( $addClasses ) {
  1421. $el->setAttribute('class', $el->getAttribute('class') . ' ' . implode(' ', $addClasses));
  1422. }
  1423. }
  1424. /**
  1425. * Check an element for mf2 h-* class, typically to determine if backcompat should be used
  1426. * @param DOMElement $el
  1427. */
  1428. public function hasRootMf2(\DOMElement $el) {
  1429. $class = str_replace(array("\t", "\n"), ' ', $el->getAttribute('class'));
  1430. $classes = array_filter(explode(' ', $class));
  1431. foreach ( $classes as $classname ) {
  1432. if ( strpos($classname, 'h-') === 0 ) {
  1433. return true;
  1434. }
  1435. }
  1436. return false;
  1437. }
  1438. /**
  1439. * Convert Legacy Classnames
  1440. *
  1441. * Adds microformats2 classnames into a document containing only legacy
  1442. * semantic classnames.
  1443. *
  1444. * @return Parser $this
  1445. */
  1446. public function convertLegacy() {
  1447. $doc = $this->doc;
  1448. $xp = new DOMXPath($doc);
  1449. // replace all roots
  1450. foreach ($this->classicRootMap as $old => $new) {
  1451. foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
  1452. $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
  1453. }
  1454. }
  1455. foreach ($this->classicPropertyMap as $oldRoot => $properties) {
  1456. $newRoot = $this->classicRootMap[$oldRoot];
  1457. foreach ($properties as $old => $data) {
  1458. foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $data['replace'] . ' "))]') as $el) {
  1459. $el->setAttribute('class', $el->getAttribute('class') . ' ' . $data['replace']);
  1460. }
  1461. }
  1462. }
  1463. return $this;
  1464. }
  1465. /**
  1466. * XPath Query
  1467. *
  1468. * Runs an XPath query over the current document. Works in exactly the same
  1469. * way as DOMXPath::query.
  1470. *
  1471. * @param string $expression
  1472. * @param DOMNode $context
  1473. * @return DOMNodeList
  1474. */
  1475. public function query($expression, $context = null) {
  1476. return $this->xpath->query($expression, $context);
  1477. }
  1478. /**
  1479. * Classic Root Classname map
  1480. * @var array
  1481. */
  1482. public $classicRootMap = array(
  1483. 'vcard' => 'h-card',
  1484. 'hfeed' => 'h-feed',
  1485. 'hentry' => 'h-entry',
  1486. 'hrecipe' => 'h-recipe',
  1487. 'hresume' => 'h-resume',
  1488. 'vevent' => 'h-event',
  1489. 'hreview' => 'h-review',
  1490. 'hproduct' => 'h-product',
  1491. 'adr' => 'h-adr',
  1492. );
  1493. /**
  1494. * Mapping of mf1 properties to mf2 and the context they're parsed with
  1495. * @var array
  1496. */
  1497. public $classicPropertyMap = array(
  1498. 'vcard' => array(
  1499. 'fn' => array(
  1500. 'replace' => 'p-name'
  1501. ),
  1502. 'honorific-prefix' => array(
  1503. 'replace' => 'p-honorific-prefix'
  1504. ),
  1505. 'given-name' => array(
  1506. 'replace' => 'p-given-name'
  1507. ),
  1508. 'additional-name' => array(
  1509. 'replace' => 'p-additional-name'
  1510. ),
  1511. 'family-name' => array(
  1512. 'replace' => 'p-family-name'
  1513. ),
  1514. 'honorific-suffix' => array(
  1515. 'replace' => 'p-honorific-suffix'
  1516. ),
  1517. 'nickname' => array(
  1518. 'replace' => 'p-nickname'
  1519. ),
  1520. 'email' => array(
  1521. 'replace' => 'u-email'
  1522. ),
  1523. 'logo' => array(
  1524. 'replace' => 'u-logo'
  1525. ),
  1526. 'photo' => array(
  1527. 'replace' => 'u-photo'
  1528. ),
  1529. 'url' => array(
  1530. 'replace' => 'u-url'
  1531. ),
  1532. 'uid' => array(
  1533. 'replace' => 'u-uid'
  1534. ),
  1535. 'category' => array(
  1536. 'replace' => 'p-category'
  1537. ),
  1538. 'adr' => array(
  1539. 'replace' => 'p-adr',
  1540. ),
  1541. 'extended-address' => array(
  1542. 'replace' => 'p-extended-address'
  1543. ),
  1544. 'street-address' => array(
  1545. 'replace' => 'p-street-address'
  1546. ),
  1547. 'locality' => array(
  1548. 'replace' => 'p-locality'
  1549. ),
  1550. 'region' => array(
  1551. 'replace' => 'p-region'
  1552. ),
  1553. 'postal-code' => array(
  1554. 'replace' => 'p-postal-code'
  1555. ),
  1556. 'country-name' => array(
  1557. 'replace' => 'p-country-name'
  1558. ),
  1559. 'label' => array(
  1560. 'replace' => 'p-label'
  1561. ),
  1562. 'geo' => array(
  1563. 'replace' => 'p-geo h-geo'
  1564. ),
  1565. 'latitude' => array(
  1566. 'replace' => 'p-latitude'
  1567. ),
  1568. 'longitude' => array(
  1569. 'replace' => 'p-longitude'
  1570. ),
  1571. 'tel' => array(
  1572. 'replace' => 'p-tel'
  1573. ),
  1574. 'note' => array(
  1575. 'replace' => 'p-note'
  1576. ),
  1577. 'bday' => array(
  1578. 'replace' => 'dt-bday'
  1579. ),
  1580. 'key' => array(
  1581. 'replace' => 'u-key'
  1582. ),
  1583. 'org' => array(
  1584. 'replace' => 'p-org'
  1585. ),
  1586. 'organization-name' => array(
  1587. 'replace' => 'p-organization-name'
  1588. ),
  1589. 'organization-unit' => array(
  1590. 'replace' => 'p-organization-unit'
  1591. ),
  1592. 'title' => array(
  1593. 'replace' => 'p-job-title'
  1594. ),
  1595. 'role' => array(
  1596. 'replace' => 'p-role'
  1597. ),
  1598. 'tz' => array(
  1599. 'replace' => 'p-tz'
  1600. ),
  1601. 'rev' => array(
  1602. 'replace' => 'dt-rev'
  1603. ),
  1604. ),
  1605. 'hfeed' => array(
  1606. # nothing currently
  1607. ),
  1608. 'hentry' => array(
  1609. 'entry-title' => array(
  1610. 'replace' => 'p-name'
  1611. ),
  1612. 'entry-summary' => array(
  1613. 'replace' => 'p-summary'
  1614. ),
  1615. 'entry-content' => array(
  1616. 'replace' => 'e-content'
  1617. ),
  1618. 'published' => array(
  1619. 'replace' => 'dt-published'
  1620. ),
  1621. 'updated' => array(
  1622. 'replace' => 'dt-updated'
  1623. ),
  1624. 'author' => array(
  1625. 'replace' => 'p-author h-card',
  1626. 'context' => 'vcard',
  1627. ),
  1628. 'category' => array(
  1629. 'replace' => 'p-category'
  1630. ),
  1631. ),
  1632. 'hrecipe' => array(
  1633. 'fn' => array(
  1634. 'replace' => 'p-name'
  1635. ),
  1636. 'ingredient' => array(
  1637. 'replace' => 'p-ingredient'
  1638. /**
  1639. * TODO: hRecipe 'value' and 'type' child mf not parsing correctly currently.
  1640. * Per http://microformats.org/wiki/hRecipe#Property_details, they're experimental.
  1641. */
  1642. ),
  1643. 'yield' => array(
  1644. 'replace' => 'p-yield'
  1645. ),
  1646. 'instructions' => array(
  1647. 'replace' => 'e-instructions'
  1648. ),
  1649. 'duration' => array(
  1650. 'replace' => 'dt-duration'
  1651. ),
  1652. 'photo' => array(
  1653. 'replace' => 'u-photo'
  1654. ),
  1655. 'summary' => array(
  1656. 'replace' => 'p-summary'
  1657. ),
  1658. 'author' => array(
  1659. 'replace' => 'p-author h-card',
  1660. 'context' => 'vcard',
  1661. ),
  1662. 'nutrition' => array(
  1663. 'replace' => 'p-nutrition'
  1664. ),
  1665. 'category' => array(
  1666. 'replace' => 'p-category'
  1667. ),
  1668. ),
  1669. 'hresume' => array(
  1670. 'summary' => array(
  1671. 'replace' => 'p-summary'
  1672. ),
  1673. 'contact' => array(
  1674. 'replace' => 'p-contact h-card',
  1675. 'context' => 'vcard',
  1676. ),
  1677. 'education' => array(
  1678. 'replace' => 'p-education h-event',
  1679. 'context' => 'vevent',
  1680. ),
  1681. 'experience' => array(
  1682. 'replace' => 'p-experience h-event',
  1683. 'context' => 'vevent',
  1684. ),
  1685. 'skill' => array(
  1686. 'replace' => 'p-skill'
  1687. ),
  1688. 'affiliation' => array(
  1689. 'replace' => 'p-affiliation h-card',
  1690. 'context' => 'vcard',
  1691. ),
  1692. ),
  1693. 'vevent' => array(
  1694. 'summary' => array(
  1695. 'replace' => 'p-name'
  1696. ),
  1697. 'dtstart' => array(
  1698. 'replace' => 'dt-start'
  1699. ),
  1700. 'dtend' => array(
  1701. 'replace' => 'dt-end'
  1702. ),
  1703. 'duration' => array(
  1704. 'replace' => 'dt-duration'
  1705. ),
  1706. 'description' => array(
  1707. 'replace' => 'p-description'
  1708. ),
  1709. 'url' => array(
  1710. 'replace' => 'u-url'
  1711. ),
  1712. 'category' => array(
  1713. 'replace' => 'p-category'
  1714. ),
  1715. 'location' => array(
  1716. 'replace' => 'h-card',
  1717. 'context' => 'vcard'
  1718. ),
  1719. 'geo' => array(
  1720. 'replace' => 'p-location h-geo'
  1721. ),
  1722. ),
  1723. 'hreview' => array(
  1724. 'summary' => array(
  1725. 'replace' => 'p-name'
  1726. ),
  1727. # fn: see item.fn below
  1728. # photo: see item.photo below
  1729. # url: see item.url below
  1730. 'item' => array(
  1731. 'replace' => 'p-item h-item',
  1732. 'context' => 'item'
  1733. ),
  1734. 'reviewer' => array(
  1735. 'replace' => 'p-author h-card',
  1736. 'context' => 'vcard',
  1737. ),
  1738. 'dtreviewed' => array(
  1739. 'replace' => 'dt-published'
  1740. ),
  1741. 'rating' => array(
  1742. 'replace' => 'p-rating'
  1743. ),
  1744. 'best' => array(
  1745. 'replace' => 'p-best'
  1746. ),
  1747. 'worst' => array(
  1748. 'replace' => 'p-worst'
  1749. ),
  1750. 'description' => array(
  1751. 'replace' => 'e-content'
  1752. ),
  1753. 'category' => array(
  1754. 'replace' => 'p-category'
  1755. ),
  1756. ),
  1757. 'hproduct' => array(
  1758. 'fn' => array(
  1759. 'replace' => 'p-name',
  1760. ),
  1761. 'photo' => array(
  1762. 'replace' => 'u-photo',
  1763. ),
  1764. 'brand' => array(
  1765. 'replace' => 'p-brand',
  1766. ),
  1767. 'category' => array(
  1768. 'replace' => 'p-category',
  1769. ),
  1770. 'description' => array(
  1771. 'replace' => 'p-description',
  1772. ),
  1773. 'identifier' => array(
  1774. 'replace' => 'u-identifier',
  1775. ),
  1776. 'url' => array(
  1777. 'replace' => 'u-url',
  1778. ),
  1779. 'review' => array(
  1780. 'replace' => 'p-review h-review',
  1781. ),
  1782. 'price' => array(
  1783. 'replace' => 'p-price'
  1784. ),
  1785. ),
  1786. 'item' => array(
  1787. 'fn' => array(
  1788. 'replace' => 'p-name'
  1789. ),
  1790. 'url' => array(
  1791. 'replace' => 'u-url'
  1792. ),
  1793. 'photo' => array(
  1794. 'replace' => 'u-photo'
  1795. ),
  1796. ),
  1797. 'adr' => array(
  1798. 'post-office-box' => array(
  1799. 'replace' => 'p-post-office-box'
  1800. ),
  1801. 'extended-address' => array(
  1802. 'replace' => 'p-extended-address'
  1803. ),
  1804. 'street-address' => array(
  1805. 'replace' => 'p-street-address'
  1806. ),
  1807. 'locality' => array(
  1808. 'replace' => 'p-locality'
  1809. ),
  1810. 'region' => array(
  1811. 'replace' => 'p-region'
  1812. ),
  1813. 'postal-code' => array(
  1814. 'replace' => 'p-postal-code'
  1815. ),
  1816. 'country-name' => array(
  1817. 'replace' => 'p-country-name'
  1818. ),
  1819. ),
  1820. 'geo' => array(
  1821. 'latitude' => array(
  1822. 'replace' => 'p-latitude'
  1823. ),
  1824. 'longitude' => array(
  1825. 'replace' => 'p-longitude'
  1826. ),
  1827. ),
  1828. );
  1829. }
  1830. function parseUriToComponents($uri) {
  1831. $result = array(
  1832. 'scheme' => null,
  1833. 'authority' => null,
  1834. 'path' => null,
  1835. 'query' => null,
  1836. 'fragment' => null
  1837. );
  1838. $u = @parse_url($uri);
  1839. if(array_key_exists('scheme', $u))
  1840. $result['scheme'] = $u['scheme'];
  1841. if(array_key_exists('host', $u)) {
  1842. if(array_key_exists('user', $u))
  1843. $result['authority'] = $u['user'];
  1844. if(array_key_exists('pass', $u))
  1845. $result['authority'] .= ':' . $u['pass'];
  1846. if(array_key_exists('user', $u) || array_key_exists('pass', $u))
  1847. $result['authority'] .= '@';
  1848. $result['authority'] .= $u['host'];
  1849. if(array_key_exists('port', $u))
  1850. $result['authority'] .= ':' . $u['port'];
  1851. }
  1852. if(array_key_exists('path', $u))
  1853. $result['path'] = $u['path'];
  1854. if(array_key_exists('query', $u))
  1855. $result['query'] = $u['query'];
  1856. if(array_key_exists('fragment', $u))
  1857. $result['fragment'] = $u['fragment'];
  1858. return $result;
  1859. }
  1860. function resolveUrl($baseURI, $referenceURI) {
  1861. $target = array(
  1862. 'scheme' => null,
  1863. 'authority' => null,
  1864. 'path' => null,
  1865. 'query' => null,
  1866. 'fragment' => null
  1867. );
  1868. # 5.2.1 Pre-parse the Base URI
  1869. # The base URI (Base) is established according to the procedure of
  1870. # Section 5.1 and parsed into the five main components described in
  1871. # Section 3
  1872. $base = parseUriToComponents($baseURI);
  1873. # If base path is blank (http://example.com) then set it to /
  1874. # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
  1875. if($base['path'] == null)
  1876. $base['path'] = '/';
  1877. # 5.2.2. Transform References
  1878. # The URI reference is parsed into the five URI components
  1879. # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
  1880. $reference = parseUriToComponents($referenceURI);
  1881. # A non-strict parser may ignore a scheme in the reference
  1882. # if it is identical to the base URI's scheme.
  1883. # TODO
  1884. if($reference['scheme']) {
  1885. $target['scheme'] = $reference['scheme'];
  1886. $target['authority'] = $reference['authority'];
  1887. $target['path'] = removeDotSegments($reference['path']);
  1888. $target['query'] = $reference['query'];
  1889. } else {
  1890. if($reference['authority']) {
  1891. $target['authority'] = $reference['authority'];
  1892. $target['path'] = removeDotSegments($reference['path']);
  1893. $target['query'] = $reference['query'];
  1894. } else {
  1895. if($reference['path'] == '') {
  1896. $target['path'] = $base['path'];
  1897. if($reference['query']) {
  1898. $target['query'] = $reference['query'];
  1899. } else {
  1900. $target['query'] = $base['query'];
  1901. }
  1902. } else {
  1903. if(substr($reference['path'], 0, 1) == '/') {
  1904. $target['path'] = removeDotSegments($reference['path']);
  1905. } else {
  1906. $target['path'] = mergePaths($base, $reference);
  1907. $target['path'] = removeDotSegments($target['path']);
  1908. }
  1909. $target['query'] = $reference['query'];
  1910. }
  1911. $target['authority'] = $base['authority'];
  1912. }
  1913. $target['scheme'] = $base['scheme'];
  1914. }
  1915. $target['fragment'] = $reference['fragment'];
  1916. # 5.3 Component Recomposition
  1917. $result = '';
  1918. if($target['scheme']) {
  1919. $result .= $target['scheme'] . ':';
  1920. }
  1921. if($target['authority']) {
  1922. $result .= '//' . $target['authority'];
  1923. }
  1924. $result .= $target['path'];
  1925. if($target['query']) {
  1926. $result .= '?' . $target['query'];
  1927. }
  1928. if($target['fragment']) {
  1929. $result .= '#' . $target['fragment'];
  1930. } elseif($referenceURI == '#') {
  1931. $result .= '#';
  1932. }
  1933. return $result;
  1934. }
  1935. # 5.2.3 Merge Paths
  1936. function mergePaths($base, $reference) {
  1937. # If the base URI has a defined authority component and an empty
  1938. # path,
  1939. if($base['authority'] && $base['path'] == null) {
  1940. # then return a string consisting of "/" concatenated with the
  1941. # reference's path; otherwise,
  1942. $merged = '/' . $reference['path'];
  1943. } else {
  1944. if(($pos=strrpos($base['path'], '/')) !== false) {
  1945. # return a string consisting of the reference's path component
  1946. # appended to all but the last segment of the base URI's path (i.e.,
  1947. # excluding any characters after the right-most "/" in the base URI
  1948. # path,
  1949. $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
  1950. } else {
  1951. # or excluding the entire base URI path if it does not contain
  1952. # any "/" characters).
  1953. $merged = $base['path'];
  1954. }
  1955. }
  1956. return $merged;
  1957. }
  1958. # 5.2.4.A Remove leading ../ or ./
  1959. function removeLeadingDotSlash(&$input) {
  1960. if(substr($input, 0, 3) == '../') {
  1961. $input = substr($input, 3);
  1962. } elseif(substr($input, 0, 2) == './') {
  1963. $input = substr($input, 2);
  1964. }
  1965. }
  1966. # 5.2.4.B Replace leading /. with /
  1967. function removeLeadingSlashDot(&$input) {
  1968. if(substr($input, 0, 3) == '/./') {
  1969. $input = '/' . substr($input, 3);
  1970. } else {
  1971. $input = '/' . substr($input, 2);
  1972. }
  1973. }
  1974. # 5.2.4.C Given leading /../ remove component from output buffer
  1975. function removeOneDirLevel(&$input, &$output) {
  1976. if(substr($input, 0, 4) == '/../') {
  1977. $input = '/' . substr($input, 4);
  1978. } else {
  1979. $input = '/' . substr($input, 3);
  1980. }
  1981. $output = substr($output, 0, strrpos($output, '/'));
  1982. }
  1983. # 5.2.4.D Remove . and .. if it's the only thing in the input
  1984. function removeLoneDotDot(&$input) {
  1985. if($input == '.') {
  1986. $input = substr($input, 1);
  1987. } else {
  1988. $input = substr($input, 2);
  1989. }
  1990. }
  1991. # 5.2.4.E Move one segment from input to output
  1992. function moveOneSegmentFromInput(&$input, &$output) {
  1993. if(substr($input, 0, 1) != '/') {
  1994. $pos = strpos($input, '/');
  1995. } else {
  1996. $pos = strpos($input, '/', 1);
  1997. }
  1998. if($pos === false) {
  1999. $output .= $input;
  2000. $input = '';
  2001. } else {
  2002. $output .= substr($input, 0, $pos);
  2003. $input = substr($input, $pos);
  2004. }
  2005. }
  2006. # 5.2.4 Remove Dot Segments
  2007. function removeDotSegments($path) {
  2008. # 1. The input buffer is initialized with the now-appended path
  2009. # components and the output buffer is initialized to the empty
  2010. # string.
  2011. $input = $path;
  2012. $output = '';
  2013. $step = 0;
  2014. # 2. While the input buffer is not empty, loop as follows:
  2015. while($input) {
  2016. $step++;
  2017. if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
  2018. # A. If the input buffer begins with a prefix of "../" or "./",
  2019. # then remove that prefix from the input buffer; otherwise,
  2020. removeLeadingDotSlash($input);
  2021. } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
  2022. # B. if the input buffer begins with a prefix of "/./" or "/.",
  2023. # where "." is a complete path segment, then replace that
  2024. # prefix with "/" in the input buffer; otherwise,
  2025. removeLeadingSlashDot($input);
  2026. } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
  2027. # C. if the input buffer begins with a prefix of "/../" or "/..",
  2028. # where ".." is a complete path segment, then replace that
  2029. # prefix with "/" in the input buffer and remove the last
  2030. # segment and its preceding "/" (if any) from the output
  2031. # buffer; otherwise,
  2032. removeOneDirLevel($input, $output);
  2033. } elseif($input == '.' || $input == '..') {
  2034. # D. if the input buffer consists only of "." or "..", then remove
  2035. # that from the input buffer; otherwise,
  2036. removeLoneDotDot($input);
  2037. } else {
  2038. # E. move the first path segment in the input buffer to the end of
  2039. # the output buffer and any subsequent characters up to, but not including,
  2040. # the next "/" character or the end of the input buffer
  2041. moveOneSegmentFromInput($input, $output);
  2042. }
  2043. }
  2044. return $output;
  2045. }