123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585 |
- <?php
- /**
- * An implementation of the tree building portion of the HTML5 parsing
- * spec.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- * @ingroup Parser
- * @since 1.27
- * @author C. Scott Ananian, 2016
- */
- namespace MediaWiki\Tidy;
- use ExplodeIterator;
- use IteratorAggregate;
- use ReverseArrayIterator;
- use Sanitizer;
- use Wikimedia\Assert\Assert;
- use Wikimedia\Assert\ParameterAssertionException;
- // A note for future librarization[1] -- this file is a good candidate
- // for splitting into an independent library, except that it is currently
- // highly optimized for MediaWiki use. It only implements the portions
- // of the HTML5 tree builder used by tags supported by MediaWiki, and
- // does not contain a true tokenizer pass, instead relying on
- // comment stripping, attribute normalization, and escaping done by
- // the MediaWiki Sanitizer. It also deliberately avoids building
- // a true DOM in memory, instead serializing elements to an output string
- // as soon as possible (usually as soon as the tag is closed) to reduce
- // its memory footprint.
- // We've been gradually lifting some of these restrictions to handle
- // non-sanitized output generated by extensions, but we shortcut the tokenizer
- // for speed (primarily by splitting on `<`) and so rely on syntactic
- // well-formedness.
- // On the other hand, I've been pretty careful to note with comments in the
- // code the places where this implementation omits features of the spec or
- // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
- // implement the missing pieces and make this a standalone PHP HTML5 parser.
- // In order to do so, some sort of MediaWiki-specific API will need
- // to be added to (a) allow the Balancer to bypass the tokenizer,
- // and (b) support on-the-fly flattening instead of DOM node creation.
- // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
- /**
- * Utility constants and sets for the HTML5 tree building algorithm.
- * Sets are associative arrays indexed first by namespace and then by
- * lower-cased tag name.
- *
- * @ingroup Parser
- * @since 1.27
- */
- class BalanceSets {
- const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
- const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
- const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
- public static $unsupportedSet = [
- self::HTML_NAMESPACE => [
- 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
- 'frame' => true,
- 'plaintext' => true,
- 'xmp' => true, 'iframe' => true, 'noembed' => true,
- 'noscript' => true, 'script' => true,
- 'title' => true
- ]
- ];
- public static $emptyElementSet = [
- self::HTML_NAMESPACE => [
- 'area' => true, 'base' => true, 'basefont' => true,
- 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
- 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
- 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
- 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
- ]
- ];
- public static $extraLinefeedSet = [
- self::HTML_NAMESPACE => [
- 'pre' => true, 'textarea' => true, 'listing' => true,
- ]
- ];
- public static $headingSet = [
- self::HTML_NAMESPACE => [
- 'h1' => true, 'h2' => true, 'h3' => true,
- 'h4' => true, 'h5' => true, 'h6' => true
- ]
- ];
- public static $specialSet = [
- self::HTML_NAMESPACE => [
- 'address' => true, 'applet' => true, 'area' => true,
- 'article' => true, 'aside' => true, 'base' => true,
- 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
- 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
- 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
- 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
- 'dt' => true, 'embed' => true, 'fieldset' => true,
- 'figcaption' => true, 'figure' => true, 'footer' => true,
- 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
- 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
- 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
- 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
- 'input' => true, 'li' => true, 'link' => true,
- 'listing' => true, 'main' => true, 'marquee' => true,
- 'menu' => true, 'meta' => true, 'nav' => true,
- 'noembed' => true, 'noframes' => true, 'noscript' => true,
- 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
- 'plaintext' => true, 'pre' => true, 'script' => true,
- 'section' => true, 'select' => true, 'source' => true,
- 'style' => true, 'summary' => true, 'table' => true,
- 'tbody' => true, 'td' => true, 'template' => true,
- 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
- 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
- 'wbr' => true, 'xmp' => true
- ],
- self::SVG_NAMESPACE => [
- 'foreignobject' => true, 'desc' => true, 'title' => true
- ],
- self::MATHML_NAMESPACE => [
- 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
- 'mtext' => true, 'annotation-xml' => true
- ]
- ];
- public static $addressDivPSet = [
- self::HTML_NAMESPACE => [
- 'address' => true, 'div' => true, 'p' => true
- ]
- ];
- public static $tableSectionRowSet = [
- self::HTML_NAMESPACE => [
- 'table' => true, 'thead' => true, 'tbody' => true,
- 'tfoot' => true, 'tr' => true
- ]
- ];
- public static $impliedEndTagsSet = [
- self::HTML_NAMESPACE => [
- 'dd' => true, 'dt' => true, 'li' => true,
- 'menuitem' => true, 'optgroup' => true,
- 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
- 'rt' => true, 'rtc' => true
- ]
- ];
- public static $thoroughImpliedEndTagsSet = [
- self::HTML_NAMESPACE => [
- 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
- 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
- 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
- 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
- 'thead' => true, 'tr' => true
- ]
- ];
- public static $tableCellSet = [
- self::HTML_NAMESPACE => [
- 'td' => true, 'th' => true
- ]
- ];
- public static $tableContextSet = [
- self::HTML_NAMESPACE => [
- 'table' => true, 'template' => true, 'html' => true
- ]
- ];
- public static $tableBodyContextSet = [
- self::HTML_NAMESPACE => [
- 'tbody' => true, 'tfoot' => true, 'thead' => true,
- 'template' => true, 'html' => true
- ]
- ];
- public static $tableRowContextSet = [
- self::HTML_NAMESPACE => [
- 'tr' => true, 'template' => true, 'html' => true
- ]
- ];
- // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
- public static $formAssociatedSet = [
- self::HTML_NAMESPACE => [
- 'button' => true, 'fieldset' => true, 'input' => true,
- 'keygen' => true, 'object' => true, 'output' => true,
- 'select' => true, 'textarea' => true, 'img' => true
- ]
- ];
- public static $inScopeSet = [
- self::HTML_NAMESPACE => [
- 'applet' => true, 'caption' => true, 'html' => true,
- 'marquee' => true, 'object' => true,
- 'table' => true, 'td' => true, 'template' => true,
- 'th' => true
- ],
- self::SVG_NAMESPACE => [
- 'foreignobject' => true, 'desc' => true, 'title' => true
- ],
- self::MATHML_NAMESPACE => [
- 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
- 'mtext' => true, 'annotation-xml' => true
- ]
- ];
- private static $inListItemScopeSet = null;
- public static function inListItemScopeSet() {
- if ( self::$inListItemScopeSet === null ) {
- self::$inListItemScopeSet = self::$inScopeSet;
- self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
- self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
- }
- return self::$inListItemScopeSet;
- }
- private static $inButtonScopeSet = null;
- public static function inButtonScopeSet() {
- if ( self::$inButtonScopeSet === null ) {
- self::$inButtonScopeSet = self::$inScopeSet;
- self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
- }
- return self::$inButtonScopeSet;
- }
- public static $inTableScopeSet = [
- self::HTML_NAMESPACE => [
- 'html' => true, 'table' => true, 'template' => true
- ]
- ];
- public static $inInvertedSelectScopeSet = [
- self::HTML_NAMESPACE => [
- 'option' => true, 'optgroup' => true
- ]
- ];
- public static $mathmlTextIntegrationPointSet = [
- self::MATHML_NAMESPACE => [
- 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
- 'mtext' => true
- ]
- ];
- public static $htmlIntegrationPointSet = [
- self::SVG_NAMESPACE => [
- 'foreignobject' => true,
- 'desc' => true,
- 'title' => true
- ]
- ];
- // For tidy compatibility.
- public static $tidyPWrapSet = [
- self::HTML_NAMESPACE => [
- 'body' => true, 'blockquote' => true,
- // We parse with <body> as the fragment context, but the top-level
- // element on the stack is actually <html>. We could use the
- // "adjusted current node" everywhere to work around this, but it's
- // easier just to add <html> to the p-wrap set.
- 'html' => true,
- ],
- ];
- public static $tidyInlineSet = [
- self::HTML_NAMESPACE => [
- 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
- 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
- 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
- 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
- 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
- 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
- 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
- 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
- 's' => true, 'samp' => true, 'select' => true, 'small' => true,
- 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
- 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
- 'var' => true,
- // Those defined in tidy.conf
- 'video' => true, 'audio' => true, 'bdi' => true, 'data' => true,
- 'time' => true, 'mark' => true,
- ],
- ];
- }
- /**
- * A BalanceElement is a simplified version of a DOM Node. The main
- * difference is that we only keep BalanceElements around for nodes
- * currently on the BalanceStack of open elements. As soon as an
- * element is closed, with some minor exceptions relating to the
- * tree builder "adoption agency algorithm", the element and all its
- * children are serialized to a string using the flatten() method.
- * This keeps our memory usage low.
- *
- * @ingroup Parser
- * @since 1.27
- */
- class BalanceElement {
- /**
- * The namespace of the element.
- * @var string $namespaceURI
- */
- public $namespaceURI;
- /**
- * The lower-cased name of the element.
- * @var string $localName
- */
- public $localName;
- /**
- * Attributes for the element, in array form
- * @var array $attribs
- */
- public $attribs;
- /**
- * Parent of this element, or the string "flat" if this element has
- * already been flattened into its parent.
- * @var BalanceElement|string|null $parent
- */
- public $parent;
- /**
- * An array of children of this element. Typically only the last
- * child will be an actual BalanceElement object; the rest will
- * be strings, representing either text nodes or flattened
- * BalanceElement objects.
- * @var BalanceElement[]|string[] $children
- */
- public $children;
- /**
- * A unique string identifier for Noah's Ark purposes, lazy initialized
- */
- private $noahKey;
- /**
- * The next active formatting element in the list, or null if this is the
- * end of the AFE list or if the element is not in the AFE list.
- */
- public $nextAFE;
- /**
- * The previous active formatting element in the list, or null if this is
- * the start of the list or if the element is not in the AFE list.
- */
- public $prevAFE;
- /**
- * The next element in the Noah's Ark species bucket.
- */
- public $nextNoah;
- /**
- * Make a new BalanceElement corresponding to the HTML DOM Element
- * with the given localname, namespace, and attributes.
- *
- * @param string $namespaceURI The namespace of the element.
- * @param string $localName The lowercased name of the tag.
- * @param array $attribs Attributes of the element
- */
- public function __construct( $namespaceURI, $localName, array $attribs ) {
- $this->localName = $localName;
- $this->namespaceURI = $namespaceURI;
- $this->attribs = $attribs;
- $this->contents = '';
- $this->parent = null;
- $this->children = [];
- }
- /**
- * Remove the given child from this element.
- * @param BalanceElement $elt
- */
- private function removeChild( BalanceElement $elt ) {
- Assert::precondition(
- $this->parent !== 'flat', "Can't removeChild after flattening $this"
- );
- Assert::parameter(
- $elt->parent === $this, 'elt', 'must have $this as a parent'
- );
- $idx = array_search( $elt, $this->children, true );
- Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
- $elt->parent = null;
- array_splice( $this->children, $idx, 1 );
- }
- /**
- * Find $a in the list of children and insert $b before it.
- * @param BalanceElement $a
- * @param BalanceElement|string $b
- */
- public function insertBefore( BalanceElement $a, $b ) {
- Assert::precondition(
- $this->parent !== 'flat', "Can't insertBefore after flattening."
- );
- $idx = array_search( $a, $this->children, true );
- Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
- if ( is_string( $b ) ) {
- array_splice( $this->children, $idx, 0, [ $b ] );
- } else {
- Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
- if ( $b->parent !== null ) {
- $b->parent->removeChild( $b );
- }
- array_splice( $this->children, $idx, 0, [ $b ] );
- $b->parent = $this;
- }
- }
- /**
- * Append $elt to the end of the list of children.
- * @param BalanceElement|string $elt
- */
- public function appendChild( $elt ) {
- Assert::precondition(
- $this->parent !== 'flat', "Can't appendChild after flattening."
- );
- if ( is_string( $elt ) ) {
- array_push( $this->children, $elt );
- return;
- }
- // Remove $elt from parent, if it had one.
- if ( $elt->parent !== null ) {
- $elt->parent->removeChild( $elt );
- }
- array_push( $this->children, $elt );
- $elt->parent = $this;
- }
- /**
- * Transfer all of the children of $elt to $this.
- * @param BalanceElement $elt
- */
- public function adoptChildren( BalanceElement $elt ) {
- Assert::precondition(
- $elt->parent !== 'flat', "Can't adoptChildren after flattening."
- );
- foreach ( $elt->children as $child ) {
- if ( !is_string( $child ) ) {
- // This is an optimization which avoids an O(n^2) set of
- // array_splice operations.
- $child->parent = null;
- }
- $this->appendChild( $child );
- }
- $elt->children = [];
- }
- /**
- * Flatten this node and all of its children into a string, as specified
- * by the HTML serialization specification, and replace this node
- * in its parent by that string.
- *
- * @param array $config Balancer configuration; see Balancer::__construct().
- * @return string
- *
- * @see __toString()
- */
- public function flatten( array $config ) {
- Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
- Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
- $idx = array_search( $this, $this->parent->children, true );
- Assert::parameter(
- $idx !== false, '$this', 'must be a child of its parent'
- );
- $tidyCompat = $config['tidyCompat'];
- if ( $tidyCompat ) {
- $blank = true;
- foreach ( $this->children as $elt ) {
- if ( !is_string( $elt ) ) {
- $elt = $elt->flatten( $config );
- }
- if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
- $blank = false;
- }
- }
- if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
- $this->localName = 'p';
- } elseif ( $blank ) {
- // Add 'mw-empty-elt' class so elements can be hidden via CSS
- // for compatibility with legacy tidy.
- if ( !count( $this->attribs ) &&
- ( $this->localName === 'tr' || $this->localName === 'li' )
- ) {
- $this->attribs = [ 'class' => "mw-empty-elt" ];
- }
- $blank = false;
- } elseif (
- $this->isA( BalanceSets::$extraLinefeedSet ) &&
- count( $this->children ) > 0 &&
- substr( $this->children[0], 0, 1 ) == "\n"
- ) {
- // Double the linefeed after pre/listing/textarea
- // according to the (old) HTML5 fragment serialization
- // algorithm (see https://github.com/whatwg/html/issues/944)
- // to ensure this will round-trip.
- array_unshift( $this->children, "\n" );
- }
- $flat = $blank ? '' : "{$this}";
- } else {
- $flat = "{$this}";
- }
- $this->parent->children[$idx] = $flat;
- $this->parent = 'flat'; // for assertion checking
- return $flat;
- }
- /**
- * Serialize this node and all of its children to a string, as specified
- * by the HTML serialization specification.
- *
- * @return string The serialization of the BalanceElement
- * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
- */
- public function __toString() {
- $encAttribs = '';
- foreach ( $this->attribs as $name => $value ) {
- $encValue = Sanitizer::encodeAttribute( $value );
- $encAttribs .= " $name=\"$encValue\"";
- }
- if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
- $out = "<{$this->localName}{$encAttribs}>";
- $len = strlen( $out );
- // flatten children
- foreach ( $this->children as $elt ) {
- $out .= "{$elt}";
- }
- $out .= "</{$this->localName}>";
- } else {
- $out = "<{$this->localName}{$encAttribs} />";
- Assert::invariant(
- count( $this->children ) === 0,
- "Empty elements shouldn't have children."
- );
- }
- return $out;
- }
- // Utility functions on BalanceElements.
- /**
- * Determine if $this represents a specific HTML tag, is a member of
- * a tag set, or is equal to another BalanceElement.
- *
- * @param BalanceElement|array|string $set The target BalanceElement,
- * set (from the BalanceSets class), or string (HTML tag name).
- * @return bool
- */
- public function isA( $set ) {
- if ( $set instanceof BalanceElement ) {
- return $this === $set;
- } elseif ( is_array( $set ) ) {
- return isset( $set[$this->namespaceURI] ) &&
- isset( $set[$this->namespaceURI][$this->localName] );
- } else {
- // assume this is an HTML element name.
- return $this->isHtml() && $this->localName === $set;
- }
- }
- /**
- * Determine if this element is an HTML element with the specified name
- * @param string $tagName
- * @return bool
- */
- public function isHtmlNamed( $tagName ) {
- return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
- && $this->localName === $tagName;
- }
- /**
- * Determine if $this represents an element in the HTML namespace.
- *
- * @return bool
- */
- public function isHtml() {
- return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
- }
- /**
- * Determine if $this represents a MathML text integration point,
- * as defined in the HTML5 specification.
- *
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
- */
- public function isMathmlTextIntegrationPoint() {
- return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
- }
- /**
- * Determine if $this represents an HTML integration point,
- * as defined in the HTML5 specification.
- *
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
- */
- public function isHtmlIntegrationPoint() {
- if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
- return true;
- }
- if (
- $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
- $this->localName === 'annotation-xml' &&
- isset( $this->attribs['encoding'] ) &&
- ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
- strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
- ) {
- return true;
- }
- return false;
- }
- /**
- * Get a string key for the Noah's Ark algorithm
- * @return string
- */
- public function getNoahKey() {
- if ( $this->noahKey === null ) {
- $attribs = $this->attribs;
- ksort( $attribs );
- $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
- }
- return $this->noahKey;
- }
- }
- /**
- * The "stack of open elements" as defined in the HTML5 tree builder
- * spec. This contains methods to ensure that content (start tags, text)
- * are inserted at the correct place in the output string, and to
- * flatten BalanceElements are they are closed to avoid holding onto
- * a complete DOM tree for the document in memory.
- *
- * The stack defines a PHP iterator to traverse it in "reverse order",
- * that is, the most-recently-added element is visited first in a
- * foreach loop.
- *
- * @ingroup Parser
- * @since 1.27
- * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
- */
- class BalanceStack implements IteratorAggregate {
- /**
- * Backing storage for the stack.
- * @var BalanceElement[] $elements
- */
- private $elements = [];
- /**
- * Foster parent mode determines how nodes are inserted into the
- * stack.
- * @var bool $fosterParentMode
- * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
- */
- public $fosterParentMode = false;
- /**
- * Configuration options governing flattening.
- * @var array $config
- * @see Balancer::__construct()
- */
- private $config;
- /**
- * Reference to the current element
- */
- public $currentNode;
- /**
- * Create a new BalanceStack with a single BalanceElement on it,
- * representing the root <html> node.
- * @param array $config Balancer configuration; see Balancer::_construct().
- */
- public function __construct( array $config ) {
- // always a root <html> element on the stack
- array_push(
- $this->elements,
- new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
- );
- $this->currentNode = $this->elements[0];
- $this->config = $config;
- }
- /**
- * Return a string representing the output of the tree builder:
- * all the children of the root <html> node.
- * @return string
- */
- public function getOutput() {
- // Don't include the outer '<html>....</html>'
- $out = '';
- foreach ( $this->elements[0]->children as $elt ) {
- $out .= is_string( $elt ) ? $elt :
- $elt->flatten( $this->config );
- }
- return $out;
- }
- /**
- * Insert a comment at the appropriate place for inserting a node.
- * @param string $value Content of the comment.
- * @return string
- * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
- */
- public function insertComment( $value ) {
- // Just another type of text node, except for tidy p-wrapping.
- return $this->insertText( '<!--' . $value . '-->', true );
- }
- /**
- * Insert text at the appropriate place for inserting a node.
- * @param string $value
- * @param bool $isComment
- * @return string
- * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
- */
- public function insertText( $value, $isComment = false ) {
- if (
- $this->fosterParentMode &&
- $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
- ) {
- $this->fosterParent( $value );
- } elseif (
- $this->config['tidyCompat'] && !$isComment &&
- $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
- ) {
- $this->insertHTMLElement( 'mw:p-wrap', [] );
- return $this->insertText( $value );
- } else {
- $this->currentNode->appendChild( $value );
- }
- }
- /**
- * Insert a BalanceElement at the appropriate place, pushing it
- * on to the open elements stack.
- * @param string $namespaceURI The element namespace
- * @param string $tag The tag name
- * @param string $attribs Normalized attributes, as a string.
- * @return BalanceElement
- * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
- */
- public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
- return $this->insertElement(
- new BalanceElement( $namespaceURI, $tag, $attribs )
- );
- }
- /**
- * Insert an HTML element at the appropriate place, pushing it on to
- * the open elements stack.
- * @param string $tag The tag name
- * @param string $attribs Normalized attributes, as a string.
- * @return BalanceElement
- * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
- */
- public function insertHTMLElement( $tag, $attribs ) {
- return $this->insertForeignElement(
- BalanceSets::HTML_NAMESPACE, $tag, $attribs
- );
- }
- /**
- * Insert an element at the appropriate place and push it on to the
- * open elements stack.
- * @param BalanceElement $elt
- * @return BalanceElement
- * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
- */
- public function insertElement( BalanceElement $elt ) {
- if (
- $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
- !$elt->isA( BalanceSets::$tidyInlineSet )
- ) {
- // Tidy compatibility.
- $this->pop();
- }
- if (
- $this->fosterParentMode &&
- $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
- ) {
- $elt = $this->fosterParent( $elt );
- } else {
- $this->currentNode->appendChild( $elt );
- }
- Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
- Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
- array_push( $this->elements, $elt );
- $this->currentNode = $elt;
- return $elt;
- }
- /**
- * Determine if the stack has $tag in scope.
- * @param BalanceElement|array|string $tag
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
- */
- public function inScope( $tag ) {
- return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
- }
- /**
- * Determine if the stack has $tag in button scope.
- * @param BalanceElement|array|string $tag
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
- */
- public function inButtonScope( $tag ) {
- return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
- }
- /**
- * Determine if the stack has $tag in list item scope.
- * @param BalanceElement|array|string $tag
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
- */
- public function inListItemScope( $tag ) {
- return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
- }
- /**
- * Determine if the stack has $tag in table scope.
- * @param BalanceElement|array|string $tag
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
- */
- public function inTableScope( $tag ) {
- return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
- }
- /**
- * Determine if the stack has $tag in select scope.
- * @param BalanceElement|array|string $tag
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
- */
- public function inSelectScope( $tag ) {
- // Can't use inSpecificScope to implement this, since it involves
- // *inverting* a set of tags. Implement manually.
- foreach ( $this as $elt ) {
- if ( $elt->isA( $tag ) ) {
- return true;
- }
- if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
- return false;
- }
- }
- return false;
- }
- /**
- * Determine if the stack has $tag in a specific scope, $set.
- * @param BalanceElement|array|string $tag
- * @param BalanceElement|array|string $set
- * @return bool
- * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
- */
- public function inSpecificScope( $tag, $set ) {
- foreach ( $this as $elt ) {
- if ( $elt->isA( $tag ) ) {
- return true;
- }
- if ( $elt->isA( $set ) ) {
- return false;
- }
- }
- return false;
- }
- /**
- * Generate implied end tags.
- * @param string $butnot
- * @param bool $thorough True if we should generate end tags thoroughly.
- * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
- */
- public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
- $endTagSet = $thorough ?
- BalanceSets::$thoroughImpliedEndTagsSet :
- BalanceSets::$impliedEndTagsSet;
- while ( $this->currentNode ) {
- if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
- break;
- }
- if ( !$this->currentNode->isA( $endTagSet ) ) {
- break;
- }
- $this->pop();
- }
- }
- /**
- * Return the adjusted current node.
- * @param string $fragmentContext
- * @return string
- */
- public function adjustedCurrentNode( $fragmentContext ) {
- return ( $fragmentContext && count( $this->elements ) === 1 ) ?
- $fragmentContext : $this->currentNode;
- }
- /**
- * Return an iterator over this stack which visits the current node
- * first, and the root node last.
- * @return \Iterator
- */
- public function getIterator() {
- return new ReverseArrayIterator( $this->elements );
- }
- /**
- * Return the BalanceElement at the given position $idx, where
- * position 0 represents the root element.
- * @param int $idx
- * @return BalanceElement
- */
- public function node( $idx ) {
- return $this->elements[ $idx ];
- }
- /**
- * Replace the element at position $idx in the BalanceStack with $elt.
- * @param int $idx
- * @param BalanceElement $elt
- */
- public function replaceAt( $idx, BalanceElement $elt ) {
- Assert::precondition(
- $this->elements[$idx]->parent !== 'flat',
- 'Replaced element should not have already been flattened.'
- );
- Assert::precondition(
- $elt->parent !== 'flat',
- 'New element should not have already been flattened.'
- );
- $this->elements[$idx] = $elt;
- if ( $idx === count( $this->elements ) - 1 ) {
- $this->currentNode = $elt;
- }
- }
- /**
- * Return the position of the given BalanceElement, set, or
- * HTML tag name string in the BalanceStack.
- * @param BalanceElement|array|string $tag
- * @return int
- */
- public function indexOf( $tag ) {
- for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
- if ( $this->elements[$i]->isA( $tag ) ) {
- return $i;
- }
- }
- return -1;
- }
- /**
- * Return the number of elements currently in the BalanceStack.
- * @return int
- */
- public function length() {
- return count( $this->elements );
- }
- /**
- * Remove the current node from the BalanceStack, flattening it
- * in the process.
- */
- public function pop() {
- $elt = array_pop( $this->elements );
- if ( count( $this->elements ) ) {
- $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
- } else {
- $this->currentNode = null;
- }
- if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
- $elt->flatten( $this->config );
- }
- }
- /**
- * Remove all nodes up to and including position $idx from the
- * BalanceStack, flattening them in the process.
- * @param int $idx
- */
- public function popTo( $idx ) {
- for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
- $this->pop();
- }
- }
- /**
- * Pop elements off the stack up to and including the first
- * element with the specified HTML tagname (or matching the given
- * set).
- * @param BalanceElement|array|string $tag
- */
- public function popTag( $tag ) {
- while ( $this->currentNode ) {
- if ( $this->currentNode->isA( $tag ) ) {
- $this->pop();
- break;
- }
- $this->pop();
- }
- }
- /**
- * Pop elements off the stack *not including* the first element
- * in the specified set.
- * @param BalanceElement|array|string $set
- */
- public function clearToContext( $set ) {
- // Note that we don't loop to 0. Never pop the <html> elt off.
- for ( $length = count( $this->elements ); $length > 1; $length-- ) {
- if ( $this->currentNode->isA( $set ) ) {
- break;
- }
- $this->pop();
- }
- }
- /**
- * Remove the given $elt from the BalanceStack, optionally
- * flattening it in the process.
- * @param BalanceElement $elt The element to remove.
- * @param bool $flatten Whether to flatten the removed element.
- */
- public function removeElement( BalanceElement $elt, $flatten = true ) {
- Assert::parameter(
- $elt->parent !== 'flat',
- '$elt',
- '$elt should not already have been flattened.'
- );
- Assert::parameter(
- $elt->parent->parent !== 'flat',
- '$elt',
- 'The parent of $elt should not already have been flattened.'
- );
- $idx = array_search( $elt, $this->elements, true );
- Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
- array_splice( $this->elements, $idx, 1 );
- if ( $idx === count( $this->elements ) ) {
- $this->currentNode = $this->elements[$idx - 1];
- }
- if ( $flatten ) {
- // serialize $elt into its parent
- // otherwise, it will eventually serialize when the parent
- // is serialized, we just hold onto the memory for its
- // tree of objects a little longer.
- $elt->flatten( $this->config );
- }
- Assert::postcondition(
- array_search( $elt, $this->elements, true ) === false,
- '$elt should no longer be in open elements stack'
- );
- }
- /**
- * Find $a in the BalanceStack and insert $b after it.
- * @param BalanceElement $a
- * @param BalanceElement $b
- */
- public function insertAfter( BalanceElement $a, BalanceElement $b ) {
- $idx = $this->indexOf( $a );
- Assert::parameter( $idx !== false, '$a', 'must be in stack' );
- if ( $idx === count( $this->elements ) - 1 ) {
- array_push( $this->elements, $b );
- $this->currentNode = $b;
- } else {
- array_splice( $this->elements, $idx + 1, 0, [ $b ] );
- }
- }
- // Fostering and adoption.
- /**
- * Foster parent the given $elt in the stack of open elements.
- * @param BalanceElement|string $elt
- * @return BalanceElement|string
- *
- * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
- */
- private function fosterParent( $elt ) {
- $lastTable = $this->indexOf( 'table' );
- $lastTemplate = $this->indexOf( 'template' );
- $parent = null;
- $before = null;
- if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
- $parent = $this->elements[$lastTemplate];
- } elseif ( $lastTable >= 0 ) {
- $parent = $this->elements[$lastTable]->parent;
- // Assume all tables have parents, since we're not running scripts!
- Assert::invariant(
- $parent !== null, "All tables should have parents"
- );
- $before = $this->elements[$lastTable];
- } else {
- $parent = $this->elements[0]; // the `html` element.
- }
- if ( $this->config['tidyCompat'] ) {
- if ( is_string( $elt ) ) {
- // We're fostering text: do we need a p-wrapper?
- if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
- $this->insertHTMLElement( 'mw:p-wrap', [] );
- $this->insertText( $elt );
- return $elt;
- }
- } else {
- // We're fostering an element; do we need to merge p-wrappers?
- if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
- $idx = $before ?
- array_search( $before, $parent->children, true ) :
- count( $parent->children );
- $after = $idx > 0 ? $parent->children[$idx - 1] : '';
- if (
- $after instanceof BalanceElement &&
- $after->isHtmlNamed( 'mw:p-wrap' )
- ) {
- return $after; // Re-use existing p-wrapper.
- }
- }
- }
- }
- if ( $before ) {
- $parent->insertBefore( $before, $elt );
- } else {
- $parent->appendChild( $elt );
- }
- return $elt;
- }
- /**
- * Run the "adoption agency algoritm" (AAA) for the given subject
- * tag name.
- * @param string $tag The subject tag name.
- * @param BalanceActiveFormattingElements $afe The current
- * active formatting elements list.
- * @return true if the adoption agency algorithm "did something", false
- * if more processing is required by the caller.
- * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
- */
- public function adoptionAgency( $tag, $afe ) {
- // If the current node is an HTML element whose tag name is subject,
- // and the current node is not in the list of active formatting
- // elements, then pop the current node off the stack of open
- // elements and abort these steps.
- if (
- $this->currentNode->isHtmlNamed( $tag ) &&
- !$afe->isInList( $this->currentNode )
- ) {
- $this->pop();
- return true; // no more handling required
- }
- // Outer loop: If outer loop counter is greater than or
- // equal to eight, then abort these steps.
- for ( $outer = 0; $outer < 8; $outer++ ) {
- // Let the formatting element be the last element in the list
- // of active formatting elements that: is between the end of
- // the list and the last scope marker in the list, if any, or
- // the start of the list otherwise, and has the same tag name
- // as the token.
- $fmtElt = $afe->findElementByTag( $tag );
- // If there is no such node, then abort these steps and instead
- // act as described in the "any other end tag" entry below.
- if ( !$fmtElt ) {
- return false; // false means handle by the default case
- }
- // Otherwise, if there is such a node, but that node is not in
- // the stack of open elements, then this is a parse error;
- // remove the element from the list, and abort these steps.
- $index = $this->indexOf( $fmtElt );
- if ( $index < 0 ) {
- $afe->remove( $fmtElt );
- return true; // true means no more handling required
- }
- // Otherwise, if there is such a node, and that node is also in
- // the stack of open elements, but the element is not in scope,
- // then this is a parse error; ignore the token, and abort
- // these steps.
- if ( !$this->inScope( $fmtElt ) ) {
- return true;
- }
- // Let the furthest block be the topmost node in the stack of
- // open elements that is lower in the stack than the formatting
- // element, and is an element in the special category. There
- // might not be one.
- $furthestBlock = null;
- $furthestBlockIndex = -1;
- $stackLength = $this->length();
- for ( $i = $index + 1; $i < $stackLength; $i++ ) {
- if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
- $furthestBlock = $this->node( $i );
- $furthestBlockIndex = $i;
- break;
- }
- }
- // If there is no furthest block, then the UA must skip the
- // subsequent steps and instead just pop all the nodes from the
- // bottom of the stack of open elements, from the current node
- // up to and including the formatting element, and remove the
- // formatting element from the list of active formatting
- // elements.
- if ( !$furthestBlock ) {
- $this->popTag( $fmtElt );
- $afe->remove( $fmtElt );
- return true;
- }
- // Let the common ancestor be the element immediately above
- // the formatting element in the stack of open elements.
- $ancestor = $this->node( $index - 1 );
- // Let a bookmark note the position of the formatting
- // element in the list of active formatting elements
- // relative to the elements on either side of it in the
- // list.
- $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
- $afe->insertAfter( $fmtElt, $BOOKMARK );
- // Let node and last node be the furthest block.
- $node = $furthestBlock;
- $lastNode = $furthestBlock;
- $nodeIndex = $furthestBlockIndex;
- $isAFE = false;
- // Inner loop
- for ( $inner = 1; true; $inner++ ) {
- // Let node be the element immediately above node in
- // the stack of open elements, or if node is no longer
- // in the stack of open elements (e.g. because it got
- // removed by this algorithm), the element that was
- // immediately above node in the stack of open elements
- // before node was removed.
- $node = $this->node( --$nodeIndex );
- // If node is the formatting element, then go
- // to the next step in the overall algorithm.
- if ( $node === $fmtElt ) break;
- // If the inner loop counter is greater than three and node
- // is in the list of active formatting elements, then remove
- // node from the list of active formatting elements.
- $isAFE = $afe->isInList( $node );
- if ( $inner > 3 && $isAFE ) {
- $afe->remove( $node );
- $isAFE = false;
- }
- // If node is not in the list of active formatting
- // elements, then remove node from the stack of open
- // elements and then go back to the step labeled inner
- // loop.
- if ( !$isAFE ) {
- // Don't flatten here, since we're about to relocate
- // parts of this $node.
- $this->removeElement( $node, false );
- continue;
- }
- // Create an element for the token for which the
- // element node was created with common ancestor as
- // the intended parent, replace the entry for node
- // in the list of active formatting elements with an
- // entry for the new element, replace the entry for
- // node in the stack of open elements with an entry for
- // the new element, and let node be the new element.
- $newElt = new BalanceElement(
- $node->namespaceURI, $node->localName, $node->attribs );
- $afe->replace( $node, $newElt );
- $this->replaceAt( $nodeIndex, $newElt );
- $node = $newElt;
- // If last node is the furthest block, then move the
- // aforementioned bookmark to be immediately after the
- // new node in the list of active formatting elements.
- if ( $lastNode === $furthestBlock ) {
- $afe->remove( $BOOKMARK );
- $afe->insertAfter( $newElt, $BOOKMARK );
- }
- // Insert last node into node, first removing it from
- // its previous parent node if any.
- $node->appendChild( $lastNode );
- // Let last node be node.
- $lastNode = $node;
- }
- // If the common ancestor node is a table, tbody, tfoot,
- // thead, or tr element, then, foster parent whatever last
- // node ended up being in the previous step, first removing
- // it from its previous parent node if any.
- if (
- $this->fosterParentMode &&
- $ancestor->isA( BalanceSets::$tableSectionRowSet )
- ) {
- $this->fosterParent( $lastNode );
- } else {
- // Otherwise, append whatever last node ended up being in
- // the previous step to the common ancestor node, first
- // removing it from its previous parent node if any.
- $ancestor->appendChild( $lastNode );
- }
- // Create an element for the token for which the
- // formatting element was created, with furthest block
- // as the intended parent.
- $newElt2 = new BalanceElement(
- $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
- // Take all of the child nodes of the furthest block and
- // append them to the element created in the last step.
- $newElt2->adoptChildren( $furthestBlock );
- // Append that new element to the furthest block.
- $furthestBlock->appendChild( $newElt2 );
- // Remove the formatting element from the list of active
- // formatting elements, and insert the new element into the
- // list of active formatting elements at the position of
- // the aforementioned bookmark.
- $afe->remove( $fmtElt );
- $afe->replace( $BOOKMARK, $newElt2 );
- // Remove the formatting element from the stack of open
- // elements, and insert the new element into the stack of
- // open elements immediately below the position of the
- // furthest block in that stack.
- $this->removeElement( $fmtElt );
- $this->insertAfter( $furthestBlock, $newElt2 );
- }
- return true;
- }
- /**
- * Return the contents of the open elements stack as a string for
- * debugging.
- * @return string
- */
- public function __toString() {
- $r = [];
- foreach ( $this->elements as $elt ) {
- array_push( $r, $elt->localName );
- }
- return implode( ' ', $r );
- }
- }
- /**
- * A pseudo-element used as a marker in the list of active formatting elements
- *
- * @ingroup Parser
- * @since 1.27
- */
- class BalanceMarker {
- public $nextAFE;
- public $prevAFE;
- }
- /**
- * The list of active formatting elements, which is used to handle
- * mis-nested formatting element tags in the HTML5 tree builder
- * specification.
- *
- * @ingroup Parser
- * @since 1.27
- * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
- */
- class BalanceActiveFormattingElements {
- /** The last (most recent) element in the list */
- private $tail;
- /** The first (least recent) element in the list */
- private $head;
- /**
- * An array of arrays representing the population of elements in each bucket
- * according to the Noah's Ark clause. The outer array is stack-like, with each
- * integer-indexed element representing a segment of the list, bounded by
- * markers. The first element represents the segment of the list before the
- * first marker.
- *
- * The inner arrays are indexed by "Noah key", which is a string which uniquely
- * identifies each bucket according to the rules in the spec. The value in
- * the inner array is the first (least recently inserted) element in the bucket,
- * and subsequent members of the bucket can be found by iterating through the
- * singly-linked list via $node->nextNoah.
- *
- * This is optimised for the most common case of inserting into a bucket
- * with zero members, and deleting a bucket containing one member. In the
- * worst case, iteration through the list is still O(1) in the document
- * size, since each bucket can have at most 3 members.
- */
- private $noahTableStack = [ [] ];
- public function __destruct() {
- $next = null;
- for ( $node = $this->head; $node; $node = $next ) {
- $next = $node->nextAFE;
- $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
- }
- $this->head = $this->tail = $this->noahTableStack = null;
- }
- public function insertMarker() {
- $elt = new BalanceMarker;
- if ( $this->tail ) {
- $this->tail->nextAFE = $elt;
- $elt->prevAFE = $this->tail;
- } else {
- $this->head = $elt;
- }
- $this->tail = $elt;
- $this->noahTableStack[] = [];
- }
- /**
- * Follow the steps required when the spec requires us to "push onto the
- * list of active formatting elements".
- * @param BalanceElement $elt
- */
- public function push( BalanceElement $elt ) {
- // Must not be in the list already
- if ( $elt->prevAFE !== null || $this->head === $elt ) {
- throw new ParameterAssertionException( '$elt',
- 'Cannot insert a node into the AFE list twice' );
- }
- // "Noah's Ark clause" -- if there are already three copies of
- // this element before we encounter a marker, then drop the last
- // one.
- $noahKey = $elt->getNoahKey();
- $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
- if ( !isset( $table[$noahKey] ) ) {
- $table[$noahKey] = $elt;
- } else {
- $count = 1;
- $head = $tail = $table[$noahKey];
- while ( $tail->nextNoah ) {
- $tail = $tail->nextNoah;
- $count++;
- }
- if ( $count >= 3 ) {
- $this->remove( $head );
- }
- $tail->nextNoah = $elt;
- }
- // Add to the main AFE list
- if ( $this->tail ) {
- $this->tail->nextAFE = $elt;
- $elt->prevAFE = $this->tail;
- } else {
- $this->head = $elt;
- }
- $this->tail = $elt;
- }
- /**
- * Follow the steps required when the spec asks us to "clear the list of
- * active formatting elements up to the last marker".
- */
- public function clearToMarker() {
- // Iterate back through the list starting from the tail
- $tail = $this->tail;
- while ( $tail && !( $tail instanceof BalanceMarker ) ) {
- // Unlink the element
- $prev = $tail->prevAFE;
- $tail->prevAFE = null;
- if ( $prev ) {
- $prev->nextAFE = null;
- }
- $tail->nextNoah = null;
- $tail = $prev;
- }
- // If we finished on a marker, unlink it and pop it off the Noah table stack
- if ( $tail ) {
- $prev = $tail->prevAFE;
- if ( $prev ) {
- $prev->nextAFE = null;
- }
- $tail = $prev;
- array_pop( $this->noahTableStack );
- } else {
- // No marker: wipe the top-level Noah table (which is the only one)
- $this->noahTableStack[0] = [];
- }
- // If we removed all the elements, clear the head pointer
- if ( !$tail ) {
- $this->head = null;
- }
- $this->tail = $tail;
- }
- /**
- * Find and return the last element with the specified tag between the
- * end of the list and the last marker on the list.
- * Used when parsing <a> "in body mode".
- * @param string $tag
- * @return null|Node
- */
- public function findElementByTag( $tag ) {
- $elt = $this->tail;
- while ( $elt && !( $elt instanceof BalanceMarker ) ) {
- if ( $elt->localName === $tag ) {
- return $elt;
- }
- $elt = $elt->prevAFE;
- }
- return null;
- }
- /**
- * Determine whether an element is in the list of formatting elements.
- * @param BalanceElement $elt
- * @return bool
- */
- public function isInList( BalanceElement $elt ) {
- return $this->head === $elt || $elt->prevAFE;
- }
- /**
- * Find the element $elt in the list and remove it.
- * Used when parsing <a> in body mode.
- *
- * @param BalanceElement $elt
- */
- public function remove( BalanceElement $elt ) {
- if ( $this->head !== $elt && !$elt->prevAFE ) {
- throw new ParameterAssertionException( '$elt',
- "Attempted to remove an element which is not in the AFE list" );
- }
- // Update head and tail pointers
- if ( $this->head === $elt ) {
- $this->head = $elt->nextAFE;
- }
- if ( $this->tail === $elt ) {
- $this->tail = $elt->prevAFE;
- }
- // Update previous element
- if ( $elt->prevAFE ) {
- $elt->prevAFE->nextAFE = $elt->nextAFE;
- }
- // Update next element
- if ( $elt->nextAFE ) {
- $elt->nextAFE->prevAFE = $elt->prevAFE;
- }
- // Clear pointers so that isInList() etc. will work
- $elt->prevAFE = $elt->nextAFE = null;
- // Update Noah list
- $this->removeFromNoahList( $elt );
- }
- private function addToNoahList( BalanceElement $elt ) {
- $noahKey = $elt->getNoahKey();
- $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
- if ( !isset( $table[$noahKey] ) ) {
- $table[$noahKey] = $elt;
- } else {
- $tail = $table[$noahKey];
- while ( $tail->nextNoah ) {
- $tail = $tail->nextNoah;
- }
- $tail->nextNoah = $elt;
- }
- }
- private function removeFromNoahList( BalanceElement $elt ) {
- $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
- $key = $elt->getNoahKey();
- $noahElt = $table[$key];
- if ( $noahElt === $elt ) {
- if ( $noahElt->nextNoah ) {
- $table[$key] = $noahElt->nextNoah;
- $noahElt->nextNoah = null;
- } else {
- unset( $table[$key] );
- }
- } else {
- do {
- $prevNoahElt = $noahElt;
- $noahElt = $prevNoahElt->nextNoah;
- if ( $noahElt === $elt ) {
- // Found it, unlink
- $prevNoahElt->nextNoah = $elt->nextNoah;
- $elt->nextNoah = null;
- break;
- }
- } while ( $noahElt );
- }
- }
- /**
- * Find element $a in the list and replace it with element $b
- *
- * @param BalanceElement $a
- * @param BalanceElement $b
- */
- public function replace( BalanceElement $a, BalanceElement $b ) {
- if ( $this->head !== $a && !$a->prevAFE ) {
- throw new ParameterAssertionException( '$a',
- "Attempted to replace an element which is not in the AFE list" );
- }
- // Update head and tail pointers
- if ( $this->head === $a ) {
- $this->head = $b;
- }
- if ( $this->tail === $a ) {
- $this->tail = $b;
- }
- // Update previous element
- if ( $a->prevAFE ) {
- $a->prevAFE->nextAFE = $b;
- }
- // Update next element
- if ( $a->nextAFE ) {
- $a->nextAFE->prevAFE = $b;
- }
- $b->prevAFE = $a->prevAFE;
- $b->nextAFE = $a->nextAFE;
- $a->nextAFE = $a->prevAFE = null;
- // Update Noah list
- $this->removeFromNoahList( $a );
- $this->addToNoahList( $b );
- }
- /**
- * Find $a in the list and insert $b after it.
- * @param BalanceElement $a
- * @param BalanceElement $b
- */
- public function insertAfter( BalanceElement $a, BalanceElement $b ) {
- if ( $this->head !== $a && !$a->prevAFE ) {
- throw new ParameterAssertionException( '$a',
- "Attempted to insert after an element which is not in the AFE list" );
- }
- if ( $this->tail === $a ) {
- $this->tail = $b;
- }
- if ( $a->nextAFE ) {
- $a->nextAFE->prevAFE = $b;
- }
- $b->nextAFE = $a->nextAFE;
- $b->prevAFE = $a;
- $a->nextAFE = $b;
- $this->addToNoahList( $b );
- }
- /**
- * Reconstruct the active formatting elements.
- * @param BalanceStack $stack The open elements stack
- * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
- */
- public function reconstruct( $stack ) {
- $entry = $this->tail;
- // If there are no entries in the list of active formatting elements,
- // then there is nothing to reconstruct
- if ( !$entry ) {
- return;
- }
- // If the last is a marker, do nothing.
- if ( $entry instanceof BalanceMarker ) {
- return;
- }
- // Or if it is an open element, do nothing.
- if ( $stack->indexOf( $entry ) >= 0 ) {
- return;
- }
- // Loop backward through the list until we find a marker or an
- // open element
- $foundIt = false;
- while ( $entry->prevAFE ) {
- $entry = $entry->prevAFE;
- if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
- $foundIt = true;
- break;
- }
- }
- // Now loop forward, starting from the element after the current one (or
- // the first element if we didn't find a marker or open element),
- // recreating formatting elements and pushing them back onto the list
- // of open elements.
- if ( $foundIt ) {
- $entry = $entry->nextAFE;
- }
- do {
- $newElement = $stack->insertHTMLElement(
- $entry->localName,
- $entry->attribs );
- $this->replace( $entry, $newElement );
- $entry = $newElement->nextAFE;
- } while ( $entry );
- }
- /**
- * Get a string representation of the AFE list, for debugging
- */
- public function __toString() {
- $prev = null;
- $s = '';
- for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
- if ( $node instanceof BalanceMarker ) {
- $s .= "MARKER\n";
- continue;
- }
- $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
- if ( $node->nextNoah ) {
- $s .= " (noah sibling: {$node->nextNoah->localName}#" .
- substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
- ')';
- }
- if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
- $s .= " (reverse link is wrong!)";
- }
- $s .= "\n";
- }
- if ( $prev !== $this->tail ) {
- $s .= "(tail pointer is wrong!)\n";
- }
- return $s;
- }
- }
- /**
- * An implementation of the tree building portion of the HTML5 parsing
- * spec.
- *
- * This is used to balance and tidy output so that the result can
- * always be cleanly serialized/deserialized by an HTML5 parser. It
- * does *not* guarantee "conforming" output -- the HTML5 spec contains
- * a number of constraints which are not enforced by the HTML5 parsing
- * process. But the result will be free of gross errors: misnested or
- * unclosed tags, for example, and will be unchanged by spec-complient
- * parsing followed by serialization.
- *
- * The tree building stage is structured as a state machine.
- * When comparing the implementation to
- * https://www.w3.org/TR/html5/syntax.html#tree-construction
- * note that each state is implemented as a function with a
- * name ending in `Mode` (because the HTML spec refers to them
- * as insertion modes). The current insertion mode is held by
- * the $parseMode property.
- *
- * The following simplifications have been made:
- * - We handle body content only (ie, we start `in body`.)
- * - The document is never in "quirks mode".
- * - All occurrences of < and > have been entity escaped, so we
- * can parse tags by simply splitting on those two characters.
- * (This also simplifies the handling of < inside <textarea>.)
- * The character < must not appear inside comments.
- * Similarly, all attributes have been "cleaned" and are double-quoted
- * and escaped.
- * - All null characters are assumed to have been removed.
- * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- * <frame>, <plaintext>, <xmp>, <iframe>,
- * <noembed>, <noscript>, <script>, <title>. As a result,
- * further simplifications can be made:
- * - `frameset-ok` is not tracked.
- * - `head element pointer` is not tracked (but presumed non-null)
- * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
- * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
- *
- * We generally mark places where we omit cases from the spec due to
- * disallowed elements with a comment: `// OMITTED: <element-name>`.
- *
- * The HTML spec keeps a flag during the parsing process to track
- * whether or not a "parse error" has been encountered. We don't
- * bother to track that flag, we just implement the error-handling
- * process as specified.
- *
- * @ingroup Parser
- * @since 1.27
- * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
- */
- class Balancer {
- private $parseMode;
- /** @var \Iterator */
- private $bitsIterator;
- private $allowedHtmlElements;
- /** @var BalanceActiveFormattingElements */
- private $afe;
- /** @var BalanceStack */
- private $stack;
- private $strict;
- private $allowComments;
- private $config;
- private $textIntegrationMode;
- private $pendingTableText;
- private $originalInsertionMode;
- private $fragmentContext;
- private $formElementPointer;
- private $ignoreLinefeed;
- private $inRCDATA;
- private $inRAWTEXT;
- /** @var callable|null */
- private $processingCallback;
- /** @var array */
- private $processingArgs;
- /**
- * Valid HTML5 comments.
- * Regex borrowed from Tim Starling's "remex-html" project.
- */
- const VALID_COMMENT_REGEX = "~ !--
- ( # 1. Comment match detector
- > | -> | # Invalid short close
- ( # 2. Comment contents
- (?:
- (?! --> )
- (?! --!> )
- (?! --! \z )
- (?! -- \z )
- (?! - \z )
- .
- )*+
- )
- ( # 3. Comment close
- --> | # Normal close
- --!> | # Comment end bang
- ( # 4. Indicate matches requiring EOF
- --! | # EOF in comment end bang state
- -- | # EOF in comment end state
- - | # EOF in comment end dash state
- (?#nothing) # EOF in comment state
- )
- )
- )
- ([^<]*) \z # 5. Non-tag text after the comment
- ~xs";
- /**
- * Create a new Balancer.
- * @param array $config Balancer configuration. Includes:
- * 'strict' : boolean, defaults to false.
- * When true, enforces syntactic constraints on input:
- * all non-tag '<' must be escaped, all attributes must be
- * separated by a single space and double-quoted. This is
- * consistent with the output of the Sanitizer.
- * 'allowedHtmlElements' : array, defaults to null.
- * When present, the keys of this associative array give
- * the acceptable HTML tag names. When not present, no
- * tag sanitization is done.
- * 'tidyCompat' : boolean, defaults to false.
- * When true, the serialization algorithm is tweaked to
- * provide historical compatibility with the old "tidy"
- * program: <p>-wrapping is done to the children of
- * <body> and <blockquote> elements, and empty elements
- * are removed. The <pre>/<listing>/<textarea> serialization
- * is also tweaked to allow lossless round trips.
- * (See: https://github.com/whatwg/html/issues/944)
- * 'allowComments': boolean, defaults to true.
- * When true, allows HTML comments in the input.
- * The Sanitizer generally strips all comments, so if you
- * are running on sanitized output you can set this to
- * false to get a bit more performance.
- */
- public function __construct( array $config = [] ) {
- $this->config = $config = $config + [
- 'strict' => false,
- 'allowedHtmlElements' => null,
- 'tidyCompat' => false,
- 'allowComments' => true,
- ];
- $this->allowedHtmlElements = $config['allowedHtmlElements'];
- $this->strict = $config['strict'];
- $this->allowComments = $config['allowComments'];
- if ( $this->allowedHtmlElements !== null ) {
- // Sanity check!
- $bad = array_uintersect_assoc(
- $this->allowedHtmlElements,
- BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
- function ( $a, $b ) {
- // Ignore the values (just intersect the keys) by saying
- // all values are equal to each other.
- return 0;
- }
- );
- if ( count( $bad ) > 0 ) {
- $badstr = implode( ',', array_keys( $bad ) );
- throw new ParameterAssertionException(
- '$config',
- 'Balance attempted with sanitization including ' .
- "unsupported elements: {$badstr}"
- );
- }
- }
- }
- /**
- * Return a balanced HTML string for the HTML fragment given by $text,
- * subject to the caveats listed in the class description. The result
- * will typically be idempotent -- that is, rebalancing the output
- * would result in no change.
- *
- * @param string $text The markup to be balanced
- * @param callable $processingCallback Callback to do any variable or
- * parameter replacements in HTML attributes values
- * @param array|bool $processingArgs Arguments for the processing callback
- * @return string The balanced markup
- */
- public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
- $this->parseMode = 'inBodyMode';
- $this->bitsIterator = new ExplodeIterator( '<', $text );
- $this->afe = new BalanceActiveFormattingElements();
- $this->stack = new BalanceStack( $this->config );
- $this->processingCallback = $processingCallback;
- $this->processingArgs = $processingArgs;
- $this->textIntegrationMode =
- $this->ignoreLinefeed =
- $this->inRCDATA =
- $this->inRAWTEXT = false;
- // The stack is constructed with an <html> element already on it.
- // Set this up as a fragment parsed with <body> as the context.
- $this->fragmentContext =
- new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
- $this->resetInsertionMode();
- $this->formElementPointer = null;
- for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
- if ( $e->isHtmlNamed( 'form' ) ) {
- $this->formElementPointer = $e;
- break;
- }
- }
- // First element is text not tag
- $x = $this->bitsIterator->current();
- $this->bitsIterator->next();
- $this->insertToken( 'text', str_replace( '>', '>', $x ) );
- // Now process each tag.
- while ( $this->bitsIterator->valid() ) {
- $this->advance();
- }
- $this->insertToken( 'eof', null );
- $result = $this->stack->getOutput();
- // Free memory before returning.
- $this->bitsIterator = null;
- $this->afe = null;
- $this->stack = null;
- $this->fragmentContext = null;
- $this->formElementPointer = null;
- return $result;
- }
- /**
- * Pass a token to the tree builder. The $token will be one of the
- * strings "tag", "endtag", or "text".
- */
- private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
- // validate tags against $unsupportedSet
- if ( $token === 'tag' || $token === 'endtag' ) {
- if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
- // As described in "simplifications" above, these tags are
- // not supported in the balancer.
- Assert::invariant(
- !$this->strict,
- "Unsupported $token <$value> found."
- );
- return false;
- }
- } elseif ( $token === 'text' && $value === '' ) {
- // Don't actually inject the empty string as a text token.
- return true;
- }
- // Support pre/listing/textarea by suppressing initial linefeed
- if ( $this->ignoreLinefeed ) {
- $this->ignoreLinefeed = false;
- if ( $token === 'text' ) {
- if ( $value[0] === "\n" ) {
- if ( $value === "\n" ) {
- // Nothing would be left, don't inject the empty string.
- return true;
- }
- $value = substr( $value, 1 );
- }
- }
- }
- // Some hoops we have to jump through
- $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
- // The spec calls this the "tree construction dispatcher".
- $isForeign = true;
- if (
- $this->stack->length() === 0 ||
- $adjusted->isHtml() ||
- $token === 'eof'
- ) {
- $isForeign = false;
- } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
- if ( $token === 'text' ) {
- $isForeign = false;
- } elseif (
- $token === 'tag' &&
- $value !== 'mglyph' && $value !== 'malignmark'
- ) {
- $isForeign = false;
- }
- } elseif (
- $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
- $adjusted->localName === 'annotation-xml' &&
- $token === 'tag' && $value === 'svg'
- ) {
- $isForeign = false;
- } elseif (
- $adjusted->isHtmlIntegrationPoint() &&
- ( $token === 'tag' || $token === 'text' )
- ) {
- $isForeign = false;
- }
- if ( $isForeign ) {
- return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
- } else {
- $func = $this->parseMode;
- return $this->$func( $token, $value, $attribs, $selfClose );
- }
- }
- private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- $this->stack->insertText( $value );
- return true;
- } elseif ( $token === 'comment' ) {
- $this->stack->insertComment( $value );
- return true;
- } elseif ( $token === 'tag' ) {
- switch ( $value ) {
- case 'font':
- if ( isset( $attribs['color'] )
- || isset( $attribs['face'] )
- || isset( $attribs['size'] )
- ) {
- break;
- }
- // otherwise, fall through
- case 'b':
- case 'big':
- case 'blockquote':
- case 'body':
- case 'br':
- case 'center':
- case 'code':
- case 'dd':
- case 'div':
- case 'dl':
- case 'dt':
- case 'em':
- case 'embed':
- case 'h1':
- case 'h2':
- case 'h3':
- case 'h4':
- case 'h5':
- case 'h6':
- case 'head':
- case 'hr':
- case 'i':
- case 'img':
- case 'li':
- case 'listing':
- case 'menu':
- case 'meta':
- case 'nobr':
- case 'ol':
- case 'p':
- case 'pre':
- case 'ruby':
- case 's':
- case 'small':
- case 'span':
- case 'strong':
- case 'strike':
- case 'sub':
- case 'sup':
- case 'table':
- case 'tt':
- case 'u':
- case 'ul':
- case 'var':
- if ( $this->fragmentContext ) {
- break;
- }
- while ( true ) {
- $this->stack->pop();
- $node = $this->stack->currentNode;
- if (
- $node->isMathmlTextIntegrationPoint() ||
- $node->isHtmlIntegrationPoint() ||
- $node->isHtml()
- ) {
- break;
- }
- }
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- // "Any other start tag"
- $adjusted = ( $this->fragmentContext && $this->stack->length() === 1 ) ?
- $this->fragmentContext : $this->stack->currentNode;
- $this->stack->insertForeignElement(
- $adjusted->namespaceURI, $value, $attribs
- );
- if ( $selfClose ) {
- $this->stack->pop();
- }
- return true;
- } elseif ( $token === 'endtag' ) {
- $first = true;
- foreach ( $this->stack as $i => $node ) {
- if ( $node->isHtml() && !$first ) {
- // process the end tag as HTML
- $func = $this->parseMode;
- return $this->$func( $token, $value, $attribs, $selfClose );
- } elseif ( $i === 0 ) {
- return true;
- } elseif ( $node->localName === $value ) {
- $this->stack->popTag( $node );
- return true;
- }
- $first = false;
- }
- }
- }
- /**
- * Grab the next "token" from $bitsIterator. This is either a open/close
- * tag or text or a comment, depending on whether the Sanitizer approves.
- */
- private function advance() {
- $x = $this->bitsIterator->current();
- $this->bitsIterator->next();
- $regs = [];
- // Handle comments. These won't be generated by mediawiki (they
- // are stripped in the Sanitizer) but may be generated by extensions.
- if (
- $this->allowComments &&
- !( $this->inRCDATA || $this->inRAWTEXT ) &&
- preg_match( self::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
- // verify EOF condition where necessary
- ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
- ) {
- $contents = $regs[2][0];
- $rest = $regs[5][0];
- $this->insertToken( 'comment', $contents );
- $this->insertToken( 'text', str_replace( '>', '>', $rest ) );
- return;
- }
- // $slash: Does the current element start with a '/'?
- // $t: Current element name
- // $attribStr: String between element name and >
- // $brace: Ending '>' or '/>'
- // $rest: Everything until the next element from the $bitsIterator
- if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
- list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
- $t = strtolower( $t );
- if ( $this->strict ) {
- // Verify that attributes are all properly double-quoted
- Assert::invariant(
- preg_match(
- '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
- ),
- "Bad attribute string found"
- );
- }
- } else {
- Assert::invariant(
- !$this->strict, "< found which does not start a valid tag"
- );
- $slash = $t = $attribStr = $brace = $rest = null;
- }
- $goodTag = $t;
- if ( $this->inRCDATA ) {
- if ( $slash && $t === $this->inRCDATA ) {
- $this->inRCDATA = false;
- } else {
- // No tags allowed; this emulates the "rcdata" tokenizer mode.
- $goodTag = false;
- }
- }
- if ( $this->inRAWTEXT ) {
- if ( $slash && $t === $this->inRAWTEXT ) {
- $this->inRAWTEXT = false;
- } else {
- // No tags allowed, no entity-escaping done.
- $goodTag = false;
- }
- }
- $sanitize = $this->allowedHtmlElements !== null;
- if ( $sanitize ) {
- $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
- }
- if ( $goodTag ) {
- if ( is_callable( $this->processingCallback ) ) {
- call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
- }
- if ( $sanitize ) {
- $goodTag = Sanitizer::validateTag( $attribStr, $t );
- }
- }
- if ( $goodTag ) {
- if ( $sanitize ) {
- $attribs = Sanitizer::decodeTagAttributes( $attribStr );
- $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
- } else {
- $attribs = Sanitizer::decodeTagAttributes( $attribStr );
- }
- $goodTag = $this->insertToken(
- $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
- );
- }
- if ( $goodTag ) {
- $rest = str_replace( '>', '>', $rest );
- $this->insertToken( 'text', str_replace( '>', '>', $rest ) );
- } elseif ( $this->inRAWTEXT ) {
- $this->insertToken( 'text', "<$x" );
- } else {
- // bad tag; serialize entire thing as text.
- $this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) );
- }
- }
- private function switchMode( $mode ) {
- Assert::parameter(
- substr( $mode, -4 ) === 'Mode', '$mode', 'should end in Mode'
- );
- $oldMode = $this->parseMode;
- $this->parseMode = $mode;
- return $oldMode;
- }
- private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
- $this->switchMode( $mode );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- private function resetInsertionMode() {
- $last = false;
- foreach ( $this->stack as $i => $node ) {
- if ( $i === 0 ) {
- $last = true;
- if ( $this->fragmentContext ) {
- $node = $this->fragmentContext;
- }
- }
- if ( $node->isHtml() ) {
- switch ( $node->localName ) {
- case 'select':
- $stackLength = $this->stack->length();
- for ( $j = $i + 1; $j < $stackLength - 1; $j++ ) {
- $ancestor = $this->stack->node( $stackLength - $j - 1 );
- if ( $ancestor->isHtmlNamed( 'template' ) ) {
- break;
- }
- if ( $ancestor->isHtmlNamed( 'table' ) ) {
- $this->switchMode( 'inSelectInTableMode' );
- return;
- }
- }
- $this->switchMode( 'inSelectMode' );
- return;
- case 'tr':
- $this->switchMode( 'inRowMode' );
- return;
- case 'tbody':
- case 'tfoot':
- case 'thead':
- $this->switchMode( 'inTableBodyMode' );
- return;
- case 'caption':
- $this->switchMode( 'inCaptionMode' );
- return;
- case 'colgroup':
- $this->switchMode( 'inColumnGroupMode' );
- return;
- case 'table':
- $this->switchMode( 'inTableMode' );
- return;
- case 'template':
- $this->switchMode(
- array_slice( $this->templateInsertionModes, -1 )[0]
- );
- return;
- case 'body':
- $this->switchMode( 'inBodyMode' );
- return;
- // OMITTED: <frameset>
- // OMITTED: <html>
- // OMITTED: <head>
- default:
- if ( !$last ) {
- // OMITTED: <head>
- if ( $node->isA( BalanceSets::$tableCellSet ) ) {
- $this->switchMode( 'inCellMode' );
- return;
- }
- }
- }
- }
- if ( $last ) {
- $this->switchMode( 'inBodyMode' );
- return;
- }
- }
- }
- private function stopParsing() {
- // Most of the spec methods are inapplicable, other than step 2:
- // "pop all the nodes off the stack of open elements".
- // We're going to keep the top-most <html> element on the stack, though.
- // Clear the AFE list first, otherwise the element objects will stay live
- // during serialization, potentially using O(N^2) memory. Note that
- // popping the stack will never result in reconstructing the active
- // formatting elements.
- $this->afe = null;
- $this->stack->popTo( 1 );
- }
- private function parseRawText( $value, $attribs = null ) {
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->inRAWTEXT = $value;
- $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
- return true;
- }
- private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- $this->stack->insertText( $value );
- return true;
- } elseif ( $token === 'eof' ) {
- $this->stack->pop();
- return $this->switchModeAndReprocess(
- $this->originalInsertionMode, $token, $value, $attribs, $selfClose
- );
- } elseif ( $token === 'endtag' ) {
- $this->stack->pop();
- $this->switchMode( $this->originalInsertionMode );
- return true;
- }
- return true;
- }
- private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
- $this->stack->insertText( $matches[0] );
- $value = substr( $value, strlen( $matches[0] ) );
- }
- if ( strlen( $value ) === 0 ) {
- return true; // All text handled.
- }
- // Fall through to handle non-whitespace below.
- } elseif ( $token === 'tag' ) {
- switch ( $value ) {
- case 'meta':
- // OMITTED: in a full HTML parser, this might change the encoding.
- // falls through
- // OMITTED: <html>
- case 'base':
- case 'basefont':
- case 'bgsound':
- case 'link':
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->pop();
- return true;
- // OMITTED: <title>
- // OMITTED: <noscript>
- case 'noframes':
- case 'style':
- return $this->parseRawText( $value, $attribs );
- // OMITTED: <script>
- case 'template':
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->afe->insertMarker();
- // OMITTED: frameset_ok
- $this->switchMode( 'inTemplateMode' );
- $this->templateInsertionModes[] = $this->parseMode;
- return true;
- // OMITTED: <head>
- }
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- // OMITTED: <head>
- // OMITTED: <body>
- // OMITTED: <html>
- case 'br':
- break; // handle at the bottom of the function
- case 'template':
- if ( $this->stack->indexOf( $value ) < 0 ) {
- return true; // Ignore the token.
- }
- $this->stack->generateImpliedEndTags( null, true /* thorough */ );
- $this->stack->popTag( $value );
- $this->afe->clearToMarker();
- array_pop( $this->templateInsertionModes );
- $this->resetInsertionMode();
- return true;
- default:
- // ignore any other end tag
- return true;
- }
- } elseif ( $token === 'comment' ) {
- $this->stack->insertComment( $value );
- return true;
- }
- // If not handled above
- $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
- // Then redo this one
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertText( $value );
- return true;
- } elseif ( $token === 'eof' ) {
- if ( !empty( $this->templateInsertionModes ) ) {
- return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
- }
- $this->stopParsing();
- return true;
- } elseif ( $token === 'tag' ) {
- switch ( $value ) {
- // OMITTED: <html>
- case 'base':
- case 'basefont':
- case 'bgsound':
- case 'link':
- case 'meta':
- case 'noframes':
- // OMITTED: <script>
- case 'style':
- case 'template':
- // OMITTED: <title>
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- // OMITTED: <body>
- // OMITTED: <frameset>
- case 'address':
- case 'article':
- case 'aside':
- case 'blockquote':
- case 'center':
- case 'details':
- case 'dialog':
- case 'dir':
- case 'div':
- case 'dl':
- case 'fieldset':
- case 'figcaption':
- case 'figure':
- case 'footer':
- case 'header':
- case 'hgroup':
- case 'main':
- case 'nav':
- case 'ol':
- case 'p':
- case 'section':
- case 'summary':
- case 'ul':
- if ( $this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'menu':
- if ( $this->stack->inButtonScope( "p" ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
- $this->stack->pop();
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'h1':
- case 'h2':
- case 'h3':
- case 'h4':
- case 'h5':
- case 'h6':
- if ( $this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
- $this->stack->pop();
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'pre':
- case 'listing':
- if ( $this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->ignoreLinefeed = true;
- // OMITTED: frameset_ok
- return true;
- case 'form':
- if (
- $this->formElementPointer &&
- $this->stack->indexOf( 'template' ) < 0
- ) {
- return true; // in a form, not in a template.
- }
- if ( $this->stack->inButtonScope( "p" ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- $elt = $this->stack->insertHTMLElement( $value, $attribs );
- if ( $this->stack->indexOf( 'template' ) < 0 ) {
- $this->formElementPointer = $elt;
- }
- return true;
- case 'li':
- // OMITTED: frameset_ok
- foreach ( $this->stack as $node ) {
- if ( $node->isHtmlNamed( 'li' ) ) {
- $this->inBodyMode( 'endtag', 'li' );
- break;
- }
- if (
- $node->isA( BalanceSets::$specialSet ) &&
- !$node->isA( BalanceSets::$addressDivPSet )
- ) {
- break;
- }
- }
- if ( $this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'dd':
- case 'dt':
- // OMITTED: frameset_ok
- foreach ( $this->stack as $node ) {
- if ( $node->isHtmlNamed( 'dd' ) ) {
- $this->inBodyMode( 'endtag', 'dd' );
- break;
- }
- if ( $node->isHtmlNamed( 'dt' ) ) {
- $this->inBodyMode( 'endtag', 'dt' );
- break;
- }
- if (
- $node->isA( BalanceSets::$specialSet ) &&
- !$node->isA( BalanceSets::$addressDivPSet )
- ) {
- break;
- }
- }
- if ( $this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- // OMITTED: <plaintext>
- case 'button':
- if ( $this->stack->inScope( 'button' ) ) {
- $this->inBodyMode( 'endtag', 'button' );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'a':
- $activeElement = $this->afe->findElementByTag( 'a' );
- if ( $activeElement ) {
- $this->inBodyMode( 'endtag', 'a' );
- if ( $this->afe->isInList( $activeElement ) ) {
- $this->afe->remove( $activeElement );
- // Don't flatten here, since when we fall
- // through below we might foster parent
- // the new <a> tag inside this one.
- $this->stack->removeElement( $activeElement, false );
- }
- }
- // Falls through
- case 'b':
- case 'big':
- case 'code':
- case 'em':
- case 'font':
- case 'i':
- case 's':
- case 'small':
- case 'strike':
- case 'strong':
- case 'tt':
- case 'u':
- $this->afe->reconstruct( $this->stack );
- $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
- return true;
- case 'nobr':
- $this->afe->reconstruct( $this->stack );
- if ( $this->stack->inScope( 'nobr' ) ) {
- $this->inBodyMode( 'endtag', 'nobr' );
- $this->afe->reconstruct( $this->stack );
- }
- $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
- return true;
- case 'applet':
- case 'marquee':
- case 'object':
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->afe->insertMarker();
- // OMITTED: frameset_ok
- return true;
- case 'table':
- // The document is never in "quirks mode"; see simplifications
- // above.
- if ( $this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- // OMITTED: frameset_ok
- $this->switchMode( 'inTableMode' );
- return true;
- case 'area':
- case 'br':
- case 'embed':
- case 'img':
- case 'keygen':
- case 'wbr':
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->pop();
- // OMITTED: frameset_ok
- return true;
- case 'input':
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->pop();
- // OMITTED: frameset_ok
- // (hence we don't need to examine the tag's "type" attribute)
- return true;
- case 'param':
- case 'source':
- case 'track':
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->pop();
- return true;
- case 'hr':
- if ( $this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'endtag', 'p' );
- }
- if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
- $this->stack->pop();
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->pop();
- return true;
- case 'image':
- // warts!
- return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
- case 'textarea':
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->ignoreLinefeed = true;
- $this->inRCDATA = $value; // emulate rcdata tokenizer mode
- // OMITTED: frameset_ok
- return true;
- // OMITTED: <xmp>
- // OMITTED: <iframe>
- // OMITTED: <noembed>
- // OMITTED: <noscript>
- case 'select':
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- switch ( $this->parseMode ) {
- case 'inTableMode':
- case 'inCaptionMode':
- case 'inTableBodyMode':
- case 'inRowMode':
- case 'inCellMode':
- $this->switchMode( 'inSelectInTableMode' );
- return true;
- default:
- $this->switchMode( 'inSelectMode' );
- return true;
- }
- case 'optgroup':
- case 'option':
- if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
- $this->inBodyMode( 'endtag', 'option' );
- }
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'menuitem':
- if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
- $this->stack->pop();
- }
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'rb':
- case 'rtc':
- if ( $this->stack->inScope( 'ruby' ) ) {
- $this->stack->generateImpliedEndTags();
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'rp':
- case 'rt':
- if ( $this->stack->inScope( 'ruby' ) ) {
- $this->stack->generateImpliedEndTags( 'rtc' );
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'math':
- $this->afe->reconstruct( $this->stack );
- // We skip the spec's "adjust MathML attributes" and
- // "adjust foreign attributes" steps, since the browser will
- // do this later when it parses the output and it doesn't affect
- // balancing.
- $this->stack->insertForeignElement(
- BalanceSets::MATHML_NAMESPACE, $value, $attribs
- );
- if ( $selfClose ) {
- // emit explicit </math> tag.
- $this->stack->pop();
- }
- return true;
- case 'svg':
- $this->afe->reconstruct( $this->stack );
- // We skip the spec's "adjust SVG attributes" and
- // "adjust foreign attributes" steps, since the browser will
- // do this later when it parses the output and it doesn't affect
- // balancing.
- $this->stack->insertForeignElement(
- BalanceSets::SVG_NAMESPACE, $value, $attribs
- );
- if ( $selfClose ) {
- // emit explicit </svg> tag.
- $this->stack->pop();
- }
- return true;
- case 'caption':
- case 'col':
- case 'colgroup':
- // OMITTED: <frame>
- case 'head':
- case 'tbody':
- case 'td':
- case 'tfoot':
- case 'th':
- case 'thead':
- case 'tr':
- // Ignore table tags if we're not inTableMode
- return true;
- }
- // Handle any other start tag here
- $this->afe->reconstruct( $this->stack );
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- // </body>,</html> are unsupported.
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- case 'address':
- case 'article':
- case 'aside':
- case 'blockquote':
- case 'button':
- case 'center':
- case 'details':
- case 'dialog':
- case 'dir':
- case 'div':
- case 'dl':
- case 'fieldset':
- case 'figcaption':
- case 'figure':
- case 'footer':
- case 'header':
- case 'hgroup':
- case 'listing':
- case 'main':
- case 'menu':
- case 'nav':
- case 'ol':
- case 'pre':
- case 'section':
- case 'summary':
- case 'ul':
- // Ignore if there is not a matching open tag
- if ( !$this->stack->inScope( $value ) ) {
- return true;
- }
- $this->stack->generateImpliedEndTags();
- $this->stack->popTag( $value );
- return true;
- case 'form':
- if ( $this->stack->indexOf( 'template' ) < 0 ) {
- $openform = $this->formElementPointer;
- $this->formElementPointer = null;
- if ( !$openform || !$this->stack->inScope( $openform ) ) {
- return true;
- }
- $this->stack->generateImpliedEndTags();
- // Don't flatten yet if we're removing a <form> element
- // out-of-order. (eg. `<form><div></form>`)
- $flatten = ( $this->stack->currentNode === $openform );
- $this->stack->removeElement( $openform, $flatten );
- } else {
- if ( !$this->stack->inScope( 'form' ) ) {
- return true;
- }
- $this->stack->generateImpliedEndTags();
- $this->stack->popTag( 'form' );
- }
- return true;
- case 'p':
- if ( !$this->stack->inButtonScope( 'p' ) ) {
- $this->inBodyMode( 'tag', 'p', [] );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- $this->stack->generateImpliedEndTags( $value );
- $this->stack->popTag( $value );
- return true;
- case 'li':
- if ( !$this->stack->inListItemScope( $value ) ) {
- return true; // ignore
- }
- $this->stack->generateImpliedEndTags( $value );
- $this->stack->popTag( $value );
- return true;
- case 'dd':
- case 'dt':
- if ( !$this->stack->inScope( $value ) ) {
- return true; // ignore
- }
- $this->stack->generateImpliedEndTags( $value );
- $this->stack->popTag( $value );
- return true;
- case 'h1':
- case 'h2':
- case 'h3':
- case 'h4':
- case 'h5':
- case 'h6':
- if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
- return true; // ignore
- }
- $this->stack->generateImpliedEndTags();
- $this->stack->popTag( BalanceSets::$headingSet );
- return true;
- case 'sarcasm':
- // Take a deep breath, then:
- break;
- case 'a':
- case 'b':
- case 'big':
- case 'code':
- case 'em':
- case 'font':
- case 'i':
- case 'nobr':
- case 's':
- case 'small':
- case 'strike':
- case 'strong':
- case 'tt':
- case 'u':
- if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
- return true; // If we did something, we're done.
- }
- break; // Go to the "any other end tag" case.
- case 'applet':
- case 'marquee':
- case 'object':
- if ( !$this->stack->inScope( $value ) ) {
- return true; // ignore
- }
- $this->stack->generateImpliedEndTags();
- $this->stack->popTag( $value );
- $this->afe->clearToMarker();
- return true;
- case 'br':
- // Turn </br> into <br>
- return $this->inBodyMode( 'tag', $value, [] );
- }
- // Any other end tag goes here
- foreach ( $this->stack as $i => $node ) {
- if ( $node->isHtmlNamed( $value ) ) {
- $this->stack->generateImpliedEndTags( $value );
- $this->stack->popTo( $i ); // including $i
- break;
- } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
- return true; // ignore this close token.
- }
- }
- return true;
- } elseif ( $token === 'comment' ) {
- $this->stack->insertComment( $value );
- return true;
- } else {
- Assert::invariant( false, "Bad token type: $token" );
- }
- }
- private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- if ( $this->textIntegrationMode ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfClose );
- } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
- $this->pendingTableText = '';
- $this->originalInsertionMode = $this->parseMode;
- return $this->switchModeAndReprocess( 'inTableTextMode',
- $token, $value, $attribs, $selfClose );
- }
- // fall through to default case.
- } elseif ( $token === 'eof' ) {
- $this->stopParsing();
- return true;
- } elseif ( $token === 'tag' ) {
- switch ( $value ) {
- case 'caption':
- $this->afe->insertMarker();
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->switchMode( 'inCaptionMode' );
- return true;
- case 'colgroup':
- $this->stack->clearToContext( BalanceSets::$tableContextSet );
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->switchMode( 'inColumnGroupMode' );
- return true;
- case 'col':
- $this->inTableMode( 'tag', 'colgroup', [] );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- case 'tbody':
- case 'tfoot':
- case 'thead':
- $this->stack->clearToContext( BalanceSets::$tableContextSet );
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->switchMode( 'inTableBodyMode' );
- return true;
- case 'td':
- case 'th':
- case 'tr':
- $this->inTableMode( 'tag', 'tbody', [] );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- case 'table':
- if ( !$this->stack->inTableScope( $value ) ) {
- return true; // Ignore this tag.
- }
- $this->inTableMode( 'endtag', $value );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- case 'style':
- // OMITTED: <script>
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- case 'input':
- if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
- break; // Handle this as "everything else"
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->pop();
- return true;
- case 'form':
- if (
- $this->formElementPointer ||
- $this->stack->indexOf( 'template' ) >= 0
- ) {
- return true; // ignore this token
- }
- $this->formElementPointer =
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->popTag( $this->formElementPointer );
- return true;
- }
- // Fall through for "anything else" clause.
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'table':
- if ( !$this->stack->inTableScope( $value ) ) {
- return true; // Ignore.
- }
- $this->stack->popTag( $value );
- $this->resetInsertionMode();
- return true;
- // OMITTED: <body>
- case 'caption':
- case 'col':
- case 'colgroup':
- // OMITTED: <html>
- case 'tbody':
- case 'td':
- case 'tfoot':
- case 'th':
- case 'thead':
- case 'tr':
- return true; // Ignore the token.
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- }
- // Fall through for "anything else" clause.
- } elseif ( $token === 'comment' ) {
- $this->stack->insertComment( $value );
- return true;
- }
- // This is the "anything else" case:
- $this->stack->fosterParentMode = true;
- $this->inBodyMode( $token, $value, $attribs, $selfClose );
- $this->stack->fosterParentMode = false;
- return true;
- }
- private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- $this->pendingTableText .= $value;
- return true;
- }
- // Non-text token:
- $text = $this->pendingTableText;
- $this->pendingTableText = '';
- if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
- // This should match the "anything else" case inTableMode
- $this->stack->fosterParentMode = true;
- $this->inBodyMode( 'text', $text );
- $this->stack->fosterParentMode = false;
- } else {
- // Pending text is just whitespace.
- $this->stack->insertText( $text );
- }
- return $this->switchModeAndReprocess(
- $this->originalInsertionMode, $token, $value, $attribs, $selfClose
- );
- }
- // helper for inCaptionMode
- private function endCaption() {
- if ( !$this->stack->inTableScope( 'caption' ) ) {
- return false;
- }
- $this->stack->generateImpliedEndTags();
- $this->stack->popTag( 'caption' );
- $this->afe->clearToMarker();
- $this->switchMode( 'inTableMode' );
- return true;
- }
- private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'tag' ) {
- switch ( $value ) {
- case 'caption':
- case 'col':
- case 'colgroup':
- case 'tbody':
- case 'td':
- case 'tfoot':
- case 'th':
- case 'thead':
- case 'tr':
- if ( $this->endCaption() ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- }
- // Fall through to "anything else" case.
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'caption':
- $this->endCaption();
- return true;
- case 'table':
- if ( $this->endCaption() ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- case 'body':
- case 'col':
- case 'colgroup':
- // OMITTED: <html>
- case 'tbody':
- case 'td':
- case 'tfoot':
- case 'th':
- case 'thead':
- case 'tr':
- // Ignore the token
- return true;
- }
- // Fall through to "anything else" case.
- }
- // The Anything Else case
- return $this->inBodyMode( $token, $value, $attribs, $selfClose );
- }
- private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
- $this->stack->insertText( $matches[0] );
- $value = substr( $value, strlen( $matches[0] ) );
- }
- if ( strlen( $value ) === 0 ) {
- return true; // All text handled.
- }
- // Fall through to handle non-whitespace below.
- } elseif ( $token === 'tag' ) {
- switch ( $value ) {
- // OMITTED: <html>
- case 'col':
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->stack->pop();
- return true;
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- }
- // Fall through for "anything else".
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'colgroup':
- if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
- return true; // Ignore the token.
- }
- $this->stack->pop();
- $this->switchMode( 'inTableMode' );
- return true;
- case 'col':
- return true; // Ignore the token.
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- }
- // Fall through for "anything else".
- } elseif ( $token === 'eof' ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfClose );
- } elseif ( $token === 'comment' ) {
- $this->stack->insertComment( $value );
- return true;
- }
- // Anything else
- if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
- return true; // Ignore the token.
- }
- $this->inColumnGroupMode( 'endtag', 'colgroup' );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- // Helper function for inTableBodyMode
- private function endSection() {
- if ( !(
- $this->stack->inTableScope( 'tbody' ) ||
- $this->stack->inTableScope( 'thead' ) ||
- $this->stack->inTableScope( 'tfoot' )
- ) ) {
- return false;
- }
- $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
- $this->stack->pop();
- $this->switchMode( 'inTableMode' );
- return true;
- }
- private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'tag' ) {
- switch ( $value ) {
- case 'tr':
- $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->switchMode( 'inRowMode' );
- return true;
- case 'th':
- case 'td':
- $this->inTableBodyMode( 'tag', 'tr', [] );
- $this->insertToken( $token, $value, $attribs, $selfClose );
- return true;
- case 'caption':
- case 'col':
- case 'colgroup':
- case 'tbody':
- case 'tfoot':
- case 'thead':
- if ( $this->endSection() ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- }
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'table':
- if ( $this->endSection() ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- case 'tbody':
- case 'tfoot':
- case 'thead':
- if ( $this->stack->inTableScope( $value ) ) {
- $this->endSection();
- }
- return true;
- // OMITTED: <body>
- case 'caption':
- case 'col':
- case 'colgroup':
- // OMITTED: <html>
- case 'td':
- case 'th':
- case 'tr':
- return true; // Ignore the token.
- }
- }
- // Anything else:
- return $this->inTableMode( $token, $value, $attribs, $selfClose );
- }
- // Helper function for inRowMode
- private function endRow() {
- if ( !$this->stack->inTableScope( 'tr' ) ) {
- return false;
- }
- $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
- $this->stack->pop();
- $this->switchMode( 'inTableBodyMode' );
- return true;
- }
- private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'tag' ) {
- switch ( $value ) {
- case 'th':
- case 'td':
- $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
- $this->stack->insertHTMLElement( $value, $attribs );
- $this->switchMode( 'inCellMode' );
- $this->afe->insertMarker();
- return true;
- case 'caption':
- case 'col':
- case 'colgroup':
- case 'tbody':
- case 'tfoot':
- case 'thead':
- case 'tr':
- if ( $this->endRow() ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- }
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'tr':
- $this->endRow();
- return true;
- case 'table':
- if ( $this->endRow() ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- case 'tbody':
- case 'tfoot':
- case 'thead':
- if (
- $this->stack->inTableScope( $value ) &&
- $this->endRow()
- ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- // OMITTED: <body>
- case 'caption':
- case 'col':
- case 'colgroup':
- // OMITTED: <html>
- case 'td':
- case 'th':
- return true; // Ignore the token.
- }
- }
- // Anything else:
- return $this->inTableMode( $token, $value, $attribs, $selfClose );
- }
- // Helper for inCellMode
- private function endCell() {
- if ( $this->stack->inTableScope( 'td' ) ) {
- $this->inCellMode( 'endtag', 'td' );
- return true;
- } elseif ( $this->stack->inTableScope( 'th' ) ) {
- $this->inCellMode( 'endtag', 'th' );
- return true;
- } else {
- return false;
- }
- }
- private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'tag' ) {
- switch ( $value ) {
- case 'caption':
- case 'col':
- case 'colgroup':
- case 'tbody':
- case 'td':
- case 'tfoot':
- case 'th':
- case 'thead':
- case 'tr':
- if ( $this->endCell() ) {
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- }
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'td':
- case 'th':
- if ( $this->stack->inTableScope( $value ) ) {
- $this->stack->generateImpliedEndTags();
- $this->stack->popTag( $value );
- $this->afe->clearToMarker();
- $this->switchMode( 'inRowMode' );
- }
- return true;
- // OMITTED: <body>
- case 'caption':
- case 'col':
- case 'colgroup':
- // OMITTED: <html>
- return true;
- case 'table':
- case 'tbody':
- case 'tfoot':
- case 'thead':
- case 'tr':
- if ( $this->stack->inTableScope( $value ) ) {
- $this->stack->generateImpliedEndTags();
- $this->stack->popTag( BalanceSets::$tableCellSet );
- $this->afe->clearToMarker();
- $this->switchMode( 'inRowMode' );
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- }
- }
- // Anything else:
- return $this->inBodyMode( $token, $value, $attribs, $selfClose );
- }
- private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' ) {
- $this->stack->insertText( $value );
- return true;
- } elseif ( $token === 'eof' ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfClose );
- } elseif ( $token === 'tag' ) {
- switch ( $value ) {
- // OMITTED: <html>
- case 'option':
- if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
- $this->stack->pop();
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'optgroup':
- if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
- $this->stack->pop();
- }
- if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
- $this->stack->pop();
- }
- $this->stack->insertHTMLElement( $value, $attribs );
- return true;
- case 'select':
- $this->inSelectMode( 'endtag', $value ); // treat it like endtag
- return true;
- case 'input':
- case 'keygen':
- case 'textarea':
- if ( !$this->stack->inSelectScope( 'select' ) ) {
- return true; // ignore token (fragment case)
- }
- $this->inSelectMode( 'endtag', 'select' );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- case 'script':
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- }
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'optgroup':
- if (
- $this->stack->currentNode->isHtmlNamed( 'option' ) &&
- $this->stack->length() >= 2 &&
- $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
- ) {
- $this->stack->pop();
- }
- if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
- $this->stack->pop();
- }
- return true;
- case 'option':
- if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
- $this->stack->pop();
- }
- return true;
- case 'select':
- if ( !$this->stack->inSelectScope( $value ) ) {
- return true; // fragment case
- }
- $this->stack->popTag( $value );
- $this->resetInsertionMode();
- return true;
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- }
- } elseif ( $token === 'comment' ) {
- $this->stack->insertComment( $value );
- return true;
- }
- // anything else: just ignore the token
- return true;
- }
- private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
- switch ( $value ) {
- case 'caption':
- case 'table':
- case 'tbody':
- case 'tfoot':
- case 'thead':
- case 'tr':
- case 'td':
- case 'th':
- if ( $token === 'tag' ) {
- $this->inSelectInTableMode( 'endtag', 'select' );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- } elseif ( $token === 'endtag' ) {
- if ( $this->stack->inTableScope( $value ) ) {
- $this->inSelectInTableMode( 'endtag', 'select' );
- return $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- }
- }
- // anything else
- return $this->inSelectMode( $token, $value, $attribs, $selfClose );
- }
- private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
- if ( $token === 'text' || $token === 'comment' ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfClose );
- } elseif ( $token === 'eof' ) {
- if ( $this->stack->indexOf( 'template' ) < 0 ) {
- $this->stopParsing();
- } else {
- $this->stack->popTag( 'template' );
- $this->afe->clearToMarker();
- array_pop( $this->templateInsertionModes );
- $this->resetInsertionMode();
- $this->insertToken( $token, $value, $attribs, $selfClose );
- }
- return true;
- } elseif ( $token === 'tag' ) {
- switch ( $value ) {
- case 'base':
- case 'basefont':
- case 'bgsound':
- case 'link':
- case 'meta':
- case 'noframes':
- // OMITTED: <script>
- case 'style':
- case 'template':
- // OMITTED: <title>
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- case 'caption':
- case 'colgroup':
- case 'tbody':
- case 'tfoot':
- case 'thead':
- return $this->switchModeAndReprocess(
- 'inTableMode', $token, $value, $attribs, $selfClose
- );
- case 'col':
- return $this->switchModeAndReprocess(
- 'inColumnGroupMode', $token, $value, $attribs, $selfClose
- );
- case 'tr':
- return $this->switchModeAndReprocess(
- 'inTableBodyMode', $token, $value, $attribs, $selfClose
- );
- case 'td':
- case 'th':
- return $this->switchModeAndReprocess(
- 'inRowMode', $token, $value, $attribs, $selfClose
- );
- }
- return $this->switchModeAndReprocess(
- 'inBodyMode', $token, $value, $attribs, $selfClose
- );
- } elseif ( $token === 'endtag' ) {
- switch ( $value ) {
- case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfClose );
- }
- return true;
- } else {
- Assert::invariant( false, "Bad token type: $token" );
- }
- }
- }
|