YarrInterpreter.cpp 81 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960
  1. /*
  2. * Copyright (C) 2009 Apple Inc. All rights reserved.
  3. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  15. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
  18. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include "config.h"
  27. #include "YarrInterpreter.h"
  28. #include "Yarr.h"
  29. #include "YarrCanonicalizeUCS2.h"
  30. #include <wtf/BumpPointerAllocator.h>
  31. #include <wtf/DataLog.h>
  32. #include <wtf/text/CString.h>
  33. #include <wtf/text/WTFString.h>
  34. #ifndef NDEBUG
  35. #include <stdio.h>
  36. #endif
  37. using namespace WTF;
  38. namespace JSC { namespace Yarr {
  39. template<typename CharType>
  40. class Interpreter {
  41. public:
  42. struct ParenthesesDisjunctionContext;
  43. struct BackTrackInfoPatternCharacter {
  44. uintptr_t matchAmount;
  45. };
  46. struct BackTrackInfoCharacterClass {
  47. uintptr_t matchAmount;
  48. };
  49. struct BackTrackInfoBackReference {
  50. uintptr_t begin; // Not really needed for greedy quantifiers.
  51. uintptr_t matchAmount; // Not really needed for fixed quantifiers.
  52. };
  53. struct BackTrackInfoAlternative {
  54. uintptr_t offset;
  55. };
  56. struct BackTrackInfoParentheticalAssertion {
  57. uintptr_t begin;
  58. };
  59. struct BackTrackInfoParenthesesOnce {
  60. uintptr_t begin;
  61. };
  62. struct BackTrackInfoParenthesesTerminal {
  63. uintptr_t begin;
  64. };
  65. struct BackTrackInfoParentheses {
  66. uintptr_t matchAmount;
  67. ParenthesesDisjunctionContext* lastContext;
  68. };
  69. static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
  70. {
  71. context->next = backTrack->lastContext;
  72. backTrack->lastContext = context;
  73. ++backTrack->matchAmount;
  74. }
  75. static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
  76. {
  77. RELEASE_ASSERT(backTrack->matchAmount);
  78. RELEASE_ASSERT(backTrack->lastContext);
  79. backTrack->lastContext = backTrack->lastContext->next;
  80. --backTrack->matchAmount;
  81. }
  82. struct DisjunctionContext
  83. {
  84. DisjunctionContext()
  85. : term(0)
  86. {
  87. }
  88. void* operator new(size_t, void* where)
  89. {
  90. return where;
  91. }
  92. int term;
  93. unsigned matchBegin;
  94. unsigned matchEnd;
  95. uintptr_t frame[1];
  96. };
  97. DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
  98. {
  99. size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
  100. allocatorPool = allocatorPool->ensureCapacity(size);
  101. RELEASE_ASSERT(allocatorPool);
  102. return new (allocatorPool->alloc(size)) DisjunctionContext();
  103. }
  104. void freeDisjunctionContext(DisjunctionContext* context)
  105. {
  106. allocatorPool = allocatorPool->dealloc(context);
  107. }
  108. struct ParenthesesDisjunctionContext
  109. {
  110. ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
  111. : next(0)
  112. {
  113. unsigned firstSubpatternId = term.atom.subpatternId;
  114. unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
  115. for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
  116. subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
  117. output[(firstSubpatternId << 1) + i] = offsetNoMatch;
  118. }
  119. new (getDisjunctionContext(term)) DisjunctionContext();
  120. }
  121. void* operator new(size_t, void* where)
  122. {
  123. return where;
  124. }
  125. void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
  126. {
  127. for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
  128. output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
  129. }
  130. DisjunctionContext* getDisjunctionContext(ByteTerm& term)
  131. {
  132. return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
  133. }
  134. ParenthesesDisjunctionContext* next;
  135. unsigned subpatternBackup[1];
  136. };
  137. ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
  138. {
  139. size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
  140. allocatorPool = allocatorPool->ensureCapacity(size);
  141. RELEASE_ASSERT(allocatorPool);
  142. return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
  143. }
  144. void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
  145. {
  146. allocatorPool = allocatorPool->dealloc(context);
  147. }
  148. class InputStream {
  149. public:
  150. InputStream(const CharType* input, unsigned start, unsigned length)
  151. : input(input)
  152. , pos(start)
  153. , length(length)
  154. {
  155. }
  156. void next()
  157. {
  158. ++pos;
  159. }
  160. void rewind(unsigned amount)
  161. {
  162. ASSERT(pos >= amount);
  163. pos -= amount;
  164. }
  165. int read()
  166. {
  167. ASSERT(pos < length);
  168. if (pos < length)
  169. return input[pos];
  170. return -1;
  171. }
  172. int readPair()
  173. {
  174. ASSERT(pos + 1 < length);
  175. return input[pos] | input[pos + 1] << 16;
  176. }
  177. int readChecked(unsigned negativePositionOffest)
  178. {
  179. RELEASE_ASSERT(pos >= negativePositionOffest);
  180. unsigned p = pos - negativePositionOffest;
  181. ASSERT(p < length);
  182. return input[p];
  183. }
  184. int reread(unsigned from)
  185. {
  186. ASSERT(from < length);
  187. return input[from];
  188. }
  189. int prev()
  190. {
  191. ASSERT(!(pos > length));
  192. if (pos && length)
  193. return input[pos - 1];
  194. return -1;
  195. }
  196. unsigned getPos()
  197. {
  198. return pos;
  199. }
  200. void setPos(unsigned p)
  201. {
  202. pos = p;
  203. }
  204. bool atStart()
  205. {
  206. return pos == 0;
  207. }
  208. bool atEnd()
  209. {
  210. return pos == length;
  211. }
  212. unsigned end()
  213. {
  214. return length;
  215. }
  216. bool checkInput(unsigned count)
  217. {
  218. if (((pos + count) <= length) && ((pos + count) >= pos)) {
  219. pos += count;
  220. return true;
  221. }
  222. return false;
  223. }
  224. void uncheckInput(unsigned count)
  225. {
  226. RELEASE_ASSERT(pos >= count);
  227. pos -= count;
  228. }
  229. bool atStart(unsigned negativePositionOffest)
  230. {
  231. return pos == negativePositionOffest;
  232. }
  233. bool atEnd(unsigned negativePositionOffest)
  234. {
  235. RELEASE_ASSERT(pos >= negativePositionOffest);
  236. return (pos - negativePositionOffest) == length;
  237. }
  238. bool isAvailableInput(unsigned offset)
  239. {
  240. return (((pos + offset) <= length) && ((pos + offset) >= pos));
  241. }
  242. private:
  243. const CharType* input;
  244. unsigned pos;
  245. unsigned length;
  246. };
  247. bool testCharacterClass(CharacterClass* characterClass, int ch)
  248. {
  249. if (ch & 0xFF80) {
  250. for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
  251. if (ch == characterClass->m_matchesUnicode[i])
  252. return true;
  253. for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
  254. if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
  255. return true;
  256. } else {
  257. for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
  258. if (ch == characterClass->m_matches[i])
  259. return true;
  260. for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
  261. if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
  262. return true;
  263. }
  264. return false;
  265. }
  266. bool checkCharacter(int testChar, unsigned negativeInputOffset)
  267. {
  268. return testChar == input.readChecked(negativeInputOffset);
  269. }
  270. bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
  271. {
  272. int ch = input.readChecked(negativeInputOffset);
  273. return (loChar == ch) || (hiChar == ch);
  274. }
  275. bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
  276. {
  277. bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
  278. return invert ? !match : match;
  279. }
  280. bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
  281. {
  282. unsigned matchSize = (unsigned)(matchEnd - matchBegin);
  283. if (!input.checkInput(matchSize))
  284. return false;
  285. if (pattern->m_ignoreCase) {
  286. for (unsigned i = 0; i < matchSize; ++i) {
  287. int oldCh = input.reread(matchBegin + i);
  288. int ch = input.readChecked(negativeInputOffset + matchSize - i);
  289. if (oldCh == ch)
  290. continue;
  291. // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
  292. // unicode values are never allowed to match against ascii ones.
  293. if (isASCII(oldCh) || isASCII(ch)) {
  294. if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
  295. continue;
  296. } else if (areCanonicallyEquivalent(oldCh, ch))
  297. continue;
  298. input.uncheckInput(matchSize);
  299. return false;
  300. }
  301. } else {
  302. for (unsigned i = 0; i < matchSize; ++i) {
  303. if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
  304. input.uncheckInput(matchSize);
  305. return false;
  306. }
  307. }
  308. }
  309. return true;
  310. }
  311. bool matchAssertionBOL(ByteTerm& term)
  312. {
  313. return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
  314. }
  315. bool matchAssertionEOL(ByteTerm& term)
  316. {
  317. if (term.inputPosition)
  318. return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
  319. return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
  320. }
  321. bool matchAssertionWordBoundary(ByteTerm& term)
  322. {
  323. bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
  324. bool readIsWordchar;
  325. if (term.inputPosition)
  326. readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
  327. else
  328. readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
  329. bool wordBoundary = prevIsWordchar != readIsWordchar;
  330. return term.invert() ? !wordBoundary : wordBoundary;
  331. }
  332. bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
  333. {
  334. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
  335. switch (term.atom.quantityType) {
  336. case QuantifierFixedCount:
  337. break;
  338. case QuantifierGreedy:
  339. if (backTrack->matchAmount) {
  340. --backTrack->matchAmount;
  341. input.uncheckInput(1);
  342. return true;
  343. }
  344. break;
  345. case QuantifierNonGreedy:
  346. if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
  347. ++backTrack->matchAmount;
  348. if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
  349. return true;
  350. }
  351. input.uncheckInput(backTrack->matchAmount);
  352. break;
  353. }
  354. return false;
  355. }
  356. bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
  357. {
  358. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
  359. switch (term.atom.quantityType) {
  360. case QuantifierFixedCount:
  361. break;
  362. case QuantifierGreedy:
  363. if (backTrack->matchAmount) {
  364. --backTrack->matchAmount;
  365. input.uncheckInput(1);
  366. return true;
  367. }
  368. break;
  369. case QuantifierNonGreedy:
  370. if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
  371. ++backTrack->matchAmount;
  372. if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
  373. return true;
  374. }
  375. input.uncheckInput(backTrack->matchAmount);
  376. break;
  377. }
  378. return false;
  379. }
  380. bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
  381. {
  382. ASSERT(term.type == ByteTerm::TypeCharacterClass);
  383. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
  384. switch (term.atom.quantityType) {
  385. case QuantifierFixedCount: {
  386. for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
  387. if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
  388. return false;
  389. }
  390. return true;
  391. }
  392. case QuantifierGreedy: {
  393. unsigned matchAmount = 0;
  394. while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
  395. if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
  396. input.uncheckInput(1);
  397. break;
  398. }
  399. ++matchAmount;
  400. }
  401. backTrack->matchAmount = matchAmount;
  402. return true;
  403. }
  404. case QuantifierNonGreedy:
  405. backTrack->matchAmount = 0;
  406. return true;
  407. }
  408. RELEASE_ASSERT_NOT_REACHED();
  409. return false;
  410. }
  411. bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
  412. {
  413. ASSERT(term.type == ByteTerm::TypeCharacterClass);
  414. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
  415. switch (term.atom.quantityType) {
  416. case QuantifierFixedCount:
  417. break;
  418. case QuantifierGreedy:
  419. if (backTrack->matchAmount) {
  420. --backTrack->matchAmount;
  421. input.uncheckInput(1);
  422. return true;
  423. }
  424. break;
  425. case QuantifierNonGreedy:
  426. if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
  427. ++backTrack->matchAmount;
  428. if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
  429. return true;
  430. }
  431. input.uncheckInput(backTrack->matchAmount);
  432. break;
  433. }
  434. return false;
  435. }
  436. bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
  437. {
  438. ASSERT(term.type == ByteTerm::TypeBackReference);
  439. BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
  440. unsigned matchBegin = output[(term.atom.subpatternId << 1)];
  441. unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
  442. // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
  443. // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
  444. // Eg.: /(a\1)/
  445. if (matchEnd == offsetNoMatch)
  446. return true;
  447. if (matchBegin == offsetNoMatch)
  448. return true;
  449. ASSERT(matchBegin <= matchEnd);
  450. if (matchBegin == matchEnd)
  451. return true;
  452. switch (term.atom.quantityType) {
  453. case QuantifierFixedCount: {
  454. backTrack->begin = input.getPos();
  455. for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
  456. if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
  457. input.setPos(backTrack->begin);
  458. return false;
  459. }
  460. }
  461. return true;
  462. }
  463. case QuantifierGreedy: {
  464. unsigned matchAmount = 0;
  465. while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
  466. ++matchAmount;
  467. backTrack->matchAmount = matchAmount;
  468. return true;
  469. }
  470. case QuantifierNonGreedy:
  471. backTrack->begin = input.getPos();
  472. backTrack->matchAmount = 0;
  473. return true;
  474. }
  475. RELEASE_ASSERT_NOT_REACHED();
  476. return false;
  477. }
  478. bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
  479. {
  480. ASSERT(term.type == ByteTerm::TypeBackReference);
  481. BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
  482. unsigned matchBegin = output[(term.atom.subpatternId << 1)];
  483. unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
  484. if (matchBegin == offsetNoMatch)
  485. return false;
  486. ASSERT(matchBegin <= matchEnd);
  487. if (matchBegin == matchEnd)
  488. return false;
  489. switch (term.atom.quantityType) {
  490. case QuantifierFixedCount:
  491. // for quantityCount == 1, could rewind.
  492. input.setPos(backTrack->begin);
  493. break;
  494. case QuantifierGreedy:
  495. if (backTrack->matchAmount) {
  496. --backTrack->matchAmount;
  497. input.rewind(matchEnd - matchBegin);
  498. return true;
  499. }
  500. break;
  501. case QuantifierNonGreedy:
  502. if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
  503. ++backTrack->matchAmount;
  504. return true;
  505. }
  506. input.setPos(backTrack->begin);
  507. break;
  508. }
  509. return false;
  510. }
  511. void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
  512. {
  513. if (term.capture()) {
  514. unsigned subpatternId = term.atom.subpatternId;
  515. output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
  516. output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
  517. }
  518. }
  519. void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
  520. {
  521. unsigned firstSubpatternId = term.atom.subpatternId;
  522. unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
  523. context->restoreOutput(output, firstSubpatternId, count);
  524. }
  525. JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
  526. {
  527. while (backTrack->matchAmount) {
  528. ParenthesesDisjunctionContext* context = backTrack->lastContext;
  529. JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
  530. if (result == JSRegExpMatch)
  531. return JSRegExpMatch;
  532. resetMatches(term, context);
  533. popParenthesesDisjunctionContext(backTrack);
  534. freeParenthesesDisjunctionContext(context);
  535. if (result != JSRegExpNoMatch)
  536. return result;
  537. }
  538. return JSRegExpNoMatch;
  539. }
  540. bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
  541. {
  542. ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
  543. ASSERT(term.atom.quantityCount == 1);
  544. BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
  545. switch (term.atom.quantityType) {
  546. case QuantifierGreedy: {
  547. // set this speculatively; if we get to the parens end this will be true.
  548. backTrack->begin = input.getPos();
  549. break;
  550. }
  551. case QuantifierNonGreedy: {
  552. backTrack->begin = notFound;
  553. context->term += term.atom.parenthesesWidth;
  554. return true;
  555. }
  556. case QuantifierFixedCount:
  557. break;
  558. }
  559. if (term.capture()) {
  560. unsigned subpatternId = term.atom.subpatternId;
  561. output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
  562. }
  563. return true;
  564. }
  565. bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
  566. {
  567. ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
  568. ASSERT(term.atom.quantityCount == 1);
  569. if (term.capture()) {
  570. unsigned subpatternId = term.atom.subpatternId;
  571. output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
  572. }
  573. if (term.atom.quantityType == QuantifierFixedCount)
  574. return true;
  575. BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
  576. return backTrack->begin != input.getPos();
  577. }
  578. bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
  579. {
  580. ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
  581. ASSERT(term.atom.quantityCount == 1);
  582. BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
  583. if (term.capture()) {
  584. unsigned subpatternId = term.atom.subpatternId;
  585. output[(subpatternId << 1)] = offsetNoMatch;
  586. output[(subpatternId << 1) + 1] = offsetNoMatch;
  587. }
  588. switch (term.atom.quantityType) {
  589. case QuantifierGreedy:
  590. // if we backtrack to this point, there is another chance - try matching nothing.
  591. ASSERT(backTrack->begin != notFound);
  592. backTrack->begin = notFound;
  593. context->term += term.atom.parenthesesWidth;
  594. return true;
  595. case QuantifierNonGreedy:
  596. ASSERT(backTrack->begin != notFound);
  597. case QuantifierFixedCount:
  598. break;
  599. }
  600. return false;
  601. }
  602. bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
  603. {
  604. ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
  605. ASSERT(term.atom.quantityCount == 1);
  606. BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
  607. switch (term.atom.quantityType) {
  608. case QuantifierGreedy:
  609. if (backTrack->begin == notFound) {
  610. context->term -= term.atom.parenthesesWidth;
  611. return false;
  612. }
  613. case QuantifierNonGreedy:
  614. if (backTrack->begin == notFound) {
  615. backTrack->begin = input.getPos();
  616. if (term.capture()) {
  617. // Technically this access to inputPosition should be accessing the begin term's
  618. // inputPosition, but for repeats other than fixed these values should be
  619. // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
  620. ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
  621. ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
  622. unsigned subpatternId = term.atom.subpatternId;
  623. output[subpatternId << 1] = input.getPos() + term.inputPosition;
  624. }
  625. context->term -= term.atom.parenthesesWidth;
  626. return true;
  627. }
  628. case QuantifierFixedCount:
  629. break;
  630. }
  631. return false;
  632. }
  633. bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
  634. {
  635. ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
  636. ASSERT(term.atom.quantityType == QuantifierGreedy);
  637. ASSERT(term.atom.quantityCount == quantifyInfinite);
  638. ASSERT(!term.capture());
  639. BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
  640. backTrack->begin = input.getPos();
  641. return true;
  642. }
  643. bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
  644. {
  645. ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
  646. BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
  647. // Empty match is a failed match.
  648. if (backTrack->begin == input.getPos())
  649. return false;
  650. // Successful match! Okay, what's next? - loop around and try to match moar!
  651. context->term -= (term.atom.parenthesesWidth + 1);
  652. return true;
  653. }
  654. bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
  655. {
  656. ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
  657. ASSERT(term.atom.quantityType == QuantifierGreedy);
  658. ASSERT(term.atom.quantityCount == quantifyInfinite);
  659. ASSERT(!term.capture());
  660. // If we backtrack to this point, we have failed to match this iteration of the parens.
  661. // Since this is greedy / zero minimum a failed is also accepted as a match!
  662. context->term += term.atom.parenthesesWidth;
  663. return true;
  664. }
  665. bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
  666. {
  667. // 'Terminal' parentheses are at the end of the regex, and as such a match past end
  668. // should always be returned as a successful match - we should never backtrack to here.
  669. RELEASE_ASSERT_NOT_REACHED();
  670. return false;
  671. }
  672. bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
  673. {
  674. ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
  675. ASSERT(term.atom.quantityCount == 1);
  676. BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
  677. backTrack->begin = input.getPos();
  678. return true;
  679. }
  680. bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
  681. {
  682. ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
  683. ASSERT(term.atom.quantityCount == 1);
  684. BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
  685. input.setPos(backTrack->begin);
  686. // We've reached the end of the parens; if they are inverted, this is failure.
  687. if (term.invert()) {
  688. context->term -= term.atom.parenthesesWidth;
  689. return false;
  690. }
  691. return true;
  692. }
  693. bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
  694. {
  695. ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
  696. ASSERT(term.atom.quantityCount == 1);
  697. // We've failed to match parens; if they are inverted, this is win!
  698. if (term.invert()) {
  699. context->term += term.atom.parenthesesWidth;
  700. return true;
  701. }
  702. return false;
  703. }
  704. bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
  705. {
  706. ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
  707. ASSERT(term.atom.quantityCount == 1);
  708. BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
  709. input.setPos(backTrack->begin);
  710. context->term -= term.atom.parenthesesWidth;
  711. return false;
  712. }
  713. JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
  714. {
  715. ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
  716. BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
  717. ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
  718. backTrack->matchAmount = 0;
  719. backTrack->lastContext = 0;
  720. switch (term.atom.quantityType) {
  721. case QuantifierFixedCount: {
  722. // While we haven't yet reached our fixed limit,
  723. while (backTrack->matchAmount < term.atom.quantityCount) {
  724. // Try to do a match, and it it succeeds, add it to the list.
  725. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
  726. JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
  727. if (result == JSRegExpMatch)
  728. appendParenthesesDisjunctionContext(backTrack, context);
  729. else {
  730. // The match failed; try to find an alternate point to carry on from.
  731. resetMatches(term, context);
  732. freeParenthesesDisjunctionContext(context);
  733. if (result != JSRegExpNoMatch)
  734. return result;
  735. JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
  736. if (backtrackResult != JSRegExpMatch)
  737. return backtrackResult;
  738. }
  739. }
  740. ASSERT(backTrack->matchAmount == term.atom.quantityCount);
  741. ParenthesesDisjunctionContext* context = backTrack->lastContext;
  742. recordParenthesesMatch(term, context);
  743. return JSRegExpMatch;
  744. }
  745. case QuantifierGreedy: {
  746. while (backTrack->matchAmount < term.atom.quantityCount) {
  747. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
  748. JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
  749. if (result == JSRegExpMatch)
  750. appendParenthesesDisjunctionContext(backTrack, context);
  751. else {
  752. resetMatches(term, context);
  753. freeParenthesesDisjunctionContext(context);
  754. if (result != JSRegExpNoMatch)
  755. return result;
  756. break;
  757. }
  758. }
  759. if (backTrack->matchAmount) {
  760. ParenthesesDisjunctionContext* context = backTrack->lastContext;
  761. recordParenthesesMatch(term, context);
  762. }
  763. return JSRegExpMatch;
  764. }
  765. case QuantifierNonGreedy:
  766. return JSRegExpMatch;
  767. }
  768. RELEASE_ASSERT_NOT_REACHED();
  769. return JSRegExpErrorNoMatch;
  770. }
  771. // Rules for backtracking differ depending on whether this is greedy or non-greedy.
  772. //
  773. // Greedy matches never should try just adding more - you should already have done
  774. // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
  775. // you backtrack an item off the list needs checking, since we'll never have matched
  776. // the one less case. Tracking forwards, still add as much as possible.
  777. //
  778. // Non-greedy, we've already done the one less case, so don't match on popping.
  779. // We haven't done the one more case, so always try to add that.
  780. //
  781. JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
  782. {
  783. ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
  784. BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
  785. ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
  786. switch (term.atom.quantityType) {
  787. case QuantifierFixedCount: {
  788. ASSERT(backTrack->matchAmount == term.atom.quantityCount);
  789. ParenthesesDisjunctionContext* context = 0;
  790. JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
  791. if (result != JSRegExpMatch)
  792. return result;
  793. // While we haven't yet reached our fixed limit,
  794. while (backTrack->matchAmount < term.atom.quantityCount) {
  795. // Try to do a match, and it it succeeds, add it to the list.
  796. context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
  797. result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
  798. if (result == JSRegExpMatch)
  799. appendParenthesesDisjunctionContext(backTrack, context);
  800. else {
  801. // The match failed; try to find an alternate point to carry on from.
  802. resetMatches(term, context);
  803. freeParenthesesDisjunctionContext(context);
  804. if (result != JSRegExpNoMatch)
  805. return result;
  806. JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
  807. if (backtrackResult != JSRegExpMatch)
  808. return backtrackResult;
  809. }
  810. }
  811. ASSERT(backTrack->matchAmount == term.atom.quantityCount);
  812. context = backTrack->lastContext;
  813. recordParenthesesMatch(term, context);
  814. return JSRegExpMatch;
  815. }
  816. case QuantifierGreedy: {
  817. if (!backTrack->matchAmount)
  818. return JSRegExpNoMatch;
  819. ParenthesesDisjunctionContext* context = backTrack->lastContext;
  820. JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
  821. if (result == JSRegExpMatch) {
  822. while (backTrack->matchAmount < term.atom.quantityCount) {
  823. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
  824. JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
  825. if (parenthesesResult == JSRegExpMatch)
  826. appendParenthesesDisjunctionContext(backTrack, context);
  827. else {
  828. resetMatches(term, context);
  829. freeParenthesesDisjunctionContext(context);
  830. if (parenthesesResult != JSRegExpNoMatch)
  831. return parenthesesResult;
  832. break;
  833. }
  834. }
  835. } else {
  836. resetMatches(term, context);
  837. popParenthesesDisjunctionContext(backTrack);
  838. freeParenthesesDisjunctionContext(context);
  839. if (result != JSRegExpNoMatch)
  840. return result;
  841. }
  842. if (backTrack->matchAmount) {
  843. ParenthesesDisjunctionContext* context = backTrack->lastContext;
  844. recordParenthesesMatch(term, context);
  845. }
  846. return JSRegExpMatch;
  847. }
  848. case QuantifierNonGreedy: {
  849. // If we've not reached the limit, try to add one more match.
  850. if (backTrack->matchAmount < term.atom.quantityCount) {
  851. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
  852. JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
  853. if (result == JSRegExpMatch) {
  854. appendParenthesesDisjunctionContext(backTrack, context);
  855. recordParenthesesMatch(term, context);
  856. return JSRegExpMatch;
  857. }
  858. resetMatches(term, context);
  859. freeParenthesesDisjunctionContext(context);
  860. if (result != JSRegExpNoMatch)
  861. return result;
  862. }
  863. // Nope - okay backtrack looking for an alternative.
  864. while (backTrack->matchAmount) {
  865. ParenthesesDisjunctionContext* context = backTrack->lastContext;
  866. JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
  867. if (result == JSRegExpMatch) {
  868. // successful backtrack! we're back in the game!
  869. if (backTrack->matchAmount) {
  870. context = backTrack->lastContext;
  871. recordParenthesesMatch(term, context);
  872. }
  873. return JSRegExpMatch;
  874. }
  875. // pop a match off the stack
  876. resetMatches(term, context);
  877. popParenthesesDisjunctionContext(backTrack);
  878. freeParenthesesDisjunctionContext(context);
  879. if (result != JSRegExpNoMatch)
  880. return result;
  881. }
  882. return JSRegExpNoMatch;
  883. }
  884. }
  885. RELEASE_ASSERT_NOT_REACHED();
  886. return JSRegExpErrorNoMatch;
  887. }
  888. bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
  889. {
  890. UNUSED_PARAM(term);
  891. unsigned matchBegin = context->matchBegin;
  892. if (matchBegin) {
  893. for (matchBegin--; true; matchBegin--) {
  894. if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
  895. ++matchBegin;
  896. break;
  897. }
  898. if (!matchBegin)
  899. break;
  900. }
  901. }
  902. unsigned matchEnd = input.getPos();
  903. for (; (matchEnd != input.end())
  904. && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
  905. if (((matchBegin && term.anchors.m_bol)
  906. || ((matchEnd != input.end()) && term.anchors.m_eol))
  907. && !pattern->m_multiline)
  908. return false;
  909. context->matchBegin = matchBegin;
  910. context->matchEnd = matchEnd;
  911. return true;
  912. }
  913. #define MATCH_NEXT() { ++context->term; goto matchAgain; }
  914. #define BACKTRACK() { --context->term; goto backtrack; }
  915. #define currentTerm() (disjunction->terms[context->term])
  916. JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
  917. {
  918. if (!--remainingMatchCount)
  919. return JSRegExpErrorHitLimit;
  920. if (btrack)
  921. BACKTRACK();
  922. context->matchBegin = input.getPos();
  923. context->term = 0;
  924. matchAgain:
  925. ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
  926. switch (currentTerm().type) {
  927. case ByteTerm::TypeSubpatternBegin:
  928. MATCH_NEXT();
  929. case ByteTerm::TypeSubpatternEnd:
  930. context->matchEnd = input.getPos();
  931. return JSRegExpMatch;
  932. case ByteTerm::TypeBodyAlternativeBegin:
  933. MATCH_NEXT();
  934. case ByteTerm::TypeBodyAlternativeDisjunction:
  935. case ByteTerm::TypeBodyAlternativeEnd:
  936. context->matchEnd = input.getPos();
  937. return JSRegExpMatch;
  938. case ByteTerm::TypeAlternativeBegin:
  939. MATCH_NEXT();
  940. case ByteTerm::TypeAlternativeDisjunction:
  941. case ByteTerm::TypeAlternativeEnd: {
  942. int offset = currentTerm().alternative.end;
  943. BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
  944. backTrack->offset = offset;
  945. context->term += offset;
  946. MATCH_NEXT();
  947. }
  948. case ByteTerm::TypeAssertionBOL:
  949. if (matchAssertionBOL(currentTerm()))
  950. MATCH_NEXT();
  951. BACKTRACK();
  952. case ByteTerm::TypeAssertionEOL:
  953. if (matchAssertionEOL(currentTerm()))
  954. MATCH_NEXT();
  955. BACKTRACK();
  956. case ByteTerm::TypeAssertionWordBoundary:
  957. if (matchAssertionWordBoundary(currentTerm()))
  958. MATCH_NEXT();
  959. BACKTRACK();
  960. case ByteTerm::TypePatternCharacterOnce:
  961. case ByteTerm::TypePatternCharacterFixed: {
  962. for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
  963. if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
  964. BACKTRACK();
  965. }
  966. MATCH_NEXT();
  967. }
  968. case ByteTerm::TypePatternCharacterGreedy: {
  969. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
  970. unsigned matchAmount = 0;
  971. while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
  972. if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
  973. input.uncheckInput(1);
  974. break;
  975. }
  976. ++matchAmount;
  977. }
  978. backTrack->matchAmount = matchAmount;
  979. MATCH_NEXT();
  980. }
  981. case ByteTerm::TypePatternCharacterNonGreedy: {
  982. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
  983. backTrack->matchAmount = 0;
  984. MATCH_NEXT();
  985. }
  986. case ByteTerm::TypePatternCasedCharacterOnce:
  987. case ByteTerm::TypePatternCasedCharacterFixed: {
  988. for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
  989. if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
  990. BACKTRACK();
  991. }
  992. MATCH_NEXT();
  993. }
  994. case ByteTerm::TypePatternCasedCharacterGreedy: {
  995. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
  996. unsigned matchAmount = 0;
  997. while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
  998. if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
  999. input.uncheckInput(1);
  1000. break;
  1001. }
  1002. ++matchAmount;
  1003. }
  1004. backTrack->matchAmount = matchAmount;
  1005. MATCH_NEXT();
  1006. }
  1007. case ByteTerm::TypePatternCasedCharacterNonGreedy: {
  1008. BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
  1009. backTrack->matchAmount = 0;
  1010. MATCH_NEXT();
  1011. }
  1012. case ByteTerm::TypeCharacterClass:
  1013. if (matchCharacterClass(currentTerm(), context))
  1014. MATCH_NEXT();
  1015. BACKTRACK();
  1016. case ByteTerm::TypeBackReference:
  1017. if (matchBackReference(currentTerm(), context))
  1018. MATCH_NEXT();
  1019. BACKTRACK();
  1020. case ByteTerm::TypeParenthesesSubpattern: {
  1021. JSRegExpResult result = matchParentheses(currentTerm(), context);
  1022. if (result == JSRegExpMatch) {
  1023. MATCH_NEXT();
  1024. } else if (result != JSRegExpNoMatch)
  1025. return result;
  1026. BACKTRACK();
  1027. }
  1028. case ByteTerm::TypeParenthesesSubpatternOnceBegin:
  1029. if (matchParenthesesOnceBegin(currentTerm(), context))
  1030. MATCH_NEXT();
  1031. BACKTRACK();
  1032. case ByteTerm::TypeParenthesesSubpatternOnceEnd:
  1033. if (matchParenthesesOnceEnd(currentTerm(), context))
  1034. MATCH_NEXT();
  1035. BACKTRACK();
  1036. case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
  1037. if (matchParenthesesTerminalBegin(currentTerm(), context))
  1038. MATCH_NEXT();
  1039. BACKTRACK();
  1040. case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
  1041. if (matchParenthesesTerminalEnd(currentTerm(), context))
  1042. MATCH_NEXT();
  1043. BACKTRACK();
  1044. case ByteTerm::TypeParentheticalAssertionBegin:
  1045. if (matchParentheticalAssertionBegin(currentTerm(), context))
  1046. MATCH_NEXT();
  1047. BACKTRACK();
  1048. case ByteTerm::TypeParentheticalAssertionEnd:
  1049. if (matchParentheticalAssertionEnd(currentTerm(), context))
  1050. MATCH_NEXT();
  1051. BACKTRACK();
  1052. case ByteTerm::TypeCheckInput:
  1053. if (input.checkInput(currentTerm().checkInputCount))
  1054. MATCH_NEXT();
  1055. BACKTRACK();
  1056. case ByteTerm::TypeUncheckInput:
  1057. input.uncheckInput(currentTerm().checkInputCount);
  1058. MATCH_NEXT();
  1059. case ByteTerm::TypeDotStarEnclosure:
  1060. if (matchDotStarEnclosure(currentTerm(), context))
  1061. return JSRegExpMatch;
  1062. BACKTRACK();
  1063. }
  1064. // We should never fall-through to here.
  1065. RELEASE_ASSERT_NOT_REACHED();
  1066. backtrack:
  1067. ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
  1068. switch (currentTerm().type) {
  1069. case ByteTerm::TypeSubpatternBegin:
  1070. return JSRegExpNoMatch;
  1071. case ByteTerm::TypeSubpatternEnd:
  1072. RELEASE_ASSERT_NOT_REACHED();
  1073. case ByteTerm::TypeBodyAlternativeBegin:
  1074. case ByteTerm::TypeBodyAlternativeDisjunction: {
  1075. int offset = currentTerm().alternative.next;
  1076. context->term += offset;
  1077. if (offset > 0)
  1078. MATCH_NEXT();
  1079. if (input.atEnd())
  1080. return JSRegExpNoMatch;
  1081. input.next();
  1082. context->matchBegin = input.getPos();
  1083. if (currentTerm().alternative.onceThrough)
  1084. context->term += currentTerm().alternative.next;
  1085. MATCH_NEXT();
  1086. }
  1087. case ByteTerm::TypeBodyAlternativeEnd:
  1088. RELEASE_ASSERT_NOT_REACHED();
  1089. case ByteTerm::TypeAlternativeBegin:
  1090. case ByteTerm::TypeAlternativeDisjunction: {
  1091. int offset = currentTerm().alternative.next;
  1092. context->term += offset;
  1093. if (offset > 0)
  1094. MATCH_NEXT();
  1095. BACKTRACK();
  1096. }
  1097. case ByteTerm::TypeAlternativeEnd: {
  1098. // We should never backtrack back into an alternative of the main body of the regex.
  1099. BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
  1100. unsigned offset = backTrack->offset;
  1101. context->term -= offset;
  1102. BACKTRACK();
  1103. }
  1104. case ByteTerm::TypeAssertionBOL:
  1105. case ByteTerm::TypeAssertionEOL:
  1106. case ByteTerm::TypeAssertionWordBoundary:
  1107. BACKTRACK();
  1108. case ByteTerm::TypePatternCharacterOnce:
  1109. case ByteTerm::TypePatternCharacterFixed:
  1110. case ByteTerm::TypePatternCharacterGreedy:
  1111. case ByteTerm::TypePatternCharacterNonGreedy:
  1112. if (backtrackPatternCharacter(currentTerm(), context))
  1113. MATCH_NEXT();
  1114. BACKTRACK();
  1115. case ByteTerm::TypePatternCasedCharacterOnce:
  1116. case ByteTerm::TypePatternCasedCharacterFixed:
  1117. case ByteTerm::TypePatternCasedCharacterGreedy:
  1118. case ByteTerm::TypePatternCasedCharacterNonGreedy:
  1119. if (backtrackPatternCasedCharacter(currentTerm(), context))
  1120. MATCH_NEXT();
  1121. BACKTRACK();
  1122. case ByteTerm::TypeCharacterClass:
  1123. if (backtrackCharacterClass(currentTerm(), context))
  1124. MATCH_NEXT();
  1125. BACKTRACK();
  1126. case ByteTerm::TypeBackReference:
  1127. if (backtrackBackReference(currentTerm(), context))
  1128. MATCH_NEXT();
  1129. BACKTRACK();
  1130. case ByteTerm::TypeParenthesesSubpattern: {
  1131. JSRegExpResult result = backtrackParentheses(currentTerm(), context);
  1132. if (result == JSRegExpMatch) {
  1133. MATCH_NEXT();
  1134. } else if (result != JSRegExpNoMatch)
  1135. return result;
  1136. BACKTRACK();
  1137. }
  1138. case ByteTerm::TypeParenthesesSubpatternOnceBegin:
  1139. if (backtrackParenthesesOnceBegin(currentTerm(), context))
  1140. MATCH_NEXT();
  1141. BACKTRACK();
  1142. case ByteTerm::TypeParenthesesSubpatternOnceEnd:
  1143. if (backtrackParenthesesOnceEnd(currentTerm(), context))
  1144. MATCH_NEXT();
  1145. BACKTRACK();
  1146. case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
  1147. if (backtrackParenthesesTerminalBegin(currentTerm(), context))
  1148. MATCH_NEXT();
  1149. BACKTRACK();
  1150. case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
  1151. if (backtrackParenthesesTerminalEnd(currentTerm(), context))
  1152. MATCH_NEXT();
  1153. BACKTRACK();
  1154. case ByteTerm::TypeParentheticalAssertionBegin:
  1155. if (backtrackParentheticalAssertionBegin(currentTerm(), context))
  1156. MATCH_NEXT();
  1157. BACKTRACK();
  1158. case ByteTerm::TypeParentheticalAssertionEnd:
  1159. if (backtrackParentheticalAssertionEnd(currentTerm(), context))
  1160. MATCH_NEXT();
  1161. BACKTRACK();
  1162. case ByteTerm::TypeCheckInput:
  1163. input.uncheckInput(currentTerm().checkInputCount);
  1164. BACKTRACK();
  1165. case ByteTerm::TypeUncheckInput:
  1166. input.checkInput(currentTerm().checkInputCount);
  1167. BACKTRACK();
  1168. case ByteTerm::TypeDotStarEnclosure:
  1169. RELEASE_ASSERT_NOT_REACHED();
  1170. }
  1171. RELEASE_ASSERT_NOT_REACHED();
  1172. return JSRegExpErrorNoMatch;
  1173. }
  1174. JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
  1175. {
  1176. JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
  1177. if (result == JSRegExpMatch) {
  1178. while (context->matchBegin == context->matchEnd) {
  1179. result = matchDisjunction(disjunction, context, true);
  1180. if (result != JSRegExpMatch)
  1181. return result;
  1182. }
  1183. return JSRegExpMatch;
  1184. }
  1185. return result;
  1186. }
  1187. unsigned interpret()
  1188. {
  1189. if (!input.isAvailableInput(0))
  1190. return offsetNoMatch;
  1191. for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
  1192. output[i << 1] = offsetNoMatch;
  1193. allocatorPool = pattern->m_allocator->startAllocator();
  1194. RELEASE_ASSERT(allocatorPool);
  1195. DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
  1196. JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
  1197. if (result == JSRegExpMatch) {
  1198. output[0] = context->matchBegin;
  1199. output[1] = context->matchEnd;
  1200. }
  1201. freeDisjunctionContext(context);
  1202. pattern->m_allocator->stopAllocator();
  1203. ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
  1204. return output[0];
  1205. }
  1206. Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
  1207. : pattern(pattern)
  1208. , output(output)
  1209. , input(input, start, length)
  1210. , allocatorPool(0)
  1211. , remainingMatchCount(matchLimit)
  1212. {
  1213. }
  1214. private:
  1215. BytecodePattern* pattern;
  1216. unsigned* output;
  1217. InputStream input;
  1218. BumpPointerPool* allocatorPool;
  1219. unsigned remainingMatchCount;
  1220. };
  1221. class ByteCompiler {
  1222. struct ParenthesesStackEntry {
  1223. unsigned beginTerm;
  1224. unsigned savedAlternativeIndex;
  1225. ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
  1226. : beginTerm(beginTerm)
  1227. , savedAlternativeIndex(savedAlternativeIndex)
  1228. {
  1229. }
  1230. };
  1231. public:
  1232. ByteCompiler(YarrPattern& pattern)
  1233. : m_pattern(pattern)
  1234. {
  1235. m_currentAlternativeIndex = 0;
  1236. }
  1237. PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
  1238. {
  1239. regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
  1240. emitDisjunction(m_pattern.m_body);
  1241. regexEnd();
  1242. return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
  1243. }
  1244. void checkInput(unsigned count)
  1245. {
  1246. m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
  1247. }
  1248. void uncheckInput(unsigned count)
  1249. {
  1250. m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
  1251. }
  1252. void assertionBOL(unsigned inputPosition)
  1253. {
  1254. m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
  1255. }
  1256. void assertionEOL(unsigned inputPosition)
  1257. {
  1258. m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
  1259. }
  1260. void assertionWordBoundary(bool invert, unsigned inputPosition)
  1261. {
  1262. m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
  1263. }
  1264. void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
  1265. {
  1266. if (m_pattern.m_ignoreCase) {
  1267. UChar lo = Unicode::toLower(ch);
  1268. UChar hi = Unicode::toUpper(ch);
  1269. if (lo != hi) {
  1270. m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
  1271. return;
  1272. }
  1273. }
  1274. m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
  1275. }
  1276. void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
  1277. {
  1278. m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
  1279. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
  1280. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
  1281. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
  1282. }
  1283. void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
  1284. {
  1285. ASSERT(subpatternId);
  1286. m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
  1287. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
  1288. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
  1289. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
  1290. }
  1291. void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
  1292. {
  1293. int beginTerm = m_bodyDisjunction->terms.size();
  1294. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
  1295. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
  1296. m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
  1297. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
  1298. m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
  1299. m_currentAlternativeIndex = beginTerm + 1;
  1300. }
  1301. void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
  1302. {
  1303. int beginTerm = m_bodyDisjunction->terms.size();
  1304. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
  1305. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
  1306. m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
  1307. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
  1308. m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
  1309. m_currentAlternativeIndex = beginTerm + 1;
  1310. }
  1311. void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
  1312. {
  1313. // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
  1314. // then fix this up at the end! - simplifying this should make it much clearer.
  1315. // https://bugs.webkit.org/show_bug.cgi?id=50136
  1316. int beginTerm = m_bodyDisjunction->terms.size();
  1317. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
  1318. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
  1319. m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
  1320. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
  1321. m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
  1322. m_currentAlternativeIndex = beginTerm + 1;
  1323. }
  1324. void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
  1325. {
  1326. int beginTerm = m_bodyDisjunction->terms.size();
  1327. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
  1328. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
  1329. m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
  1330. m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
  1331. m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
  1332. m_currentAlternativeIndex = beginTerm + 1;
  1333. }
  1334. void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
  1335. {
  1336. unsigned beginTerm = popParenthesesStack();
  1337. closeAlternative(beginTerm + 1);
  1338. unsigned endTerm = m_bodyDisjunction->terms.size();
  1339. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
  1340. bool invert = m_bodyDisjunction->terms[beginTerm].invert();
  1341. unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
  1342. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
  1343. m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
  1344. m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
  1345. m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
  1346. m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
  1347. m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
  1348. m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
  1349. m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
  1350. }
  1351. void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
  1352. {
  1353. m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
  1354. }
  1355. unsigned popParenthesesStack()
  1356. {
  1357. ASSERT(m_parenthesesStack.size());
  1358. int stackEnd = m_parenthesesStack.size() - 1;
  1359. unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
  1360. m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
  1361. m_parenthesesStack.shrink(stackEnd);
  1362. ASSERT(beginTerm < m_bodyDisjunction->terms.size());
  1363. ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
  1364. return beginTerm;
  1365. }
  1366. #ifndef NDEBUG
  1367. void dumpDisjunction(ByteDisjunction* disjunction)
  1368. {
  1369. dataLogF("ByteDisjunction(%p):\n\t", disjunction);
  1370. for (unsigned i = 0; i < disjunction->terms.size(); ++i)
  1371. dataLogF("{ %d } ", disjunction->terms[i].type);
  1372. dataLogF("\n");
  1373. }
  1374. #endif
  1375. void closeAlternative(int beginTerm)
  1376. {
  1377. int origBeginTerm = beginTerm;
  1378. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
  1379. int endIndex = m_bodyDisjunction->terms.size();
  1380. unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
  1381. if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
  1382. m_bodyDisjunction->terms.remove(beginTerm);
  1383. else {
  1384. while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
  1385. beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
  1386. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
  1387. m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
  1388. m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
  1389. }
  1390. m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
  1391. m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
  1392. m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
  1393. }
  1394. }
  1395. void closeBodyAlternative()
  1396. {
  1397. int beginTerm = 0;
  1398. int origBeginTerm = 0;
  1399. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
  1400. int endIndex = m_bodyDisjunction->terms.size();
  1401. unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
  1402. while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
  1403. beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
  1404. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
  1405. m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
  1406. m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
  1407. }
  1408. m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
  1409. m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
  1410. m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
  1411. }
  1412. void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
  1413. {
  1414. unsigned beginTerm = popParenthesesStack();
  1415. closeAlternative(beginTerm + 1);
  1416. unsigned endTerm = m_bodyDisjunction->terms.size();
  1417. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
  1418. ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
  1419. bool capture = parenthesesBegin.capture();
  1420. unsigned subpatternId = parenthesesBegin.atom.subpatternId;
  1421. unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
  1422. OwnPtr<ByteDisjunction> parenthesesDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
  1423. unsigned firstTermInParentheses = beginTerm + 1;
  1424. parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
  1425. parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
  1426. for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
  1427. parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
  1428. parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
  1429. m_bodyDisjunction->terms.shrink(beginTerm);
  1430. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
  1431. m_allParenthesesInfo.append(parenthesesDisjunction.release());
  1432. m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
  1433. m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
  1434. m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
  1435. }
  1436. void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
  1437. {
  1438. unsigned beginTerm = popParenthesesStack();
  1439. closeAlternative(beginTerm + 1);
  1440. unsigned endTerm = m_bodyDisjunction->terms.size();
  1441. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
  1442. bool capture = m_bodyDisjunction->terms[beginTerm].capture();
  1443. unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
  1444. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
  1445. m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
  1446. m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
  1447. m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
  1448. m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
  1449. m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
  1450. m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
  1451. m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
  1452. }
  1453. void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
  1454. {
  1455. unsigned beginTerm = popParenthesesStack();
  1456. closeAlternative(beginTerm + 1);
  1457. unsigned endTerm = m_bodyDisjunction->terms.size();
  1458. ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
  1459. bool capture = m_bodyDisjunction->terms[beginTerm].capture();
  1460. unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
  1461. m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
  1462. m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
  1463. m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
  1464. m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
  1465. m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
  1466. m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
  1467. m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
  1468. m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
  1469. }
  1470. void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
  1471. {
  1472. m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
  1473. m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
  1474. m_bodyDisjunction->terms[0].frameLocation = 0;
  1475. m_currentAlternativeIndex = 0;
  1476. }
  1477. void regexEnd()
  1478. {
  1479. closeBodyAlternative();
  1480. }
  1481. void alternativeBodyDisjunction(bool onceThrough)
  1482. {
  1483. int newAlternativeIndex = m_bodyDisjunction->terms.size();
  1484. m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
  1485. m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
  1486. m_currentAlternativeIndex = newAlternativeIndex;
  1487. }
  1488. void alternativeDisjunction()
  1489. {
  1490. int newAlternativeIndex = m_bodyDisjunction->terms.size();
  1491. m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
  1492. m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
  1493. m_currentAlternativeIndex = newAlternativeIndex;
  1494. }
  1495. void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
  1496. {
  1497. for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
  1498. unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
  1499. PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
  1500. if (alt) {
  1501. if (disjunction == m_pattern.m_body)
  1502. alternativeBodyDisjunction(alternative->onceThrough());
  1503. else
  1504. alternativeDisjunction();
  1505. }
  1506. unsigned minimumSize = alternative->m_minimumSize;
  1507. ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
  1508. unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
  1509. if (countToCheck) {
  1510. checkInput(countToCheck);
  1511. currentCountAlreadyChecked += countToCheck;
  1512. }
  1513. for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
  1514. PatternTerm& term = alternative->m_terms[i];
  1515. switch (term.type) {
  1516. case PatternTerm::TypeAssertionBOL:
  1517. assertionBOL(currentCountAlreadyChecked - term.inputPosition);
  1518. break;
  1519. case PatternTerm::TypeAssertionEOL:
  1520. assertionEOL(currentCountAlreadyChecked - term.inputPosition);
  1521. break;
  1522. case PatternTerm::TypeAssertionWordBoundary:
  1523. assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
  1524. break;
  1525. case PatternTerm::TypePatternCharacter:
  1526. atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
  1527. break;
  1528. case PatternTerm::TypeCharacterClass:
  1529. atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
  1530. break;
  1531. case PatternTerm::TypeBackReference:
  1532. atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
  1533. break;
  1534. case PatternTerm::TypeForwardReference:
  1535. break;
  1536. case PatternTerm::TypeParenthesesSubpattern: {
  1537. unsigned disjunctionAlreadyCheckedCount = 0;
  1538. if (term.quantityCount == 1 && !term.parentheses.isCopy) {
  1539. unsigned alternativeFrameLocation = term.frameLocation;
  1540. // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
  1541. if (term.quantityType == QuantifierFixedCount)
  1542. disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
  1543. else
  1544. alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
  1545. unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
  1546. atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
  1547. emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
  1548. atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
  1549. } else if (term.parentheses.isTerminal) {
  1550. unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
  1551. atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
  1552. emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
  1553. atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
  1554. } else {
  1555. unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
  1556. atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
  1557. emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
  1558. atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
  1559. }
  1560. break;
  1561. }
  1562. case PatternTerm::TypeParentheticalAssertion: {
  1563. unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
  1564. ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
  1565. unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
  1566. unsigned uncheckAmount = 0;
  1567. if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
  1568. uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
  1569. uncheckInput(uncheckAmount);
  1570. currentCountAlreadyChecked -= uncheckAmount;
  1571. }
  1572. atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
  1573. emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
  1574. atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
  1575. if (uncheckAmount) {
  1576. checkInput(uncheckAmount);
  1577. currentCountAlreadyChecked += uncheckAmount;
  1578. }
  1579. break;
  1580. }
  1581. case PatternTerm::TypeDotStarEnclosure:
  1582. assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
  1583. break;
  1584. }
  1585. }
  1586. }
  1587. }
  1588. private:
  1589. YarrPattern& m_pattern;
  1590. OwnPtr<ByteDisjunction> m_bodyDisjunction;
  1591. unsigned m_currentAlternativeIndex;
  1592. Vector<ParenthesesStackEntry> m_parenthesesStack;
  1593. Vector<OwnPtr<ByteDisjunction> > m_allParenthesesInfo;
  1594. };
  1595. PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
  1596. {
  1597. return ByteCompiler(pattern).compile(allocator);
  1598. }
  1599. unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
  1600. {
  1601. if (input.is8Bit())
  1602. return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
  1603. return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
  1604. }
  1605. unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
  1606. {
  1607. return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
  1608. }
  1609. unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
  1610. {
  1611. return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
  1612. }
  1613. // These should be the same for both UChar & LChar.
  1614. COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
  1615. COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
  1616. COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
  1617. COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
  1618. COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
  1619. COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
  1620. COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
  1621. } }