onepass.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package regexp
  5. import (
  6. "bytes"
  7. "regexp/syntax"
  8. "sort"
  9. "unicode"
  10. )
  11. // "One-pass" regexp execution.
  12. // Some regexps can be analyzed to determine that they never need
  13. // backtracking: they are guaranteed to run in one pass over the string
  14. // without bothering to save all the usual NFA state.
  15. // Detect those and execute them more quickly.
  16. // A onePassProg is a compiled one-pass regular expression program.
  17. // It is the same as syntax.Prog except for the use of onePassInst.
  18. type onePassProg struct {
  19. Inst []onePassInst
  20. Start int // index of start instruction
  21. NumCap int // number of InstCapture insts in re
  22. }
  23. // A onePassInst is a single instruction in a one-pass regular expression program.
  24. // It is the same as syntax.Inst except for the new 'Next' field.
  25. type onePassInst struct {
  26. syntax.Inst
  27. Next []uint32
  28. }
  29. // OnePassPrefix returns a literal string that all matches for the
  30. // regexp must start with. Complete is true if the prefix
  31. // is the entire match. Pc is the index of the last rune instruction
  32. // in the string. The OnePassPrefix skips over the mandatory
  33. // EmptyBeginText
  34. func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
  35. i := &p.Inst[p.Start]
  36. if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
  37. return "", i.Op == syntax.InstMatch, uint32(p.Start)
  38. }
  39. pc = i.Out
  40. i = &p.Inst[pc]
  41. for i.Op == syntax.InstNop {
  42. pc = i.Out
  43. i = &p.Inst[pc]
  44. }
  45. // Avoid allocation of buffer if prefix is empty.
  46. if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
  47. return "", i.Op == syntax.InstMatch, uint32(p.Start)
  48. }
  49. // Have prefix; gather characters.
  50. var buf bytes.Buffer
  51. for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
  52. buf.WriteRune(i.Rune[0])
  53. pc, i = i.Out, &p.Inst[i.Out]
  54. }
  55. return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
  56. }
  57. // OnePassNext selects the next actionable state of the prog, based on the input character.
  58. // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
  59. // One of the alternates may ultimately lead without input to end of line. If the instruction
  60. // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
  61. func onePassNext(i *onePassInst, r rune) uint32 {
  62. next := i.MatchRunePos(r)
  63. if next >= 0 {
  64. return i.Next[next]
  65. }
  66. if i.Op == syntax.InstAltMatch {
  67. return i.Out
  68. }
  69. return 0
  70. }
  71. func iop(i *syntax.Inst) syntax.InstOp {
  72. op := i.Op
  73. switch op {
  74. case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  75. op = syntax.InstRune
  76. }
  77. return op
  78. }
  79. // Sparse Array implementation is used as a queueOnePass.
  80. type queueOnePass struct {
  81. sparse []uint32
  82. dense []uint32
  83. size, nextIndex uint32
  84. }
  85. func (q *queueOnePass) empty() bool {
  86. return q.nextIndex >= q.size
  87. }
  88. func (q *queueOnePass) next() (n uint32) {
  89. n = q.dense[q.nextIndex]
  90. q.nextIndex++
  91. return
  92. }
  93. func (q *queueOnePass) clear() {
  94. q.size = 0
  95. q.nextIndex = 0
  96. }
  97. func (q *queueOnePass) reset() {
  98. q.nextIndex = 0
  99. }
  100. func (q *queueOnePass) contains(u uint32) bool {
  101. if u >= uint32(len(q.sparse)) {
  102. return false
  103. }
  104. return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
  105. }
  106. func (q *queueOnePass) insert(u uint32) {
  107. if !q.contains(u) {
  108. q.insertNew(u)
  109. }
  110. }
  111. func (q *queueOnePass) insertNew(u uint32) {
  112. if u >= uint32(len(q.sparse)) {
  113. return
  114. }
  115. q.sparse[u] = q.size
  116. q.dense[q.size] = u
  117. q.size++
  118. }
  119. func newQueue(size int) (q *queueOnePass) {
  120. return &queueOnePass{
  121. sparse: make([]uint32, size),
  122. dense: make([]uint32, size),
  123. }
  124. }
  125. // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
  126. // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
  127. // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
  128. // NextIp array with the single element mergeFailed is returned.
  129. // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
  130. const mergeFailed = uint32(0xffffffff)
  131. var (
  132. noRune = []rune{}
  133. noNext = []uint32{mergeFailed}
  134. )
  135. func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
  136. leftLen := len(*leftRunes)
  137. rightLen := len(*rightRunes)
  138. if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
  139. panic("mergeRuneSets odd length []rune")
  140. }
  141. var (
  142. lx, rx int
  143. )
  144. merged := make([]rune, 0)
  145. next := make([]uint32, 0)
  146. ok := true
  147. defer func() {
  148. if !ok {
  149. merged = nil
  150. next = nil
  151. }
  152. }()
  153. ix := -1
  154. extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
  155. if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
  156. return false
  157. }
  158. merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
  159. *newLow += 2
  160. ix += 2
  161. next = append(next, pc)
  162. return true
  163. }
  164. for lx < leftLen || rx < rightLen {
  165. switch {
  166. case rx >= rightLen:
  167. ok = extend(&lx, leftRunes, leftPC)
  168. case lx >= leftLen:
  169. ok = extend(&rx, rightRunes, rightPC)
  170. case (*rightRunes)[rx] < (*leftRunes)[lx]:
  171. ok = extend(&rx, rightRunes, rightPC)
  172. default:
  173. ok = extend(&lx, leftRunes, leftPC)
  174. }
  175. if !ok {
  176. return noRune, noNext
  177. }
  178. }
  179. return merged, next
  180. }
  181. // cleanupOnePass drops working memory, and restores certain shortcut instructions.
  182. func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
  183. for ix, instOriginal := range original.Inst {
  184. switch instOriginal.Op {
  185. case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
  186. case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
  187. prog.Inst[ix].Next = nil
  188. case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  189. prog.Inst[ix].Next = nil
  190. prog.Inst[ix] = onePassInst{Inst: instOriginal}
  191. }
  192. }
  193. }
  194. // onePassCopy creates a copy of the original Prog, as we'll be modifying it
  195. func onePassCopy(prog *syntax.Prog) *onePassProg {
  196. p := &onePassProg{
  197. Start: prog.Start,
  198. NumCap: prog.NumCap,
  199. }
  200. for _, inst := range prog.Inst {
  201. p.Inst = append(p.Inst, onePassInst{Inst: inst})
  202. }
  203. // rewrites one or more common Prog constructs that enable some otherwise
  204. // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
  205. // ip A, that points to ips B & C.
  206. // A:BC + B:DA => A:BC + B:CD
  207. // A:BC + B:DC => A:DC + B:DC
  208. for pc := range p.Inst {
  209. switch p.Inst[pc].Op {
  210. default:
  211. continue
  212. case syntax.InstAlt, syntax.InstAltMatch:
  213. // A:Bx + B:Ay
  214. p_A_Other := &p.Inst[pc].Out
  215. p_A_Alt := &p.Inst[pc].Arg
  216. // make sure a target is another Alt
  217. instAlt := p.Inst[*p_A_Alt]
  218. if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
  219. p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
  220. instAlt = p.Inst[*p_A_Alt]
  221. if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
  222. continue
  223. }
  224. }
  225. instOther := p.Inst[*p_A_Other]
  226. // Analyzing both legs pointing to Alts is for another day
  227. if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
  228. // too complicated
  229. continue
  230. }
  231. // simple empty transition loop
  232. // A:BC + B:DA => A:BC + B:DC
  233. p_B_Alt := &p.Inst[*p_A_Alt].Out
  234. p_B_Other := &p.Inst[*p_A_Alt].Arg
  235. patch := false
  236. if instAlt.Out == uint32(pc) {
  237. patch = true
  238. } else if instAlt.Arg == uint32(pc) {
  239. patch = true
  240. p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
  241. }
  242. if patch {
  243. *p_B_Alt = *p_A_Other
  244. }
  245. // empty transition to common target
  246. // A:BC + B:DC => A:DC + B:DC
  247. if *p_A_Other == *p_B_Alt {
  248. *p_A_Alt = *p_B_Other
  249. }
  250. }
  251. }
  252. return p
  253. }
  254. // runeSlice exists to permit sorting the case-folded rune sets.
  255. type runeSlice []rune
  256. func (p runeSlice) Len() int { return len(p) }
  257. func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
  258. func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  259. // Sort is a convenience method.
  260. func (p runeSlice) Sort() {
  261. sort.Sort(p)
  262. }
  263. var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
  264. var anyRune = []rune{0, unicode.MaxRune}
  265. // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
  266. // the match engine can always tell which branch to take. The routine may modify
  267. // p if it is turned into a onepass Prog. If it isn't possible for this to be a
  268. // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
  269. // to the size of the Prog.
  270. func makeOnePass(p *onePassProg) *onePassProg {
  271. // If the machine is very long, it's not worth the time to check if we can use one pass.
  272. if len(p.Inst) >= 1000 {
  273. return notOnePass
  274. }
  275. var (
  276. instQueue = newQueue(len(p.Inst))
  277. visitQueue = newQueue(len(p.Inst))
  278. build func(uint32, *queueOnePass)
  279. check func(uint32, map[uint32]bool) bool
  280. onePassRunes = make([][]rune, len(p.Inst))
  281. )
  282. build = func(pc uint32, q *queueOnePass) {
  283. if q.contains(pc) {
  284. return
  285. }
  286. inst := p.Inst[pc]
  287. switch inst.Op {
  288. case syntax.InstAlt, syntax.InstAltMatch:
  289. q.insert(inst.Out)
  290. build(inst.Out, q)
  291. q.insert(inst.Arg)
  292. case syntax.InstMatch, syntax.InstFail:
  293. default:
  294. q.insert(inst.Out)
  295. }
  296. }
  297. // check that paths from Alt instructions are unambiguous, and rebuild the new
  298. // program as a onepass program
  299. check = func(pc uint32, m map[uint32]bool) (ok bool) {
  300. ok = true
  301. inst := &p.Inst[pc]
  302. if visitQueue.contains(pc) {
  303. return
  304. }
  305. visitQueue.insert(pc)
  306. switch inst.Op {
  307. case syntax.InstAlt, syntax.InstAltMatch:
  308. ok = check(inst.Out, m) && check(inst.Arg, m)
  309. // check no-input paths to InstMatch
  310. matchOut := m[inst.Out]
  311. matchArg := m[inst.Arg]
  312. if matchOut && matchArg {
  313. ok = false
  314. break
  315. }
  316. // Match on empty goes in inst.Out
  317. if matchArg {
  318. inst.Out, inst.Arg = inst.Arg, inst.Out
  319. matchOut, matchArg = matchArg, matchOut
  320. }
  321. if matchOut {
  322. m[pc] = true
  323. inst.Op = syntax.InstAltMatch
  324. }
  325. // build a dispatch operator from the two legs of the alt.
  326. onePassRunes[pc], inst.Next = mergeRuneSets(
  327. &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
  328. if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
  329. ok = false
  330. break
  331. }
  332. case syntax.InstCapture, syntax.InstNop:
  333. ok = check(inst.Out, m)
  334. m[pc] = m[inst.Out]
  335. // pass matching runes back through these no-ops.
  336. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
  337. inst.Next = []uint32{}
  338. for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
  339. inst.Next = append(inst.Next, inst.Out)
  340. }
  341. case syntax.InstEmptyWidth:
  342. ok = check(inst.Out, m)
  343. m[pc] = m[inst.Out]
  344. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
  345. inst.Next = []uint32{}
  346. for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
  347. inst.Next = append(inst.Next, inst.Out)
  348. }
  349. case syntax.InstMatch, syntax.InstFail:
  350. m[pc] = inst.Op == syntax.InstMatch
  351. break
  352. case syntax.InstRune:
  353. ok = check(inst.Out, m)
  354. m[pc] = false
  355. if len(inst.Next) > 0 {
  356. break
  357. }
  358. if len(inst.Rune) == 0 {
  359. onePassRunes[pc] = []rune{}
  360. inst.Next = []uint32{inst.Out}
  361. break
  362. }
  363. runes := make([]rune, 0)
  364. if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
  365. r0 := inst.Rune[0]
  366. runes = append(runes, r0, r0)
  367. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  368. runes = append(runes, r1, r1)
  369. }
  370. sort.Sort(runeSlice(runes))
  371. } else {
  372. runes = append(runes, inst.Rune...)
  373. }
  374. onePassRunes[pc] = runes
  375. inst.Next = []uint32{}
  376. for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
  377. inst.Next = append(inst.Next, inst.Out)
  378. }
  379. inst.Op = syntax.InstRune
  380. case syntax.InstRune1:
  381. ok = check(inst.Out, m)
  382. m[pc] = false
  383. if len(inst.Next) > 0 {
  384. break
  385. }
  386. runes := []rune{}
  387. // expand case-folded runes
  388. if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
  389. r0 := inst.Rune[0]
  390. runes = append(runes, r0, r0)
  391. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  392. runes = append(runes, r1, r1)
  393. }
  394. sort.Sort(runeSlice(runes))
  395. } else {
  396. runes = append(runes, inst.Rune[0], inst.Rune[0])
  397. }
  398. onePassRunes[pc] = runes
  399. inst.Next = []uint32{}
  400. for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
  401. inst.Next = append(inst.Next, inst.Out)
  402. }
  403. inst.Op = syntax.InstRune
  404. case syntax.InstRuneAny:
  405. ok = check(inst.Out, m)
  406. m[pc] = false
  407. if len(inst.Next) > 0 {
  408. break
  409. }
  410. onePassRunes[pc] = append([]rune{}, anyRune...)
  411. inst.Next = []uint32{inst.Out}
  412. case syntax.InstRuneAnyNotNL:
  413. ok = check(inst.Out, m)
  414. m[pc] = false
  415. if len(inst.Next) > 0 {
  416. break
  417. }
  418. onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
  419. inst.Next = []uint32{}
  420. for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
  421. inst.Next = append(inst.Next, inst.Out)
  422. }
  423. }
  424. return
  425. }
  426. instQueue.clear()
  427. instQueue.insert(uint32(p.Start))
  428. m := make(map[uint32]bool, len(p.Inst))
  429. for !instQueue.empty() {
  430. pc := instQueue.next()
  431. inst := p.Inst[pc]
  432. visitQueue.clear()
  433. if !check(uint32(pc), m) {
  434. p = notOnePass
  435. break
  436. }
  437. switch inst.Op {
  438. case syntax.InstAlt, syntax.InstAltMatch:
  439. instQueue.insert(inst.Out)
  440. instQueue.insert(inst.Arg)
  441. case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
  442. instQueue.insert(inst.Out)
  443. case syntax.InstMatch:
  444. case syntax.InstFail:
  445. case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  446. default:
  447. }
  448. }
  449. if p != notOnePass {
  450. for i := range p.Inst {
  451. p.Inst[i].Rune = onePassRunes[i]
  452. }
  453. }
  454. return p
  455. }
  456. // walk visits each Inst in the prog once, and applies the argument
  457. // function(ip, next), in pre-order.
  458. func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
  459. var walk1 func(uint32)
  460. progQueue := newQueue(len(prog.Inst))
  461. walk1 = func(ip uint32) {
  462. if progQueue.contains(ip) {
  463. return
  464. }
  465. progQueue.insert(ip)
  466. inst := prog.Inst[ip]
  467. switch inst.Op {
  468. case syntax.InstAlt, syntax.InstAltMatch:
  469. for _, f := range funcs {
  470. f(ip, inst.Out)
  471. f(ip, inst.Arg)
  472. }
  473. walk1(inst.Out)
  474. walk1(inst.Arg)
  475. default:
  476. for _, f := range funcs {
  477. f(ip, inst.Out)
  478. }
  479. walk1(inst.Out)
  480. }
  481. }
  482. walk1(uint32(prog.Start))
  483. }
  484. // find returns the Insts that match the argument predicate function
  485. func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
  486. matches = []uint32{}
  487. for ip := range prog.Inst {
  488. if f(prog, ip) {
  489. matches = append(matches, uint32(ip))
  490. }
  491. }
  492. return
  493. }
  494. var notOnePass *onePassProg = nil
  495. // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
  496. // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
  497. // Prog cannot be converted. For a one pass prog, the fundamental condition that must
  498. // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
  499. func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
  500. if prog.Start == 0 {
  501. return notOnePass
  502. }
  503. // onepass regexp is anchored
  504. if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
  505. syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
  506. return notOnePass
  507. }
  508. // every instruction leading to InstMatch must be EmptyEndText
  509. for _, inst := range prog.Inst {
  510. opOut := prog.Inst[inst.Out].Op
  511. switch inst.Op {
  512. default:
  513. if opOut == syntax.InstMatch {
  514. return notOnePass
  515. }
  516. case syntax.InstAlt, syntax.InstAltMatch:
  517. if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
  518. return notOnePass
  519. }
  520. case syntax.InstEmptyWidth:
  521. if opOut == syntax.InstMatch {
  522. if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
  523. continue
  524. }
  525. return notOnePass
  526. }
  527. }
  528. }
  529. // Creates a slightly optimized copy of the original Prog
  530. // that cleans up some Prog idioms that block valid onepass programs
  531. p = onePassCopy(prog)
  532. // checkAmbiguity on InstAlts, build onepass Prog if possible
  533. p = makeOnePass(p)
  534. if p != notOnePass {
  535. cleanupOnePass(p, prog)
  536. }
  537. return p
  538. }