prog.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package syntax
  5. import (
  6. "bytes"
  7. "strconv"
  8. "unicode"
  9. )
  10. // Compiled program.
  11. // May not belong in this package, but convenient for now.
  12. // A Prog is a compiled regular expression program.
  13. type Prog struct {
  14. Inst []Inst
  15. Start int // index of start instruction
  16. NumCap int // number of InstCapture insts in re
  17. }
  18. // An InstOp is an instruction opcode.
  19. type InstOp uint8
  20. const (
  21. InstAlt InstOp = iota
  22. InstAltMatch
  23. InstCapture
  24. InstEmptyWidth
  25. InstMatch
  26. InstFail
  27. InstNop
  28. InstRune
  29. InstRune1
  30. InstRuneAny
  31. InstRuneAnyNotNL
  32. )
  33. var instOpNames = []string{
  34. "InstAlt",
  35. "InstAltMatch",
  36. "InstCapture",
  37. "InstEmptyWidth",
  38. "InstMatch",
  39. "InstFail",
  40. "InstNop",
  41. "InstRune",
  42. "InstRune1",
  43. "InstRuneAny",
  44. "InstRuneAnyNotNL",
  45. }
  46. func (i InstOp) String() string {
  47. if uint(i) >= uint(len(instOpNames)) {
  48. return ""
  49. }
  50. return instOpNames[i]
  51. }
  52. // An EmptyOp specifies a kind or mixture of zero-width assertions.
  53. type EmptyOp uint8
  54. const (
  55. EmptyBeginLine EmptyOp = 1 << iota
  56. EmptyEndLine
  57. EmptyBeginText
  58. EmptyEndText
  59. EmptyWordBoundary
  60. EmptyNoWordBoundary
  61. )
  62. // EmptyOpContext returns the zero-width assertions
  63. // satisfied at the position between the runes r1 and r2.
  64. // Passing r1 == -1 indicates that the position is
  65. // at the beginning of the text.
  66. // Passing r2 == -1 indicates that the position is
  67. // at the end of the text.
  68. func EmptyOpContext(r1, r2 rune) EmptyOp {
  69. var op EmptyOp = EmptyNoWordBoundary
  70. var boundary byte
  71. switch {
  72. case IsWordChar(r1):
  73. boundary = 1
  74. case r1 == '\n':
  75. op |= EmptyBeginLine
  76. case r1 < 0:
  77. op |= EmptyBeginText | EmptyBeginLine
  78. }
  79. switch {
  80. case IsWordChar(r2):
  81. boundary ^= 1
  82. case r2 == '\n':
  83. op |= EmptyEndLine
  84. case r2 < 0:
  85. op |= EmptyEndText | EmptyEndLine
  86. }
  87. if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
  88. op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
  89. }
  90. return op
  91. }
  92. // IsWordChar reports whether r is consider a ``word character''
  93. // during the evaluation of the \b and \B zero-width assertions.
  94. // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
  95. func IsWordChar(r rune) bool {
  96. return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
  97. }
  98. // An Inst is a single instruction in a regular expression program.
  99. type Inst struct {
  100. Op InstOp
  101. Out uint32 // all but InstMatch, InstFail
  102. Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
  103. Rune []rune
  104. }
  105. func (p *Prog) String() string {
  106. var b bytes.Buffer
  107. dumpProg(&b, p)
  108. return b.String()
  109. }
  110. // skipNop follows any no-op or capturing instructions
  111. // and returns the resulting pc.
  112. func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
  113. i := &p.Inst[pc]
  114. for i.Op == InstNop || i.Op == InstCapture {
  115. pc = i.Out
  116. i = &p.Inst[pc]
  117. }
  118. return i, pc
  119. }
  120. // op returns i.Op but merges all the Rune special cases into InstRune
  121. func (i *Inst) op() InstOp {
  122. op := i.Op
  123. switch op {
  124. case InstRune1, InstRuneAny, InstRuneAnyNotNL:
  125. op = InstRune
  126. }
  127. return op
  128. }
  129. // Prefix returns a literal string that all matches for the
  130. // regexp must start with. Complete is true if the prefix
  131. // is the entire match.
  132. func (p *Prog) Prefix() (prefix string, complete bool) {
  133. i, _ := p.skipNop(uint32(p.Start))
  134. // Avoid allocation of buffer if prefix is empty.
  135. if i.op() != InstRune || len(i.Rune) != 1 {
  136. return "", i.Op == InstMatch
  137. }
  138. // Have prefix; gather characters.
  139. var buf bytes.Buffer
  140. for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
  141. buf.WriteRune(i.Rune[0])
  142. i, _ = p.skipNop(i.Out)
  143. }
  144. return buf.String(), i.Op == InstMatch
  145. }
  146. // StartCond returns the leading empty-width conditions that must
  147. // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
  148. func (p *Prog) StartCond() EmptyOp {
  149. var flag EmptyOp
  150. pc := uint32(p.Start)
  151. i := &p.Inst[pc]
  152. Loop:
  153. for {
  154. switch i.Op {
  155. case InstEmptyWidth:
  156. flag |= EmptyOp(i.Arg)
  157. case InstFail:
  158. return ^EmptyOp(0)
  159. case InstCapture, InstNop:
  160. // skip
  161. default:
  162. break Loop
  163. }
  164. pc = i.Out
  165. i = &p.Inst[pc]
  166. }
  167. return flag
  168. }
  169. const noMatch = -1
  170. // MatchRune returns true if the instruction matches (and consumes) r.
  171. // It should only be called when i.Op == InstRune.
  172. func (i *Inst) MatchRune(r rune) bool {
  173. return i.MatchRunePos(r) != noMatch
  174. }
  175. // MatchRunePos checks whether the instruction matches (and consumes) r.
  176. // If so, MatchRunePos returns the index of the matching rune pair
  177. // (or, when len(i.Rune) == 1, rune singleton).
  178. // If not, MatchRunePos returns -1.
  179. // MatchRunePos should only be called when i.Op == InstRune.
  180. func (i *Inst) MatchRunePos(r rune) int {
  181. rune := i.Rune
  182. // Special case: single-rune slice is from literal string, not char class.
  183. if len(rune) == 1 {
  184. r0 := rune[0]
  185. if r == r0 {
  186. return 0
  187. }
  188. if Flags(i.Arg)&FoldCase != 0 {
  189. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  190. if r == r1 {
  191. return 0
  192. }
  193. }
  194. }
  195. return noMatch
  196. }
  197. // Peek at the first few pairs.
  198. // Should handle ASCII well.
  199. for j := 0; j < len(rune) && j <= 8; j += 2 {
  200. if r < rune[j] {
  201. return noMatch
  202. }
  203. if r <= rune[j+1] {
  204. return j / 2
  205. }
  206. }
  207. // Otherwise binary search.
  208. lo := 0
  209. hi := len(rune) / 2
  210. for lo < hi {
  211. m := lo + (hi-lo)/2
  212. if c := rune[2*m]; c <= r {
  213. if r <= rune[2*m+1] {
  214. return m
  215. }
  216. lo = m + 1
  217. } else {
  218. hi = m
  219. }
  220. }
  221. return noMatch
  222. }
  223. // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
  224. // Since we act on runes, it would be easy to support Unicode here.
  225. func wordRune(r rune) bool {
  226. return r == '_' ||
  227. ('A' <= r && r <= 'Z') ||
  228. ('a' <= r && r <= 'z') ||
  229. ('0' <= r && r <= '9')
  230. }
  231. // MatchEmptyWidth returns true if the instruction matches
  232. // an empty string between the runes before and after.
  233. // It should only be called when i.Op == InstEmptyWidth.
  234. func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
  235. switch EmptyOp(i.Arg) {
  236. case EmptyBeginLine:
  237. return before == '\n' || before == -1
  238. case EmptyEndLine:
  239. return after == '\n' || after == -1
  240. case EmptyBeginText:
  241. return before == -1
  242. case EmptyEndText:
  243. return after == -1
  244. case EmptyWordBoundary:
  245. return wordRune(before) != wordRune(after)
  246. case EmptyNoWordBoundary:
  247. return wordRune(before) == wordRune(after)
  248. }
  249. panic("unknown empty width arg")
  250. }
  251. func (i *Inst) String() string {
  252. var b bytes.Buffer
  253. dumpInst(&b, i)
  254. return b.String()
  255. }
  256. func bw(b *bytes.Buffer, args ...string) {
  257. for _, s := range args {
  258. b.WriteString(s)
  259. }
  260. }
  261. func dumpProg(b *bytes.Buffer, p *Prog) {
  262. for j := range p.Inst {
  263. i := &p.Inst[j]
  264. pc := strconv.Itoa(j)
  265. if len(pc) < 3 {
  266. b.WriteString(" "[len(pc):])
  267. }
  268. if j == p.Start {
  269. pc += "*"
  270. }
  271. bw(b, pc, "\t")
  272. dumpInst(b, i)
  273. bw(b, "\n")
  274. }
  275. }
  276. func u32(i uint32) string {
  277. return strconv.FormatUint(uint64(i), 10)
  278. }
  279. func dumpInst(b *bytes.Buffer, i *Inst) {
  280. switch i.Op {
  281. case InstAlt:
  282. bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
  283. case InstAltMatch:
  284. bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
  285. case InstCapture:
  286. bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
  287. case InstEmptyWidth:
  288. bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
  289. case InstMatch:
  290. bw(b, "match")
  291. case InstFail:
  292. bw(b, "fail")
  293. case InstNop:
  294. bw(b, "nop -> ", u32(i.Out))
  295. case InstRune:
  296. if i.Rune == nil {
  297. // shouldn't happen
  298. bw(b, "rune <nil>")
  299. }
  300. bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
  301. if Flags(i.Arg)&FoldCase != 0 {
  302. bw(b, "/i")
  303. }
  304. bw(b, " -> ", u32(i.Out))
  305. case InstRune1:
  306. bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
  307. case InstRuneAny:
  308. bw(b, "any -> ", u32(i.Out))
  309. case InstRuneAnyNotNL:
  310. bw(b, "anynotnl -> ", u32(i.Out))
  311. }
  312. }