regexp.go 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package regexp implements regular expression search.
  5. //
  6. // The syntax of the regular expressions accepted is the same
  7. // general syntax used by Perl, Python, and other languages.
  8. // More precisely, it is the syntax accepted by RE2 and described at
  9. // http://code.google.com/p/re2/wiki/Syntax, except for \C.
  10. // For an overview of the syntax, run
  11. // godoc regexp/syntax
  12. //
  13. // The regexp implementation provided by this package is
  14. // guaranteed to run in time linear in the size of the input.
  15. // (This is a property not guaranteed by most open source
  16. // implementations of regular expressions.) For more information
  17. // about this property, see
  18. // http://swtch.com/~rsc/regexp/regexp1.html
  19. // or any book about automata theory.
  20. //
  21. // All characters are UTF-8-encoded code points.
  22. //
  23. // There are 16 methods of Regexp that match a regular expression and identify
  24. // the matched text. Their names are matched by this regular expression:
  25. //
  26. // Find(All)?(String)?(Submatch)?(Index)?
  27. //
  28. // If 'All' is present, the routine matches successive non-overlapping
  29. // matches of the entire expression. Empty matches abutting a preceding
  30. // match are ignored. The return value is a slice containing the successive
  31. // return values of the corresponding non-'All' routine. These routines take
  32. // an extra integer argument, n; if n >= 0, the function returns at most n
  33. // matches/submatches.
  34. //
  35. // If 'String' is present, the argument is a string; otherwise it is a slice
  36. // of bytes; return values are adjusted as appropriate.
  37. //
  38. // If 'Submatch' is present, the return value is a slice identifying the
  39. // successive submatches of the expression. Submatches are matches of
  40. // parenthesized subexpressions (also known as capturing groups) within the
  41. // regular expression, numbered from left to right in order of opening
  42. // parenthesis. Submatch 0 is the match of the entire expression, submatch 1
  43. // the match of the first parenthesized subexpression, and so on.
  44. //
  45. // If 'Index' is present, matches and submatches are identified by byte index
  46. // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
  47. // the nth submatch. The pair for n==0 identifies the match of the entire
  48. // expression. If 'Index' is not present, the match is identified by the
  49. // text of the match/submatch. If an index is negative, it means that
  50. // subexpression did not match any string in the input.
  51. //
  52. // There is also a subset of the methods that can be applied to text read
  53. // from a RuneReader:
  54. //
  55. // MatchReader, FindReaderIndex, FindReaderSubmatchIndex
  56. //
  57. // This set may grow. Note that regular expression matches may need to
  58. // examine text beyond the text returned by a match, so the methods that
  59. // match text from a RuneReader may read arbitrarily far into the input
  60. // before returning.
  61. //
  62. // (There are a few other methods that do not match this pattern.)
  63. //
  64. package regexp
  65. import (
  66. "bytes"
  67. "io"
  68. "regexp/syntax"
  69. "strconv"
  70. "strings"
  71. "sync"
  72. "unicode"
  73. "unicode/utf8"
  74. )
  75. var debug = false
  76. // Regexp is the representation of a compiled regular expression.
  77. // A Regexp is safe for concurrent use by multiple goroutines.
  78. type Regexp struct {
  79. // read-only after Compile
  80. expr string // as passed to Compile
  81. prog *syntax.Prog // compiled program
  82. onepass *onePassProg // onpass program or nil
  83. prefix string // required prefix in unanchored matches
  84. prefixBytes []byte // prefix, as a []byte
  85. prefixComplete bool // prefix is the entire regexp
  86. prefixRune rune // first rune in prefix
  87. prefixEnd uint32 // pc for last rune in prefix
  88. cond syntax.EmptyOp // empty-width conditions required at start of match
  89. numSubexp int
  90. subexpNames []string
  91. longest bool
  92. // cache of machines for running regexp
  93. mu sync.Mutex
  94. machine []*machine
  95. }
  96. // String returns the source text used to compile the regular expression.
  97. func (re *Regexp) String() string {
  98. return re.expr
  99. }
  100. // Compile parses a regular expression and returns, if successful,
  101. // a Regexp object that can be used to match against text.
  102. //
  103. // When matching against text, the regexp returns a match that
  104. // begins as early as possible in the input (leftmost), and among those
  105. // it chooses the one that a backtracking search would have found first.
  106. // This so-called leftmost-first matching is the same semantics
  107. // that Perl, Python, and other implementations use, although this
  108. // package implements it without the expense of backtracking.
  109. // For POSIX leftmost-longest matching, see CompilePOSIX.
  110. func Compile(expr string) (*Regexp, error) {
  111. return compile(expr, syntax.Perl, false)
  112. }
  113. // CompilePOSIX is like Compile but restricts the regular expression
  114. // to POSIX ERE (egrep) syntax and changes the match semantics to
  115. // leftmost-longest.
  116. //
  117. // That is, when matching against text, the regexp returns a match that
  118. // begins as early as possible in the input (leftmost), and among those
  119. // it chooses a match that is as long as possible.
  120. // This so-called leftmost-longest matching is the same semantics
  121. // that early regular expression implementations used and that POSIX
  122. // specifies.
  123. //
  124. // However, there can be multiple leftmost-longest matches, with different
  125. // submatch choices, and here this package diverges from POSIX.
  126. // Among the possible leftmost-longest matches, this package chooses
  127. // the one that a backtracking search would have found first, while POSIX
  128. // specifies that the match be chosen to maximize the length of the first
  129. // subexpression, then the second, and so on from left to right.
  130. // The POSIX rule is computationally prohibitive and not even well-defined.
  131. // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
  132. func CompilePOSIX(expr string) (*Regexp, error) {
  133. return compile(expr, syntax.POSIX, true)
  134. }
  135. // Longest makes future searches prefer the leftmost-longest match.
  136. // That is, when matching against text, the regexp returns a match that
  137. // begins as early as possible in the input (leftmost), and among those
  138. // it chooses a match that is as long as possible.
  139. func (re *Regexp) Longest() {
  140. re.longest = true
  141. }
  142. func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
  143. re, err := syntax.Parse(expr, mode)
  144. if err != nil {
  145. return nil, err
  146. }
  147. maxCap := re.MaxCap()
  148. capNames := re.CapNames()
  149. re = re.Simplify()
  150. prog, err := syntax.Compile(re)
  151. if err != nil {
  152. return nil, err
  153. }
  154. regexp := &Regexp{
  155. expr: expr,
  156. prog: prog,
  157. onepass: compileOnePass(prog),
  158. numSubexp: maxCap,
  159. subexpNames: capNames,
  160. cond: prog.StartCond(),
  161. longest: longest,
  162. }
  163. if regexp.onepass == notOnePass {
  164. regexp.prefix, regexp.prefixComplete = prog.Prefix()
  165. } else {
  166. regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
  167. }
  168. if regexp.prefix != "" {
  169. // TODO(rsc): Remove this allocation by adding
  170. // IndexString to package bytes.
  171. regexp.prefixBytes = []byte(regexp.prefix)
  172. regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
  173. }
  174. return regexp, nil
  175. }
  176. // get returns a machine to use for matching re.
  177. // It uses the re's machine cache if possible, to avoid
  178. // unnecessary allocation.
  179. func (re *Regexp) get() *machine {
  180. re.mu.Lock()
  181. if n := len(re.machine); n > 0 {
  182. z := re.machine[n-1]
  183. re.machine = re.machine[:n-1]
  184. re.mu.Unlock()
  185. return z
  186. }
  187. re.mu.Unlock()
  188. z := progMachine(re.prog, re.onepass)
  189. z.re = re
  190. return z
  191. }
  192. // put returns a machine to the re's machine cache.
  193. // There is no attempt to limit the size of the cache, so it will
  194. // grow to the maximum number of simultaneous matches
  195. // run using re. (The cache empties when re gets garbage collected.)
  196. func (re *Regexp) put(z *machine) {
  197. re.mu.Lock()
  198. re.machine = append(re.machine, z)
  199. re.mu.Unlock()
  200. }
  201. // MustCompile is like Compile but panics if the expression cannot be parsed.
  202. // It simplifies safe initialization of global variables holding compiled regular
  203. // expressions.
  204. func MustCompile(str string) *Regexp {
  205. regexp, error := Compile(str)
  206. if error != nil {
  207. panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
  208. }
  209. return regexp
  210. }
  211. // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
  212. // It simplifies safe initialization of global variables holding compiled regular
  213. // expressions.
  214. func MustCompilePOSIX(str string) *Regexp {
  215. regexp, error := CompilePOSIX(str)
  216. if error != nil {
  217. panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
  218. }
  219. return regexp
  220. }
  221. func quote(s string) string {
  222. if strconv.CanBackquote(s) {
  223. return "`" + s + "`"
  224. }
  225. return strconv.Quote(s)
  226. }
  227. // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
  228. func (re *Regexp) NumSubexp() int {
  229. return re.numSubexp
  230. }
  231. // SubexpNames returns the names of the parenthesized subexpressions
  232. // in this Regexp. The name for the first sub-expression is names[1],
  233. // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
  234. // Since the Regexp as a whole cannot be named, names[0] is always
  235. // the empty string. The slice should not be modified.
  236. func (re *Regexp) SubexpNames() []string {
  237. return re.subexpNames
  238. }
  239. const endOfText rune = -1
  240. // input abstracts different representations of the input text. It provides
  241. // one-character lookahead.
  242. type input interface {
  243. step(pos int) (r rune, width int) // advance one rune
  244. canCheckPrefix() bool // can we look ahead without losing info?
  245. hasPrefix(re *Regexp) bool
  246. index(re *Regexp, pos int) int
  247. context(pos int) syntax.EmptyOp
  248. }
  249. // inputString scans a string.
  250. type inputString struct {
  251. str string
  252. }
  253. func (i *inputString) step(pos int) (rune, int) {
  254. if pos < len(i.str) {
  255. c := i.str[pos]
  256. if c < utf8.RuneSelf {
  257. return rune(c), 1
  258. }
  259. return utf8.DecodeRuneInString(i.str[pos:])
  260. }
  261. return endOfText, 0
  262. }
  263. func (i *inputString) canCheckPrefix() bool {
  264. return true
  265. }
  266. func (i *inputString) hasPrefix(re *Regexp) bool {
  267. return strings.HasPrefix(i.str, re.prefix)
  268. }
  269. func (i *inputString) index(re *Regexp, pos int) int {
  270. return strings.Index(i.str[pos:], re.prefix)
  271. }
  272. func (i *inputString) context(pos int) syntax.EmptyOp {
  273. r1, r2 := endOfText, endOfText
  274. if pos > 0 && pos <= len(i.str) {
  275. r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
  276. }
  277. if pos < len(i.str) {
  278. r2, _ = utf8.DecodeRuneInString(i.str[pos:])
  279. }
  280. return syntax.EmptyOpContext(r1, r2)
  281. }
  282. // inputBytes scans a byte slice.
  283. type inputBytes struct {
  284. str []byte
  285. }
  286. func (i *inputBytes) step(pos int) (rune, int) {
  287. if pos < len(i.str) {
  288. c := i.str[pos]
  289. if c < utf8.RuneSelf {
  290. return rune(c), 1
  291. }
  292. return utf8.DecodeRune(i.str[pos:])
  293. }
  294. return endOfText, 0
  295. }
  296. func (i *inputBytes) canCheckPrefix() bool {
  297. return true
  298. }
  299. func (i *inputBytes) hasPrefix(re *Regexp) bool {
  300. return bytes.HasPrefix(i.str, re.prefixBytes)
  301. }
  302. func (i *inputBytes) index(re *Regexp, pos int) int {
  303. return bytes.Index(i.str[pos:], re.prefixBytes)
  304. }
  305. func (i *inputBytes) context(pos int) syntax.EmptyOp {
  306. r1, r2 := endOfText, endOfText
  307. if pos > 0 && pos <= len(i.str) {
  308. r1, _ = utf8.DecodeLastRune(i.str[:pos])
  309. }
  310. if pos < len(i.str) {
  311. r2, _ = utf8.DecodeRune(i.str[pos:])
  312. }
  313. return syntax.EmptyOpContext(r1, r2)
  314. }
  315. // inputReader scans a RuneReader.
  316. type inputReader struct {
  317. r io.RuneReader
  318. atEOT bool
  319. pos int
  320. }
  321. func (i *inputReader) step(pos int) (rune, int) {
  322. if !i.atEOT && pos != i.pos {
  323. return endOfText, 0
  324. }
  325. r, w, err := i.r.ReadRune()
  326. if err != nil {
  327. i.atEOT = true
  328. return endOfText, 0
  329. }
  330. i.pos += w
  331. return r, w
  332. }
  333. func (i *inputReader) canCheckPrefix() bool {
  334. return false
  335. }
  336. func (i *inputReader) hasPrefix(re *Regexp) bool {
  337. return false
  338. }
  339. func (i *inputReader) index(re *Regexp, pos int) int {
  340. return -1
  341. }
  342. func (i *inputReader) context(pos int) syntax.EmptyOp {
  343. return 0
  344. }
  345. // LiteralPrefix returns a literal string that must begin any match
  346. // of the regular expression re. It returns the boolean true if the
  347. // literal string comprises the entire regular expression.
  348. func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
  349. return re.prefix, re.prefixComplete
  350. }
  351. // MatchReader reports whether the Regexp matches the text read by the
  352. // RuneReader.
  353. func (re *Regexp) MatchReader(r io.RuneReader) bool {
  354. return re.doExecute(r, nil, "", 0, 0) != nil
  355. }
  356. // MatchString reports whether the Regexp matches the string s.
  357. func (re *Regexp) MatchString(s string) bool {
  358. return re.doExecute(nil, nil, s, 0, 0) != nil
  359. }
  360. // Match reports whether the Regexp matches the byte slice b.
  361. func (re *Regexp) Match(b []byte) bool {
  362. return re.doExecute(nil, b, "", 0, 0) != nil
  363. }
  364. // MatchReader checks whether a textual regular expression matches the text
  365. // read by the RuneReader. More complicated queries need to use Compile and
  366. // the full Regexp interface.
  367. func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
  368. re, err := Compile(pattern)
  369. if err != nil {
  370. return false, err
  371. }
  372. return re.MatchReader(r), nil
  373. }
  374. // MatchString checks whether a textual regular expression
  375. // matches a string. More complicated queries need
  376. // to use Compile and the full Regexp interface.
  377. func MatchString(pattern string, s string) (matched bool, err error) {
  378. re, err := Compile(pattern)
  379. if err != nil {
  380. return false, err
  381. }
  382. return re.MatchString(s), nil
  383. }
  384. // Match checks whether a textual regular expression
  385. // matches a byte slice. More complicated queries need
  386. // to use Compile and the full Regexp interface.
  387. func Match(pattern string, b []byte) (matched bool, err error) {
  388. re, err := Compile(pattern)
  389. if err != nil {
  390. return false, err
  391. }
  392. return re.Match(b), nil
  393. }
  394. // ReplaceAllString returns a copy of src, replacing matches of the Regexp
  395. // with the replacement string repl. Inside repl, $ signs are interpreted as
  396. // in Expand, so for instance $1 represents the text of the first submatch.
  397. func (re *Regexp) ReplaceAllString(src, repl string) string {
  398. n := 2
  399. if strings.Index(repl, "$") >= 0 {
  400. n = 2 * (re.numSubexp + 1)
  401. }
  402. b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
  403. return re.expand(dst, repl, nil, src, match)
  404. })
  405. return string(b)
  406. }
  407. // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
  408. // with the replacement string repl. The replacement repl is substituted directly,
  409. // without using Expand.
  410. func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
  411. return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
  412. return append(dst, repl...)
  413. }))
  414. }
  415. // ReplaceAllStringFunc returns a copy of src in which all matches of the
  416. // Regexp have been replaced by the return value of function repl applied
  417. // to the matched substring. The replacement returned by repl is substituted
  418. // directly, without using Expand.
  419. func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
  420. b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
  421. return append(dst, repl(src[match[0]:match[1]])...)
  422. })
  423. return string(b)
  424. }
  425. func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
  426. lastMatchEnd := 0 // end position of the most recent match
  427. searchPos := 0 // position where we next look for a match
  428. var buf []byte
  429. var endPos int
  430. if bsrc != nil {
  431. endPos = len(bsrc)
  432. } else {
  433. endPos = len(src)
  434. }
  435. for searchPos <= endPos {
  436. a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
  437. if len(a) == 0 {
  438. break // no more matches
  439. }
  440. // Copy the unmatched characters before this match.
  441. if bsrc != nil {
  442. buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
  443. } else {
  444. buf = append(buf, src[lastMatchEnd:a[0]]...)
  445. }
  446. // Now insert a copy of the replacement string, but not for a
  447. // match of the empty string immediately after another match.
  448. // (Otherwise, we get double replacement for patterns that
  449. // match both empty and nonempty strings.)
  450. if a[1] > lastMatchEnd || a[0] == 0 {
  451. buf = repl(buf, a)
  452. }
  453. lastMatchEnd = a[1]
  454. // Advance past this match; always advance at least one character.
  455. var width int
  456. if bsrc != nil {
  457. _, width = utf8.DecodeRune(bsrc[searchPos:])
  458. } else {
  459. _, width = utf8.DecodeRuneInString(src[searchPos:])
  460. }
  461. if searchPos+width > a[1] {
  462. searchPos += width
  463. } else if searchPos+1 > a[1] {
  464. // This clause is only needed at the end of the input
  465. // string. In that case, DecodeRuneInString returns width=0.
  466. searchPos++
  467. } else {
  468. searchPos = a[1]
  469. }
  470. }
  471. // Copy the unmatched characters after the last match.
  472. if bsrc != nil {
  473. buf = append(buf, bsrc[lastMatchEnd:]...)
  474. } else {
  475. buf = append(buf, src[lastMatchEnd:]...)
  476. }
  477. return buf
  478. }
  479. // ReplaceAll returns a copy of src, replacing matches of the Regexp
  480. // with the replacement text repl. Inside repl, $ signs are interpreted as
  481. // in Expand, so for instance $1 represents the text of the first submatch.
  482. func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
  483. n := 2
  484. if bytes.IndexByte(repl, '$') >= 0 {
  485. n = 2 * (re.numSubexp + 1)
  486. }
  487. srepl := ""
  488. b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
  489. if len(srepl) != len(repl) {
  490. srepl = string(repl)
  491. }
  492. return re.expand(dst, srepl, src, "", match)
  493. })
  494. return b
  495. }
  496. // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
  497. // with the replacement bytes repl. The replacement repl is substituted directly,
  498. // without using Expand.
  499. func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
  500. return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
  501. return append(dst, repl...)
  502. })
  503. }
  504. // ReplaceAllFunc returns a copy of src in which all matches of the
  505. // Regexp have been replaced by the return value of function repl applied
  506. // to the matched byte slice. The replacement returned by repl is substituted
  507. // directly, without using Expand.
  508. func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
  509. return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
  510. return append(dst, repl(src[match[0]:match[1]])...)
  511. })
  512. }
  513. var specialBytes = []byte(`\.+*?()|[]{}^$`)
  514. func special(b byte) bool {
  515. return bytes.IndexByte(specialBytes, b) >= 0
  516. }
  517. // QuoteMeta returns a string that quotes all regular expression metacharacters
  518. // inside the argument text; the returned string is a regular expression matching
  519. // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
  520. func QuoteMeta(s string) string {
  521. b := make([]byte, 2*len(s))
  522. // A byte loop is correct because all metacharacters are ASCII.
  523. j := 0
  524. for i := 0; i < len(s); i++ {
  525. if special(s[i]) {
  526. b[j] = '\\'
  527. j++
  528. }
  529. b[j] = s[i]
  530. j++
  531. }
  532. return string(b[0:j])
  533. }
  534. // The number of capture values in the program may correspond
  535. // to fewer capturing expressions than are in the regexp.
  536. // For example, "(a){0}" turns into an empty program, so the
  537. // maximum capture in the program is 0 but we need to return
  538. // an expression for \1. Pad appends -1s to the slice a as needed.
  539. func (re *Regexp) pad(a []int) []int {
  540. if a == nil {
  541. // No match.
  542. return nil
  543. }
  544. n := (1 + re.numSubexp) * 2
  545. for len(a) < n {
  546. a = append(a, -1)
  547. }
  548. return a
  549. }
  550. // Find matches in slice b if b is non-nil, otherwise find matches in string s.
  551. func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
  552. var end int
  553. if b == nil {
  554. end = len(s)
  555. } else {
  556. end = len(b)
  557. }
  558. for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
  559. matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
  560. if len(matches) == 0 {
  561. break
  562. }
  563. accept := true
  564. if matches[1] == pos {
  565. // We've found an empty match.
  566. if matches[0] == prevMatchEnd {
  567. // We don't allow an empty match right
  568. // after a previous match, so ignore it.
  569. accept = false
  570. }
  571. var width int
  572. // TODO: use step()
  573. if b == nil {
  574. _, width = utf8.DecodeRuneInString(s[pos:end])
  575. } else {
  576. _, width = utf8.DecodeRune(b[pos:end])
  577. }
  578. if width > 0 {
  579. pos += width
  580. } else {
  581. pos = end + 1
  582. }
  583. } else {
  584. pos = matches[1]
  585. }
  586. prevMatchEnd = matches[1]
  587. if accept {
  588. deliver(re.pad(matches))
  589. i++
  590. }
  591. }
  592. }
  593. // Find returns a slice holding the text of the leftmost match in b of the regular expression.
  594. // A return value of nil indicates no match.
  595. func (re *Regexp) Find(b []byte) []byte {
  596. a := re.doExecute(nil, b, "", 0, 2)
  597. if a == nil {
  598. return nil
  599. }
  600. return b[a[0]:a[1]]
  601. }
  602. // FindIndex returns a two-element slice of integers defining the location of
  603. // the leftmost match in b of the regular expression. The match itself is at
  604. // b[loc[0]:loc[1]].
  605. // A return value of nil indicates no match.
  606. func (re *Regexp) FindIndex(b []byte) (loc []int) {
  607. a := re.doExecute(nil, b, "", 0, 2)
  608. if a == nil {
  609. return nil
  610. }
  611. return a[0:2]
  612. }
  613. // FindString returns a string holding the text of the leftmost match in s of the regular
  614. // expression. If there is no match, the return value is an empty string,
  615. // but it will also be empty if the regular expression successfully matches
  616. // an empty string. Use FindStringIndex or FindStringSubmatch if it is
  617. // necessary to distinguish these cases.
  618. func (re *Regexp) FindString(s string) string {
  619. a := re.doExecute(nil, nil, s, 0, 2)
  620. if a == nil {
  621. return ""
  622. }
  623. return s[a[0]:a[1]]
  624. }
  625. // FindStringIndex returns a two-element slice of integers defining the
  626. // location of the leftmost match in s of the regular expression. The match
  627. // itself is at s[loc[0]:loc[1]].
  628. // A return value of nil indicates no match.
  629. func (re *Regexp) FindStringIndex(s string) (loc []int) {
  630. a := re.doExecute(nil, nil, s, 0, 2)
  631. if a == nil {
  632. return nil
  633. }
  634. return a[0:2]
  635. }
  636. // FindReaderIndex returns a two-element slice of integers defining the
  637. // location of the leftmost match of the regular expression in text read from
  638. // the RuneReader. The match text was found in the input stream at
  639. // byte offset loc[0] through loc[1]-1.
  640. // A return value of nil indicates no match.
  641. func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
  642. a := re.doExecute(r, nil, "", 0, 2)
  643. if a == nil {
  644. return nil
  645. }
  646. return a[0:2]
  647. }
  648. // FindSubmatch returns a slice of slices holding the text of the leftmost
  649. // match of the regular expression in b and the matches, if any, of its
  650. // subexpressions, as defined by the 'Submatch' descriptions in the package
  651. // comment.
  652. // A return value of nil indicates no match.
  653. func (re *Regexp) FindSubmatch(b []byte) [][]byte {
  654. a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
  655. if a == nil {
  656. return nil
  657. }
  658. ret := make([][]byte, 1+re.numSubexp)
  659. for i := range ret {
  660. if 2*i < len(a) && a[2*i] >= 0 {
  661. ret[i] = b[a[2*i]:a[2*i+1]]
  662. }
  663. }
  664. return ret
  665. }
  666. // Expand appends template to dst and returns the result; during the
  667. // append, Expand replaces variables in the template with corresponding
  668. // matches drawn from src. The match slice should have been returned by
  669. // FindSubmatchIndex.
  670. //
  671. // In the template, a variable is denoted by a substring of the form
  672. // $name or ${name}, where name is a non-empty sequence of letters,
  673. // digits, and underscores. A purely numeric name like $1 refers to
  674. // the submatch with the corresponding index; other names refer to
  675. // capturing parentheses named with the (?P<name>...) syntax. A
  676. // reference to an out of range or unmatched index or a name that is not
  677. // present in the regular expression is replaced with an empty slice.
  678. //
  679. // In the $name form, name is taken to be as long as possible: $1x is
  680. // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
  681. //
  682. // To insert a literal $ in the output, use $$ in the template.
  683. func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
  684. return re.expand(dst, string(template), src, "", match)
  685. }
  686. // ExpandString is like Expand but the template and source are strings.
  687. // It appends to and returns a byte slice in order to give the calling
  688. // code control over allocation.
  689. func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
  690. return re.expand(dst, template, nil, src, match)
  691. }
  692. func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
  693. for len(template) > 0 {
  694. i := strings.Index(template, "$")
  695. if i < 0 {
  696. break
  697. }
  698. dst = append(dst, template[:i]...)
  699. template = template[i:]
  700. if len(template) > 1 && template[1] == '$' {
  701. // Treat $$ as $.
  702. dst = append(dst, '$')
  703. template = template[2:]
  704. continue
  705. }
  706. name, num, rest, ok := extract(template)
  707. if !ok {
  708. // Malformed; treat $ as raw text.
  709. dst = append(dst, '$')
  710. template = template[1:]
  711. continue
  712. }
  713. template = rest
  714. if num >= 0 {
  715. if 2*num+1 < len(match) && match[2*num] >= 0 {
  716. if bsrc != nil {
  717. dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
  718. } else {
  719. dst = append(dst, src[match[2*num]:match[2*num+1]]...)
  720. }
  721. }
  722. } else {
  723. for i, namei := range re.subexpNames {
  724. if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
  725. if bsrc != nil {
  726. dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
  727. } else {
  728. dst = append(dst, src[match[2*i]:match[2*i+1]]...)
  729. }
  730. break
  731. }
  732. }
  733. }
  734. }
  735. dst = append(dst, template...)
  736. return dst
  737. }
  738. // extract returns the name from a leading "$name" or "${name}" in str.
  739. // If it is a number, extract returns num set to that number; otherwise num = -1.
  740. func extract(str string) (name string, num int, rest string, ok bool) {
  741. if len(str) < 2 || str[0] != '$' {
  742. return
  743. }
  744. brace := false
  745. if str[1] == '{' {
  746. brace = true
  747. str = str[2:]
  748. } else {
  749. str = str[1:]
  750. }
  751. i := 0
  752. for i < len(str) {
  753. rune, size := utf8.DecodeRuneInString(str[i:])
  754. if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
  755. break
  756. }
  757. i += size
  758. }
  759. if i == 0 {
  760. // empty name is not okay
  761. return
  762. }
  763. name = str[:i]
  764. if brace {
  765. if i >= len(str) || str[i] != '}' {
  766. // missing closing brace
  767. return
  768. }
  769. i++
  770. }
  771. // Parse number.
  772. num = 0
  773. for i := 0; i < len(name); i++ {
  774. if name[i] < '0' || '9' < name[i] || num >= 1e8 {
  775. num = -1
  776. break
  777. }
  778. num = num*10 + int(name[i]) - '0'
  779. }
  780. // Disallow leading zeros.
  781. if name[0] == '0' && len(name) > 1 {
  782. num = -1
  783. }
  784. rest = str[i:]
  785. ok = true
  786. return
  787. }
  788. // FindSubmatchIndex returns a slice holding the index pairs identifying the
  789. // leftmost match of the regular expression in b and the matches, if any, of
  790. // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
  791. // in the package comment.
  792. // A return value of nil indicates no match.
  793. func (re *Regexp) FindSubmatchIndex(b []byte) []int {
  794. return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
  795. }
  796. // FindStringSubmatch returns a slice of strings holding the text of the
  797. // leftmost match of the regular expression in s and the matches, if any, of
  798. // its subexpressions, as defined by the 'Submatch' description in the
  799. // package comment.
  800. // A return value of nil indicates no match.
  801. func (re *Regexp) FindStringSubmatch(s string) []string {
  802. a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
  803. if a == nil {
  804. return nil
  805. }
  806. ret := make([]string, 1+re.numSubexp)
  807. for i := range ret {
  808. if 2*i < len(a) && a[2*i] >= 0 {
  809. ret[i] = s[a[2*i]:a[2*i+1]]
  810. }
  811. }
  812. return ret
  813. }
  814. // FindStringSubmatchIndex returns a slice holding the index pairs
  815. // identifying the leftmost match of the regular expression in s and the
  816. // matches, if any, of its subexpressions, as defined by the 'Submatch' and
  817. // 'Index' descriptions in the package comment.
  818. // A return value of nil indicates no match.
  819. func (re *Regexp) FindStringSubmatchIndex(s string) []int {
  820. return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
  821. }
  822. // FindReaderSubmatchIndex returns a slice holding the index pairs
  823. // identifying the leftmost match of the regular expression of text read by
  824. // the RuneReader, and the matches, if any, of its subexpressions, as defined
  825. // by the 'Submatch' and 'Index' descriptions in the package comment. A
  826. // return value of nil indicates no match.
  827. func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
  828. return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
  829. }
  830. const startSize = 10 // The size at which to start a slice in the 'All' routines.
  831. // FindAll is the 'All' version of Find; it returns a slice of all successive
  832. // matches of the expression, as defined by the 'All' description in the
  833. // package comment.
  834. // A return value of nil indicates no match.
  835. func (re *Regexp) FindAll(b []byte, n int) [][]byte {
  836. if n < 0 {
  837. n = len(b) + 1
  838. }
  839. result := make([][]byte, 0, startSize)
  840. re.allMatches("", b, n, func(match []int) {
  841. result = append(result, b[match[0]:match[1]])
  842. })
  843. if len(result) == 0 {
  844. return nil
  845. }
  846. return result
  847. }
  848. // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
  849. // successive matches of the expression, as defined by the 'All' description
  850. // in the package comment.
  851. // A return value of nil indicates no match.
  852. func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
  853. if n < 0 {
  854. n = len(b) + 1
  855. }
  856. result := make([][]int, 0, startSize)
  857. re.allMatches("", b, n, func(match []int) {
  858. result = append(result, match[0:2])
  859. })
  860. if len(result) == 0 {
  861. return nil
  862. }
  863. return result
  864. }
  865. // FindAllString is the 'All' version of FindString; it returns a slice of all
  866. // successive matches of the expression, as defined by the 'All' description
  867. // in the package comment.
  868. // A return value of nil indicates no match.
  869. func (re *Regexp) FindAllString(s string, n int) []string {
  870. if n < 0 {
  871. n = len(s) + 1
  872. }
  873. result := make([]string, 0, startSize)
  874. re.allMatches(s, nil, n, func(match []int) {
  875. result = append(result, s[match[0]:match[1]])
  876. })
  877. if len(result) == 0 {
  878. return nil
  879. }
  880. return result
  881. }
  882. // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
  883. // slice of all successive matches of the expression, as defined by the 'All'
  884. // description in the package comment.
  885. // A return value of nil indicates no match.
  886. func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
  887. if n < 0 {
  888. n = len(s) + 1
  889. }
  890. result := make([][]int, 0, startSize)
  891. re.allMatches(s, nil, n, func(match []int) {
  892. result = append(result, match[0:2])
  893. })
  894. if len(result) == 0 {
  895. return nil
  896. }
  897. return result
  898. }
  899. // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
  900. // of all successive matches of the expression, as defined by the 'All'
  901. // description in the package comment.
  902. // A return value of nil indicates no match.
  903. func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
  904. if n < 0 {
  905. n = len(b) + 1
  906. }
  907. result := make([][][]byte, 0, startSize)
  908. re.allMatches("", b, n, func(match []int) {
  909. slice := make([][]byte, len(match)/2)
  910. for j := range slice {
  911. if match[2*j] >= 0 {
  912. slice[j] = b[match[2*j]:match[2*j+1]]
  913. }
  914. }
  915. result = append(result, slice)
  916. })
  917. if len(result) == 0 {
  918. return nil
  919. }
  920. return result
  921. }
  922. // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
  923. // a slice of all successive matches of the expression, as defined by the
  924. // 'All' description in the package comment.
  925. // A return value of nil indicates no match.
  926. func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
  927. if n < 0 {
  928. n = len(b) + 1
  929. }
  930. result := make([][]int, 0, startSize)
  931. re.allMatches("", b, n, func(match []int) {
  932. result = append(result, match)
  933. })
  934. if len(result) == 0 {
  935. return nil
  936. }
  937. return result
  938. }
  939. // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
  940. // returns a slice of all successive matches of the expression, as defined by
  941. // the 'All' description in the package comment.
  942. // A return value of nil indicates no match.
  943. func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  944. if n < 0 {
  945. n = len(s) + 1
  946. }
  947. result := make([][]string, 0, startSize)
  948. re.allMatches(s, nil, n, func(match []int) {
  949. slice := make([]string, len(match)/2)
  950. for j := range slice {
  951. if match[2*j] >= 0 {
  952. slice[j] = s[match[2*j]:match[2*j+1]]
  953. }
  954. }
  955. result = append(result, slice)
  956. })
  957. if len(result) == 0 {
  958. return nil
  959. }
  960. return result
  961. }
  962. // FindAllStringSubmatchIndex is the 'All' version of
  963. // FindStringSubmatchIndex; it returns a slice of all successive matches of
  964. // the expression, as defined by the 'All' description in the package
  965. // comment.
  966. // A return value of nil indicates no match.
  967. func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  968. if n < 0 {
  969. n = len(s) + 1
  970. }
  971. result := make([][]int, 0, startSize)
  972. re.allMatches(s, nil, n, func(match []int) {
  973. result = append(result, match)
  974. })
  975. if len(result) == 0 {
  976. return nil
  977. }
  978. return result
  979. }
  980. // Split slices s into substrings separated by the expression and returns a slice of
  981. // the substrings between those expression matches.
  982. //
  983. // The slice returned by this method consists of all the substrings of s
  984. // not contained in the slice returned by FindAllString. When called on an expression
  985. // that contains no metacharacters, it is equivalent to strings.SplitN.
  986. //
  987. // Example:
  988. // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
  989. // // s: ["", "b", "b", "c", "cadaaae"]
  990. //
  991. // The count determines the number of substrings to return:
  992. // n > 0: at most n substrings; the last substring will be the unsplit remainder.
  993. // n == 0: the result is nil (zero substrings)
  994. // n < 0: all substrings
  995. func (re *Regexp) Split(s string, n int) []string {
  996. if n == 0 {
  997. return nil
  998. }
  999. if len(re.expr) > 0 && len(s) == 0 {
  1000. return []string{""}
  1001. }
  1002. matches := re.FindAllStringIndex(s, n)
  1003. strings := make([]string, 0, len(matches))
  1004. beg := 0
  1005. end := 0
  1006. for _, match := range matches {
  1007. if n > 0 && len(strings) >= n-1 {
  1008. break
  1009. }
  1010. end = match[0]
  1011. if match[1] != 0 {
  1012. strings = append(strings, s[beg:end])
  1013. }
  1014. beg = match[1]
  1015. }
  1016. if end != len(s) {
  1017. strings = append(strings, s[beg:])
  1018. }
  1019. return strings
  1020. }