simplify_test.go 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package syntax
  5. import "testing"
  6. var simplifyTests = []struct {
  7. Regexp string
  8. Simple string
  9. }{
  10. // Already-simple constructs
  11. {`a`, `a`},
  12. {`ab`, `ab`},
  13. {`a|b`, `[a-b]`},
  14. {`ab|cd`, `ab|cd`},
  15. {`(ab)*`, `(ab)*`},
  16. {`(ab)+`, `(ab)+`},
  17. {`(ab)?`, `(ab)?`},
  18. {`.`, `(?s:.)`},
  19. {`^`, `^`},
  20. {`$`, `$`},
  21. {`[ac]`, `[ac]`},
  22. {`[^ac]`, `[^ac]`},
  23. // Posix character classes
  24. {`[[:alnum:]]`, `[0-9A-Za-z]`},
  25. {`[[:alpha:]]`, `[A-Za-z]`},
  26. {`[[:blank:]]`, `[\t ]`},
  27. {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
  28. {`[[:digit:]]`, `[0-9]`},
  29. {`[[:graph:]]`, `[!-~]`},
  30. {`[[:lower:]]`, `[a-z]`},
  31. {`[[:print:]]`, `[ -~]`},
  32. {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
  33. {`[[:space:]]`, `[\t-\r ]`},
  34. {`[[:upper:]]`, `[A-Z]`},
  35. {`[[:xdigit:]]`, `[0-9A-Fa-f]`},
  36. // Perl character classes
  37. {`\d`, `[0-9]`},
  38. {`\s`, `[\t-\n\f-\r ]`},
  39. {`\w`, `[0-9A-Z_a-z]`},
  40. {`\D`, `[^0-9]`},
  41. {`\S`, `[^\t-\n\f-\r ]`},
  42. {`\W`, `[^0-9A-Z_a-z]`},
  43. {`[\d]`, `[0-9]`},
  44. {`[\s]`, `[\t-\n\f-\r ]`},
  45. {`[\w]`, `[0-9A-Z_a-z]`},
  46. {`[\D]`, `[^0-9]`},
  47. {`[\S]`, `[^\t-\n\f-\r ]`},
  48. {`[\W]`, `[^0-9A-Z_a-z]`},
  49. // Posix repetitions
  50. {`a{1}`, `a`},
  51. {`a{2}`, `aa`},
  52. {`a{5}`, `aaaaa`},
  53. {`a{0,1}`, `a?`},
  54. // The next three are illegible because Simplify inserts (?:)
  55. // parens instead of () parens to avoid creating extra
  56. // captured subexpressions. The comments show a version with fewer parens.
  57. {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
  58. {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
  59. {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
  60. {`a{0,2}`, `(?:aa?)?`}, // (aa?)?
  61. {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
  62. {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
  63. {`a{0,}`, `a*`},
  64. {`a{1,}`, `a+`},
  65. {`a{2,}`, `aa+`},
  66. {`a{5,}`, `aaaaa+`},
  67. // Test that operators simplify their arguments.
  68. {`(?:a{1,}){1,}`, `a+`},
  69. {`(a{1,}b{1,})`, `(a+b+)`},
  70. {`a{1,}|b{1,}`, `a+|b+`},
  71. {`(?:a{1,})*`, `(?:a+)*`},
  72. {`(?:a{1,})+`, `a+`},
  73. {`(?:a{1,})?`, `(?:a+)?`},
  74. {``, `(?:)`},
  75. {`a{0}`, `(?:)`},
  76. // Character class simplification
  77. {`[ab]`, `[a-b]`},
  78. {`[a-za-za-z]`, `[a-z]`},
  79. {`[A-Za-zA-Za-z]`, `[A-Za-z]`},
  80. {`[ABCDEFGH]`, `[A-H]`},
  81. {`[AB-CD-EF-GH]`, `[A-H]`},
  82. {`[W-ZP-XE-R]`, `[E-Z]`},
  83. {`[a-ee-gg-m]`, `[a-m]`},
  84. {`[a-ea-ha-m]`, `[a-m]`},
  85. {`[a-ma-ha-e]`, `[a-m]`},
  86. {`[a-zA-Z0-9 -~]`, `[ -~]`},
  87. // Empty character classes
  88. {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
  89. // Full character classes
  90. {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
  91. // Unicode case folding.
  92. {`(?i)A`, `(?i:A)`},
  93. {`(?i)a`, `(?i:A)`},
  94. {`(?i)[A]`, `(?i:A)`},
  95. {`(?i)[a]`, `(?i:A)`},
  96. {`(?i)K`, `(?i:K)`},
  97. {`(?i)k`, `(?i:K)`},
  98. {`(?i)\x{212a}`, "(?i:K)"},
  99. {`(?i)[K]`, "[Kk\u212A]"},
  100. {`(?i)[k]`, "[Kk\u212A]"},
  101. {`(?i)[\x{212a}]`, "[Kk\u212A]"},
  102. {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
  103. {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
  104. {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
  105. // Empty string as a regular expression.
  106. // The empty string must be preserved inside parens in order
  107. // to make submatches work right, so these tests are less
  108. // interesting than they might otherwise be. String inserts
  109. // explicit (?:) in place of non-parenthesized empty strings,
  110. // to make them easier to spot for other parsers.
  111. {`(a|b|)`, `([a-b]|(?:))`},
  112. {`(|)`, `()`},
  113. {`a()`, `a()`},
  114. {`(()|())`, `(()|())`},
  115. {`(a|)`, `(a|(?:))`},
  116. {`ab()cd()`, `ab()cd()`},
  117. {`()`, `()`},
  118. {`()*`, `()*`},
  119. {`()+`, `()+`},
  120. {`()?`, `()?`},
  121. {`(){0}`, `(?:)`},
  122. {`(){1}`, `()`},
  123. {`(){1,}`, `()+`},
  124. {`(){0,2}`, `(?:()()?)?`},
  125. }
  126. func TestSimplify(t *testing.T) {
  127. for _, tt := range simplifyTests {
  128. re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
  129. if err != nil {
  130. t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
  131. continue
  132. }
  133. s := re.Simplify().String()
  134. if s != tt.Simple {
  135. t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
  136. }
  137. }
  138. }