123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176 |
- #!/usr/bin/ruby
- #
- ## https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
- #
- # Translation of the Python example.
- func match_length(Array S, idx1, idx2) {
- if (idx1 == idx2) {
- return (S.len - idx1)
- }
- var match_count = 0
- while ((idx1 < S.len) && (idx2 < S.len) && (S[idx1] == S[idx2])) {
- ++match_count
- ++idx1
- ++idx2
- }
- return match_count
- }
- func fundamental_preprocess(Array S) {
- if (S.len == 0) { # Handles case of empty string
- return []
- }
- if (S.len == 1) { # Handles case of single-character string
- return [1]
- }
- var z = S.len.of(0)
- z[0] = S.len
- z[1] = match_length(S, 0, 1)
- for i in (2 .. z[1]) { # Optimization from exercise 1-5
- z[i] = (z[1] - i + 1)
- }
- # Defines lower and upper limits of z-box
- var l = 0
- var r = 0
- for i in (2+z[1] .. S.end) {
- if (i <= r) { # i falls within existing z-box
- var k = i-l
- var b = z[k]
- var a = (r - i + 1)
- if (b < a) { # b ends within existing z-box
- z[i] = b
- }
- else { # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
- z[i] = a+match_length(S, a, r+1)
- l = i
- r = (i + z[i] - 1)
- }
- }
- else { # i does not reside within existing z-box
- z[i] = match_length(S, 0, i)
- if (z[i] > 0) {
- l = i
- r = (i + z[i] - 1)
- }
- }
- }
- return z
- }
- func bad_character_table(Array S) {
- if (S.len == 0) {
- return 256.of { [] }
- }
- var R = 256.of { [-1] }
- var alpha = 256.of(-1)
- S.each_kv { |i,c|
- alpha[c] = i
- alpha.each_kv { |j, a|
- R[j].append(a)
- }
- }
- return R
- }
- func good_suffix_table(S) {
- var L = S.len.of(-1)
- var N = fundamental_preprocess(S.flip).flip
- for j in (0 .. S.end) { # should the range be exclusive?
- var i = (S.len - N[j])
- if (i != S.len) {
- L[i] = j
- }
- }
- return L
- }
- func full_shift_table(S) {
- var F = S.len.of(0)
- var Z = fundamental_preprocess(S)
- var longest = 0
- Z.flip.each_kv { |i,zv|
- longest = (zv == i+1 ? (zv `max` longest) : longest)
- F[-i - 1] = longest
- }
- return F
- }
- func string_search(Array P, Array T) {
- if ((P.len == 0) || (T.len == 0) || (T.len < P.len)) {
- return []
- }
- var matches = []
- # Preprocessing
- var R = bad_character_table(P)
- var L = good_suffix_table(P)
- var F = full_shift_table(P)
- var k = P.end # Represents alignment of end of P relative to T
- var previous_k = -1 # Represents alignment in previous phase (Galil's rule)
- while (k < T.len) {
- var i = P.end # Character to compare in P
- var h = k # Character to compare in T
- while ((i >= 0) && (h > previous_k) && (P[i] == T[h])) { # Matches starting from end of P
- --i
- --h
- }
- if ((i == -1) || (h == previous_k)) { # Match has been found (Galil's rule)
- matches.append(k - P.len + 1)
- k += (P.len > 1 ? (P.len - F[1]) : 1)
- }
- else { # No match, shift by max of bad character and good suffix rules
- var char_shift = (i - R[T[h]][i])
- var suffix_shift = given (i+1) { |t|
- case (t == P.len) { 1 } # Mismatch happened on first attempt
- case (L[t] == -1) { P.len - F[t] } # Matched suffix does not appear anywhere in P
- default { P.len - L[t] } # Matched suffix appears in P
- }
- var shift = (char_shift `max` suffix_shift)
- previous_k = k if (shift >= i+1) # Galil's rule
- k += shift
- }
- }
- return matches
- }
- func string_search(String P, String T) {
- string_search(P.bytes, T.bytes)
- }
- say(string_search("bar", "foobartestbarend")) #=> [3, 10]
- say(string_search("bar", "foo-bar-test-bar-end")) #=> [4, 13]
- say(string_search("Bar", "foo-bar-test-Bar-end-Bar")) #=> [13, 21]
- say(string_search("-test-", "foo-bar-test-Bar-end-Bar")) #=> [7]
- say(string_search("-Bar-end", "foo-Bar-test-Bar-end-Bar")) #=> [12]
- say(string_search("-Bar", "foo-Bar-test-Bar-end-Bar")) #=> [3, 12, 20]
|