boyer-moore_string_search_algorithm.sf 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. #!/usr/bin/ruby
  2. #
  3. ## https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
  4. #
  5. # Translation of the Python example.
  6. func match_length(Array S, idx1, idx2) {
  7. if (idx1 == idx2) {
  8. return (S.len - idx1)
  9. }
  10. var match_count = 0
  11. while ((idx1 < S.len) && (idx2 < S.len) && (S[idx1] == S[idx2])) {
  12. ++match_count
  13. ++idx1
  14. ++idx2
  15. }
  16. return match_count
  17. }
  18. func fundamental_preprocess(Array S) {
  19. if (S.len == 0) { # Handles case of empty string
  20. return []
  21. }
  22. if (S.len == 1) { # Handles case of single-character string
  23. return [1]
  24. }
  25. var z = S.len.of(0)
  26. z[0] = S.len
  27. z[1] = match_length(S, 0, 1)
  28. for i in (2 .. z[1]) { # Optimization from exercise 1-5
  29. z[i] = (z[1] - i + 1)
  30. }
  31. # Defines lower and upper limits of z-box
  32. var l = 0
  33. var r = 0
  34. for i in (2+z[1] .. S.end) {
  35. if (i <= r) { # i falls within existing z-box
  36. var k = i-l
  37. var b = z[k]
  38. var a = (r - i + 1)
  39. if (b < a) { # b ends within existing z-box
  40. z[i] = b
  41. }
  42. else { # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
  43. z[i] = a+match_length(S, a, r+1)
  44. l = i
  45. r = (i + z[i] - 1)
  46. }
  47. }
  48. else { # i does not reside within existing z-box
  49. z[i] = match_length(S, 0, i)
  50. if (z[i] > 0) {
  51. l = i
  52. r = (i + z[i] - 1)
  53. }
  54. }
  55. }
  56. return z
  57. }
  58. func bad_character_table(Array S) {
  59. if (S.len == 0) {
  60. return 256.of { [] }
  61. }
  62. var R = 256.of { [-1] }
  63. var alpha = 256.of(-1)
  64. S.each_kv { |i,c|
  65. alpha[c] = i
  66. alpha.each_kv { |j, a|
  67. R[j].append(a)
  68. }
  69. }
  70. return R
  71. }
  72. func good_suffix_table(S) {
  73. var L = S.len.of(-1)
  74. var N = fundamental_preprocess(S.flip).flip
  75. for j in (0 .. S.end) { # should the range be exclusive?
  76. var i = (S.len - N[j])
  77. if (i != S.len) {
  78. L[i] = j
  79. }
  80. }
  81. return L
  82. }
  83. func full_shift_table(S) {
  84. var F = S.len.of(0)
  85. var Z = fundamental_preprocess(S)
  86. var longest = 0
  87. Z.flip.each_kv { |i,zv|
  88. longest = (zv == i+1 ? (zv `max` longest) : longest)
  89. F[-i - 1] = longest
  90. }
  91. return F
  92. }
  93. func string_search(Array P, Array T) {
  94. if ((P.len == 0) || (T.len == 0) || (T.len < P.len)) {
  95. return []
  96. }
  97. var matches = []
  98. # Preprocessing
  99. var R = bad_character_table(P)
  100. var L = good_suffix_table(P)
  101. var F = full_shift_table(P)
  102. var k = P.end # Represents alignment of end of P relative to T
  103. var previous_k = -1 # Represents alignment in previous phase (Galil's rule)
  104. while (k < T.len) {
  105. var i = P.end # Character to compare in P
  106. var h = k # Character to compare in T
  107. while ((i >= 0) && (h > previous_k) && (P[i] == T[h])) { # Matches starting from end of P
  108. --i
  109. --h
  110. }
  111. if ((i == -1) || (h == previous_k)) { # Match has been found (Galil's rule)
  112. matches.append(k - P.len + 1)
  113. k += (P.len > 1 ? (P.len - F[1]) : 1)
  114. }
  115. else { # No match, shift by max of bad character and good suffix rules
  116. var char_shift = (i - R[T[h]][i])
  117. var suffix_shift = given (i+1) { |t|
  118. case (t == P.len) { 1 } # Mismatch happened on first attempt
  119. case (L[t] == -1) { P.len - F[t] } # Matched suffix does not appear anywhere in P
  120. default { P.len - L[t] } # Matched suffix appears in P
  121. }
  122. var shift = (char_shift `max` suffix_shift)
  123. previous_k = k if (shift >= i+1) # Galil's rule
  124. k += shift
  125. }
  126. }
  127. return matches
  128. }
  129. func string_search(String P, String T) {
  130. string_search(P.bytes, T.bytes)
  131. }
  132. say(string_search("bar", "foobartestbarend")) #=> [3, 10]
  133. say(string_search("bar", "foo-bar-test-bar-end")) #=> [4, 13]
  134. say(string_search("Bar", "foo-bar-test-Bar-end-Bar")) #=> [13, 21]
  135. say(string_search("-test-", "foo-bar-test-Bar-end-Bar")) #=> [7]
  136. say(string_search("-Bar-end", "foo-Bar-test-Bar-end-Bar")) #=> [12]
  137. say(string_search("-Bar", "foo-Bar-test-Bar-end-Bar")) #=> [3, 12, 20]