burrows-wheeler_transform_symbolic.sf 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #!/usr/bin/ruby
  2. # Author: Trizen
  3. # Date: 14 June 2023
  4. # Edit: 15 June 2023
  5. # https://github.com/trizen
  6. # Burrows–Wheeler transform, implemented to work on an array of symbols.
  7. # https://rosettacode.org/wiki/Burrows–Wheeler_transform
  8. # Reference:
  9. # Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
  10. # https://youtube.com/watch?v=rQ7wwh4HRZM
  11. func bwt_cyclic (s) { # O(n) space
  12. var len = s.len
  13. gather { # check if all the symbols are the same
  14. take(true)
  15. s.each_cons(2, {|a,b|
  16. if (a != b) {
  17. take(false)
  18. break
  19. }
  20. })
  21. } == [true] && return @(^len)
  22. @(^len) -> sort {|i,j|
  23. while (s[i] == s[j]) {
  24. i %= len if (++i >= len)
  25. j %= len if (++j >= len)
  26. }
  27. s[i] <=> s[j]
  28. }
  29. }
  30. func bwt_encode_quadratic(Array s) { # O(n^2) space
  31. #var bwt = s.len.of{|i| s.slice(i) + s.slice(0, i) }.sort
  32. var bwt = s.len.of{|i| s.rotate(i) }.sort
  33. var ret = bwt.map { .last }
  34. var idx = bwt.index(s)
  35. return (ret, idx)
  36. }
  37. func bwt_encode(Array s) {
  38. var bwt = bwt_cyclic(s)
  39. var ret = bwt.map {|i| s.slice(i-1, 1)... }
  40. var idx = bwt.first_index_by { .is_zero }
  41. return (ret, idx)
  42. }
  43. func bwt_decode(Array bwt, Number idx) { # fast inversion
  44. var tail = bwt
  45. var head = bwt.sort
  46. var indices = []
  47. tail.each_kv {|i,v|
  48. indices[v] := [] << i
  49. }
  50. var table = []
  51. head.each_kv {|i,b|
  52. table[i] = indices[b].shift
  53. }
  54. var dec = []
  55. var i = idx
  56. head.len.times {
  57. dec << head[i]
  58. i = table[i]
  59. }
  60. return dec
  61. }
  62. var tests = [
  63. "banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
  64. "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES", "PINEAPPLE",
  65. "","a","aa","aabb","aaaaaaaaaaaa","aaaaaaaaaaaab",
  66. "baaaaaaaaaaaa","aaaaaabaaaaaa","aaaaaaabaaaaa",
  67. File(__FILE__).read(:raw),
  68. ]
  69. tests.each { |str|
  70. var (enc, idx) = bwt_encode(str.bytes)
  71. if (enc.len < 1024) {
  72. say "BWT(#{str.dump}) = (#{enc.join_bytes.dump}, #{idx})"
  73. }
  74. var dec = bwt_decode(enc, idx)
  75. assert_eq(str, dec.pack('C*'))
  76. }
  77. __END__
  78. BWT("banana") = ("nnbaaa", 3)
  79. BWT("appellee") = ("eelplepa", 0)
  80. BWT("dogwood") = ("odoodwg", 1)
  81. BWT("TOBEORNOTTOBEORTOBEORNOT") = ("OOOBBBRRTTTEEENNOOORTTOO", 20)
  82. BWT("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES") = ("TEXYDST.E.IXIXIXXSSMPPS.B..E.S.EUSFXDIIOIIIT", 29)
  83. BWT("PINEAPPLE") = ("ENLPPIEPA", 6)