suffix-array.hpp 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. #pragma once
  2. #include <nall/array.hpp>
  3. #include <nall/counting-sort.hpp>
  4. #include <nall/induced-sort.hpp>
  5. #include <nall/range.hpp>
  6. #include <nall/view.hpp>
  7. namespace nall {
  8. /*
  9. input:
  10. data = "acaacatat"
  11. 0 "acaacatat"
  12. 1 "caacatat"
  13. 2 "aacatat"
  14. 3 "acatat"
  15. 4 "catat"
  16. 5 "atat"
  17. 6 "tat"
  18. 7 "at"
  19. 8 "t"
  20. 9 ""
  21. suffix_array:
  22. suffixes = [9,2,0,3,7,5,1,4,8,6] => input + suffixes:
  23. 9 ""
  24. 2 "aacatat"
  25. 0 "acaacatat"
  26. 3 "acatat"
  27. 7 "at"
  28. 5 "atat"
  29. 1 "caacatat"
  30. 4 "catat"
  31. 8 "t"
  32. 6 "tat"
  33. [auxiliary data structures to represent information lost from suffix trees]
  34. suffix_array_invert:
  35. inverted = [2,6,1,3,7,5,9,4,8,0] => input + suffixes[inverted]:
  36. 2 "acaacatat"
  37. 6 "caacatat"
  38. 1 "aacatat"
  39. 3 "acatat"
  40. 7 "catat"
  41. 5 "atat"
  42. 9 "tat"
  43. 4 "at"
  44. 8 "t"
  45. 0 ""
  46. suffix_array_phi:
  47. phi = [2,5,9,0,1,7,8,3,4,0]
  48. suffix_array_lcp:
  49. prefixes = [0,0,1,3,1,2,0,2,0,1] => lcp[n] == lcp(n, n-1)
  50. "" 0
  51. "aacatat" 0
  52. "acaacatat" 1 "a"
  53. "acatat" 3 "aca"
  54. "at" 1 "a"
  55. "atat" 2 "at"
  56. "caacatat" 0
  57. "catat" 2 "ca"
  58. "t" 0
  59. "tat" 1 "t"
  60. suffix_array_plcp:
  61. plcp = [1,0,0,3,2,2,1,1,0,0]
  62. suffix_array_lrcp:
  63. llcp = [0,0,0,3,0,2,0,2,0,0] => llcp[m] == lcp(l, m)
  64. rlcp = [0,1,1,1,0,0,0,0,0,0] => rlcp[m] == lcp(m, r)
  65. suffix_array_lpf:
  66. lengths = [0,0,1,3,2,1,0,2,1,0]
  67. offsets = [0,0,0,0,1,3,4,5,6,2]
  68. "acaacatat" (0,-)
  69. "caacatat" (0,-)
  70. "aacatat" (1,0) at 0, match "a"
  71. "acatat" (3,0) at 0, match "aca"
  72. "catat" (2,1) at 1, match "ca"
  73. "atat" (1,3) at 3, match "a"
  74. "tat" (0,-)
  75. "at" (2,5) at 5, match "at"
  76. "t" (1,6) at 6, match "t"
  77. "" (0,-)
  78. */
  79. // suffix array via induced sorting
  80. // O(n)
  81. inline auto suffix_array(array_view<uint8_t> input) -> vector<int> {
  82. return induced_sort(input);
  83. }
  84. // inverse
  85. // O(n)
  86. inline auto suffix_array_invert(array_view<int> sa) -> vector<int> {
  87. vector<int> isa;
  88. isa.reallocate(sa.size());
  89. for(int i : range(sa.size())) isa[sa[i]] = i;
  90. return isa;
  91. }
  92. // auxiliary data structure for plcp and lpf computation
  93. // O(n)
  94. inline auto suffix_array_phi(array_view<int> sa) -> vector<int> {
  95. vector<int> phi;
  96. phi.reallocate(sa.size());
  97. phi[sa[0]] = 0;
  98. for(int i : range(1, sa.size())) phi[sa[i]] = sa[i - 1];
  99. return phi;
  100. }
  101. // longest common prefix: lcp(l, r)
  102. // O(n)
  103. inline auto suffix_array_lcp(int l, int r, array_view<int> sa, array_view<uint8_t> input) -> int {
  104. int i = sa[l], j = sa[r], k = 0, size = input.size();
  105. while(i + k < size && j + k < size && input[i + k] == input[j + k]) k++;
  106. return k;
  107. }
  108. // longest common prefix: lcp(i, j, k)
  109. // O(n)
  110. inline auto suffix_array_lcp(int i, int j, int k, array_view<uint8_t> input) -> int {
  111. int size = input.size();
  112. while(i + k < size && j + k < size && input[i + k] == input[j + k]) k++;
  113. return k;
  114. }
  115. // longest common prefix: lcp[n] == lcp(n, n-1)
  116. // O(n)
  117. inline auto suffix_array_lcp(array_view<int> sa, array_view<int> isa, array_view<uint8_t> input) -> vector<int> {
  118. int k = 0, size = input.size();
  119. vector<int> lcp;
  120. lcp.reallocate(size + 1);
  121. for(int i : range(size)) {
  122. if(isa[i] == size) { k = 0; continue; } //the next substring is empty; ignore it
  123. int j = sa[isa[i] + 1];
  124. while(i + k < size && j + k < size && input[i + k] == input[j + k]) k++;
  125. lcp[1 + isa[i]] = k;
  126. if(k) k--;
  127. }
  128. lcp[0] = 0;
  129. return lcp;
  130. }
  131. // longest common prefix (from permuted longest common prefix)
  132. // O(n)
  133. inline auto suffix_array_lcp(array_view<int> plcp, array_view<int> sa) -> vector<int> {
  134. vector<int> lcp;
  135. lcp.reallocate(plcp.size());
  136. for(int i : range(plcp.size())) lcp[i] = plcp[sa[i]];
  137. return lcp;
  138. }
  139. // permuted longest common prefix
  140. // O(n)
  141. inline auto suffix_array_plcp(array_view<int> phi, array_view<uint8_t> input) -> vector<int> {
  142. vector<int> plcp;
  143. plcp.reallocate(phi.size());
  144. int k = 0, size = input.size();
  145. for(int i : range(size)) {
  146. int j = phi[i];
  147. while(i + k < size && j + k < size && input[i + k] == input[j + k]) k++;
  148. plcp[i] = k;
  149. if(k) k--;
  150. }
  151. return plcp;
  152. }
  153. // permuted longest common prefix (from longest common prefix)
  154. // O(n)
  155. inline auto suffix_array_plcp(array_view<int> lcp, array_view<int> sa) -> vector<int> {
  156. vector<int> plcp;
  157. plcp.reallocate(lcp.size());
  158. for(int i : range(lcp.size())) plcp[sa[i]] = lcp[i];
  159. return plcp;
  160. }
  161. // longest common prefixes - left + right
  162. // llcp[m] == lcp(l, m)
  163. // rlcp[m] == lcp(m, r)
  164. // O(n)
  165. // requires: lcp -or- plcp+sa
  166. inline auto suffix_array_lrcp(vector<int>& llcp, vector<int>& rlcp, array_view<int> lcp, array_view<int> plcp, array_view<int> sa, array_view<uint8_t> input) -> void {
  167. int size = input.size();
  168. llcp.reset(), llcp.reallocate(size + 1);
  169. rlcp.reset(), rlcp.reallocate(size + 1);
  170. function<int (int, int)> recurse = [&](int l, int r) -> int {
  171. if(l >= r - 1) {
  172. if(l >= size) return 0;
  173. if(lcp) return lcp[l];
  174. return plcp[sa[l]];
  175. }
  176. int m = l + r >> 1;
  177. llcp[m - 1] = recurse(l, m);
  178. rlcp[m - 1] = recurse(m, r);
  179. return min(llcp[m - 1], rlcp[m - 1]);
  180. };
  181. recurse(1, size + 1);
  182. llcp[0] = 0;
  183. rlcp[0] = 0;
  184. }
  185. // longest previous factor
  186. // O(n)
  187. // optional: plcp
  188. inline auto suffix_array_lpf(vector<int>& lengths, vector<int>& offsets, array_view<int> phi, array_view<int> plcp, array_view<uint8_t> input) -> void {
  189. int k = 0, size = input.size();
  190. lengths.reset(), lengths.resize(size + 1, -1);
  191. offsets.reset(), offsets.resize(size + 1, -1);
  192. function<void (int, int, int)> recurse = [&](int i, int j, int k) -> void {
  193. if(lengths[i] < 0) {
  194. lengths[i] = k;
  195. offsets[i] = j;
  196. } else if(lengths[i] < k) {
  197. if(offsets[i] > j) {
  198. recurse(offsets[i], j, lengths[i]);
  199. } else {
  200. recurse(j, offsets[i], lengths[i]);
  201. }
  202. lengths[i] = k;
  203. offsets[i] = j;
  204. } else {
  205. if(offsets[i] > j) {
  206. recurse(offsets[i], j, k);
  207. } else {
  208. recurse(j, offsets[i], k);
  209. }
  210. }
  211. };
  212. for(int i : range(size)) {
  213. int j = phi[i];
  214. if(plcp) k = plcp[i];
  215. else while(i + k < size && j + k < size && input[i + k] == input[j + k]) k++;
  216. if(i > j) {
  217. recurse(i, j, k);
  218. } else {
  219. recurse(j, i, k);
  220. }
  221. if(k) k--;
  222. }
  223. lengths[0] = 0;
  224. offsets[0] = 0;
  225. }
  226. // O(n log m)
  227. inline auto suffix_array_find(int& length, int& offset, array_view<int> sa, array_view<uint8_t> input, array_view<uint8_t> match) -> bool {
  228. length = 0, offset = 0;
  229. int l = 0, r = input.size();
  230. while(l < r - 1) {
  231. int m = l + r >> 1;
  232. int s = sa[m];
  233. int k = 0;
  234. while(k < match.size() && s + k < input.size()) {
  235. if(match[k] != input[s + k]) break;
  236. k++;
  237. }
  238. if(k > length) {
  239. length = k;
  240. offset = s;
  241. if(k == match.size()) return true;
  242. }
  243. if(k == match.size() || s + k == input.size()) k--;
  244. if(match[k] < input[s + k]) {
  245. r = m;
  246. } else {
  247. l = m;
  248. }
  249. }
  250. return false;
  251. }
  252. // O(n + log m)
  253. inline auto suffix_array_find(int& length, int& offset, array_view<int> llcp, array_view<int> rlcp, array_view<int> sa, array_view<uint8_t> input, array_view<uint8_t> match) -> bool {
  254. length = 0, offset = 0;
  255. int l = 0, r = input.size(), k = 0;
  256. while(l < r - 1) {
  257. int m = l + r >> 1;
  258. int s = sa[m];
  259. while(k < match.size() && s + k < input.size()) {
  260. if(match[k] != input[s + k]) break;
  261. k++;
  262. }
  263. if(k > length) {
  264. length = k;
  265. offset = s;
  266. if(k == match.size()) return true;
  267. }
  268. if(k == match.size() || s + k == input.size()) k--;
  269. if(match[k] < input[s + k]) {
  270. r = m;
  271. k = min(k, llcp[m]);
  272. } else {
  273. l = m;
  274. k = min(k, rlcp[m]);
  275. }
  276. }
  277. return false;
  278. }
  279. //
  280. //there are multiple strategies for building the required auxiliary structures for suffix arrays
  281. struct SuffixArray {
  282. using type = SuffixArray;
  283. //O(n)
  284. inline SuffixArray(array_view<uint8_t> input) : input(input) {
  285. sa = suffix_array(input);
  286. }
  287. //O(n)
  288. inline auto lrcp() -> type& {
  289. //if(!isa) isa = suffix_array_invert(sa);
  290. //if(!lcp) lcp = suffix_array_lcp(sa, isa, input);
  291. if(!phi) phi = suffix_array_phi(sa);
  292. if(!plcp) plcp = suffix_array_plcp(phi, input);
  293. //if(!lcp) lcp = suffix_array_lcp(plcp, sa);
  294. if(!llcp || !rlcp) suffix_array_lrcp(llcp, rlcp, lcp, plcp, sa, input);
  295. return *this;
  296. }
  297. //O(n)
  298. inline auto lpf() -> type& {
  299. if(!phi) phi = suffix_array_phi(sa);
  300. //if(!plcp) plcp = suffix_array_plcp(phi, input);
  301. if(!lengths || !offsets) suffix_array_lpf(lengths, offsets, phi, plcp, input);
  302. return *this;
  303. }
  304. inline auto operator[](int offset) const -> int {
  305. return sa[offset];
  306. }
  307. //O(n log m)
  308. //O(n + log m) with lrcp()
  309. inline auto find(int& length, int& offset, array_view<uint8_t> match) -> bool {
  310. if(!llcp || !rlcp) return suffix_array_find(length, offset, sa, input, match); //O(n log m)
  311. return suffix_array_find(length, offset, llcp, rlcp, sa, input, match); //O(n + log m)
  312. }
  313. //O(n) with lpf()
  314. inline auto previous(int& length, int& offset, int address) -> void {
  315. length = lengths[address];
  316. offset = offsets[address];
  317. }
  318. //non-owning reference: SuffixArray is invalidated if memory is freed
  319. array_view<uint8_t> input;
  320. //suffix array and auxiliary data structures
  321. vector<int> sa; //suffix array
  322. vector<int> isa; //inverted suffix array
  323. vector<int> phi; //phi
  324. vector<int> plcp; //permuted longest common prefixes
  325. vector<int> lcp; //longest common prefixes
  326. vector<int> llcp; //longest common prefixes - left
  327. vector<int> rlcp; //longest common prefixes - right
  328. vector<int> lengths; //longest previous factors
  329. vector<int> offsets; //longest previous factors
  330. };
  331. }