induced-sort.hpp 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. #pragma once
  2. //suffix array construction via induced sorting
  3. //many thanks to Screwtape for the thorough explanation of this algorithm
  4. //this implementation would not be possible without his help
  5. namespace nall {
  6. //note that induced_sort will return an array of size+1 characters,
  7. //where the first character is the empty suffix, equal to size
  8. template<typename T>
  9. inline auto induced_sort(array_view<T> data, const uint characters = 256) -> vector<int> {
  10. const uint size = data.size();
  11. if(size == 0) return vector<int>{0}; //required to avoid out-of-bounds accesses
  12. if(size == 1) return vector<int>{1, 0}; //not strictly necessary; but more performant
  13. vector<bool> types; //0 = S-suffix (sort before next suffix), 1 = L-suffix (sort after next suffix)
  14. types.resize(size + 1);
  15. types[size - 0] = 0; //empty suffix is always S-suffix
  16. types[size - 1] = 1; //last suffix is always L-suffix compared to empty suffix
  17. for(uint n : reverse(range(size - 1))) {
  18. if(data[n] < data[n + 1]) {
  19. types[n] = 0; //this suffix is smaller than the one after it
  20. } else if(data[n] > data[n + 1]) {
  21. types[n] = 1; //this suffix is larger than the one after it
  22. } else {
  23. types[n] = types[n + 1]; //this suffix will be the same as the one after it
  24. }
  25. }
  26. //left-most S-suffix
  27. auto isLMS = [&](int n) -> bool {
  28. if(n == 0) return 0; //no character to the left of the first suffix
  29. return !types[n] && types[n - 1]; //true if this is the start of a new S-suffix
  30. };
  31. //test if two LMS-substrings are equal
  32. auto isEqual = [&](int lhs, int rhs) -> bool {
  33. if(lhs == size || rhs == size) return false; //no other suffix can be equal to the empty suffix
  34. for(uint n = 0;; n++) {
  35. bool lhsLMS = isLMS(lhs + n);
  36. bool rhsLMS = isLMS(rhs + n);
  37. if(n && lhsLMS && rhsLMS) return true; //substrings are identical
  38. if(lhsLMS != rhsLMS) return false; //length mismatch: substrings cannot be identical
  39. if(data[lhs + n] != data[rhs + n]) return false; //character mismatch: substrings are different
  40. }
  41. };
  42. //determine the sizes of each bucket: one bucket per character
  43. vector<uint> counts;
  44. counts.resize(characters);
  45. for(uint n : range(size)) counts[data[n]]++;
  46. //bucket sorting start offsets
  47. vector<uint> heads;
  48. heads.resize(characters);
  49. uint headOffset;
  50. auto getHeads = [&] {
  51. headOffset = 1;
  52. for(uint n : range(characters)) {
  53. heads[n] = headOffset;
  54. headOffset += counts[n];
  55. }
  56. };
  57. //bucket sorting end offsets
  58. vector<uint> tails;
  59. tails.resize(characters);
  60. uint tailOffset;
  61. auto getTails = [&] {
  62. tailOffset = 1;
  63. for(uint n : range(characters)) {
  64. tailOffset += counts[n];
  65. tails[n] = tailOffset - 1;
  66. }
  67. };
  68. //inaccurate LMS bucket sort
  69. vector<int> suffixes;
  70. suffixes.resize(size + 1, (int)-1);
  71. getTails();
  72. for(uint n : range(size)) {
  73. if(!isLMS(n)) continue; //skip non-LMS-suffixes
  74. suffixes[tails[data[n]]--] = n; //advance from the tail of the bucket
  75. }
  76. suffixes[0] = size; //the empty suffix is always an LMS-suffix, and is the first suffix
  77. //sort all L-suffixes to the left of LMS-suffixes
  78. auto sortL = [&] {
  79. getHeads();
  80. for(uint n : range(size + 1)) {
  81. if(suffixes[n] == -1) continue; //offsets may not be known yet here ...
  82. auto l = suffixes[n] - 1;
  83. if(l < 0 || !types[l]) continue; //skip S-suffixes
  84. suffixes[heads[data[l]]++] = l; //advance from the head of the bucket
  85. }
  86. };
  87. auto sortS = [&] {
  88. getTails();
  89. for(uint n : reverse(range(size + 1))) {
  90. auto l = suffixes[n] - 1;
  91. if(l < 0 || types[l]) continue; //skip L-suffixes
  92. suffixes[tails[data[l]]--] = l; //advance from the tail of the bucket
  93. }
  94. };
  95. sortL();
  96. sortS();
  97. //analyze data for the summary suffix array
  98. vector<int> names;
  99. names.resize(size + 1, (int)-1);
  100. uint currentName = 0; //keep a count to tag each unique LMS-substring with unique IDs
  101. auto lastLMSOffset = suffixes[0]; //location in the original data of the last checked LMS suffix
  102. names[lastLMSOffset] = currentName; //the first LMS-substring is always the empty suffix entry, at position 0
  103. for(uint n : range(1, size + 1)) {
  104. auto offset = suffixes[n];
  105. if(!isLMS(offset)) continue; //only LMS suffixes are important
  106. //if this LMS suffix starts with a different LMS substring than the last suffix observed ...
  107. if(!isEqual(lastLMSOffset, offset)) currentName++; //then it gets a new name
  108. lastLMSOffset = offset; //keep track of the new most-recent LMS suffix
  109. names[lastLMSOffset] = currentName; //store the LMS suffix name where the suffix appears at in the original data
  110. }
  111. vector<int> summaryOffsets;
  112. vector<int> summaryData;
  113. for(uint n : range(size + 1)) {
  114. if(names[n] == -1) continue;
  115. summaryOffsets.append(n);
  116. summaryData.append(names[n]);
  117. }
  118. uint summaryCharacters = currentName + 1; //zero-indexed, so the total unique characters is currentName + 1
  119. //make the summary suffix array
  120. vector<int> summaries;
  121. if(summaryData.size() == summaryCharacters) {
  122. //simple bucket sort when every character in summaryData appears only once
  123. summaries.resize(summaryData.size() + 1, (int)-1);
  124. summaries[0] = summaryData.size(); //always include the empty suffix at the beginning
  125. for(int x : range(summaryData.size())) {
  126. int y = summaryData[x];
  127. summaries[y + 1] = x;
  128. }
  129. } else {
  130. //recurse until every character in summaryData is unique ...
  131. summaries = induced_sort<int>({summaryData.data(), summaryData.size()}, summaryCharacters);
  132. }
  133. suffixes.fill(-1); //reuse existing buffer for accurate sort
  134. //accurate LMS sort
  135. getTails();
  136. for(uint n : reverse(range(2, summaries.size()))) {
  137. auto index = summaryOffsets[summaries[n]];
  138. suffixes[tails[data[index]]--] = index; //advance from the tail of the bucket
  139. }
  140. suffixes[0] = size; //always include the empty suffix at the beginning
  141. sortL();
  142. sortS();
  143. return suffixes;
  144. }
  145. }