123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- #pragma once
- //suffix array construction via induced sorting
- //many thanks to Screwtape for the thorough explanation of this algorithm
- //this implementation would not be possible without his help
- namespace nall {
- //note that induced_sort will return an array of size+1 characters,
- //where the first character is the empty suffix, equal to size
- template<typename T>
- inline auto induced_sort(array_view<T> data, const uint characters = 256) -> vector<int> {
- const uint size = data.size();
- if(size == 0) return vector<int>{0}; //required to avoid out-of-bounds accesses
- if(size == 1) return vector<int>{1, 0}; //not strictly necessary; but more performant
- vector<bool> types; //0 = S-suffix (sort before next suffix), 1 = L-suffix (sort after next suffix)
- types.resize(size + 1);
- types[size - 0] = 0; //empty suffix is always S-suffix
- types[size - 1] = 1; //last suffix is always L-suffix compared to empty suffix
- for(uint n : reverse(range(size - 1))) {
- if(data[n] < data[n + 1]) {
- types[n] = 0; //this suffix is smaller than the one after it
- } else if(data[n] > data[n + 1]) {
- types[n] = 1; //this suffix is larger than the one after it
- } else {
- types[n] = types[n + 1]; //this suffix will be the same as the one after it
- }
- }
- //left-most S-suffix
- auto isLMS = [&](int n) -> bool {
- if(n == 0) return 0; //no character to the left of the first suffix
- return !types[n] && types[n - 1]; //true if this is the start of a new S-suffix
- };
- //test if two LMS-substrings are equal
- auto isEqual = [&](int lhs, int rhs) -> bool {
- if(lhs == size || rhs == size) return false; //no other suffix can be equal to the empty suffix
- for(uint n = 0;; n++) {
- bool lhsLMS = isLMS(lhs + n);
- bool rhsLMS = isLMS(rhs + n);
- if(n && lhsLMS && rhsLMS) return true; //substrings are identical
- if(lhsLMS != rhsLMS) return false; //length mismatch: substrings cannot be identical
- if(data[lhs + n] != data[rhs + n]) return false; //character mismatch: substrings are different
- }
- };
- //determine the sizes of each bucket: one bucket per character
- vector<uint> counts;
- counts.resize(characters);
- for(uint n : range(size)) counts[data[n]]++;
- //bucket sorting start offsets
- vector<uint> heads;
- heads.resize(characters);
- uint headOffset;
- auto getHeads = [&] {
- headOffset = 1;
- for(uint n : range(characters)) {
- heads[n] = headOffset;
- headOffset += counts[n];
- }
- };
- //bucket sorting end offsets
- vector<uint> tails;
- tails.resize(characters);
- uint tailOffset;
- auto getTails = [&] {
- tailOffset = 1;
- for(uint n : range(characters)) {
- tailOffset += counts[n];
- tails[n] = tailOffset - 1;
- }
- };
- //inaccurate LMS bucket sort
- vector<int> suffixes;
- suffixes.resize(size + 1, (int)-1);
- getTails();
- for(uint n : range(size)) {
- if(!isLMS(n)) continue; //skip non-LMS-suffixes
- suffixes[tails[data[n]]--] = n; //advance from the tail of the bucket
- }
- suffixes[0] = size; //the empty suffix is always an LMS-suffix, and is the first suffix
- //sort all L-suffixes to the left of LMS-suffixes
- auto sortL = [&] {
- getHeads();
- for(uint n : range(size + 1)) {
- if(suffixes[n] == -1) continue; //offsets may not be known yet here ...
- auto l = suffixes[n] - 1;
- if(l < 0 || !types[l]) continue; //skip S-suffixes
- suffixes[heads[data[l]]++] = l; //advance from the head of the bucket
- }
- };
- auto sortS = [&] {
- getTails();
- for(uint n : reverse(range(size + 1))) {
- auto l = suffixes[n] - 1;
- if(l < 0 || types[l]) continue; //skip L-suffixes
- suffixes[tails[data[l]]--] = l; //advance from the tail of the bucket
- }
- };
- sortL();
- sortS();
- //analyze data for the summary suffix array
- vector<int> names;
- names.resize(size + 1, (int)-1);
- uint currentName = 0; //keep a count to tag each unique LMS-substring with unique IDs
- auto lastLMSOffset = suffixes[0]; //location in the original data of the last checked LMS suffix
- names[lastLMSOffset] = currentName; //the first LMS-substring is always the empty suffix entry, at position 0
- for(uint n : range(1, size + 1)) {
- auto offset = suffixes[n];
- if(!isLMS(offset)) continue; //only LMS suffixes are important
- //if this LMS suffix starts with a different LMS substring than the last suffix observed ...
- if(!isEqual(lastLMSOffset, offset)) currentName++; //then it gets a new name
- lastLMSOffset = offset; //keep track of the new most-recent LMS suffix
- names[lastLMSOffset] = currentName; //store the LMS suffix name where the suffix appears at in the original data
- }
- vector<int> summaryOffsets;
- vector<int> summaryData;
- for(uint n : range(size + 1)) {
- if(names[n] == -1) continue;
- summaryOffsets.append(n);
- summaryData.append(names[n]);
- }
- uint summaryCharacters = currentName + 1; //zero-indexed, so the total unique characters is currentName + 1
- //make the summary suffix array
- vector<int> summaries;
- if(summaryData.size() == summaryCharacters) {
- //simple bucket sort when every character in summaryData appears only once
- summaries.resize(summaryData.size() + 1, (int)-1);
- summaries[0] = summaryData.size(); //always include the empty suffix at the beginning
- for(int x : range(summaryData.size())) {
- int y = summaryData[x];
- summaries[y + 1] = x;
- }
- } else {
- //recurse until every character in summaryData is unique ...
- summaries = induced_sort<int>({summaryData.data(), summaryData.size()}, summaryCharacters);
- }
- suffixes.fill(-1); //reuse existing buffer for accurate sort
- //accurate LMS sort
- getTails();
- for(uint n : reverse(range(2, summaries.size()))) {
- auto index = summaryOffsets[summaries[n]];
- suffixes[tails[data[index]]--] = index; //advance from the tail of the bucket
- }
- suffixes[0] = size; //always include the empty suffix at the beginning
- sortL();
- sortS();
- return suffixes;
- }
- }
|