fuzzy_search.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /**************************************************************************/
  2. /* fuzzy_search.cpp */
  3. /**************************************************************************/
  4. /* This file is part of: */
  5. /* GODOT ENGINE */
  6. /* https://godotengine.org */
  7. /**************************************************************************/
  8. /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
  9. /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
  10. /* */
  11. /* Permission is hereby granted, free of charge, to any person obtaining */
  12. /* a copy of this software and associated documentation files (the */
  13. /* "Software"), to deal in the Software without restriction, including */
  14. /* without limitation the rights to use, copy, modify, merge, publish, */
  15. /* distribute, sublicense, and/or sell copies of the Software, and to */
  16. /* permit persons to whom the Software is furnished to do so, subject to */
  17. /* the following conditions: */
  18. /* */
  19. /* The above copyright notice and this permission notice shall be */
  20. /* included in all copies or substantial portions of the Software. */
  21. /* */
  22. /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
  23. /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
  24. /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
  25. /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
  26. /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
  27. /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
  28. /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
  29. /**************************************************************************/
  30. #include "fuzzy_search.h"
  31. constexpr float cull_factor = 0.1f;
  32. constexpr float cull_cutoff = 30.0f;
  33. const String boundary_chars = "/\\-_.";
  34. static bool _is_valid_interval(const Vector2i &p_interval) {
  35. // Empty intervals are represented as (-1, -1).
  36. return p_interval.x >= 0 && p_interval.y >= p_interval.x;
  37. }
  38. static Vector2i _extend_interval(const Vector2i &p_a, const Vector2i &p_b) {
  39. if (!_is_valid_interval(p_a)) {
  40. return p_b;
  41. }
  42. if (!_is_valid_interval(p_b)) {
  43. return p_a;
  44. }
  45. return Vector2i(MIN(p_a.x, p_b.x), MAX(p_a.y, p_b.y));
  46. }
  47. static bool _is_word_boundary(const String &p_str, int p_index) {
  48. if (p_index == -1 || p_index == p_str.size()) {
  49. return true;
  50. }
  51. return boundary_chars.find_char(p_str[p_index]) != -1;
  52. }
  53. bool FuzzySearchToken::try_exact_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset) const {
  54. p_match.token_idx = idx;
  55. p_match.token_length = string.length();
  56. int match_idx = p_target.find(string, p_offset);
  57. if (match_idx == -1) {
  58. return false;
  59. }
  60. p_match.add_substring(match_idx, string.length());
  61. return true;
  62. }
  63. bool FuzzySearchToken::try_fuzzy_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset, int p_miss_budget) const {
  64. p_match.token_idx = idx;
  65. p_match.token_length = string.length();
  66. int run_start = -1;
  67. int run_len = 0;
  68. // Search for the subsequence p_token in p_target starting from p_offset, recording each substring for
  69. // later scoring and display.
  70. for (int i = 0; i < string.length(); i++) {
  71. int new_offset = p_target.find_char(string[i], p_offset);
  72. if (new_offset < 0) {
  73. p_miss_budget--;
  74. if (p_miss_budget < 0) {
  75. return false;
  76. }
  77. } else {
  78. if (run_start == -1 || p_offset != new_offset) {
  79. if (run_start != -1) {
  80. p_match.add_substring(run_start, run_len);
  81. }
  82. run_start = new_offset;
  83. run_len = 1;
  84. } else {
  85. run_len += 1;
  86. }
  87. p_offset = new_offset + 1;
  88. }
  89. }
  90. if (run_start != -1) {
  91. p_match.add_substring(run_start, run_len);
  92. }
  93. return true;
  94. }
  95. void FuzzyTokenMatch::add_substring(int p_substring_start, int p_substring_length) {
  96. substrings.append(Vector2i(p_substring_start, p_substring_length));
  97. matched_length += p_substring_length;
  98. Vector2i substring_interval = { p_substring_start, p_substring_start + p_substring_length - 1 };
  99. interval = _extend_interval(interval, substring_interval);
  100. }
  101. bool FuzzyTokenMatch::intersects(const Vector2i &p_other_interval) const {
  102. if (!_is_valid_interval(interval) || !_is_valid_interval(p_other_interval)) {
  103. return false;
  104. }
  105. return interval.y >= p_other_interval.x && interval.x <= p_other_interval.y;
  106. }
  107. bool FuzzySearchResult::can_add_token_match(const FuzzyTokenMatch &p_match) const {
  108. if (p_match.get_miss_count() > miss_budget) {
  109. return false;
  110. }
  111. if (p_match.intersects(match_interval)) {
  112. if (token_matches.size() == 1) {
  113. return false;
  114. }
  115. for (const FuzzyTokenMatch &existing_match : token_matches) {
  116. if (existing_match.intersects(p_match.interval)) {
  117. return false;
  118. }
  119. }
  120. }
  121. return true;
  122. }
  123. bool FuzzyTokenMatch::is_case_insensitive(const String &p_original, const String &p_adjusted) const {
  124. for (const Vector2i &substr : substrings) {
  125. const int end = substr.x + substr.y;
  126. for (int i = substr.x; i < end; i++) {
  127. if (p_original[i] != p_adjusted[i]) {
  128. return true;
  129. }
  130. }
  131. }
  132. return false;
  133. }
  134. void FuzzySearchResult::score_token_match(FuzzyTokenMatch &p_match, bool p_case_insensitive) const {
  135. // This can always be tweaked more. The intuition is that exact matches should almost always
  136. // be prioritized over broken up matches, and other criteria more or less act as tie breakers.
  137. p_match.score = -20 * p_match.get_miss_count() - (p_case_insensitive ? 3 : 0);
  138. for (const Vector2i &substring : p_match.substrings) {
  139. // Score longer substrings higher than short substrings.
  140. int substring_score = substring.y * substring.y;
  141. // Score matches deeper in path higher than shallower matches
  142. if (substring.x > dir_index) {
  143. substring_score *= 2;
  144. }
  145. // Score matches on a word boundary higher than matches within a word
  146. if (_is_word_boundary(target, substring.x - 1) || _is_word_boundary(target, substring.x + substring.y)) {
  147. substring_score += 4;
  148. }
  149. // Score exact query matches higher than non-compact subsequence matches
  150. if (substring.y == p_match.token_length) {
  151. substring_score += 100;
  152. }
  153. p_match.score += substring_score;
  154. }
  155. }
  156. void FuzzySearchResult::maybe_apply_score_bonus() {
  157. // This adds a small bonus to results which match tokens in the same order they appear in the query.
  158. int *token_range_starts = (int *)alloca(sizeof(int) * token_matches.size());
  159. for (const FuzzyTokenMatch &match : token_matches) {
  160. token_range_starts[match.token_idx] = match.interval.x;
  161. }
  162. int last = token_range_starts[0];
  163. for (int i = 1; i < token_matches.size(); i++) {
  164. if (last > token_range_starts[i]) {
  165. return;
  166. }
  167. last = token_range_starts[i];
  168. }
  169. score += 1;
  170. }
  171. void FuzzySearchResult::add_token_match(const FuzzyTokenMatch &p_match) {
  172. score += p_match.score;
  173. match_interval = _extend_interval(match_interval, p_match.interval);
  174. miss_budget -= p_match.get_miss_count();
  175. token_matches.append(p_match);
  176. }
  177. void remove_low_scores(Vector<FuzzySearchResult> &p_results, float p_cull_score) {
  178. // Removes all results with score < p_cull_score in-place.
  179. int i = 0;
  180. int j = p_results.size() - 1;
  181. FuzzySearchResult *results = p_results.ptrw();
  182. while (true) {
  183. // Advances i to an element to remove and j to an element to keep.
  184. while (j >= i && results[j].score < p_cull_score) {
  185. j--;
  186. }
  187. while (i < j && results[i].score >= p_cull_score) {
  188. i++;
  189. }
  190. if (i >= j) {
  191. break;
  192. }
  193. results[i++] = results[j--];
  194. }
  195. p_results.resize(j + 1);
  196. }
  197. void FuzzySearch::sort_and_filter(Vector<FuzzySearchResult> &p_results) const {
  198. if (p_results.is_empty()) {
  199. return;
  200. }
  201. float avg_score = 0;
  202. float max_score = 0;
  203. for (const FuzzySearchResult &result : p_results) {
  204. avg_score += result.score;
  205. max_score = MAX(max_score, result.score);
  206. }
  207. // TODO: Tune scoring and culling here to display fewer subsequence soup matches when good matches
  208. // are available.
  209. avg_score /= p_results.size();
  210. float cull_score = MIN(cull_cutoff, Math::lerp(avg_score, max_score, cull_factor));
  211. remove_low_scores(p_results, cull_score);
  212. struct FuzzySearchResultComparator {
  213. bool operator()(const FuzzySearchResult &p_lhs, const FuzzySearchResult &p_rhs) const {
  214. // Sort on (score, length, alphanumeric) to ensure consistent ordering.
  215. if (p_lhs.score == p_rhs.score) {
  216. if (p_lhs.target.length() == p_rhs.target.length()) {
  217. return p_lhs.target < p_rhs.target;
  218. }
  219. return p_lhs.target.length() < p_rhs.target.length();
  220. }
  221. return p_lhs.score > p_rhs.score;
  222. }
  223. };
  224. SortArray<FuzzySearchResult, FuzzySearchResultComparator> sorter;
  225. if (p_results.size() > max_results) {
  226. sorter.partial_sort(0, p_results.size(), max_results, p_results.ptrw());
  227. p_results.resize(max_results);
  228. } else {
  229. sorter.sort(p_results.ptrw(), p_results.size());
  230. }
  231. }
  232. void FuzzySearch::set_query(const String &p_query) {
  233. tokens.clear();
  234. for (const String &string : p_query.split(" ", false)) {
  235. tokens.append({ static_cast<int>(tokens.size()), string });
  236. }
  237. case_sensitive = !p_query.is_lowercase();
  238. struct TokenComparator {
  239. bool operator()(const FuzzySearchToken &A, const FuzzySearchToken &B) const {
  240. if (A.string.length() == B.string.length()) {
  241. return A.idx < B.idx;
  242. }
  243. return A.string.length() > B.string.length();
  244. }
  245. };
  246. // Prioritize matching longer tokens before shorter ones since match overlaps are not accepted.
  247. tokens.sort_custom<TokenComparator>();
  248. }
  249. bool FuzzySearch::search(const String &p_target, FuzzySearchResult &p_result) const {
  250. p_result.target = p_target;
  251. p_result.dir_index = p_target.rfind_char('/');
  252. p_result.miss_budget = max_misses;
  253. String adjusted_target = case_sensitive ? p_target : p_target.to_lower();
  254. // For each token, eagerly generate subsequences starting from index 0 and keep the best scoring one
  255. // which does not conflict with prior token matches. This is not ensured to find the highest scoring
  256. // combination of matches, or necessarily the highest scoring single subsequence, as it only considers
  257. // eager subsequences for a given index, and likewise eagerly finds matches for each token in sequence.
  258. for (const FuzzySearchToken &token : tokens) {
  259. FuzzyTokenMatch best_match;
  260. int offset = start_offset;
  261. while (true) {
  262. FuzzyTokenMatch match;
  263. if (allow_subsequences) {
  264. if (!token.try_fuzzy_match(match, adjusted_target, offset, p_result.miss_budget)) {
  265. break;
  266. }
  267. } else {
  268. if (!token.try_exact_match(match, adjusted_target, offset)) {
  269. break;
  270. }
  271. }
  272. if (p_result.can_add_token_match(match)) {
  273. p_result.score_token_match(match, match.is_case_insensitive(p_target, adjusted_target));
  274. if (best_match.token_idx == -1 || best_match.score < match.score) {
  275. best_match = match;
  276. }
  277. }
  278. if (_is_valid_interval(match.interval)) {
  279. offset = match.interval.x + 1;
  280. } else {
  281. break;
  282. }
  283. }
  284. if (best_match.token_idx == -1) {
  285. return false;
  286. }
  287. p_result.add_token_match(best_match);
  288. }
  289. p_result.maybe_apply_score_bonus();
  290. return true;
  291. }
  292. void FuzzySearch::search_all(const PackedStringArray &p_targets, Vector<FuzzySearchResult> &p_results) const {
  293. p_results.clear();
  294. for (const String &target : p_targets) {
  295. FuzzySearchResult result;
  296. if (search(target, result)) {
  297. p_results.append(result);
  298. }
  299. }
  300. sort_and_filter(p_results);
  301. }