fts_fuzzy_match.h 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. // LICENSE
  2. //
  3. // This software is dual-licensed to the public domain and under the following
  4. // license: you are granted a perpetual, irrevocable license to copy, modify,
  5. // publish, and distribute this file as you see fit.
  6. //
  7. // VERSION
  8. // 0.2.0 (2017-02-18) Scored matches perform exhaustive search for best score
  9. // 0.1.0 (2016-03-28) Initial release
  10. //
  11. // AUTHOR
  12. // Forrest Smith
  13. //
  14. // NOTES
  15. // Compiling
  16. // You MUST add '#define FTS_FUZZY_MATCH_IMPLEMENTATION' before including this header in ONE source file to create implementation.
  17. //
  18. // fuzzy_match_simple(...)
  19. // Returns true if each character in pattern is found sequentially within str
  20. //
  21. // fuzzy_match(...)
  22. // Returns true if pattern is found AND calculates a score.
  23. // Performs exhaustive search via recursion to find all possible matches and match with highest score.
  24. // Scores values have no intrinsic meaning. Possible score range is not normalized and varies with pattern.
  25. // Recursion is limited internally (default=10) to prevent degenerate cases (pattern="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
  26. // Uses uint8_t for match indices. Therefore patterns are limited to 256 characters.
  27. // Score system should be tuned for YOUR use case. Words, sentences, file names, or method names all prefer different tuning.
  28. #ifndef FTS_FUZZY_MATCH_H
  29. #define FTS_FUZZY_MATCH_H
  30. #include <cstdint> // uint8_t
  31. #include <ctype.h> // ::tolower, ::toupper
  32. #include <cstring> // memcpy
  33. #include <cstdio>
  34. // Public interface
  35. namespace fts {
  36. static bool fuzzy_match_simple(char const * pattern, char const * str);
  37. static bool fuzzy_match(char const * pattern, char const * str, int & outScore);
  38. static bool fuzzy_match(char const * pattern, char const * str, int & outScore, uint8_t * matches, int maxMatches);
  39. }
  40. #ifdef FTS_FUZZY_MATCH_IMPLEMENTATION
  41. namespace fts {
  42. // Forward declarations for "private" implementation
  43. namespace fuzzy_internal {
  44. static bool fuzzy_match_recursive(const char * pattern, const char * str, int & outScore, const char * strBegin,
  45. uint8_t const * srcMatches, uint8_t * newMatches, int maxMatches, int nextMatch,
  46. int & recursionCount, int recursionLimit);
  47. }
  48. // Public interface
  49. static bool fuzzy_match_simple(char const * pattern, char const * str) {
  50. while (*pattern != '\0' && *str != '\0') {
  51. if (tolower(*pattern) == tolower(*str))
  52. ++pattern;
  53. ++str;
  54. }
  55. return *pattern == '\0' ? true : false;
  56. }
  57. static bool fuzzy_match(char const * pattern, char const * str, int & outScore) {
  58. uint8_t matches[256];
  59. return fuzzy_match(pattern, str, outScore, matches, sizeof(matches));
  60. }
  61. static bool fuzzy_match(char const * pattern, char const * str, int & outScore, uint8_t * matches, int maxMatches) {
  62. int recursionCount = 0;
  63. int recursionLimit = 10;
  64. return fuzzy_internal::fuzzy_match_recursive(pattern, str, outScore, str, nullptr, matches, maxMatches, 0, recursionCount, recursionLimit);
  65. }
  66. // Private implementation
  67. static bool fuzzy_internal::fuzzy_match_recursive(const char * pattern, const char * str, int & outScore,
  68. const char * strBegin, uint8_t const * srcMatches, uint8_t * matches, int maxMatches,
  69. int nextMatch, int & recursionCount, int recursionLimit)
  70. {
  71. // Count recursions
  72. ++recursionCount;
  73. if (recursionCount >= recursionLimit)
  74. return false;
  75. // Detect end of strings
  76. if (*pattern == '\0' || *str == '\0')
  77. return false;
  78. // Recursion params
  79. bool recursiveMatch = false;
  80. uint8_t bestRecursiveMatches[256];
  81. int bestRecursiveScore = 0;
  82. // Loop through pattern and str looking for a match
  83. bool first_match = true;
  84. while (*pattern != '\0' && *str != '\0') {
  85. // Found match
  86. if (tolower(*pattern) == tolower(*str)) {
  87. // Supplied matches buffer was too short
  88. if (nextMatch >= maxMatches)
  89. return false;
  90. // "Copy-on-Write" srcMatches into matches
  91. if (first_match && srcMatches) {
  92. memcpy(matches, srcMatches, nextMatch);
  93. first_match = false;
  94. }
  95. // Recursive call that "skips" this match
  96. uint8_t recursiveMatches[256];
  97. int recursiveScore;
  98. if (fuzzy_match_recursive(pattern, str + 1, recursiveScore, strBegin, matches, recursiveMatches, sizeof(recursiveMatches), nextMatch, recursionCount, recursionLimit)) {
  99. // Pick best recursive score
  100. if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
  101. memcpy(bestRecursiveMatches, recursiveMatches, 256);
  102. bestRecursiveScore = recursiveScore;
  103. }
  104. recursiveMatch = true;
  105. }
  106. // Advance
  107. matches[nextMatch++] = (uint8_t)(str - strBegin);
  108. ++pattern;
  109. }
  110. ++str;
  111. }
  112. // Determine if full pattern was matched
  113. bool matched = *pattern == '\0' ? true : false;
  114. // Calculate score
  115. if (matched) {
  116. const int sequential_bonus = 15; // bonus for adjacent matches
  117. const int separator_bonus = 30; // bonus if match occurs after a separator
  118. const int camel_bonus = 30; // bonus if match is uppercase and prev is lower
  119. const int first_letter_bonus = 15; // bonus if the first letter is matched
  120. const int leading_letter_penalty = -5; // penalty applied for every letter in str before the first match
  121. const int max_leading_letter_penalty = -15; // maximum penalty for leading letters
  122. const int unmatched_letter_penalty = -1; // penalty for every letter that doesn't matter
  123. // Iterate str to end
  124. while (*str != '\0')
  125. ++str;
  126. // Initialize score
  127. outScore = 100;
  128. // Apply leading letter penalty
  129. int penalty = leading_letter_penalty * matches[0];
  130. if (penalty < max_leading_letter_penalty)
  131. penalty = max_leading_letter_penalty;
  132. outScore += penalty;
  133. // Apply unmatched penalty
  134. int unmatched = (int)(str - strBegin) - nextMatch;
  135. outScore += unmatched_letter_penalty * unmatched;
  136. // Apply ordering bonuses
  137. for (int i = 0; i < nextMatch; ++i) {
  138. uint8_t currIdx = matches[i];
  139. if (i > 0) {
  140. uint8_t prevIdx = matches[i - 1];
  141. // Sequential
  142. if (currIdx == (prevIdx + 1))
  143. outScore += sequential_bonus;
  144. }
  145. // Check for bonuses based on neighbor character value
  146. if (currIdx > 0) {
  147. // Camel case
  148. char neighbor = strBegin[currIdx - 1];
  149. char curr = strBegin[currIdx];
  150. if (::islower(neighbor) && ::isupper(curr))
  151. outScore += camel_bonus;
  152. // Separator
  153. bool neighborSeparator = neighbor == '_' || neighbor == ' ';
  154. if (neighborSeparator)
  155. outScore += separator_bonus;
  156. }
  157. else {
  158. // First letter
  159. outScore += first_letter_bonus;
  160. }
  161. }
  162. }
  163. // Return best result
  164. if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) {
  165. // Recursive score is better than "this"
  166. memcpy(matches, bestRecursiveMatches, maxMatches);
  167. outScore = bestRecursiveScore;
  168. return true;
  169. }
  170. else if (matched) {
  171. // "this" score is better than recursive
  172. return true;
  173. }
  174. else {
  175. // no match
  176. return false;
  177. }
  178. }
  179. } // namespace fts
  180. #endif // FTS_FUZZY_MATCH_IMPLEMENTATION
  181. #endif // FTS_FUZZY_MATCH_H