123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- // LICENSE
- //
- // This software is dual-licensed to the public domain and under the following
- // license: you are granted a perpetual, irrevocable license to copy, modify,
- // publish, and distribute this file as you see fit.
- //
- // VERSION
- // 0.2.0 (2017-02-18) Scored matches perform exhaustive search for best score
- // 0.1.0 (2016-03-28) Initial release
- //
- // AUTHOR
- // Forrest Smith
- //
- // NOTES
- // Compiling
- // You MUST add '#define FTS_FUZZY_MATCH_IMPLEMENTATION' before including this header in ONE source file to create implementation.
- //
- // fuzzy_match_simple(...)
- // Returns true if each character in pattern is found sequentially within str
- //
- // fuzzy_match(...)
- // Returns true if pattern is found AND calculates a score.
- // Performs exhaustive search via recursion to find all possible matches and match with highest score.
- // Scores values have no intrinsic meaning. Possible score range is not normalized and varies with pattern.
- // Recursion is limited internally (default=10) to prevent degenerate cases (pattern="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
- // Uses uint8_t for match indices. Therefore patterns are limited to 256 characters.
- // Score system should be tuned for YOUR use case. Words, sentences, file names, or method names all prefer different tuning.
- #ifndef FTS_FUZZY_MATCH_H
- #define FTS_FUZZY_MATCH_H
- #include <cstdint> // uint8_t
- #include <ctype.h> // ::tolower, ::toupper
- #include <cstring> // memcpy
- #include <cstdio>
- // Public interface
- namespace fts {
- static bool fuzzy_match_simple(char const * pattern, char const * str);
- static bool fuzzy_match(char const * pattern, char const * str, int & outScore);
- static bool fuzzy_match(char const * pattern, char const * str, int & outScore, uint8_t * matches, int maxMatches);
- }
- #ifdef FTS_FUZZY_MATCH_IMPLEMENTATION
- namespace fts {
- // Forward declarations for "private" implementation
- namespace fuzzy_internal {
- static bool fuzzy_match_recursive(const char * pattern, const char * str, int & outScore, const char * strBegin,
- uint8_t const * srcMatches, uint8_t * newMatches, int maxMatches, int nextMatch,
- int & recursionCount, int recursionLimit);
- }
- // Public interface
- static bool fuzzy_match_simple(char const * pattern, char const * str) {
- while (*pattern != '\0' && *str != '\0') {
- if (tolower(*pattern) == tolower(*str))
- ++pattern;
- ++str;
- }
- return *pattern == '\0' ? true : false;
- }
- static bool fuzzy_match(char const * pattern, char const * str, int & outScore) {
-
- uint8_t matches[256];
- return fuzzy_match(pattern, str, outScore, matches, sizeof(matches));
- }
- static bool fuzzy_match(char const * pattern, char const * str, int & outScore, uint8_t * matches, int maxMatches) {
- int recursionCount = 0;
- int recursionLimit = 10;
- return fuzzy_internal::fuzzy_match_recursive(pattern, str, outScore, str, nullptr, matches, maxMatches, 0, recursionCount, recursionLimit);
- }
- // Private implementation
- static bool fuzzy_internal::fuzzy_match_recursive(const char * pattern, const char * str, int & outScore,
- const char * strBegin, uint8_t const * srcMatches, uint8_t * matches, int maxMatches,
- int nextMatch, int & recursionCount, int recursionLimit)
- {
- // Count recursions
- ++recursionCount;
- if (recursionCount >= recursionLimit)
- return false;
- // Detect end of strings
- if (*pattern == '\0' || *str == '\0')
- return false;
- // Recursion params
- bool recursiveMatch = false;
- uint8_t bestRecursiveMatches[256];
- int bestRecursiveScore = 0;
- // Loop through pattern and str looking for a match
- bool first_match = true;
- while (*pattern != '\0' && *str != '\0') {
-
- // Found match
- if (tolower(*pattern) == tolower(*str)) {
- // Supplied matches buffer was too short
- if (nextMatch >= maxMatches)
- return false;
-
- // "Copy-on-Write" srcMatches into matches
- if (first_match && srcMatches) {
- memcpy(matches, srcMatches, nextMatch);
- first_match = false;
- }
- // Recursive call that "skips" this match
- uint8_t recursiveMatches[256];
- int recursiveScore;
- if (fuzzy_match_recursive(pattern, str + 1, recursiveScore, strBegin, matches, recursiveMatches, sizeof(recursiveMatches), nextMatch, recursionCount, recursionLimit)) {
-
- // Pick best recursive score
- if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
- memcpy(bestRecursiveMatches, recursiveMatches, 256);
- bestRecursiveScore = recursiveScore;
- }
- recursiveMatch = true;
- }
- // Advance
- matches[nextMatch++] = (uint8_t)(str - strBegin);
- ++pattern;
- }
- ++str;
- }
- // Determine if full pattern was matched
- bool matched = *pattern == '\0' ? true : false;
- // Calculate score
- if (matched) {
- const int sequential_bonus = 15; // bonus for adjacent matches
- const int separator_bonus = 30; // bonus if match occurs after a separator
- const int camel_bonus = 30; // bonus if match is uppercase and prev is lower
- const int first_letter_bonus = 15; // bonus if the first letter is matched
- const int leading_letter_penalty = -5; // penalty applied for every letter in str before the first match
- const int max_leading_letter_penalty = -15; // maximum penalty for leading letters
- const int unmatched_letter_penalty = -1; // penalty for every letter that doesn't matter
- // Iterate str to end
- while (*str != '\0')
- ++str;
- // Initialize score
- outScore = 100;
- // Apply leading letter penalty
- int penalty = leading_letter_penalty * matches[0];
- if (penalty < max_leading_letter_penalty)
- penalty = max_leading_letter_penalty;
- outScore += penalty;
- // Apply unmatched penalty
- int unmatched = (int)(str - strBegin) - nextMatch;
- outScore += unmatched_letter_penalty * unmatched;
- // Apply ordering bonuses
- for (int i = 0; i < nextMatch; ++i) {
- uint8_t currIdx = matches[i];
- if (i > 0) {
- uint8_t prevIdx = matches[i - 1];
- // Sequential
- if (currIdx == (prevIdx + 1))
- outScore += sequential_bonus;
- }
- // Check for bonuses based on neighbor character value
- if (currIdx > 0) {
- // Camel case
- char neighbor = strBegin[currIdx - 1];
- char curr = strBegin[currIdx];
- if (::islower(neighbor) && ::isupper(curr))
- outScore += camel_bonus;
- // Separator
- bool neighborSeparator = neighbor == '_' || neighbor == ' ';
- if (neighborSeparator)
- outScore += separator_bonus;
- }
- else {
- // First letter
- outScore += first_letter_bonus;
- }
- }
- }
- // Return best result
- if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) {
- // Recursive score is better than "this"
- memcpy(matches, bestRecursiveMatches, maxMatches);
- outScore = bestRecursiveScore;
- return true;
- }
- else if (matched) {
- // "this" score is better than recursive
- return true;
- }
- else {
- // no match
- return false;
- }
- }
- } // namespace fts
- #endif // FTS_FUZZY_MATCH_IMPLEMENTATION
- #endif // FTS_FUZZY_MATCH_H
|