123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- /*
- writer : Opera Wang
- E-Mail : wangvisual AT sohu DOT com
- License: GPL
- */
- /* filename: distance.cc */
- /*
- http://www.merriampark.com/ld.htm
- What is Levenshtein Distance?
-
- Levenshtein distance (LD) is a measure of the similarity between two strings,
- which we will refer to as the source string (s) and the target string (t).
- The distance is the number of deletions, insertions, or substitutions required
- to transform s into t. For example,
-
- * If s is "test" and t is "test", then LD(s,t) = 0, because no transformations are needed.
- The strings are already identical.
- * If s is "test" and t is "tent", then LD(s,t) = 1, because one substitution
- (change "s" to "n") is sufficient to transform s into t.
-
- The greater the Levenshtein distance, the more different the strings are.
-
- Levenshtein distance is named after the Russian scientist Vladimir Levenshtein,
- who devised the algorithm in 1965. If you can't spell or pronounce Levenshtein,
- the metric is also sometimes called edit distance.
-
- The Levenshtein distance algorithm has been used in:
-
- * Spell checking
- * Speech recognition
- * DNA analysis
- * Plagiarism detection
- */
- #include <stdlib.h>
- #include <string.h>
- //#include <stdio.h>
- #include "distance.h"
- #define OPTIMIZE_ED
- /*
- Cover transposition, in addition to deletion,
- insertion and substitution. This step is taken from:
- Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
- Enhanced Dynamic Programming ASM Algorithm"
- (http://www.acm.org/~hlb/publications/asm/asm.html)
- */
- #define COVER_TRANSPOSITION
- /****************************************/
- /*Implementation of Levenshtein distance*/
- /****************************************/
- EditDistance::EditDistance()
- {
- currentelements = 2500; // It's enough for most conditions :-)
- d = (int*)malloc(sizeof(int) * currentelements);
- }
- EditDistance::~EditDistance()
- {
- // printf("size:%d\n",currentelements);
- if (d)
- free(d);
- }
- #ifdef OPTIMIZE_ED
- int EditDistance::CalEditDistance(const gunichar *s, const gunichar *t, const int limit)
- /*Compute levenshtein distance between s and t, this is using QUICK algorithm*/
- {
- int n = 0, m = 0, iLenDif, k, i, j, cost;
- // Remove leftmost matching portion of strings
- while ( *s && (*s == *t) )
- {
- s++;
- t++;
- }
- while (s[n])
- {
- n++;
- }
- while (t[m])
- {
- m++;
- }
- // Remove rightmost matching portion of strings by decrement n and m.
- while ( n && m && (*(s + n - 1) == *(t + m - 1)) )
- {
- n--;
- m--;
- }
- if ( m == 0 || n == 0 || d == (int*)0 )
- return (m + n);
- if ( m < n )
- {
- const gunichar * temp = s;
- int itemp = n;
- s = t;
- t = temp;
- n = m;
- m = itemp;
- }
- iLenDif = m - n;
- if ( iLenDif >= limit )
- return iLenDif;
- // step 1
- n++;
- m++;
- // d=(int*)malloc(sizeof(int)*m*n);
- if ( m*n > currentelements )
- {
- currentelements = m * n * 2; // double the request
- d = (int*)realloc(d, sizeof(int) * currentelements);
- if ( (int*)0 == d )
- return (m + n);
- }
- // step 2, init matrix
- for (k = 0;k < n;k++)
- d[k] = k;
- for (k = 1;k < m;k++)
- d[k*n] = k;
- // step 3
- for (i = 1;i < n;i++)
- {
- // first calculate column, d(i,j)
- for ( j = 1;j < iLenDif + i;j++ )
- {
- cost = s[i - 1] == t[j - 1] ? 0 : 1;
- d[j*n + i] = minimum(d[(j - 1) * n + i] + 1, d[j * n + i - 1] + 1, d[(j - 1) * n + i - 1] + cost);
- #ifdef COVER_TRANSPOSITION
- if ( i >= 2 && j >= 2 && (d[j*n + i] - d[(j - 2)*n + i - 2] == 2)
- && (s[i - 2] == t[j - 1]) && (s[i - 1] == t[j - 2]) )
- d[j*n + i]--;
- #endif
- }
- // second calculate row, d(k,j)
- // now j==iLenDif+i;
- for ( k = 1;k <= i;k++ )
- {
- cost = s[k - 1] == t[j - 1] ? 0 : 1;
- d[j*n + k] = minimum(d[(j - 1) * n + k] + 1, d[j * n + k - 1] + 1, d[(j - 1) * n + k - 1] + cost);
- #ifdef COVER_TRANSPOSITION
- if ( k >= 2 && j >= 2 && (d[j*n + k] - d[(j - 2)*n + k - 2] == 2)
- && (s[k - 2] == t[j - 1]) && (s[k - 1] == t[j - 2]) )
- d[j*n + k]--;
- #endif
- }
- // test if d(i,j) limit gets equal or exceed
- if ( d[j*n + i] >= limit )
- {
- return d[j*n + i];
- }
- }
- // d(n-1,m-1)
- return d[n*m - 1];
- }
- #else
- int EditDistance::CalEditDistance(const char *s, const char *t, const int limit)
- {
- //Step 1
- int k, i, j, n, m, cost;
- n = strlen(s);
- m = strlen(t);
- if ( n != 0 && m != 0 && d != (int*)0 )
- {
- m++;
- n++;
- if ( m*n > currentelements )
- {
- currentelements = m * n * 2;
- d = (int*)realloc(d, sizeof(int) * currentelements);
- if ( (int*)0 == d )
- return (m + n);
- }
- //Step 2
- for (k = 0;k < n;k++)
- d[k] = k;
- for (k = 0;k < m;k++)
- d[k*n] = k;
- //Step 3 and 4
- for (i = 1;i < n;i++)
- for (j = 1;j < m;j++)
- {
- //Step 5
- if (s[i - 1] == t[j - 1])
- cost = 0;
- else
- cost = 1;
- //Step 6
- d[j*n + i] = minimum(d[(j - 1) * n + i] + 1, d[j * n + i - 1] + 1, d[(j - 1) * n + i - 1] + cost);
- #ifdef COVER_TRANSPOSITION
- if ( i >= 2 && j >= 2 && (d[j*n + i] - d[(j - 2)*n + i - 2] == 2)
- && (s[i - 2] == t[j - 1]) && (s[i - 1] == t[j - 2]) )
- d[j*n + i]--;
- #endif
- }
- return d[n*m - 1];
- }
- else
- return (n + m);
- }
- #endif
|