fuzzy-matching.lisp 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. ;; This software is Copyright (c) cage 2021
  2. ;; cage grants you the rights to distribute
  3. ;; and use this software as governed by the terms
  4. ;; of the Lisp Lesser GNU Public License
  5. ;; (http://opensource.franz.com/preamble.html),
  6. ;; known as the LLGPL
  7. (in-package :cl-i18n-utils)
  8. (defun fuzzy-match (template sequence
  9. &key
  10. (similarity-match 5)
  11. (similarity-mismatch -5)
  12. (penalty-weight 1)
  13. (char-comparison-fn #'char=))
  14. "Performs a Smith-Waterman affinity search.
  15. See: https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
  16. Returns multiple values, first a list of index in `sequence' that
  17. matches `template' (even if some deletion must be performed to get
  18. a perfect match.
  19. The other three return values are matrix cost and traceback costs,
  20. useful to perform additional searches.
  21. The list is sorted in ascending order and null values means a gap is
  22. present to match `template'
  23. Example:
  24. (fuzzy-match \"TAB\" \"TUUUAB\") -> (0 NIL NIL NIL 4 5)
  25. T AB
  26. | ||
  27. T---AB
  28. Note: length of `template' must be shorter or equal to `sequence'.
  29. "
  30. (labels ((initialize-cost-matrix ()
  31. (make-array (list (1+ (length sequence))
  32. (1+ (length template)))
  33. :element-type 'fixnum
  34. :initial-element 0
  35. :adjustable nil))
  36. (initialize-trace-matrix ()
  37. (initialize-cost-matrix))
  38. (similarity-value (i j)
  39. (if (funcall char-comparison-fn
  40. (elt sequence (1- i))
  41. (elt template (1- j)))
  42. similarity-match
  43. similarity-mismatch))
  44. (find-max (matrix)
  45. (let ((max-i -1)
  46. (max-j -1)
  47. (max 0))
  48. (loop for i from 0 below (array-dimension matrix 0) do
  49. (loop for j from 0 below (array-dimension matrix 1) do
  50. (when (> (aref matrix i j)
  51. max)
  52. (setf max (aref matrix i j))
  53. (setf max-i i)
  54. (setf max-j j))))
  55. (values max-i max-j max)))
  56. (traceback (costs trace-col trace-row i j &optional (res '()))
  57. (let ((new-i (aref trace-col i j))
  58. (new-j (aref trace-row i j))
  59. (new-cost (aref costs i j)))
  60. (cond
  61. ((= new-cost 0)
  62. (mapcar (lambda (a)
  63. (if (third a)
  64. nil
  65. (first a)))
  66. res))
  67. ((or (= j new-j)
  68. (= i new-i))
  69. (traceback costs
  70. trace-col trace-row
  71. new-i new-j
  72. (push (list new-i new-j t) res)))
  73. (t
  74. (traceback costs
  75. trace-col trace-row
  76. new-i new-j
  77. (push (list new-i new-j) res)))))))
  78. (let ((costs (initialize-cost-matrix))
  79. (trace-col (initialize-trace-matrix))
  80. (trace-row (initialize-trace-matrix)))
  81. (loop for ct-sequence from 0 below (length sequence) do
  82. (loop for ct-template from 0 below (length template) do
  83. (let* ((i (1+ ct-sequence)) ; y
  84. (j (1+ ct-template)) ; x
  85. (cost-similarity (+ (aref costs (1- i) (1- j))
  86. (similarity-value i j)))
  87. (cost-deletion-col (- (aref costs (1- i) j)
  88. penalty-weight))
  89. (cost-deletion-row (- (aref costs i (1- j))
  90. penalty-weight))
  91. (all-costs (list cost-similarity
  92. cost-deletion-row
  93. cost-deletion-col))
  94. (max -1e10)
  95. (max-pos -1))
  96. (loop for ct from 0 below 3 do
  97. (when (> (elt all-costs ct)
  98. max)
  99. (setf max (elt all-costs ct))
  100. (incf max-pos)))
  101. (setf max (max 0 max))
  102. (setf (aref costs i j) max)
  103. (cond
  104. ((= max-pos 0)
  105. (setf (aref trace-col i j) (1- i))
  106. (setf (aref trace-row i j) (1- j)))
  107. ((= max-pos 1)
  108. (setf (aref trace-col i j) i)
  109. (setf (aref trace-row i j) (1- j)))
  110. ((= max-pos 2)
  111. (setf (aref trace-col i j) (1- i))
  112. (setf (aref trace-row i j) j))))))
  113. (multiple-value-bind (start-i start-j)
  114. (find-max costs)
  115. (when (and (> start-i 0)
  116. (> start-j 0))
  117. (let ((trace (traceback costs trace-col trace-row start-i start-j)))
  118. (values trace
  119. costs trace-col trace-row)))))))