character-fold.el 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. ;;; character-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
  2. ;; Copyright (C) 2015 Free Software Foundation, Inc.
  3. ;; Maintainer: emacs-devel@gnu.org
  4. ;; Keywords: matching
  5. ;; This file is part of GNU Emacs.
  6. ;; GNU Emacs is free software: you can redistribute it and/or modify
  7. ;; it under the terms of the GNU General Public License as published by
  8. ;; the Free Software Foundation, either version 3 of the License, or
  9. ;; (at your option) any later version.
  10. ;; GNU Emacs is distributed in the hope that it will be useful,
  11. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ;; GNU General Public License for more details.
  14. ;; You should have received a copy of the GNU General Public License
  15. ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
  16. ;;; Code:
  17. ;;;###autoload
  18. (defvar character-fold-search nil
  19. "Non-nil if searches should fold similar characters.
  20. This means some characters will match entire groups of characters.
  21. For instance, \" will match all variants of double quotes, and
  22. the letter a will match all of its accented versions (and then
  23. some).")
  24. (defconst character-fold-table
  25. (eval-when-compile
  26. (let* ((equiv (make-char-table 'character-fold-table))
  27. (table (unicode-property-table-internal 'decomposition))
  28. (func (char-table-extra-slot table 1)))
  29. ;; Ensure the table is populated.
  30. (map-char-table
  31. (lambda (i v) (when (consp i) (funcall func (car i) v table)))
  32. table)
  33. ;; Compile a list of all complex characters that each simple
  34. ;; character should match.
  35. (map-char-table
  36. (lambda (i dec)
  37. (when (consp dec)
  38. ;; Discard a possible formatting tag.
  39. (when (symbolp (car dec))
  40. (setq dec (cdr dec)))
  41. ;; Skip trivial cases like ?a decomposing to (?a).
  42. (unless (or (and (eq i (car dec))
  43. (not (cdr dec))))
  44. (let ((d dec)
  45. (fold-decomp t)
  46. k found)
  47. (while (and d (not found))
  48. (setq k (pop d))
  49. ;; Is k a number or letter, per unicode standard?
  50. (setq found (memq (get-char-code-property k 'general-category)
  51. '(Lu Ll Lt Lm Lo Nd Nl No))))
  52. (if found
  53. ;; Check if the decomposition has more than one letter,
  54. ;; because then we don't want the first letter to match
  55. ;; the decomposition.
  56. (dolist (k d)
  57. (when (and fold-decomp
  58. (memq (get-char-code-property k 'general-category)
  59. '(Lu Ll Lt Lm Lo Nd Nl No)))
  60. (setq fold-decomp nil)))
  61. ;; If there's no number or letter on the
  62. ;; decomposition, take the first character in it.
  63. (setq found (car-safe dec)))
  64. ;; Finally, we only fold multi-char decomposition if at
  65. ;; least one of the chars is non-spacing (combining).
  66. (when fold-decomp
  67. (setq fold-decomp nil)
  68. (dolist (k dec)
  69. (when (and (not fold-decomp)
  70. (> (get-char-code-property k 'canonical-combining-class) 0))
  71. (setq fold-decomp t))))
  72. ;; Add i to the list of characters that k can
  73. ;; represent. Also possibly add its decomposition, so we can
  74. ;; match multi-char representations like (format "a%c" 769)
  75. (when (and found (not (eq i k)))
  76. (let ((chars (cons (char-to-string i) (aref equiv k))))
  77. (aset equiv k
  78. (if fold-decomp
  79. (cons (apply #'string dec) chars)
  80. chars))))))))
  81. table)
  82. ;; Add some manual entries.
  83. (dolist (it '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
  84. (?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "󠀢" "❮" "❯" "‹" "›")
  85. (?` "❛" "‘" "‛" "󠀢" "❮" "‹")))
  86. (let ((idx (car it))
  87. (chars (cdr it)))
  88. (aset equiv idx (append chars (aref equiv idx)))))
  89. ;; Convert the lists of characters we compiled into regexps.
  90. (map-char-table
  91. (lambda (i v) (let ((re (regexp-opt (cons (char-to-string i) v))))
  92. (if (consp i)
  93. (set-char-table-range equiv i re)
  94. (aset equiv i re))))
  95. equiv)
  96. equiv))
  97. "Used for folding characters of the same group during search.")
  98. ;;;###autoload
  99. (defun character-fold-to-regexp (string &optional lax)
  100. "Return a regexp matching anything that character-folds into STRING.
  101. If `character-fold-search' is nil, `regexp-quote' string.
  102. Otherwise, any character in STRING that has an entry in
  103. `character-fold-table' is replaced with that entry (which is a
  104. regexp) and other characters are `regexp-quote'd.
  105. If LAX is non-nil, any single whitespace character is allowed to
  106. match any number of times."
  107. (if character-fold-search
  108. (apply #'concat
  109. (mapcar (lambda (c) (if (and lax (memq c '(?\s ?\t ?\r ?\n)))
  110. "[ \t\n\r\xa0\x2002\x2d\x200a\x202f\x205f\x3000]+"
  111. (or (aref character-fold-table c)
  112. (regexp-quote (string c)))))
  113. string))
  114. (regexp-quote string)))
  115. ;;; character-fold.el ends here