tibet-util.el 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. ;;; tibet-util.el --- utilities for Tibetan -*- coding: iso-2022-7bit; -*-
  2. ;; Copyright (C) 1997, 2001-2012 Free Software Foundation, Inc.
  3. ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
  4. ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
  5. ;; National Institute of Advanced Industrial Science and Technology (AIST)
  6. ;; Registration Number H14PRO021
  7. ;; Author: Toru TOMABECHI <Toru.Tomabechi@orient.unil.ch>
  8. ;; Keywords: multilingual, Tibetan
  9. ;; Created: Feb. 17. 1997
  10. ;; This file is part of GNU Emacs.
  11. ;; GNU Emacs is free software: you can redistribute it and/or modify
  12. ;; it under the terms of the GNU General Public License as published by
  13. ;; the Free Software Foundation, either version 3 of the License, or
  14. ;; (at your option) any later version.
  15. ;; GNU Emacs is distributed in the hope that it will be useful,
  16. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. ;; GNU General Public License for more details.
  19. ;; You should have received a copy of the GNU General Public License
  20. ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
  21. ;;; History:
  22. ;; 1997.03.13 Modification in treatment of text properties;
  23. ;; Support for some special signs and punctuation.
  24. ;; 1999.10.25 Modification for a new composition way by K.Handa.
  25. ;;; Commentary:
  26. ;;; Code:
  27. (defconst tibetan-obsolete-glyphs
  28. `(("$(7!=(B" . "$(7!=(B") ; 2 col <-> 1 col
  29. ("$(7!?(B" . "$(7!?(B")
  30. ("$(7!@(B" . "$(7!@(B")
  31. ("$(7!A(B" . "$(7!A(B")
  32. ("$(7"`(B" . "$(7"`(B")
  33. ("$(7!;(B" . "$(7!;(B")
  34. ("$(7!D(B" . "$(7!D(B")
  35. ;; Yes these are dirty. But ...
  36. ("$(7!>(B $(7!>(B" . ,(compose-string "$(7!>(B $(7!>(B" 0 3 [?$(7!>(B (Br . Bl) ? (Br . Bl) ?$(7!>(B]))
  37. ("$(7!4!5!5(B" . ,(compose-string
  38. "$(7#R#S#S#S(B" 0 4
  39. [?$(7#R(B (Br . Bl) ?$(7#S(B (Br . Bl) ?$(7#S(B (Br . Bl) ?$(7#S(B]))
  40. ("$(7!4!5(B" . ,(compose-string "$(7#R#S#S(B" 0 3 [?$(7#R(B (Br . Bl) ?$(7#S(B (Br . Bl) ?$(7#S(B]))
  41. ("$(7!6(B" . ,(compose-string "$(7#R#S!I(B" 0 3 [?$(7#R(B (Br . Bl) ?$(7#S(B (br . tr) ?$(7!I(B]))
  42. ("$(7!4(B" . ,(compose-string "$(7#R#S(B" 0 2 [?$(7#R(B (Br . Bl) ?$(7#S(B]))))
  43. ;;;###autoload
  44. (defun tibetan-char-p (ch)
  45. "Check if char CH is Tibetan character.
  46. Returns non-nil if CH is Tibetan. Otherwise, returns nil."
  47. (memq (char-charset ch) '(tibetan tibetan-1-column)))
  48. ;;; Functions for Tibetan <-> Tibetan-transcription.
  49. ;;;###autoload
  50. (defun tibetan-tibetan-to-transcription (str)
  51. "Transcribe Tibetan string STR and return the corresponding Roman string."
  52. (let (;; Accumulate transcriptions here in reverse order.
  53. (trans nil)
  54. (len (length str))
  55. (i 0)
  56. ch this-trans)
  57. (while (< i len)
  58. (let ((idx (string-match tibetan-precomposition-rule-regexp str i)))
  59. (if (eq idx i)
  60. ;; Ith character and the followings matches precomposable
  61. ;; Tibetan sequence.
  62. (setq i (match-end 0)
  63. this-trans
  64. (car (rassoc
  65. (cdr (assoc (match-string 0 str)
  66. tibetan-precomposition-rule-alist))
  67. tibetan-precomposed-transcription-alist)))
  68. (setq ch (substring str i (1+ i))
  69. i (1+ i)
  70. this-trans
  71. (car (or (rassoc ch tibetan-consonant-transcription-alist)
  72. (rassoc ch tibetan-vowel-transcription-alist)
  73. (rassoc ch tibetan-subjoined-transcription-alist)))))
  74. (setq trans (cons this-trans trans))))
  75. (apply 'concat (nreverse trans))))
  76. ;;;###autoload
  77. (defun tibetan-transcription-to-tibetan (str)
  78. "Convert Tibetan Roman string STR to Tibetan character string.
  79. The returned string has no composition information."
  80. (let (;; Case is significant.
  81. (case-fold-search nil)
  82. (idx 0)
  83. ;; Accumulate Tibetan strings here in reverse order.
  84. (t-str-list nil)
  85. i subtrans)
  86. (while (setq i (string-match tibetan-regexp str idx))
  87. (if (< idx i)
  88. ;; STR contains a pattern that doesn't match Tibetan
  89. ;; transcription. Include the pattern as is.
  90. (setq t-str-list (cons (substring str idx i) t-str-list)))
  91. (setq subtrans (match-string 0 str)
  92. idx (match-end 0))
  93. (let ((t-char (cdr (assoc subtrans
  94. tibetan-precomposed-transcription-alist))))
  95. (if t-char
  96. ;; SUBTRANS corresponds to a transcription for
  97. ;; precomposable Tibetan sequence.
  98. (setq t-char (car (rassoc t-char
  99. tibetan-precomposition-rule-alist)))
  100. (setq t-char
  101. (cdr
  102. (or (assoc subtrans tibetan-consonant-transcription-alist)
  103. (assoc subtrans tibetan-vowel-transcription-alist)
  104. (assoc subtrans tibetan-modifier-transcription-alist)
  105. (assoc subtrans tibetan-subjoined-transcription-alist)))))
  106. (setq t-str-list (cons t-char t-str-list))))
  107. (if (< idx (length str))
  108. (setq t-str-list (cons (substring str idx) t-str-list)))
  109. (apply 'concat (nreverse t-str-list))))
  110. ;;;
  111. ;;; Functions for composing/decomposing Tibetan sequence.
  112. ;;;
  113. ;;; A Tibetan syllable is typically structured as follows:
  114. ;;;
  115. ;;; [Prefix] C [C+] V [M] [Suffix [Post suffix]]
  116. ;;;
  117. ;;; where C's are all vertically stacked, V appears below or above
  118. ;;; consonant cluster and M is always put above the C[C+]V combination.
  119. ;;; (Sanskrit visarga, though it is a vowel modifier, is considered
  120. ;;; to be a punctuation.)
  121. ;;;
  122. ;;; Here are examples of the words "bsgrubs" and "hfauM"
  123. ;;;
  124. ;;; $(7"7"G###C"U"7"G(B $(7"H"R"U"_(B
  125. ;;;
  126. ;;; M
  127. ;;; b s b s h
  128. ;;; g fa
  129. ;;; r u
  130. ;;; u
  131. ;;;
  132. ;;; Consonants `'' ($(7"A(B), `w' ($(7">(B), `y' ($(7"B(B), `r' ($(7"C(B) take special
  133. ;;; forms when they are used as subjoined consonant. Consonant `r'
  134. ;;; takes another special form when used as superjoined in such a case
  135. ;;; as "rka", while it does not change its form when conjoined with
  136. ;;; subjoined `'', `w' or `y' as in "rwa", "rya".
  137. ;; Append a proper composition rule and glyph to COMPONENTS to compose
  138. ;; CHAR with a composition that has COMPONENTS.
  139. (defun tibetan-add-components (components char)
  140. (let ((last (last components))
  141. (stack-upper '(tc . bc))
  142. (stack-under '(bc . tc))
  143. rule comp-vowel tmp)
  144. ;; Special treatment for 'a chung.
  145. ;; If 'a follows a consonant, turn it into the subjoined form.
  146. ;; * Disabled by Tomabechi 2000/06/09 *
  147. ;; Because in Unicode, $(7"A(B may follow directly a consonant without
  148. ;; any intervening vowel, as in $(7"9"""Q"A!;(B=$(7"9(B $(7""(B $(7"A(B not $(7"9(B $(7""(B $(7"Q(B $(7"A(B
  149. ;;(if (and (= char ?$(7"A(B)
  150. ;; (aref (char-category-set (car last)) ?0))
  151. ;; (setq char ?$(7"R(B)) ;; modified for new font by Tomabechi 1999/12/10
  152. ;; Composite vowel signs are decomposed before being added
  153. ;; Added by Tomabechi 2000/06/08
  154. (if (memq char '(?$(7"T(B ?$(7"V(B ?$(7"W(B ?$(7"X(B ?$(7"Y(B ?$(7"Z(B ?$(7"b(B))
  155. (setq comp-vowel
  156. (copy-sequence
  157. (cddr (assoc (char-to-string char)
  158. tibetan-composite-vowel-alist)))
  159. char
  160. (cadr (assoc (char-to-string char)
  161. tibetan-composite-vowel-alist))))
  162. (cond
  163. ;; Compose upper vowel sign vertically over.
  164. ((aref (char-category-set char) ?2)
  165. (setq rule stack-upper))
  166. ;; Compose lower vowel sign vertically under.
  167. ((aref (char-category-set char) ?3)
  168. (if (or (eq char ?$(7"Q(B) ;; `$(7"Q(B' and `$,1FP(B' should not visible when composed.
  169. (eq char #xF70))
  170. (setq rule nil)
  171. (setq rule stack-under)))
  172. ;; Transform ra-mgo (superscribed r) if followed by a subjoined
  173. ;; consonant other than w, ', y, r.
  174. ((and (= (car last) ?$(7"C(B)
  175. (not (memq char '(?$(7#>(B ?$(7"R(B ?$(7#B(B ?$(7#C(B))))
  176. (setcar last ?$(7!"(B) ;; modified for newfont by Tomabechi 1999/12/10
  177. (setq rule stack-under))
  178. ;; Transform initial base consonant if followed by a subjoined
  179. ;; consonant but 'a.
  180. (t
  181. (let ((laststr (char-to-string (car last))))
  182. (if (and (/= char ?$(7"R(B) ;; modified for new font by Tomabechi
  183. (string-match "[$(7"!(B-$(7"="?"@"D(B-$(7"J"K(B]" laststr))
  184. (setcar last (string-to-char
  185. (cdr (assoc (char-to-string (car last))
  186. tibetan-base-to-subjoined-alist)))))
  187. (setq rule stack-under))))
  188. (if rule
  189. (setcdr last (list rule char)))
  190. ;; Added by Tomabechi 2000/06/08
  191. (if comp-vowel
  192. (nconc last comp-vowel))
  193. ))
  194. ;;;###autoload
  195. (defun tibetan-compose-string (str)
  196. "Compose Tibetan string STR."
  197. (let ((idx 0))
  198. ;; `$(7"A(B' is included in the pattern for subjoined consonants
  199. ;; because we treat it specially in tibetan-add-components.
  200. ;; (This feature is removed by Tomabechi 2000/06/08)
  201. (while (setq idx (string-match tibetan-composable-pattern str idx))
  202. (let ((from idx)
  203. (to (match-end 0))
  204. components)
  205. (if (eq (string-match tibetan-precomposition-rule-regexp str idx) idx)
  206. (setq idx (match-end 0)
  207. components
  208. (list (string-to-char
  209. (cdr
  210. (assoc (match-string 0 str)
  211. tibetan-precomposition-rule-alist)))))
  212. (setq components (list (aref str idx))
  213. idx (1+ idx)))
  214. (while (< idx to)
  215. (tibetan-add-components components (aref str idx))
  216. (setq idx (1+ idx)))
  217. (compose-string str from to components))))
  218. str)
  219. ;;;###autoload
  220. (defun tibetan-compose-region (beg end)
  221. "Compose Tibetan text the region BEG and END."
  222. (interactive "r")
  223. (let (str result chars)
  224. (save-excursion
  225. (save-restriction
  226. (narrow-to-region beg end)
  227. (goto-char (point-min))
  228. ;; `$(7"A(B' is included in the pattern for subjoined consonants
  229. ;; because we treat it specially in tibetan-add-components.
  230. ;; (This feature is removed by Tomabechi 2000/06/08)
  231. (while (re-search-forward tibetan-composable-pattern nil t)
  232. (let ((from (match-beginning 0))
  233. (to (match-end 0))
  234. components)
  235. (goto-char from)
  236. (if (looking-at tibetan-precomposition-rule-regexp)
  237. (progn
  238. (setq components
  239. (list (string-to-char
  240. (cdr
  241. (assoc (match-string 0)
  242. tibetan-precomposition-rule-alist)))))
  243. (goto-char (match-end 0)))
  244. (setq components (list (char-after from)))
  245. (forward-char 1))
  246. (while (< (point) to)
  247. (tibetan-add-components components (following-char))
  248. (forward-char 1))
  249. (compose-region from to components)))))))
  250. (defvar tibetan-decompose-precomposition-alist
  251. (mapcar (function (lambda (x) (cons (string-to-char (cdr x)) (car x))))
  252. tibetan-precomposition-rule-alist))
  253. ;;;###autoload
  254. (defun tibetan-decompose-region (from to)
  255. "Decompose Tibetan text in the region FROM and TO.
  256. This is different from decompose-region because precomposed Tibetan characters
  257. are decomposed into normal Tibetan character sequences."
  258. (interactive "r")
  259. (save-restriction
  260. (narrow-to-region from to)
  261. (decompose-region from to)
  262. (goto-char from)
  263. (while (not (eobp))
  264. (let* ((char (following-char))
  265. (slot (assq char tibetan-decompose-precomposition-alist)))
  266. (if slot
  267. (progn
  268. (delete-char 1)
  269. (insert (cdr slot)))
  270. (forward-char 1))))))
  271. ;;;###autoload
  272. (defun tibetan-decompose-string (str)
  273. "Decompose Tibetan string STR.
  274. This is different from decompose-string because precomposed Tibetan characters
  275. are decomposed into normal Tibetan character sequences."
  276. (let ((new "")
  277. (len (length str))
  278. (idx 0)
  279. char slot)
  280. (while (< idx len)
  281. (setq char (aref str idx)
  282. slot (assq (aref str idx) tibetan-decompose-precomposition-alist)
  283. new (concat new (if slot (cdr slot) (char-to-string char)))
  284. idx (1+ idx)))
  285. new))
  286. ;;;
  287. ;;; This variable is used to avoid repeated decomposition.
  288. ;;;
  289. (setq-default tibetan-decomposed nil)
  290. ;;;###autoload
  291. (defun tibetan-decompose-buffer ()
  292. "Decomposes Tibetan characters in the buffer into their components.
  293. See also the documentation of the function `tibetan-decompose-region'."
  294. (interactive)
  295. (make-local-variable 'tibetan-decomposed)
  296. (cond ((not tibetan-decomposed)
  297. (tibetan-decompose-region (point-min) (point-max))
  298. (setq tibetan-decomposed t))))
  299. ;;;###autoload
  300. (defun tibetan-compose-buffer ()
  301. "Composes Tibetan character components in the buffer.
  302. See also docstring of the function tibetan-compose-region."
  303. (interactive)
  304. (make-local-variable 'tibetan-decomposed)
  305. (tibetan-compose-region (point-min) (point-max))
  306. (setq tibetan-decomposed nil))
  307. ;;;###autoload
  308. (defun tibetan-post-read-conversion (len)
  309. (save-excursion
  310. (save-restriction
  311. (let ((buffer-modified-p (buffer-modified-p)))
  312. (narrow-to-region (point) (+ (point) len))
  313. (tibetan-compose-region (point-min) (point-max))
  314. (set-buffer-modified-p buffer-modified-p)
  315. (make-local-variable 'tibetan-decomposed)
  316. (setq tibetan-decomposed nil)
  317. (- (point-max) (point-min))))))
  318. ;;;###autoload
  319. (defun tibetan-pre-write-conversion (from to)
  320. (setq tibetan-decomposed-temp tibetan-decomposed)
  321. (let ((old-buf (current-buffer)))
  322. (set-buffer (generate-new-buffer " *temp*"))
  323. (if (stringp from)
  324. (insert from)
  325. (insert-buffer-substring old-buf from to))
  326. (if (not tibetan-decomposed-temp)
  327. (tibetan-decompose-region (point-min) (point-max)))
  328. ;; Should return nil as annotations.
  329. nil))
  330. ;;;
  331. ;;; Unicode-related definitions.
  332. ;;;
  333. (defvar tibetan-canonicalize-for-unicode-alist
  334. '(("$(7"Q(B" . "") ;; remove vowel a
  335. ("$(7"T(B" . "$(7"R"S(B") ;; decompose vowels whose use is ``discouraged'' in Unicode 3.0
  336. ("$(7"V(B" . "$(7"R"U(B")
  337. ("$(7"W(B" . "$(7#C"a(B")
  338. ("$(7"X(B" . "$(7#C"R"a(B")
  339. ("$(7"Y(B" . "$(7#D"a(B")
  340. ("$(7"Z(B" . "$(7#D"R"a(B")
  341. ("$(7"b(B" . "$(7"R"a(B"))
  342. "Rules for canonicalizing Tibetan vowels for Unicode.")
  343. (defvar tibetan-canonicalize-for-unicode-regexp
  344. "[$(7"Q"T"V"W"X"Y"Z"b(B]"
  345. "Regexp for Tibetan vowels to be canonicalized in Unicode.")
  346. (defun tibetan-canonicalize-for-unicode-region (from to)
  347. (save-restriction
  348. (narrow-to-region from to)
  349. (goto-char from)
  350. (while (re-search-forward tibetan-canonicalize-for-unicode-regexp nil t)
  351. (let (
  352. ;;(from (match-beginning 0))
  353. ;;(to (match-end 0))
  354. (canonical-form
  355. (cdr (assoc (match-string 0)
  356. tibetan-canonicalize-for-unicode-alist))))
  357. ;;(goto-char from)
  358. ;;(delete-region from to)
  359. ;;(insert canonical-form)
  360. (replace-match canonical-form)
  361. ))))
  362. (defvar tibetan-strict-unicode t
  363. "*Flag to control Tibetan canonicalizing for Unicode.
  364. If non-nil, the vowel a is removed and composite vowels are decomposed
  365. before writing buffer in Unicode. See also
  366. `tibetan-canonicalize-for-unicode-regexp' and
  367. `tibetan-canonicalize-for-unicode-alist'.")
  368. ;;;###autoload
  369. (defun tibetan-pre-write-canonicalize-for-unicode (from to)
  370. (let ((old-buf (current-buffer))
  371. (strict-unicode tibetan-strict-unicode))
  372. (set-buffer (generate-new-buffer " *temp*"))
  373. (if (stringp from)
  374. (insert from)
  375. (insert-buffer-substring old-buf from to))
  376. (if strict-unicode
  377. (tibetan-canonicalize-for-unicode-region (point-min) (point-max)))
  378. ;; Should return nil as annotations.
  379. nil))
  380. (provide 'tibet-util)
  381. ;;; tibet-util.el ends here