mbchar.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. /*-
  2. * Copyright (c) 1998, 2002-2008 Kiyoshi Matsui <kmatsui@t3.rim.or.jp>
  3. * All rights reserved.
  4. *
  5. * Some parts of this code are derived from the public domain software
  6. * DECUS cpp (1984,1985) written by Martin Minow.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. /*
  30. * M B C H A R . C
  31. * C h a r a c t e r h a n d l i n g R o u t i n e s
  32. *
  33. * Character handling and multi-byte character handling routines are
  34. * placed here.
  35. */
  36. #if PREPROCESSED
  37. #include "mcpp.H"
  38. #else
  39. #include "system.H"
  40. #include "internal.H"
  41. #endif
  42. /*
  43. * Tables of character types and multi-byte character types.
  44. *
  45. * Some of these character attributes will be overwritten by
  46. * execution time option '-@post' or '-@old'.
  47. * Warning on erroneous sequence will be issued from the caller routines:
  48. * scan_quote(), scan_id() or scan_number().
  49. */
  50. /* Non-ASCII characters are always checked by mb_read(). */
  51. #define NA 0x4000 /* Non-ASCII characters */
  52. /* Horizontal spaces (' ', '\t' and TOK_SEP) */
  53. #define HSPA (SPA | HSP)
  54. short * char_type; /* Pointer to one of the following type_*[]. */
  55. #define EJ1 0x100 /* 1st byte of EUC_JP */
  56. #define EJ2 0x200 /* 2nd byte of EUC_JP */
  57. #define GB1 0x400 /* 1st byte of GB2312 */
  58. #define GB2 0x800 /* 2nd byte of GB2312 */
  59. #define KS1 0x1000 /* 1st byte of KSC5601 */
  60. #define KS2 0x2000 /* 2nd byte of KSC5601 */
  61. #define EJ12 (EJ1 | EJ2) /* 1st byte or 2nd byte of EUC_JP */
  62. #define GB12 (GB1 | GB2)
  63. #define KS12 (KS1 | KS2)
  64. #define EJ1N (NA | EJ1)
  65. #define EU12N (NA | EJ12 | GB12 | KS12)
  66. /* 1st or 2nd byte of EUC_JP, GB2312 or KSC5601, or any other non-ASCII */
  67. static short type_euc[ UCHARMAX + 1] = {
  68. /*
  69. * For EUC_JP, GB2312, KSC5601 or other similar multi-byte char encodings.
  70. */
  71. /* Character type codes */
  72. /* 0, 1, 2, 3, 4, 5, 6, 7, */
  73. /* 8, 9, A, B, C, D, E, F, Hex */
  74. 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
  75. 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
  76. 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
  77. /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
  78. 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
  79. HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
  80. PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
  81. DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
  82. DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
  83. 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
  84. LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
  85. LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
  86. LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
  87. 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
  88. LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
  89. LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
  90. LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
  91. NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
  92. NA, NA, NA, NA, NA, NA, EJ1N, NA, /* 88 .. 8F */
  93. NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
  94. NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
  95. NA, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A0 .. A7 */
  96. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A8 .. AF */
  97. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B0 .. B7 */
  98. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B8 .. BF */
  99. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C0 .. C7 */
  100. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C8 .. CF */
  101. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D0 .. D7 */
  102. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D8 .. DF */
  103. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E0 .. E7 */
  104. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E8 .. EF */
  105. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* F0 .. F7 */
  106. EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, NA, /* F8 .. FF */
  107. };
  108. static short type_bsl[ UCHARMAX + 1] = {
  109. /*
  110. * For SJIS, BIGFIVE or other similar encodings which may have '\\' value as
  111. * the second byte of multi-byte character.
  112. */
  113. #define SJ1 0x100 /* 1st byte of SJIS */
  114. #define SJ2 0x200 /* 2nd byte of SJIS */
  115. #define BF1 0x400 /* 1st byte of BIGFIVE */
  116. #define BF2 0x800 /* 2nd byte of BIGFIVE */
  117. #define SB2 (SJ2 | BF2)
  118. #define SJ2N (NA | SJ2)
  119. #define SB2N (NA | SJ2 | BF2)
  120. #define SJ12N (NA | SJ1 | SJ2)
  121. #define BF12N (NA | BF1 | BF2)
  122. #define SB12N (NA | SJ1 | SJ2 | BF1 | BF2)
  123. #define S2B12N (NA | SJ2 | BF1 | BF2)
  124. #define LSB2 (LET | SB2)
  125. #define PSB2 (PUNC| SB2)
  126. /* Character type codes */
  127. /* 0, 1, 2, 3, 4, 5, 6, 7, */
  128. /* 8, 9, A, B, C, D, E, F, Hex */
  129. 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
  130. 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
  131. 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
  132. /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
  133. 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
  134. HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
  135. PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
  136. DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
  137. DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
  138. SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 40 @ABCDEFG */
  139. LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 48 HIJKLMNO */
  140. LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 50 PQRSTUVW */
  141. LSB2, LSB2, LSB2, PSB2, SB2, PSB2, PSB2, LSB2, /* 58 XYZ[\]^_ */
  142. SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 60 `abcdefg */
  143. LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 68 hijklmno */
  144. LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 70 pqrstuvw */
  145. LSB2, LSB2, LSB2, PSB2, PSB2, PSB2, PSB2, 000, /* 78 xyz{|}~ */
  146. SB2N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 80 .. 87 */
  147. SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 88 .. 8F */
  148. SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 90 .. 97 */
  149. SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 98 .. 9F */
  150. SJ2N, S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A0 .. A7 */
  151. S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A8 .. AF */
  152. S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B0 .. B7 */
  153. S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B8 .. BF */
  154. S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C0 .. C7 */
  155. S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C8 .. CF */
  156. S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D0 .. D7 */
  157. S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D8 .. DF */
  158. SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E0 .. E7 */
  159. SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E8 .. EF */
  160. SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* F0 .. F7 */
  161. SB12N, SB12N, SB12N, SB12N, SB12N, BF12N, BF12N, NA, /* F8 .. FF */
  162. };
  163. /*
  164. * For ISO2022_JP multi-byte character encoding.
  165. */
  166. #define IS1 0x100 /* 1st byte of shift-sequence */
  167. #define IS2 0x200 /* 2nd byte of shift-sequence */
  168. #define IS3 0x400 /* 3rd byte of shift-sequence */
  169. #define IS4 0x800 /* 4th byte of shift-sequence */
  170. #define IJP 0x1000 /* 1st or 2nd byte of ISO-2022-JP (ISO-2022-JP1) */
  171. #define PIJP (PUNC | IJP)
  172. #define QIJP (QUO | IJP)
  173. #define DTJP (DOT | IJP)
  174. #define DGJP (DIG | IJP)
  175. #define LIJP (LET | IJP)
  176. #define JPS2 (IJP | IS2)
  177. #define PJPS23 (PIJP | IS2 | IS3)
  178. #define LJPS3 (LIJP | IS3)
  179. #define LJPS4 (LIJP | IS4)
  180. static short type_iso2022_jp[ UCHARMAX + 1] = {
  181. /* Character type codes */
  182. /* 0, 1, 2, 3, 4, 5, 6, 7, */
  183. /* 8, 9, A, B, C, D, E, F, Hex */
  184. 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
  185. 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
  186. 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
  187. /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
  188. 000, LET, LET, IS1, 000, 000, 000, HSPA, /* 18 */
  189. HSPA, PIJP, QIJP, PIJP, JPS2, PIJP, PIJP, QIJP, /* 20 !"#$%&' */
  190. PJPS23,PIJP, PIJP, PIJP, PIJP, PIJP, DTJP, PIJP, /* 28 ()*+,-./ */
  191. DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, /* 30 01234567 */
  192. DGJP, DGJP, PIJP, PIJP, PIJP, PIJP, PIJP, PIJP, /* 38 89:;<=>? */
  193. IJP, LIJP, LJPS3, LIJP, LJPS4, LIJP, LIJP, LIJP, /* 40 @ABCDEFG */
  194. LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 48 HIJKLMNO */
  195. LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 50 PQRSTUVW */
  196. LIJP, LIJP, LIJP, PIJP, IJP, PIJP, PIJP, LIJP, /* 58 XYZ[\]^_ */
  197. IJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 60 `abcdefg */
  198. LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 68 hijklmno */
  199. LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 70 pqrstuvw */
  200. LIJP, LIJP, LIJP, PIJP, PIJP, PIJP, PIJP, 000, /* 78 xyz{|}~ */
  201. NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
  202. NA, NA, NA, NA, NA, NA, NA, NA, /* 88 .. 8F */
  203. NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
  204. NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
  205. NA, NA, NA, NA, NA, NA, NA, NA, /* A0 .. A7 */
  206. NA, NA, NA, NA, NA, NA, NA, NA, /* A8 .. AF */
  207. NA, NA, NA, NA, NA, NA, NA, NA, /* B0 .. B7 */
  208. NA, NA, NA, NA, NA, NA, NA, NA, /* B8 .. BF */
  209. NA, NA, NA, NA, NA, NA, NA, NA, /* C0 .. C7 */
  210. NA, NA, NA, NA, NA, NA, NA, NA, /* C8 .. CF */
  211. NA, NA, NA, NA, NA, NA, NA, NA, /* D0 .. D7 */
  212. NA, NA, NA, NA, NA, NA, NA, NA, /* D8 .. DF */
  213. NA, NA, NA, NA, NA, NA, NA, NA, /* E0 .. E7 */
  214. NA, NA, NA, NA, NA, NA, NA, NA, /* E8 .. EF */
  215. NA, NA, NA, NA, NA, NA, NA, NA, /* F0 .. F7 */
  216. NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
  217. };
  218. /*
  219. * For UTF8 multi-byte character encoding.
  220. */
  221. #define U2_1 0x100 /* 1st byte of 2-byte encoding of UTF8 */
  222. #define U3_1 0x200 /* 1st byte of 3-byte encoding of UTF8 */
  223. #define U4_1 0x400 /* 1st byte of 4-byte encoding of UTF8 */
  224. #define UCONT 0x800 /* Continuation of a 2, 3, or 4 byte UTF8 sequence */
  225. #define U2_1N (NA | U2_1)
  226. #define U3_1N (NA | U3_1)
  227. #define U4_1N (NA | U4_1)
  228. #define UCONTN (NA | UCONT)
  229. static short type_utf8[ UCHARMAX + 1] = {
  230. /* Character type codes */
  231. /* 0, 1, 2, 3, 4, 5, 6, 7, */
  232. /* 8, 9, A, B, C, D, E, F, Hex */
  233. 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
  234. 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
  235. 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
  236. /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
  237. 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
  238. HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
  239. PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
  240. DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
  241. DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
  242. 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
  243. LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
  244. LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
  245. LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
  246. 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
  247. LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
  248. LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
  249. LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
  250. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 80 .. 87 */
  251. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 88 .. 8F */
  252. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 90 .. 97 */
  253. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 98 .. 9F */
  254. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A0 .. A7 */
  255. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A8 .. AF */
  256. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B0 .. B7 */
  257. UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B8 .. BF */
  258. NA, NA, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C0 .. C7 */
  259. U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C8 .. CF */
  260. U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D0 .. D7 */
  261. U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D8 .. DF */
  262. U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E0 .. E7 */
  263. U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E8 .. EF */
  264. U4_1N, U4_1N, U4_1N, U4_1N, U4_1N, NA, NA, NA, /* F0 .. F7 */
  265. NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
  266. };
  267. #define SETLOCALE 2 /* #pragma setlocale (not __setlocale) */
  268. #define NUM_ENCODING 8
  269. #define NUM_ALIAS 6
  270. /* Names of encoding recognized. Table for search_encoding(). */
  271. static const char * const encoding_name[ NUM_ENCODING][ NUM_ALIAS] = {
  272. /* Visual C full, Visual C short
  273. , 4 miscellaneous */
  274. { "english", "c"
  275. , "c", "en", "latin", "iso8859"},
  276. { "", ""
  277. , "eucjp", "euc", "ujis", ""},
  278. { "chinesesimplified", "chs"
  279. , "gb2312", "cngb", "euccn", ""},
  280. { "korean", "kor"
  281. , "ksc5601", "ksx1001", "wansung", "euckr"},
  282. { "japanese", "jpn"
  283. , "sjis", "shiftjis", "mskanji", ""},
  284. { "chinesetraditional", "cht"
  285. , "bigfive", "big5", "cnbig5", "euctw"},
  286. { "", ""
  287. , "iso2022jp", "iso2022jp1", "jis", ""},
  288. { "", ""
  289. , "utf8", "utf", "", ""},
  290. };
  291. static int mbstart;
  292. static int mb2;
  293. static size_t mb_read_2byte( int c1, char ** in_pp, char ** out_pp);
  294. /* For 2-byte encodings of mbchar */
  295. static const char * search_encoding( char * norm, int alias);
  296. /* Search encoding_name[][] table */
  297. static void strip_bar( char * string);
  298. /* Remove '_', '-' or '.' in the string */
  299. static void conv_case( char * name, char * lim, int upper);
  300. /* Convert to upper/lower case */
  301. static size_t mb_read_iso2022_jp( int c1, char ** in_pp, char ** out_pp);
  302. /* For ISO2022_JP encoding */
  303. static size_t mb_read_utf8( int c1, char ** in_pp, char ** out_pp);
  304. /* For UTF8 mbchar encoding */
  305. #define NAMLEN 20
  306. #define UPPER 1 /* To upper */
  307. #define LOWER 0 /* To lower */
  308. const char * set_encoding(
  309. char * name, /* Name of encoding specified */
  310. char * env, /* Name of environment variable */
  311. int pragma
  312. /* 2: #pragma setlocale, 1: #pragma __setlocale, 0: not #pragma */
  313. )
  314. /*
  315. * Search the encoding specified and re-initialize mbchar settings.
  316. */
  317. {
  318. const char * unknown_encoding
  319. = "Unknown encoding: %s%.0ld%.0s"; /* _W1_ */
  320. const char * too_long
  321. = "Too long encoding name: %s%.0ld%.0s"; /* _E_ */
  322. const char * loc = "";
  323. int alias;
  324. char norm[ NAMLEN];
  325. memset(norm, 0, NAMLEN);
  326. /*
  327. * Normalized name (removed 'xxxxx.', stripped '_', '-', '.'
  328. * and lowered.
  329. */
  330. if (strlen( name) >= NAMLEN) {
  331. if ((env || pragma) && (warn_level & 1)) {
  332. cwarn( too_long, name, 0L, NULL);
  333. } else {
  334. mcpp_fprintf( ERR, too_long, name);
  335. mcpp_fputc( '\n', ERR);
  336. }
  337. }
  338. strcpy( norm, name);
  339. if (norm[ 5] == '.')
  340. memmove( norm, norm + 5, strlen( norm + 5) + 1);
  341. /* Remove initial 'xxxxx.' as 'ja_JP.', 'en_US.' or any other */
  342. conv_case( norm, norm + strlen( norm), LOWER);
  343. strip_bar( norm);
  344. if (strlen( name) == 0) { /* "" */
  345. mbchar = MBCHAR; /* Restore to the default encoding */
  346. } else if (memcmp( norm, "iso8859", 7) == 0 /* iso8859* */
  347. || memcmp( norm, "latin", 5) == 0 /* latin* */
  348. || memcmp( norm, "en", 2) == 0) { /* en* */
  349. mbchar = 0; /* No multi-byte character */
  350. } else {
  351. alias = 2;
  352. #if COMPILER == MSC
  353. if (pragma == SETLOCALE) /* #pragma setlocale */
  354. alias = 0;
  355. #endif
  356. loc = search_encoding( norm, alias); /* Search the name */
  357. }
  358. if (loc == NULL) {
  359. if ((env || pragma) && (warn_level & 1)) {
  360. cwarn( unknown_encoding, name, 0L, NULL);
  361. } else { /* -m option */
  362. mcpp_fprintf( ERR, unknown_encoding, name);
  363. mcpp_fputc( '\n', ERR);
  364. }
  365. } else {
  366. mb_init(); /* Re-initialize */
  367. }
  368. return loc;
  369. }
  370. static const char * search_encoding(
  371. char * norm, /* The name of encoding specified */
  372. int alias /* The number of alias to start searching */
  373. )
  374. {
  375. const char * loc;
  376. int lo, al;
  377. for (lo = 0; lo < NUM_ENCODING; lo++) {
  378. for (al = alias ; al < NUM_ALIAS; al++) {
  379. loc = encoding_name[ lo][ al];
  380. if (str_eq( loc, norm)) {
  381. switch (lo) {
  382. case 0 : mbchar = 0; break;
  383. case 1 : mbchar = EUC_JP; break;
  384. case 2 : mbchar = GB2312; break;
  385. case 3 : mbchar = KSC5601; break;
  386. case 4 : mbchar = SJIS; break;
  387. case 5 : mbchar = BIGFIVE; break;
  388. case 6 : mbchar = ISO2022_JP; break;
  389. case 7 : mbchar = UTF8; break;
  390. }
  391. return loc;
  392. }
  393. }
  394. }
  395. return NULL;
  396. }
  397. static void strip_bar(
  398. char * string
  399. )
  400. /*
  401. * Strip '_', '-' or '.' in the string.
  402. */
  403. {
  404. char * cp = string;
  405. while (*cp != EOS) {
  406. if (*cp == '_' || *cp == '-' || *cp == '.')
  407. memmove( cp, cp + 1, strlen( cp));
  408. else
  409. cp++;
  410. }
  411. }
  412. static void conv_case(
  413. char * name, /* (diretory) Name */
  414. char * lim, /* End of (directory) name */
  415. int upper /* TRUE if to upper */
  416. )
  417. /* Convert a string to upper-case letters or lower-case letters in-place */
  418. {
  419. int c;
  420. char * sp;
  421. for (sp = name; sp < lim; sp++) {
  422. c = *sp & UCHARMAX;
  423. #if MBCHAR
  424. if ((char_type[ c] & mbstart)) {
  425. char tmp[ PATHMAX+1];
  426. char * tp = tmp;
  427. *tp++ = *sp++;
  428. mb_read( c, &sp, &tp);
  429. } else
  430. #endif
  431. {
  432. if (upper)
  433. *sp = toupper( c);
  434. else
  435. *sp = tolower( c);
  436. }
  437. }
  438. }
  439. void mb_init( void)
  440. /*
  441. * Initialize multi-byte character settings.
  442. * First called prior to setting the 'mcpp_mode'.
  443. * Will be called again each time the multibyte character encoding is changed.
  444. */
  445. {
  446. /*
  447. * Select the character classification table, select the multi-byte
  448. * character reading routine and decide whether multi-byte character
  449. * may contain the byte of value 0x5c.
  450. */
  451. switch (mbchar) {
  452. case 0 :
  453. case EUC_JP :
  454. case GB2312 :
  455. case KSC5601 :
  456. char_type = type_euc;
  457. bsl_in_mbchar = FALSE;
  458. mb_read = mb_read_2byte;
  459. break;
  460. case SJIS :
  461. case BIGFIVE :
  462. char_type = type_bsl;
  463. bsl_in_mbchar = TRUE;
  464. mb_read = mb_read_2byte;
  465. break;
  466. case ISO2022_JP :
  467. char_type = type_iso2022_jp;
  468. bsl_in_mbchar = TRUE;
  469. mb_read = mb_read_iso2022_jp;
  470. break;
  471. case UTF8 :
  472. char_type = type_utf8;
  473. bsl_in_mbchar = FALSE;
  474. mb_read = mb_read_utf8;
  475. break;
  476. }
  477. /* Set the bit patterns for character classification. */
  478. switch (mbchar) {
  479. case 0 :
  480. mbstart = 0;
  481. break;
  482. case EUC_JP :
  483. mbstart = EJ1;
  484. mb2 = EJ2;
  485. break;
  486. case GB2312 :
  487. mbstart = GB1;
  488. mb2 = GB2;
  489. break;
  490. case KSC5601:
  491. mbstart = KS1;
  492. mb2 = KS2;
  493. break;
  494. case SJIS :
  495. mbstart = SJ1;
  496. mb2 = SJ2;
  497. break;
  498. case BIGFIVE:
  499. mbstart = BF1;
  500. mb2 = BF2;
  501. break;
  502. case ISO2022_JP :
  503. mbstart = IS1;
  504. break;
  505. case UTF8 :
  506. mbstart = (U2_1 | U3_1 | U4_1);
  507. break;
  508. }
  509. switch (mbchar) {
  510. case 0 :
  511. mbchk = 0;
  512. break;
  513. case EUC_JP :
  514. case GB2312 :
  515. case KSC5601:
  516. case SJIS :
  517. case BIGFIVE:
  518. case UTF8 :
  519. mbchk = NA;
  520. break;
  521. case ISO2022_JP :
  522. mbchk = (IS1 | NA);
  523. break;
  524. }
  525. /*
  526. * Set special handling for some encodings to supplement some compiler's
  527. * deficiency.
  528. */
  529. switch (mbchar) {
  530. case SJIS :
  531. #if ! SJIS_IS_ESCAPE_FREE
  532. bsl_need_escape = TRUE;
  533. #endif
  534. break;
  535. case BIGFIVE:
  536. #if ! BIGFIVE_IS_ESCAPE_FREE
  537. bsl_need_escape = TRUE;
  538. #endif
  539. break;
  540. case ISO2022_JP :
  541. #if ! ISO2022_JP_IS_ESCAPE_FREE
  542. bsl_need_escape = TRUE;
  543. #endif
  544. break;
  545. default :
  546. bsl_need_escape = FALSE;
  547. break;
  548. }
  549. /*
  550. * Modify magic characters in character type table.
  551. * char_type[] table should be rewritten in accordance with the 'mcpp_mode'
  552. * whenever the encoding is changed.
  553. */
  554. if (mcpp_mode) { /* If mcpp_mode is already set */
  555. char_type[ DEF_MAGIC] = standard ? LET : 0;
  556. char_type[ IN_SRC] = (mcpp_mode == STD) ? LET : 0;
  557. char_type[ TOK_SEP] = (mcpp_mode == STD || mcpp_mode == OLD_PREP)
  558. ? HSPA: 0; /* TOK_SEP equals to COM_SEP */
  559. }
  560. }
  561. static size_t mb_read_2byte(
  562. int c1, /* The 1st byte of mbchar sequence (already read) */
  563. char ** in_pp, /* Pointer to input */
  564. char ** out_pp /* Pointer to output */
  565. )
  566. /*
  567. * Multi-byte character reading routine for 2-byte encodings.
  568. */
  569. {
  570. int error = FALSE;
  571. size_t len = 0; /* Number of multi-byte characters read. */
  572. char * in_p = *in_pp;
  573. char * out_p = *out_pp;
  574. if (! (char_type[ c1 & UCHARMAX] & mbstart))
  575. return MB_ERROR; /* Not a multi-byte character */
  576. do {
  577. if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mb2)) {
  578. error = TRUE;
  579. break;
  580. }
  581. len++;
  582. } while (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mbstart);
  583. *in_pp = --in_p;
  584. *(--out_p) = EOS;
  585. *out_pp = out_p;
  586. return error ? (len | MB_ERROR) : len;
  587. }
  588. static size_t mb_read_iso2022_jp(
  589. int c1, /* The 1st byte of the sequence already read (always 0x1b). */
  590. char ** in_pp,
  591. char ** out_pp
  592. )
  593. /*
  594. * Multi-byte character reading routine for ISO2022_JP.
  595. */
  596. {
  597. int error = FALSE;
  598. size_t len = 0;
  599. char * in_p = *in_pp;
  600. char * out_p = *out_pp;
  601. int c2, c3, c4;
  602. if (! (char_type[ c1 & UCHARMAX] & mbstart))
  603. return MB_ERROR;
  604. do {
  605. *out_p++ = c2 = *in_p++;
  606. if (! (char_type[ c2 & UCHARMAX] & IS2)) {
  607. error = TRUE;
  608. break;
  609. }
  610. *out_p++ = c3 = *in_p++;
  611. if (! (char_type[ c3 & UCHARMAX] & IS3)) {
  612. error = TRUE;
  613. break;
  614. }
  615. switch (c2) {
  616. case 0x24 :
  617. switch (c3) {
  618. case 0x42 : /* 0x1b 0x24 0x42: JIS X 0208-1983 */
  619. break;
  620. case 0x28 :
  621. *out_p++ = c4 = *in_p++;
  622. if (! (char_type[ c4 & UCHARMAX] & IS4))
  623. error = TRUE;
  624. /* else: 0x1b 0x24 0x28 0x44: JIS X 0212 */
  625. break;
  626. default :
  627. error = TRUE;
  628. }
  629. break;
  630. case 0x28 :
  631. switch (c3) {
  632. case 0x42 : /* 0x1b 0x28 0x42: ASCII */
  633. c1 = *out_p++ = *in_p++ & UCHARMAX;
  634. continue;
  635. default :
  636. error = TRUE;
  637. }
  638. break;
  639. }
  640. if (error)
  641. break;
  642. while (char_type[ c1 = *out_p++ = (*in_p++ & UCHARMAX)] & IJP) {
  643. if (! (char_type[ *out_p++ = (*in_p++ & UCHARMAX)] & IJP)) {
  644. error = TRUE;
  645. break;
  646. }
  647. len++; /* String of multi-byte characters */
  648. }
  649. if (error)
  650. break;
  651. } while (char_type[ c1] & IS1); /* 0x1b: start of shift-sequence */
  652. *in_pp = --in_p;
  653. *(--out_p) = EOS;
  654. *out_pp = out_p;
  655. return error ? (len | MB_ERROR) : len;
  656. }
  657. static size_t mb_read_utf8(
  658. int c1,
  659. char ** in_pp,
  660. char ** out_pp
  661. )
  662. /*
  663. * Multi-byte character reading routine for UTF8.
  664. */
  665. {
  666. int error = FALSE;
  667. size_t len = 0;
  668. char * in_p = *in_pp;
  669. char * out_p = *out_pp;
  670. if (! (char_type[ c1 & UCHARMAX] & mbstart))
  671. return MB_ERROR;
  672. do {
  673. unsigned int codepoint;
  674. int i, bytes;
  675. if ((char_type[ c1 & UCHARMAX] & U4_1) == U4_1)
  676. bytes = 4; /* 4-byte character */
  677. else if ((char_type[ c1 & UCHARMAX] & U3_1) == U3_1)
  678. bytes = 3; /* 3-byte character */
  679. else if ((char_type[ c1 & UCHARMAX] & U2_1) == U2_1)
  680. bytes = 2; /* 2-byte character */
  681. /* Must ensure that the sequence is not reserved as a surrogate */
  682. codepoint = ((2 << (6-bytes)) - 1) & c1; /* mask off top bits */
  683. /* All bytes left in the sequence must be in 0x80 - 0xBF */
  684. for (i = bytes - 1; i && !error; i--) {
  685. codepoint = (codepoint << 6) + ((*in_p) & 0x3fU);
  686. if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & UCONT))
  687. error = TRUE;
  688. }
  689. /* Check for overlong/underlong sequences */
  690. if ((bytes == 2 && (codepoint < 0x80 || codepoint > 0x7FF))
  691. || (bytes == 3 && (codepoint < 0x800 || codepoint > 0xFFFF))
  692. || (bytes == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)))
  693. error = TRUE;
  694. if ((codepoint >= 0xD800 && codepoint <= 0xDFFF)
  695. /* Check for reserved surrogate codepoints */
  696. || (codepoint >= 0xFFFE && codepoint <= 0xFFFF))
  697. /* Illegal */
  698. error = TRUE;
  699. #if 0
  700. printf( "codepoint:0x%x\n", codepoint);
  701. #endif
  702. if (error)
  703. break;
  704. len++;
  705. } while (char_type[ (*out_p++ = c1 = *in_p++) & UCHARMAX] & mbstart);
  706. /* Start of the next multi-byte character */
  707. *in_pp = --in_p;
  708. *(--out_p) = EOS;
  709. *out_pp = out_p;
  710. return error ? (len | MB_ERROR) : len;
  711. }
  712. uexpr_t mb_eval(
  713. char ** seq_pp
  714. )
  715. /*
  716. * Evaluate the value of a multi-byte character.
  717. * This routine does not check the legality of the sequence.
  718. * This routine is called from eval_char().
  719. * This routine is never called in POST_STD mode.
  720. */
  721. {
  722. char * seq = *seq_pp;
  723. uexpr_t val = 0;
  724. int c, c1;
  725. if (! (char_type[ c = *seq++ & UCHARMAX] & mbstart)) {
  726. *seq_pp = seq;
  727. return c; /* Not a multi-byte character */
  728. }
  729. switch (mbchar) {
  730. case EUC_JP :
  731. case GB2312 :
  732. case KSC5601:
  733. case SJIS :
  734. case BIGFIVE:
  735. val = (c << 8) + (*seq++ & UCHARMAX);
  736. /* Evaluate the 2-byte sequence */
  737. break;
  738. case ISO2022_JP :
  739. if (char_type[ c & UCHARMAX] & IS1) { /* Skip shift-sequence */
  740. if (char_type[ c = *seq++ & UCHARMAX] & IS2) {
  741. if (char_type[ c1 = *seq++ & UCHARMAX] & IS3) {
  742. if (c1 == 0x28)
  743. seq++;
  744. if (c == 0x28 && c1 == 0x42) { /* Shift-out sequence */
  745. val = 0;
  746. break;
  747. }
  748. c = *seq++ & UCHARMAX;
  749. }
  750. }
  751. }
  752. val = (c << 8) + (*seq++ & UCHARMAX); /* Evaluate the 2-bytes */
  753. break;
  754. case UTF8 : /* Evaluate the sequence of 2, 3 or 4 bytes as it is */
  755. val = (c << 8) + (*seq++ & UCHARMAX);
  756. if (char_type[ c & UCHARMAX] & U3_1) {
  757. val = (val << 8) + (*seq++ & UCHARMAX);
  758. } else if (char_type[ c & UCHARMAX] & U4_1) {
  759. val = (val << 8) + (*seq++ & UCHARMAX);
  760. val = (val << 8) + (*seq++ & UCHARMAX);
  761. }
  762. break;
  763. }
  764. *seq_pp = seq;
  765. return val;
  766. }
  767. int last_is_mbchar(
  768. const char * in, /* Input physical line */
  769. int len /* Length of the line minus 2 */
  770. )
  771. /*
  772. * Return 2, if the last char of the line is second byte of SJIS or BIGFIVE,
  773. * else return 0.
  774. */
  775. {
  776. const char * cp = in + len;
  777. const char * const endp = in + len; /* -> the char befor '\n' */
  778. if ((mbchar & (SJIS | BIGFIVE)) == 0)
  779. return 0;
  780. while (in <= --cp) { /* Search backwardly */
  781. if ((char_type[ *cp & UCHARMAX] & mbstart) == 0)
  782. break; /* Not the first byte of MBCHAR */
  783. }
  784. if ((endp - cp) & 1)
  785. return 0;
  786. else
  787. return 2;
  788. }