read.c 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851
  1. /* Copyright 1995-1997,1999-2001,2003-2004,2006-2012,2014-2021
  2. Free Software Foundation, Inc.
  3. This file is part of Guile.
  4. Guile is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Lesser General Public License as published
  6. by the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. Guile is distributed in the hope that it will be useful, but WITHOUT
  9. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
  11. License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with Guile. If not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #ifdef HAVE_CONFIG_H
  16. # include <config.h>
  17. #endif
  18. #include <alloca.h>
  19. #include <c-ctype.h>
  20. #include <c-strcase.h>
  21. #include <stdio.h>
  22. #include <string.h>
  23. #include <unicase.h>
  24. #include <unictype.h>
  25. #include <unistd.h>
  26. #include "alist.h"
  27. #include "arrays.h"
  28. #include "bitvectors.h"
  29. #include "boolean.h"
  30. #include "bytevectors.h"
  31. #include "chars.h"
  32. #include "eq.h"
  33. #include "eval.h"
  34. #include "fluids.h"
  35. #include "fports.h"
  36. #include "gsubr.h"
  37. #include "hash.h"
  38. #include "hashtab.h"
  39. #include "keywords.h"
  40. #include "modules.h"
  41. #include "numbers.h"
  42. #include "pairs.h"
  43. #include "ports-internal.h"
  44. #include "ports.h"
  45. #include "private-options.h"
  46. #include "procs.h"
  47. #include "srcprop.h"
  48. #include "srfi-13.h"
  49. #include "srfi-4.h"
  50. #include "strings.h"
  51. #include "strports.h"
  52. #include "symbols.h"
  53. #include "variable.h"
  54. #include "vectors.h"
  55. #include "read.h"
  56. SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  57. SCM_SYMBOL (sym_nil, "nil");
  58. SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
  59. scm_t_option scm_read_opts[] =
  60. {
  61. { SCM_OPTION_BOOLEAN, "positions", 1,
  62. "Record positions of source code expressions." },
  63. { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  64. "Convert symbols to lower case."},
  65. { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  66. "Style of keyword recognition: #f, 'prefix or 'postfix."},
  67. { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  68. "Use R6RS variable-length character and string hex escapes."},
  69. { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  70. "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  71. { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  72. "In strings, consume leading whitespace after an escaped end-of-line."},
  73. { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  74. "Support SRFI-105 curly infix expressions."},
  75. { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
  76. "Support R7RS |...| symbol notation."},
  77. { 0, },
  78. };
  79. void
  80. scm_i_input_error (char const *function,
  81. SCM port, const char *message, SCM arg)
  82. {
  83. SCM fn = (scm_is_string (SCM_FILENAME(port))
  84. ? SCM_FILENAME(port)
  85. : scm_from_utf8_string ("#<unknown port>"));
  86. SCM string_port = scm_open_output_string ();
  87. SCM string = SCM_EOL;
  88. scm_simple_format (string_port,
  89. scm_from_utf8_string ("~A:~S:~S: ~A"),
  90. scm_list_4 (fn,
  91. scm_sum (scm_port_line (port), SCM_INUM1),
  92. scm_sum (scm_port_column (port), SCM_INUM1),
  93. scm_from_utf8_string (message)));
  94. string = scm_get_output_string (string_port);
  95. scm_close_output_port (string_port);
  96. scm_error_scm (scm_from_utf8_symbol ("read-error"),
  97. function? scm_from_utf8_string (function) : SCM_BOOL_F,
  98. string,
  99. arg,
  100. SCM_BOOL_F);
  101. }
  102. SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
  103. (SCM setting),
  104. "Option interface for the read options. Instead of using\n"
  105. "this procedure directly, use the procedures @code{read-enable},\n"
  106. "@code{read-disable}, @code{read-set!} and @code{read-options}.")
  107. #define FUNC_NAME s_scm_read_options
  108. {
  109. return scm_options (setting, scm_read_opts, FUNC_NAME);
  110. }
  111. #undef FUNC_NAME
  112. /* Token readers. */
  113. /* Size of the C buffer used to read symbols and numbers. */
  114. #define READER_BUFFER_SIZE 128
  115. /* Number of 32-bit codepoints in the buffer used to read strings. */
  116. #define READER_STRING_BUFFER_SIZE 128
  117. /* The maximum size of Scheme character names. */
  118. #define READER_CHAR_NAME_MAX_SIZE 50
  119. /* The maximum size of reader directive names. */
  120. #define READER_DIRECTIVE_NAME_MAX_SIZE 50
  121. /* `isblank' is only in C99. */
  122. #define CHAR_IS_BLANK_(_chr) \
  123. (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
  124. || ((_chr) == '\f') || ((_chr) == '\r'))
  125. #ifdef MSDOS
  126. # define CHAR_IS_BLANK(_chr) \
  127. ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
  128. #else
  129. # define CHAR_IS_BLANK CHAR_IS_BLANK_
  130. #endif
  131. /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
  132. structure''). */
  133. #define CHAR_IS_R5RS_DELIMITER(c) \
  134. (CHAR_IS_BLANK (c) \
  135. || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
  136. #define CHAR_IS_DELIMITER(c) \
  137. (CHAR_IS_R5RS_DELIMITER (c) \
  138. || (((c) == ']' || (c) == '[')))
  139. /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
  140. Structure''. */
  141. #define CHAR_IS_EXPONENT_MARKER(_chr) \
  142. (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
  143. || ((_chr) == 'd') || ((_chr) == 'l'))
  144. /* Read an SCSH block comment. */
  145. static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
  146. static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
  147. static SCM scm_read_commented_expression (scm_t_wchar, SCM);
  148. static SCM scm_read_shebang (scm_t_wchar, SCM);
  149. static SCM scm_get_hash_procedure (int);
  150. /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
  151. result in the pre-allocated buffer BUF. Return zero if the whole token has
  152. fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
  153. bytes actually read. */
  154. static int
  155. read_token (SCM port, char *buf, size_t buf_size, size_t *read)
  156. {
  157. *read = 0;
  158. while (*read < buf_size)
  159. {
  160. int chr;
  161. chr = scm_get_byte_or_eof (port);
  162. if (chr == EOF)
  163. return 0;
  164. else if (CHAR_IS_DELIMITER (chr))
  165. {
  166. scm_unget_byte (chr, port);
  167. return 0;
  168. }
  169. else
  170. {
  171. *buf = (char) chr;
  172. buf++, (*read)++;
  173. }
  174. }
  175. return 1;
  176. }
  177. /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
  178. if the token doesn't fit in BUFFER_SIZE bytes. */
  179. static char *
  180. read_complete_token (SCM port, char *buffer, size_t buffer_size, size_t *read)
  181. {
  182. int overflow = 0;
  183. size_t bytes_read, overflow_size = 0;
  184. char *overflow_buffer = NULL;
  185. do
  186. {
  187. overflow = read_token (port, buffer, buffer_size, &bytes_read);
  188. if (bytes_read == 0)
  189. break;
  190. if (overflow || overflow_size != 0)
  191. {
  192. if (overflow_size == 0)
  193. {
  194. overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
  195. memcpy (overflow_buffer, buffer, bytes_read);
  196. overflow_size = bytes_read;
  197. }
  198. else
  199. {
  200. char *new_buf =
  201. scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
  202. memcpy (new_buf, overflow_buffer, overflow_size);
  203. memcpy (new_buf + overflow_size, buffer, bytes_read);
  204. overflow_buffer = new_buf;
  205. overflow_size += bytes_read;
  206. }
  207. }
  208. }
  209. while (overflow);
  210. if (overflow_size)
  211. *read = overflow_size;
  212. else
  213. *read = bytes_read;
  214. return overflow_size > 0 ? overflow_buffer : buffer;
  215. }
  216. /* Skip whitespace from PORT and return the first non-whitespace character
  217. read. Raise an error on end-of-file. */
  218. static int
  219. flush_ws (SCM port, const char *eoferr)
  220. {
  221. scm_t_wchar c;
  222. while (1)
  223. switch (c = scm_getc (port))
  224. {
  225. case EOF:
  226. goteof:
  227. if (eoferr)
  228. {
  229. scm_i_input_error (eoferr,
  230. port,
  231. "end of file",
  232. SCM_EOL);
  233. }
  234. return c;
  235. case ';':
  236. lp:
  237. switch (c = scm_getc (port))
  238. {
  239. case EOF:
  240. goto goteof;
  241. default:
  242. goto lp;
  243. case SCM_LINE_INCREMENTORS:
  244. break;
  245. }
  246. break;
  247. case '#':
  248. switch (c = scm_getc (port))
  249. {
  250. case EOF:
  251. eoferr = "read_sharp";
  252. goto goteof;
  253. case '!':
  254. scm_read_shebang (c, port);
  255. break;
  256. case ';':
  257. scm_read_commented_expression (c, port);
  258. break;
  259. case '|':
  260. if (scm_is_false (scm_get_hash_procedure (c)))
  261. {
  262. scm_read_r6rs_block_comment (c, port);
  263. break;
  264. }
  265. /* fall through */
  266. default:
  267. scm_ungetc (c, port);
  268. return '#';
  269. }
  270. break;
  271. case SCM_LINE_INCREMENTORS:
  272. case SCM_SINGLE_SPACES:
  273. case '\t':
  274. break;
  275. default:
  276. return c;
  277. }
  278. return 0;
  279. }
  280. /* Token readers. */
  281. static SCM scm_read_expression (SCM port);
  282. static SCM scm_read_sharp (int chr, SCM port);
  283. static SCM
  284. scm_read_sexp (scm_t_wchar chr, SCM port)
  285. #define FUNC_NAME "scm_i_lreadparen"
  286. {
  287. int c;
  288. SCM tmp, tl, ans = SCM_EOL;
  289. const int terminating_char = (chr == '[') ? ']' : ')';
  290. c = flush_ws (port, FUNC_NAME);
  291. if (terminating_char == c)
  292. return SCM_EOL;
  293. scm_ungetc (c, port);
  294. tmp = scm_read_expression (port);
  295. /* Note that it is possible for scm_read_expression to return
  296. scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
  297. check that it's a real dot by checking `c'. */
  298. if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
  299. {
  300. ans = scm_read_expression (port);
  301. if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
  302. scm_i_input_error (FUNC_NAME, port, "missing close paren",
  303. SCM_EOL);
  304. return ans;
  305. }
  306. /* Build the head of the list structure. */
  307. ans = tl = scm_cons (tmp, SCM_EOL);
  308. while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
  309. {
  310. SCM new_tail;
  311. if (c == ')' || c == ']')
  312. scm_i_input_error (FUNC_NAME, port,
  313. "in pair: mismatched close paren: ~A",
  314. scm_list_1 (SCM_MAKE_CHAR (c)));
  315. scm_ungetc (c, port);
  316. tmp = scm_read_expression (port);
  317. /* See above note about scm_sym_dot. */
  318. if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
  319. {
  320. SCM_SETCDR (tl, scm_read_expression (port));
  321. c = flush_ws (port, FUNC_NAME);
  322. if (terminating_char != c)
  323. scm_i_input_error (FUNC_NAME, port,
  324. "in pair: missing close paren", SCM_EOL);
  325. break;
  326. }
  327. new_tail = scm_cons (tmp, SCM_EOL);
  328. SCM_SETCDR (tl, new_tail);
  329. tl = new_tail;
  330. }
  331. return ans;
  332. }
  333. #undef FUNC_NAME
  334. /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
  335. C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
  336. found. */
  337. #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
  338. do \
  339. { \
  340. scm_t_wchar a; \
  341. size_t i = 0; \
  342. c = 0; \
  343. while (i < ndigits) \
  344. { \
  345. a = scm_getc (port); \
  346. if (a == EOF) \
  347. goto str_eof; \
  348. if (terminator \
  349. && (a == (scm_t_wchar) terminator) \
  350. && (i > 0)) \
  351. break; \
  352. if ('0' <= a && a <= '9') \
  353. a -= '0'; \
  354. else if ('A' <= a && a <= 'F') \
  355. a = a - 'A' + 10; \
  356. else if ('a' <= a && a <= 'f') \
  357. a = a - 'a' + 10; \
  358. else \
  359. { \
  360. c = a; \
  361. goto bad_escaped; \
  362. } \
  363. c = c * 16 + a; \
  364. i ++; \
  365. } \
  366. } while (0)
  367. /* Read either a double-quoted string or an R7RS-style symbol delimited
  368. by vertical lines, depending on the value of 'chr' ('"' or '|').
  369. Regardless, the result is always returned as a string. */
  370. static SCM
  371. scm_read_string_like_syntax (int chr, SCM port)
  372. #define FUNC_NAME "scm_lreadr"
  373. {
  374. /* For strings smaller than C_STR, this function creates only one Scheme
  375. object (the string returned). */
  376. SCM str = SCM_EOL;
  377. size_t c_str_len = 0;
  378. scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
  379. while (chr != (c = scm_getc (port)))
  380. {
  381. if (c == EOF)
  382. {
  383. str_eof:
  384. scm_i_input_error (FUNC_NAME, port,
  385. (chr == '|'
  386. ? "end of file in symbol"
  387. : "end of file in string constant"),
  388. SCM_EOL);
  389. }
  390. if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
  391. {
  392. str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
  393. c_str_len = 0;
  394. }
  395. if (c == '\\')
  396. {
  397. switch (c = scm_getc (port))
  398. {
  399. case EOF:
  400. goto str_eof;
  401. case '|':
  402. case '\\':
  403. case '(': /* Accept "\(" for use at the beginning of lines
  404. in multiline strings to avoid confusing emacs
  405. lisp modes. */
  406. break;
  407. case '\n':
  408. continue;
  409. case '0':
  410. c = '\0';
  411. break;
  412. case 'f':
  413. c = '\f';
  414. break;
  415. case 'n':
  416. c = '\n';
  417. break;
  418. case 'r':
  419. c = '\r';
  420. break;
  421. case 't':
  422. c = '\t';
  423. break;
  424. case 'a':
  425. c = '\007';
  426. break;
  427. case 'v':
  428. c = '\v';
  429. break;
  430. case 'b':
  431. c = '\010';
  432. break;
  433. case 'x':
  434. SCM_READ_HEX_ESCAPE (2, '\0');
  435. break;
  436. case 'u':
  437. SCM_READ_HEX_ESCAPE (4, '\0');
  438. break;
  439. case 'U':
  440. SCM_READ_HEX_ESCAPE (6, '\0');
  441. break;
  442. default:
  443. if (c == chr)
  444. break;
  445. bad_escaped:
  446. scm_i_input_error (FUNC_NAME, port,
  447. "invalid character in escape sequence: ~S",
  448. scm_list_1 (SCM_MAKE_CHAR (c)));
  449. }
  450. }
  451. c_str[c_str_len++] = c;
  452. }
  453. if (scm_is_null (str))
  454. /* Fast path: we got a string that fits in C_STR. */
  455. str = scm_from_utf32_stringn (c_str, c_str_len);
  456. else
  457. {
  458. if (c_str_len > 0)
  459. str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
  460. str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
  461. }
  462. return str;
  463. }
  464. #undef FUNC_NAME
  465. static SCM
  466. scm_read_string (int chr, SCM port)
  467. {
  468. return scm_read_string_like_syntax (chr, port);
  469. }
  470. static SCM
  471. scm_read_number (scm_t_wchar chr, SCM port)
  472. {
  473. SCM result, str = SCM_EOL;
  474. char local_buffer[READER_BUFFER_SIZE], *buffer;
  475. size_t bytes_read;
  476. scm_ungetc (chr, port);
  477. buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
  478. &bytes_read);
  479. str = scm_from_port_stringn (buffer, bytes_read, port);
  480. result = scm_string_to_number (str, SCM_UNDEFINED);
  481. if (scm_is_false (result))
  482. {
  483. /* Return a symbol instead of a number */
  484. result = scm_string_to_symbol (str);
  485. }
  486. scm_set_port_column_x (port,
  487. scm_sum (scm_port_column (port),
  488. scm_string_length (str)));
  489. return result;
  490. }
  491. static SCM
  492. scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
  493. {
  494. SCM result;
  495. size_t bytes_read;
  496. char local_buffer[READER_BUFFER_SIZE], *buffer;
  497. SCM str;
  498. scm_ungetc (chr, port);
  499. buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
  500. &bytes_read);
  501. str = scm_from_port_stringn (buffer, bytes_read, port);
  502. result = scm_string_to_symbol (str);
  503. scm_set_port_column_x (port,
  504. scm_sum (scm_port_column (port),
  505. scm_string_length (str)));
  506. return result;
  507. }
  508. static SCM
  509. scm_read_number_and_radix (scm_t_wchar chr, SCM port)
  510. #define FUNC_NAME "scm_lreadr"
  511. {
  512. SCM result;
  513. size_t read;
  514. char local_buffer[READER_BUFFER_SIZE], *buffer;
  515. unsigned int radix;
  516. SCM str;
  517. switch (chr)
  518. {
  519. case 'B':
  520. case 'b':
  521. radix = 2;
  522. break;
  523. case 'o':
  524. case 'O':
  525. radix = 8;
  526. break;
  527. case 'd':
  528. case 'D':
  529. radix = 10;
  530. break;
  531. case 'x':
  532. case 'X':
  533. radix = 16;
  534. break;
  535. default:
  536. scm_ungetc (chr, port);
  537. scm_ungetc ('#', port);
  538. radix = 10;
  539. }
  540. buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
  541. &read);
  542. str = scm_from_port_stringn (buffer, read, port);
  543. result = scm_string_to_number (str, scm_from_uint (radix));
  544. scm_set_port_column_x (port,
  545. scm_sum (scm_port_column (port),
  546. scm_string_length (str)));
  547. if (scm_is_true (result))
  548. return result;
  549. scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
  550. return SCM_BOOL_F;
  551. }
  552. #undef FUNC_NAME
  553. static SCM
  554. scm_read_quote (int chr, SCM port)
  555. {
  556. SCM p;
  557. switch (chr)
  558. {
  559. case '`':
  560. p = scm_sym_quasiquote;
  561. break;
  562. case '\'':
  563. p = scm_sym_quote;
  564. break;
  565. case ',':
  566. {
  567. scm_t_wchar c;
  568. c = scm_getc (port);
  569. if ('@' == c)
  570. p = scm_sym_uq_splicing;
  571. else
  572. {
  573. scm_ungetc (c, port);
  574. p = scm_sym_unquote;
  575. }
  576. break;
  577. }
  578. default:
  579. fprintf (stderr, "%s: unhandled quote character (%i)\n",
  580. "scm_read_quote", chr);
  581. abort ();
  582. }
  583. return scm_cons2 (p, scm_read_expression (port), SCM_EOL);
  584. }
  585. SCM_SYMBOL (sym_syntax, "syntax");
  586. SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
  587. SCM_SYMBOL (sym_unsyntax, "unsyntax");
  588. SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
  589. static SCM
  590. scm_read_syntax (int chr, SCM port)
  591. {
  592. SCM p;
  593. switch (chr)
  594. {
  595. case '`':
  596. p = sym_quasisyntax;
  597. break;
  598. case '\'':
  599. p = sym_syntax;
  600. break;
  601. case ',':
  602. {
  603. int c;
  604. c = scm_getc (port);
  605. if ('@' == c)
  606. p = sym_unsyntax_splicing;
  607. else
  608. {
  609. scm_ungetc (c, port);
  610. p = sym_unsyntax;
  611. }
  612. break;
  613. }
  614. default:
  615. fprintf (stderr, "%s: unhandled syntax character (%i)\n",
  616. "scm_read_syntax", chr);
  617. abort ();
  618. }
  619. return scm_cons2 (p, scm_read_expression (port), SCM_EOL);
  620. }
  621. static SCM
  622. scm_read_nil (int chr, SCM port)
  623. {
  624. SCM id = scm_read_mixed_case_symbol (chr, port);
  625. if (!scm_is_eq (id, sym_nil))
  626. scm_i_input_error ("scm_read_nil", port,
  627. "unexpected input while reading #nil: ~a",
  628. scm_list_1 (id));
  629. return SCM_ELISP_NIL;
  630. }
  631. static SCM
  632. scm_read_semicolon_comment (int chr, SCM port)
  633. {
  634. int c;
  635. /* We use the get_byte here because there is no need to get the
  636. locale correct with comment input. This presumes that newline
  637. always represents itself no matter what the encoding is. */
  638. for (c = scm_get_byte_or_eof (port);
  639. (c != EOF) && (c != '\n');
  640. c = scm_get_byte_or_eof (port));
  641. return SCM_UNSPECIFIED;
  642. }
  643. /* If the EXPECTED_CHARS are the next ones available from PORT, then
  644. consume them and return 1. Otherwise leave the port position where
  645. it was and return 0. EXPECTED_CHARS should be all lowercase, and
  646. will be matched case-insensitively against the characters read from
  647. PORT. */
  648. static int
  649. try_read_ci_chars (SCM port, const char *expected_chars)
  650. {
  651. int num_chars_wanted = strlen (expected_chars);
  652. int num_chars_read = 0;
  653. char *chars_read = alloca (num_chars_wanted);
  654. int c;
  655. while (num_chars_read < num_chars_wanted)
  656. {
  657. c = scm_getc (port);
  658. if (c == EOF)
  659. break;
  660. else if (c_tolower (c) != expected_chars[num_chars_read])
  661. {
  662. scm_ungetc (c, port);
  663. break;
  664. }
  665. else
  666. chars_read[num_chars_read++] = c;
  667. }
  668. if (num_chars_read == num_chars_wanted)
  669. return 1;
  670. else
  671. {
  672. while (num_chars_read > 0)
  673. scm_ungetc (chars_read[--num_chars_read], port);
  674. return 0;
  675. }
  676. }
  677. /* Sharp readers, i.e. readers called after a `#' sign has been read. */
  678. static SCM
  679. scm_read_boolean (int chr, SCM port)
  680. {
  681. switch (chr)
  682. {
  683. case 't':
  684. case 'T':
  685. try_read_ci_chars (port, "rue");
  686. return SCM_BOOL_T;
  687. case 'f':
  688. case 'F':
  689. try_read_ci_chars (port, "alse");
  690. return SCM_BOOL_F;
  691. }
  692. return SCM_UNSPECIFIED;
  693. }
  694. static SCM
  695. scm_read_character (scm_t_wchar chr, SCM port)
  696. #define FUNC_NAME "scm_lreadr"
  697. {
  698. char buffer[READER_CHAR_NAME_MAX_SIZE];
  699. SCM charname;
  700. size_t charname_len, bytes_read;
  701. scm_t_wchar cp;
  702. int overflow;
  703. scm_t_port *pt;
  704. overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE,
  705. &bytes_read);
  706. if (overflow)
  707. scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
  708. if (bytes_read == 0)
  709. {
  710. chr = scm_getc (port);
  711. if (chr == EOF)
  712. scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
  713. "while reading character", SCM_EOL);
  714. /* CHR must be a token delimiter, like a whitespace. */
  715. return SCM_MAKE_CHAR (chr);
  716. }
  717. pt = SCM_PORT (port);
  718. /* Simple ASCII characters can be processed immediately. Also, simple
  719. ISO-8859-1 characters can be processed immediately if the encoding for this
  720. port is ISO-8859-1. */
  721. if (bytes_read == 1 &&
  722. ((unsigned char) buffer[0] <= 127
  723. || scm_is_eq (pt->encoding, sym_ISO_8859_1)))
  724. {
  725. scm_set_port_column_x (port, scm_sum (scm_port_column (port), SCM_INUM1));
  726. return SCM_MAKE_CHAR (buffer[0]);
  727. }
  728. /* Otherwise, convert the buffer into a proper scheme string for
  729. processing. */
  730. charname = scm_from_port_stringn (buffer, bytes_read, port);
  731. charname_len = scm_i_string_length (charname);
  732. scm_set_port_column_x (port,
  733. scm_sum (scm_port_column (port),
  734. scm_from_size_t (charname_len)));
  735. cp = scm_i_string_ref (charname, 0);
  736. if (charname_len == 1)
  737. return SCM_MAKE_CHAR (cp);
  738. /* Ignore dotted circles, which may be used to keep combining characters from
  739. combining with the backslash in #\charname. */
  740. if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
  741. return scm_c_make_char (scm_i_string_ref (charname, 1));
  742. if (cp >= '0' && cp < '8')
  743. {
  744. /* Dirk:FIXME:: This type of character syntax is not R5RS
  745. * compliant. Further, it should be verified that the constant
  746. * does only consist of octal digits. */
  747. SCM p = scm_string_to_number (charname, scm_from_uint (8));
  748. if (SCM_I_INUMP (p))
  749. {
  750. scm_t_wchar c = scm_to_uint32 (p);
  751. if (SCM_IS_UNICODE_CHAR (c))
  752. return SCM_MAKE_CHAR (c);
  753. else
  754. scm_i_input_error (FUNC_NAME, port,
  755. "out-of-range octal character escape: ~a",
  756. scm_list_1 (charname));
  757. }
  758. }
  759. if (cp == 'x' && (charname_len > 1))
  760. {
  761. SCM p;
  762. /* Convert from hex, skipping the initial 'x' character in CHARNAME */
  763. p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
  764. scm_from_uint (16));
  765. if (SCM_I_INUMP (p))
  766. {
  767. scm_t_wchar c = scm_to_uint32 (p);
  768. if (SCM_IS_UNICODE_CHAR (c))
  769. return SCM_MAKE_CHAR (c);
  770. else
  771. scm_i_input_error (FUNC_NAME, port,
  772. "out-of-range hex character escape: ~a",
  773. scm_list_1 (charname));
  774. }
  775. }
  776. /* The names of characters should never have non-Latin1
  777. characters. */
  778. if (scm_i_is_narrow_string (charname)
  779. || scm_i_try_narrow_string (charname))
  780. { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
  781. charname_len);
  782. if (scm_is_true (ch))
  783. return ch;
  784. }
  785. scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
  786. scm_list_1 (charname));
  787. return SCM_UNSPECIFIED;
  788. }
  789. #undef FUNC_NAME
  790. static SCM
  791. scm_read_keyword (int chr, SCM port)
  792. {
  793. SCM symbol;
  794. /* Read the symbol that comprises the keyword. Doing this instead of
  795. invoking a specific symbol reader function allows `scm_read_keyword ()'
  796. to adapt to the delimiters currently valid of symbols.
  797. XXX: This implementation allows sloppy syntaxes like `#: key'. */
  798. symbol = scm_read_expression (port);
  799. if (!scm_is_symbol (symbol))
  800. scm_i_input_error ("scm_read_keyword", port,
  801. "keyword prefix `~a' not followed by a symbol: ~s",
  802. scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
  803. return scm_symbol_to_keyword (symbol);
  804. }
  805. static SCM
  806. scm_read_vector (int chr, SCM port)
  807. {
  808. /* Note: We call `scm_read_sexp ()' rather than READER here in order to
  809. guarantee that it's going to do what we want. After all, this is an
  810. implementation detail of `scm_read_vector ()', not a desirable
  811. property. */
  812. return scm_vector (scm_read_sexp (chr, port));
  813. }
  814. /* Helper used by scm_read_array */
  815. static int
  816. read_decimal_integer (SCM port, int c, ssize_t *resp)
  817. {
  818. ssize_t sign = 1;
  819. ssize_t res = 0;
  820. int got_it = 0;
  821. if (c == '-')
  822. {
  823. sign = -1;
  824. c = scm_getc (port);
  825. }
  826. while ('0' <= c && c <= '9')
  827. {
  828. if (((SSIZE_MAX - (c-'0')) / 10) <= res)
  829. scm_i_input_error ("read_decimal_integer", port,
  830. "number too large", SCM_EOL);
  831. res = 10*res + c-'0';
  832. got_it = 1;
  833. c = scm_getc (port);
  834. }
  835. if (got_it)
  836. *resp = sign * res;
  837. return c;
  838. }
  839. /* Read an array. This function can also read vectors and uniform
  840. vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
  841. handled here.
  842. C is the first character read after the '#'. */
  843. static SCM
  844. scm_read_array (int c, SCM port)
  845. {
  846. ssize_t rank;
  847. scm_t_wchar tag_buf[8];
  848. int tag_len;
  849. SCM tag, shape = SCM_BOOL_F, elements;
  850. /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
  851. the array code can not deal with zero-length dimensions yet, and
  852. we want to allow zero-length vectors, of course. */
  853. if (c == '(')
  854. return scm_read_vector (c, port);
  855. /* Disambiguate between '#f' and uniform floating point vectors. */
  856. if (c == 'f')
  857. {
  858. c = scm_getc (port);
  859. if (c != '3' && c != '6')
  860. {
  861. if (c == 'a' && try_read_ci_chars (port, "lse"))
  862. return SCM_BOOL_F;
  863. else if (c != EOF)
  864. scm_ungetc (c, port);
  865. return SCM_BOOL_F;
  866. }
  867. rank = 1;
  868. tag_buf[0] = 'f';
  869. tag_len = 1;
  870. goto continue_reading_tag;
  871. }
  872. /* Read rank. */
  873. rank = 1;
  874. c = read_decimal_integer (port, c, &rank);
  875. if (rank < 0)
  876. scm_i_input_error (NULL, port, "array rank must be non-negative",
  877. SCM_EOL);
  878. /* Read tag. */
  879. tag_len = 0;
  880. continue_reading_tag:
  881. while (c != EOF && c != '(' && c != '@' && c != ':'
  882. && tag_len < sizeof tag_buf / sizeof tag_buf[0])
  883. {
  884. tag_buf[tag_len++] = c;
  885. c = scm_getc (port);
  886. }
  887. if (tag_len == 0)
  888. tag = SCM_BOOL_T;
  889. else
  890. {
  891. tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
  892. if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
  893. scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
  894. scm_list_1 (tag));
  895. }
  896. /* Read shape. */
  897. if (c == '@' || c == ':')
  898. {
  899. shape = SCM_EOL;
  900. do
  901. {
  902. ssize_t lbnd = 0, len = 0;
  903. SCM s;
  904. if (c == '@')
  905. {
  906. c = scm_getc (port);
  907. c = read_decimal_integer (port, c, &lbnd);
  908. }
  909. s = scm_from_ssize_t (lbnd);
  910. if (c == ':')
  911. {
  912. c = scm_getc (port);
  913. c = read_decimal_integer (port, c, &len);
  914. if (len < 0)
  915. scm_i_input_error (NULL, port,
  916. "array length must be non-negative",
  917. SCM_EOL);
  918. s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
  919. }
  920. shape = scm_cons (s, shape);
  921. } while (c == '@' || c == ':');
  922. shape = scm_reverse_x (shape, SCM_EOL);
  923. }
  924. /* Read nested lists of elements. */
  925. if (c != '(')
  926. scm_i_input_error (NULL, port,
  927. "missing '(' in vector or array literal",
  928. SCM_EOL);
  929. elements = scm_read_sexp (c, port);
  930. if (scm_is_false (shape))
  931. shape = scm_from_ssize_t (rank);
  932. else if (scm_ilength (shape) != rank)
  933. scm_i_input_error
  934. (NULL, port,
  935. "the number of shape specifications must match the array rank",
  936. SCM_EOL);
  937. /* Handle special print syntax of rank zero arrays; see
  938. scm_i_print_array for a rationale. */
  939. if (rank == 0)
  940. {
  941. if (!scm_is_pair (elements))
  942. scm_i_input_error (NULL, port,
  943. "too few elements in array literal, need 1",
  944. SCM_EOL);
  945. if (!scm_is_null (SCM_CDR (elements)))
  946. scm_i_input_error (NULL, port,
  947. "too many elements in array literal, want 1",
  948. SCM_EOL);
  949. elements = SCM_CAR (elements);
  950. }
  951. /* Construct array, annotate with source location, and return. */
  952. return scm_list_to_typed_array (tag, shape, elements);
  953. }
  954. static SCM
  955. scm_read_srfi4_vector (int chr, SCM port)
  956. {
  957. return scm_read_array (chr, port);
  958. }
  959. static SCM
  960. scm_read_bytevector (scm_t_wchar chr, SCM port)
  961. {
  962. chr = scm_getc (port);
  963. if (chr != 'u')
  964. goto syntax;
  965. chr = scm_getc (port);
  966. if (chr != '8')
  967. goto syntax;
  968. chr = scm_getc (port);
  969. if (chr != '(')
  970. goto syntax;
  971. return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
  972. syntax:
  973. scm_i_input_error ("read_bytevector", port,
  974. "invalid bytevector prefix",
  975. SCM_MAKE_CHAR (chr));
  976. return SCM_UNSPECIFIED;
  977. }
  978. static SCM
  979. scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
  980. {
  981. /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
  982. terribly inefficient but who cares? */
  983. SCM s_bits = SCM_EOL;
  984. for (chr = scm_getc (port);
  985. (chr != EOF) && ((chr == '0') || (chr == '1'));
  986. chr = scm_getc (port))
  987. {
  988. s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
  989. }
  990. if (chr != EOF)
  991. scm_ungetc (chr, port);
  992. return scm_list_to_bitvector (scm_reverse_x (s_bits, SCM_EOL));
  993. }
  994. static SCM
  995. scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
  996. {
  997. int bang_seen = 0;
  998. for (;;)
  999. {
  1000. int c = scm_getc (port);
  1001. if (c == EOF)
  1002. scm_i_input_error ("skip_block_comment", port,
  1003. "unterminated `#! ... !#' comment", SCM_EOL);
  1004. if (c == '!')
  1005. bang_seen = 1;
  1006. else if (c == '#' && bang_seen)
  1007. break;
  1008. else
  1009. bang_seen = 0;
  1010. }
  1011. return SCM_UNSPECIFIED;
  1012. }
  1013. static SCM
  1014. scm_read_shebang (scm_t_wchar chr, SCM port)
  1015. {
  1016. char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
  1017. int c;
  1018. int i = 0;
  1019. while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
  1020. {
  1021. c = scm_getc (port);
  1022. if (c == EOF)
  1023. scm_i_input_error ("skip_block_comment", port,
  1024. "unterminated `#! ... !#' comment", SCM_EOL);
  1025. else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
  1026. name[i++] = c;
  1027. else if (CHAR_IS_DELIMITER (c))
  1028. {
  1029. scm_ungetc (c, port);
  1030. name[i] = '\0';
  1031. if (0 == strcmp ("r6rs", name)
  1032. || 0 == strcmp ("fold-case", name)
  1033. || 0 == strcmp ("no-fold-case", name)
  1034. || 0 == strcmp ("curly-infix", name)
  1035. || 0 == strcmp ("curly-infix-and-bracket-lists", name))
  1036. scm_i_input_error ("skip_block_comment", port,
  1037. "unsupported directive: ~s",
  1038. scm_list_1 (scm_from_latin1_string (name)));
  1039. break;
  1040. }
  1041. else
  1042. {
  1043. scm_ungetc (c, port);
  1044. break;
  1045. }
  1046. }
  1047. while (i > 0)
  1048. scm_ungetc (name[--i], port);
  1049. return scm_read_scsh_block_comment (chr, port);
  1050. }
  1051. static SCM
  1052. scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
  1053. {
  1054. /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
  1055. nested. So care must be taken. */
  1056. int nesting_level = 1;
  1057. int a = scm_getc (port);
  1058. if (a == EOF)
  1059. scm_i_input_error ("scm_read_r6rs_block_comment", port,
  1060. "unterminated `#| ... |#' comment", SCM_EOL);
  1061. while (nesting_level > 0)
  1062. {
  1063. int b = scm_getc (port);
  1064. if (b == EOF)
  1065. scm_i_input_error ("scm_read_r6rs_block_comment", port,
  1066. "unterminated `#| ... |#' comment", SCM_EOL);
  1067. if (a == '|' && b == '#')
  1068. {
  1069. nesting_level--;
  1070. b = EOF;
  1071. }
  1072. else if (a == '#' && b == '|')
  1073. {
  1074. nesting_level++;
  1075. b = EOF;
  1076. }
  1077. a = b;
  1078. }
  1079. return SCM_UNSPECIFIED;
  1080. }
  1081. static SCM
  1082. scm_read_commented_expression (scm_t_wchar chr, SCM port)
  1083. {
  1084. scm_t_wchar c;
  1085. c = flush_ws (port, (char *) NULL);
  1086. if (EOF == c)
  1087. scm_i_input_error ("read_commented_expression", port,
  1088. "no expression after #; comment", SCM_EOL);
  1089. scm_ungetc (c, port);
  1090. scm_read_expression (port);
  1091. return SCM_UNSPECIFIED;
  1092. }
  1093. static SCM
  1094. scm_read_extended_symbol (scm_t_wchar chr, SCM port)
  1095. {
  1096. /* Guile's extended symbol read syntax looks like this:
  1097. #{This is all a symbol name}#
  1098. So here, CHR is expected to be `{'. */
  1099. int saw_brace = 0;
  1100. size_t len = 0;
  1101. SCM buf = scm_i_make_string (1024, NULL, 0);
  1102. /* No need to scm_i_string_start_writing (), as the string isn't
  1103. visible to any other thread. */
  1104. while ((chr = scm_getc (port)) != EOF)
  1105. {
  1106. if (saw_brace)
  1107. {
  1108. if (chr == '#')
  1109. {
  1110. break;
  1111. }
  1112. else
  1113. {
  1114. saw_brace = 0;
  1115. scm_i_string_set_x (buf, len++, '}');
  1116. }
  1117. }
  1118. if (chr == '}')
  1119. saw_brace = 1;
  1120. else if (chr == '\\')
  1121. {
  1122. /* It used to be that print.c would print extended-read-syntax
  1123. symbols with backslashes before "non-standard" chars, but
  1124. this routine wouldn't do anything with those escapes.
  1125. Bummer. What we've done is to change print.c to output
  1126. R6RS hex escapes for those characters, relying on the fact
  1127. that the extended read syntax would never put a `\' before
  1128. an `x'. For now, we just ignore other instances of
  1129. backslash in the string. */
  1130. switch ((chr = scm_getc (port)))
  1131. {
  1132. case EOF:
  1133. goto done;
  1134. case 'x':
  1135. {
  1136. scm_t_wchar c;
  1137. SCM_READ_HEX_ESCAPE (10, ';');
  1138. scm_i_string_set_x (buf, len++, c);
  1139. break;
  1140. str_eof:
  1141. chr = EOF;
  1142. goto done;
  1143. bad_escaped:
  1144. scm_i_string_stop_writing ();
  1145. scm_i_input_error ("scm_read_extended_symbol", port,
  1146. "illegal character in escape sequence: ~S",
  1147. scm_list_1 (SCM_MAKE_CHAR (c)));
  1148. break;
  1149. }
  1150. default:
  1151. scm_i_string_set_x (buf, len++, chr);
  1152. break;
  1153. }
  1154. }
  1155. else
  1156. scm_i_string_set_x (buf, len++, chr);
  1157. if (len >= scm_i_string_length (buf) - 2)
  1158. {
  1159. SCM addy;
  1160. addy = scm_i_make_string (1024, NULL, 0);
  1161. buf = scm_string_append (scm_list_2 (buf, addy));
  1162. len = 0;
  1163. }
  1164. }
  1165. done:
  1166. if (chr == EOF)
  1167. scm_i_input_error ("scm_read_extended_symbol", port,
  1168. "end of file while reading symbol", SCM_EOL);
  1169. return scm_string_to_symbol (scm_c_substring (buf, 0, len));
  1170. }
  1171. /* Top-level token readers, i.e., dispatchers. */
  1172. /* The reader for the sharp `#' character. It basically dispatches reads
  1173. among the above token readers. */
  1174. static SCM
  1175. scm_read_sharp (scm_t_wchar chr, SCM port)
  1176. #define FUNC_NAME "scm_lreadr"
  1177. {
  1178. chr = scm_getc (port);
  1179. switch (chr)
  1180. {
  1181. case '\\':
  1182. return scm_read_character (chr, port);
  1183. case '(':
  1184. return scm_read_vector (chr, port);
  1185. case 's':
  1186. case 'u':
  1187. case 'f':
  1188. case 'c':
  1189. /* This one may return either a boolean or an SRFI-4 vector. */
  1190. return scm_read_srfi4_vector (chr, port);
  1191. case 'v':
  1192. return scm_read_bytevector (chr, port);
  1193. case '*':
  1194. return scm_read_guile_bit_vector (chr, port);
  1195. case 't':
  1196. case 'T':
  1197. case 'F':
  1198. return scm_read_boolean (chr, port);
  1199. case ':':
  1200. return scm_read_keyword (chr, port);
  1201. case '0': case '1': case '2': case '3': case '4':
  1202. case '5': case '6': case '7': case '8': case '9':
  1203. case '@':
  1204. return scm_read_array (chr, port);
  1205. case 'i':
  1206. case 'e':
  1207. case 'b':
  1208. case 'B':
  1209. case 'o':
  1210. case 'O':
  1211. case 'd':
  1212. case 'D':
  1213. case 'x':
  1214. case 'X':
  1215. case 'I':
  1216. case 'E':
  1217. return scm_read_number_and_radix (chr, port);
  1218. case '{':
  1219. return scm_read_extended_symbol (chr, port);
  1220. case '!':
  1221. return scm_read_shebang (chr, port);
  1222. case ';':
  1223. return scm_read_commented_expression (chr, port);
  1224. case '`':
  1225. case '\'':
  1226. case ',':
  1227. return scm_read_syntax (chr, port);
  1228. case 'n':
  1229. return scm_read_nil (chr, port);
  1230. case '|':
  1231. return scm_read_r6rs_block_comment (chr, port);
  1232. default:
  1233. scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
  1234. scm_list_1 (SCM_MAKE_CHAR (chr)));
  1235. return SCM_UNSPECIFIED;
  1236. }
  1237. }
  1238. #undef FUNC_NAME
  1239. static SCM
  1240. scm_read_expression (SCM port)
  1241. #define FUNC_NAME "scm_read_expression"
  1242. {
  1243. while (1)
  1244. {
  1245. scm_t_wchar chr;
  1246. chr = scm_getc (port);
  1247. switch (chr)
  1248. {
  1249. case SCM_WHITE_SPACES:
  1250. case SCM_LINE_INCREMENTORS:
  1251. break;
  1252. case ';':
  1253. scm_read_semicolon_comment (chr, port);
  1254. break;
  1255. case '[':
  1256. case '(':
  1257. return scm_read_sexp (chr, port);
  1258. case '"':
  1259. return scm_read_string (chr, port);
  1260. case '\'':
  1261. case '`':
  1262. case ',':
  1263. return scm_read_quote (chr, port);
  1264. case '#':
  1265. {
  1266. SCM result = scm_read_sharp (chr, port);
  1267. if (scm_is_eq (result, SCM_UNSPECIFIED))
  1268. /* We read a comment or some such. */
  1269. break;
  1270. else
  1271. return result;
  1272. }
  1273. case ')':
  1274. scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
  1275. break;
  1276. case ']':
  1277. scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
  1278. break;
  1279. case EOF:
  1280. return SCM_EOF_VAL;
  1281. default:
  1282. {
  1283. if (((chr >= '0') && (chr <= '9'))
  1284. || (strchr ("+-.", chr)))
  1285. return scm_read_number (chr, port);
  1286. else
  1287. return scm_read_mixed_case_symbol (chr, port);
  1288. }
  1289. }
  1290. }
  1291. }
  1292. #undef FUNC_NAME
  1293. /* Actual reader. */
  1294. SCM_DEFINE (scm_primitive_read, "primitive-read", 0, 1, 0,
  1295. (SCM port),
  1296. "Read an s-expression from the input port @var{port}, or from\n"
  1297. "the current input port if @var{port} is not specified.\n"
  1298. "Any whitespace before the next token is discarded.")
  1299. #define FUNC_NAME s_scm_primitive_read
  1300. {
  1301. int c;
  1302. if (SCM_UNBNDP (port))
  1303. port = scm_current_input_port ();
  1304. SCM_VALIDATE_OPINPORT (1, port);
  1305. c = flush_ws (port, (char *) NULL);
  1306. if (EOF == c)
  1307. return SCM_EOF_VAL;
  1308. scm_ungetc (c, port);
  1309. return scm_read_expression (port);
  1310. }
  1311. #undef FUNC_NAME
  1312. static SCM scm_read_var;
  1313. SCM
  1314. scm_read (SCM port)
  1315. #define FUNC_NAME "read"
  1316. {
  1317. if (SCM_UNBNDP (port))
  1318. return scm_call_0 (scm_variable_ref (scm_read_var));
  1319. return scm_call_1 (scm_variable_ref (scm_read_var), port);
  1320. }
  1321. #undef FUNC_NAME
  1322. /* A fluid referring to an association list mapping extra hash
  1323. characters to procedures. */
  1324. static SCM *scm_i_read_hash_procedures;
  1325. static SCM
  1326. scm_i_read_hash_procedures_ref (void)
  1327. {
  1328. return scm_fluid_ref (*scm_i_read_hash_procedures);
  1329. }
  1330. static void
  1331. scm_i_read_hash_procedures_set_x (SCM value)
  1332. {
  1333. scm_fluid_set_x (*scm_i_read_hash_procedures, value);
  1334. }
  1335. /* Manipulate the read-hash-procedures alist. This could be written in
  1336. Scheme, but maybe it will also be used by C code during initialisation. */
  1337. SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
  1338. (SCM chr, SCM proc),
  1339. "Install the procedure @var{proc} for reading expressions\n"
  1340. "starting with the character sequence @code{#} and @var{chr}.\n"
  1341. "@var{proc} will be called with two arguments: the character\n"
  1342. "@var{chr} and the port to read further data from. The object\n"
  1343. "returned will be the return value of @code{read}. \n"
  1344. "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
  1345. )
  1346. #define FUNC_NAME s_scm_read_hash_extend
  1347. {
  1348. SCM this;
  1349. SCM prev;
  1350. SCM_VALIDATE_CHAR (1, chr);
  1351. SCM_ASSERT (scm_is_false (proc)
  1352. || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
  1353. proc, SCM_ARG2, FUNC_NAME);
  1354. /* Check if chr is already in the alist. */
  1355. this = scm_i_read_hash_procedures_ref ();
  1356. prev = SCM_BOOL_F;
  1357. while (1)
  1358. {
  1359. if (scm_is_null (this))
  1360. {
  1361. /* not found, so add it to the beginning. */
  1362. if (scm_is_true (proc))
  1363. {
  1364. SCM new = scm_cons (scm_cons (chr, proc),
  1365. scm_i_read_hash_procedures_ref ());
  1366. scm_i_read_hash_procedures_set_x (new);
  1367. }
  1368. break;
  1369. }
  1370. if (scm_is_eq (chr, SCM_CAAR (this)))
  1371. {
  1372. /* already in the alist. */
  1373. if (scm_is_false (proc))
  1374. {
  1375. /* remove it. */
  1376. if (scm_is_false (prev))
  1377. {
  1378. SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
  1379. scm_i_read_hash_procedures_set_x (rest);
  1380. }
  1381. else
  1382. scm_set_cdr_x (prev, SCM_CDR (this));
  1383. }
  1384. else
  1385. {
  1386. /* replace it. */
  1387. scm_set_cdr_x (SCM_CAR (this), proc);
  1388. }
  1389. break;
  1390. }
  1391. prev = this;
  1392. this = SCM_CDR (this);
  1393. }
  1394. return SCM_UNSPECIFIED;
  1395. }
  1396. #undef FUNC_NAME
  1397. /* Recover the read-hash procedure corresponding to char c. */
  1398. static SCM
  1399. scm_get_hash_procedure (int c)
  1400. {
  1401. SCM rest = scm_i_read_hash_procedures_ref ();
  1402. while (1)
  1403. {
  1404. if (scm_is_null (rest))
  1405. return SCM_BOOL_F;
  1406. if (SCM_CHAR (SCM_CAAR (rest)) == c)
  1407. return SCM_CDAR (rest);
  1408. rest = SCM_CDR (rest);
  1409. }
  1410. }
  1411. static int
  1412. is_encoding_char (char c)
  1413. {
  1414. if (c >= 'a' && c <= 'z') return 1;
  1415. if (c >= 'A' && c <= 'Z') return 1;
  1416. if (c >= '0' && c <= '9') return 1;
  1417. return strchr ("_-.:/,+=()", c) != NULL;
  1418. }
  1419. /* Maximum size of an encoding name. This is a bit more than the
  1420. longest name listed at
  1421. <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
  1422. characters.) */
  1423. #define ENCODING_NAME_MAX_SIZE 20
  1424. /* Number of bytes at the beginning or end of a file that are scanned
  1425. for a "coding:" declaration. */
  1426. #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
  1427. /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
  1428. coding declaration. Returns either NULL or a string whose storage
  1429. has been allocated with `scm_gc_malloc'. */
  1430. char *
  1431. scm_i_scan_for_encoding (SCM port)
  1432. {
  1433. scm_t_port *pt;
  1434. SCM buf;
  1435. char header[SCM_ENCODING_SEARCH_SIZE+1];
  1436. size_t cur, bytes_read, encoding_length, i;
  1437. char *encoding = NULL;
  1438. char *pos, *encoding_start;
  1439. int in_comment;
  1440. pt = SCM_PORT (port);
  1441. buf = pt->read_buf;
  1442. if (pt->rw_random)
  1443. scm_flush (port);
  1444. if (scm_port_buffer_can_take (buf, &cur) == 0)
  1445. {
  1446. /* We can use the read buffer, and thus avoid a seek. */
  1447. buf = scm_fill_input (port, 0, &cur, &bytes_read);
  1448. if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
  1449. bytes_read = SCM_ENCODING_SEARCH_SIZE;
  1450. if (bytes_read <= 1)
  1451. /* An unbuffered port -- don't scan. */
  1452. return NULL;
  1453. memcpy (header, scm_port_buffer_take_pointer (buf, cur), bytes_read);
  1454. header[bytes_read] = '\0';
  1455. }
  1456. else if (pt->rw_random)
  1457. {
  1458. /* The port is seekable. This is OK but grubbing in the read
  1459. buffer is better, so this case is just a fallback. */
  1460. bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
  1461. header[bytes_read] = '\0';
  1462. scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
  1463. }
  1464. else
  1465. /* No input available and not seekable; scan fails. */
  1466. return NULL;
  1467. /* search past "coding[:=]" */
  1468. pos = header;
  1469. while (1)
  1470. {
  1471. if ((pos = strstr(pos, "coding")) == NULL)
  1472. return NULL;
  1473. pos += strlen ("coding");
  1474. if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
  1475. (*pos == ':' || *pos == '='))
  1476. {
  1477. pos ++;
  1478. break;
  1479. }
  1480. }
  1481. /* skip spaces */
  1482. while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
  1483. (*pos == ' ' || *pos == '\t'))
  1484. pos ++;
  1485. if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
  1486. /* We found the "coding:" string, but there is probably not enough
  1487. room to store an encoding name in its entirety, so ignore it.
  1488. This makes sure we do not end up returning a truncated encoding
  1489. name. */
  1490. return NULL;
  1491. /* grab the next token */
  1492. encoding_start = pos;
  1493. i = 0;
  1494. while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
  1495. && encoding_start + i - header < bytes_read
  1496. && is_encoding_char (encoding_start[i]))
  1497. i++;
  1498. encoding_length = i;
  1499. if (encoding_length == 0)
  1500. return NULL;
  1501. encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
  1502. /* push backwards to make sure we were in a comment */
  1503. in_comment = 0;
  1504. pos = encoding_start;
  1505. while (pos >= header)
  1506. {
  1507. if (*pos == ';')
  1508. {
  1509. in_comment = 1;
  1510. break;
  1511. }
  1512. else if (*pos == '\n' || pos == header)
  1513. {
  1514. /* This wasn't in a semicolon comment. Check for a
  1515. hash-bang comment. */
  1516. char *beg = strstr (header, "#!");
  1517. char *end = strstr (header, "!#");
  1518. if (beg < encoding_start && encoding_start + encoding_length <= end)
  1519. in_comment = 1;
  1520. break;
  1521. }
  1522. else
  1523. {
  1524. pos --;
  1525. continue;
  1526. }
  1527. }
  1528. if (!in_comment)
  1529. /* This wasn't in a comment */
  1530. return NULL;
  1531. return encoding;
  1532. }
  1533. SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
  1534. (SCM port),
  1535. "Scans the port for an Emacs-like character coding declaration\n"
  1536. "near the top of the contents of a port with random-accessible contents.\n"
  1537. "The coding declaration is of the form\n"
  1538. "@code{coding: XXXXX} and must appear in a scheme comment.\n"
  1539. "\n"
  1540. "Returns a string containing the character encoding of the file\n"
  1541. "if a declaration was found, or @code{#f} otherwise.\n")
  1542. #define FUNC_NAME s_scm_file_encoding
  1543. {
  1544. char *enc;
  1545. SCM s_enc;
  1546. SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
  1547. enc = scm_i_scan_for_encoding (port);
  1548. if (enc == NULL)
  1549. return SCM_BOOL_F;
  1550. else
  1551. {
  1552. /* It's not obvious what encoding to use here, but latin1 has the
  1553. advantage of never causing a decoding error, and a valid
  1554. encoding name should be ASCII anyway. */
  1555. s_enc = scm_string_upcase (scm_from_latin1_string (enc));
  1556. return s_enc;
  1557. }
  1558. return SCM_BOOL_F;
  1559. }
  1560. #undef FUNC_NAME
  1561. void
  1562. scm_init_read ()
  1563. {
  1564. SCM read_hash_procs;
  1565. read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
  1566. scm_i_read_hash_procedures =
  1567. SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
  1568. scm_init_opts (scm_read_options, scm_read_opts);
  1569. #include "read.x"
  1570. scm_read_var = scm_c_define
  1571. ("read", scm_variable_ref (scm_c_lookup (s_scm_primitive_read)));
  1572. }