read.c 66 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372
  1. /* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014
  2. * Free Software Foundation, Inc.
  3. *
  4. * This library is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public License
  6. * as published by the Free Software Foundation; either version 3 of
  7. * the License, or (at your option) any later version.
  8. *
  9. * This library is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with this library; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301 USA
  18. */
  19. #ifdef HAVE_CONFIG_H
  20. # include <config.h>
  21. #endif
  22. #include <stdio.h>
  23. #include <string.h>
  24. #include <unistd.h>
  25. #include <unicase.h>
  26. #include <unictype.h>
  27. #include <c-strcase.h>
  28. #include <c-ctype.h>
  29. #include "libguile/_scm.h"
  30. #include "libguile/bytevectors.h"
  31. #include "libguile/chars.h"
  32. #include "libguile/eval.h"
  33. #include "libguile/arrays.h"
  34. #include "libguile/bitvectors.h"
  35. #include "libguile/keywords.h"
  36. #include "libguile/alist.h"
  37. #include "libguile/srcprop.h"
  38. #include "libguile/hashtab.h"
  39. #include "libguile/hash.h"
  40. #include "libguile/ports.h"
  41. #include "libguile/ports-internal.h"
  42. #include "libguile/fports.h"
  43. #include "libguile/root.h"
  44. #include "libguile/strings.h"
  45. #include "libguile/strports.h"
  46. #include "libguile/vectors.h"
  47. #include "libguile/validate.h"
  48. #include "libguile/srfi-4.h"
  49. #include "libguile/srfi-13.h"
  50. #include "libguile/read.h"
  51. #include "libguile/private-options.h"
  52. SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  53. SCM_SYMBOL (scm_keyword_prefix, "prefix");
  54. SCM_SYMBOL (scm_keyword_postfix, "postfix");
  55. SCM_SYMBOL (sym_nil, "nil");
  56. /* SRFI-105 curly infix expression support */
  57. SCM_SYMBOL (sym_nfx, "$nfx$");
  58. SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
  59. SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
  60. scm_t_option scm_read_opts[] =
  61. {
  62. { SCM_OPTION_BOOLEAN, "copy", 0,
  63. "Copy source code expressions." },
  64. { SCM_OPTION_BOOLEAN, "positions", 1,
  65. "Record positions of source code expressions." },
  66. { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  67. "Convert symbols to lower case."},
  68. { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  69. "Style of keyword recognition: #f, 'prefix or 'postfix."},
  70. { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  71. "Use R6RS variable-length character and string hex escapes."},
  72. { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  73. "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  74. { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  75. "In strings, consume leading whitespace after an escaped end-of-line."},
  76. { SCM_OPTION_BOOLEAN, "curly-infix", 0,
  77. "Support SRFI-105 curly infix expressions."},
  78. { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
  79. "Support R7RS |...| symbol notation."},
  80. { 0, },
  81. };
  82. /* Internal read options structure. This is initialized by 'scm_read'
  83. from the global and per-port read options, and a pointer is passed
  84. down to all helper functions. */
  85. enum t_keyword_style
  86. {
  87. KEYWORD_STYLE_HASH_PREFIX,
  88. KEYWORD_STYLE_PREFIX,
  89. KEYWORD_STYLE_POSTFIX
  90. };
  91. struct t_read_opts
  92. {
  93. enum t_keyword_style keyword_style;
  94. unsigned int copy_source_p : 1;
  95. unsigned int record_positions_p : 1;
  96. unsigned int case_insensitive_p : 1;
  97. unsigned int r6rs_escapes_p : 1;
  98. unsigned int square_brackets_p : 1;
  99. unsigned int hungry_eol_escapes_p : 1;
  100. unsigned int curly_infix_p : 1;
  101. unsigned int neoteric_p : 1;
  102. unsigned int r7rs_symbols_p : 1;
  103. };
  104. typedef struct t_read_opts scm_t_read_opts;
  105. /*
  106. Give meaningful error messages for errors
  107. We use the format
  108. FILE:LINE:COL: MESSAGE
  109. This happened in ....
  110. This is not standard GNU format, but the test-suite likes the real
  111. message to be in front.
  112. */
  113. void
  114. scm_i_input_error (char const *function,
  115. SCM port, const char *message, SCM arg)
  116. {
  117. SCM fn = (scm_is_string (SCM_FILENAME(port))
  118. ? SCM_FILENAME(port)
  119. : scm_from_locale_string ("#<unknown port>"));
  120. SCM string_port = scm_open_output_string ();
  121. SCM string = SCM_EOL;
  122. scm_simple_format (string_port,
  123. scm_from_locale_string ("~A:~S:~S: ~A"),
  124. scm_list_4 (fn,
  125. scm_from_long (SCM_LINUM (port) + 1),
  126. scm_from_int (SCM_COL (port) + 1),
  127. scm_from_locale_string (message)));
  128. string = scm_get_output_string (string_port);
  129. scm_close_output_port (string_port);
  130. scm_error_scm (scm_from_latin1_symbol ("read-error"),
  131. function? scm_from_locale_string (function) : SCM_BOOL_F,
  132. string,
  133. arg,
  134. SCM_BOOL_F);
  135. }
  136. SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
  137. (SCM setting),
  138. "Option interface for the read options. Instead of using\n"
  139. "this procedure directly, use the procedures @code{read-enable},\n"
  140. "@code{read-disable}, @code{read-set!} and @code{read-options}.")
  141. #define FUNC_NAME s_scm_read_options
  142. {
  143. SCM ans = scm_options (setting,
  144. scm_read_opts,
  145. FUNC_NAME);
  146. if (SCM_COPY_SOURCE_P)
  147. SCM_RECORD_POSITIONS_P = 1;
  148. return ans;
  149. }
  150. #undef FUNC_NAME
  151. /* A fluid referring to an association list mapping extra hash
  152. characters to procedures. */
  153. static SCM *scm_i_read_hash_procedures;
  154. static SCM
  155. scm_i_read_hash_procedures_ref (void)
  156. {
  157. return scm_fluid_ref (*scm_i_read_hash_procedures);
  158. }
  159. static void
  160. scm_i_read_hash_procedures_set_x (SCM value)
  161. {
  162. scm_fluid_set_x (*scm_i_read_hash_procedures, value);
  163. }
  164. /* Token readers. */
  165. /* Size of the C buffer used to read symbols and numbers. */
  166. #define READER_BUFFER_SIZE 128
  167. /* Number of 32-bit codepoints in the buffer used to read strings. */
  168. #define READER_STRING_BUFFER_SIZE 128
  169. /* The maximum size of Scheme character names. */
  170. #define READER_CHAR_NAME_MAX_SIZE 50
  171. /* The maximum size of reader directive names. */
  172. #define READER_DIRECTIVE_NAME_MAX_SIZE 50
  173. /* `isblank' is only in C99. */
  174. #define CHAR_IS_BLANK_(_chr) \
  175. (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
  176. || ((_chr) == '\f') || ((_chr) == '\r'))
  177. #ifdef MSDOS
  178. # define CHAR_IS_BLANK(_chr) \
  179. ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
  180. #else
  181. # define CHAR_IS_BLANK CHAR_IS_BLANK_
  182. #endif
  183. /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
  184. structure''). */
  185. #define CHAR_IS_R5RS_DELIMITER(c) \
  186. (CHAR_IS_BLANK (c) \
  187. || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
  188. #define CHAR_IS_DELIMITER(c) \
  189. (CHAR_IS_R5RS_DELIMITER (c) \
  190. || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
  191. || opts->curly_infix_p)) \
  192. || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
  193. /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
  194. Structure''. */
  195. #define CHAR_IS_EXPONENT_MARKER(_chr) \
  196. (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
  197. || ((_chr) == 'd') || ((_chr) == 'l'))
  198. /* Read an SCSH block comment. */
  199. static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
  200. static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
  201. static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
  202. static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
  203. static SCM scm_get_hash_procedure (int);
  204. /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
  205. result in the pre-allocated buffer BUF. Return zero if the whole token has
  206. fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
  207. bytes actually read. */
  208. static int
  209. read_token (SCM port, scm_t_read_opts *opts,
  210. char *buf, size_t buf_size, size_t *read)
  211. {
  212. *read = 0;
  213. while (*read < buf_size)
  214. {
  215. int chr;
  216. chr = scm_get_byte_or_eof_unlocked (port);
  217. if (chr == EOF)
  218. return 0;
  219. else if (CHAR_IS_DELIMITER (chr))
  220. {
  221. scm_unget_byte_unlocked (chr, port);
  222. return 0;
  223. }
  224. else
  225. {
  226. *buf = (char) chr;
  227. buf++, (*read)++;
  228. }
  229. }
  230. return 1;
  231. }
  232. /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
  233. if the token doesn't fit in BUFFER_SIZE bytes. */
  234. static char *
  235. read_complete_token (SCM port, scm_t_read_opts *opts,
  236. char *buffer, size_t buffer_size, size_t *read)
  237. {
  238. int overflow = 0;
  239. size_t bytes_read, overflow_size = 0;
  240. char *overflow_buffer = NULL;
  241. do
  242. {
  243. overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
  244. if (bytes_read == 0)
  245. break;
  246. if (overflow || overflow_size != 0)
  247. {
  248. if (overflow_size == 0)
  249. {
  250. overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
  251. memcpy (overflow_buffer, buffer, bytes_read);
  252. overflow_size = bytes_read;
  253. }
  254. else
  255. {
  256. char *new_buf =
  257. scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
  258. memcpy (new_buf, overflow_buffer, overflow_size);
  259. memcpy (new_buf + overflow_size, buffer, bytes_read);
  260. overflow_buffer = new_buf;
  261. overflow_size += bytes_read;
  262. }
  263. }
  264. }
  265. while (overflow);
  266. if (overflow_size)
  267. *read = overflow_size;
  268. else
  269. *read = bytes_read;
  270. return (overflow_size > 0 ? overflow_buffer : buffer);
  271. }
  272. /* Skip whitespace from PORT and return the first non-whitespace character
  273. read. Raise an error on end-of-file. */
  274. static int
  275. flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
  276. {
  277. scm_t_wchar c;
  278. while (1)
  279. switch (c = scm_getc_unlocked (port))
  280. {
  281. case EOF:
  282. goteof:
  283. if (eoferr)
  284. {
  285. scm_i_input_error (eoferr,
  286. port,
  287. "end of file",
  288. SCM_EOL);
  289. }
  290. return c;
  291. case ';':
  292. lp:
  293. switch (c = scm_getc_unlocked (port))
  294. {
  295. case EOF:
  296. goto goteof;
  297. default:
  298. goto lp;
  299. case SCM_LINE_INCREMENTORS:
  300. break;
  301. }
  302. break;
  303. case '#':
  304. switch (c = scm_getc_unlocked (port))
  305. {
  306. case EOF:
  307. eoferr = "read_sharp";
  308. goto goteof;
  309. case '!':
  310. scm_read_shebang (c, port, opts);
  311. break;
  312. case ';':
  313. scm_read_commented_expression (c, port, opts);
  314. break;
  315. case '|':
  316. if (scm_is_false (scm_get_hash_procedure (c)))
  317. {
  318. scm_read_r6rs_block_comment (c, port);
  319. break;
  320. }
  321. /* fall through */
  322. default:
  323. scm_ungetc_unlocked (c, port);
  324. return '#';
  325. }
  326. break;
  327. case SCM_LINE_INCREMENTORS:
  328. case SCM_SINGLE_SPACES:
  329. case '\t':
  330. break;
  331. default:
  332. return c;
  333. }
  334. return 0;
  335. }
  336. /* Token readers. */
  337. static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
  338. static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
  339. long line, int column);
  340. static SCM
  341. maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
  342. long line, int column)
  343. {
  344. /* This condition can be caused by a user calling
  345. set-port-column!. */
  346. if (line < 0 || column < 0)
  347. return x;
  348. if (opts->record_positions_p)
  349. scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
  350. return x;
  351. }
  352. static SCM
  353. scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
  354. #define FUNC_NAME "scm_i_lreadparen"
  355. {
  356. int c;
  357. SCM tmp, tl, ans = SCM_EOL;
  358. const int curly_list_p = (chr == '{') && opts->curly_infix_p;
  359. const int terminating_char = ((chr == '{') ? '}'
  360. : ((chr == '[') ? ']'
  361. : ')'));
  362. /* Need to capture line and column numbers here. */
  363. long line = SCM_LINUM (port);
  364. int column = SCM_COL (port) - 1;
  365. c = flush_ws (port, opts, FUNC_NAME);
  366. if (terminating_char == c)
  367. return SCM_EOL;
  368. scm_ungetc_unlocked (c, port);
  369. tmp = scm_read_expression (port, opts);
  370. /* Note that it is possible for scm_read_expression to return
  371. scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
  372. check that it's a real dot by checking `c'. */
  373. if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
  374. {
  375. ans = scm_read_expression (port, opts);
  376. if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
  377. scm_i_input_error (FUNC_NAME, port, "missing close paren",
  378. SCM_EOL);
  379. return ans;
  380. }
  381. /* Build the head of the list structure. */
  382. ans = tl = scm_cons (tmp, SCM_EOL);
  383. while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
  384. {
  385. SCM new_tail;
  386. if (c == ')' || (c == ']' && opts->square_brackets_p)
  387. || ((c == '}' || c == ']') && opts->curly_infix_p))
  388. scm_i_input_error (FUNC_NAME, port,
  389. "in pair: mismatched close paren: ~A",
  390. scm_list_1 (SCM_MAKE_CHAR (c)));
  391. scm_ungetc_unlocked (c, port);
  392. tmp = scm_read_expression (port, opts);
  393. /* See above note about scm_sym_dot. */
  394. if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
  395. {
  396. SCM_SETCDR (tl, scm_read_expression (port, opts));
  397. c = flush_ws (port, opts, FUNC_NAME);
  398. if (terminating_char != c)
  399. scm_i_input_error (FUNC_NAME, port,
  400. "in pair: missing close paren", SCM_EOL);
  401. break;
  402. }
  403. new_tail = scm_cons (tmp, SCM_EOL);
  404. SCM_SETCDR (tl, new_tail);
  405. tl = new_tail;
  406. }
  407. if (curly_list_p)
  408. {
  409. /* In addition to finding the length, 'scm_ilength' checks for
  410. improper or circular lists, in which case it returns -1. */
  411. int len = scm_ilength (ans);
  412. /* The (len == 0) case is handled above */
  413. if (len == 1)
  414. /* Return directly to avoid re-annotating the element's source
  415. location with the position of the outer brace. Also, it
  416. might not be possible to annotate the element. */
  417. return scm_car (ans); /* {e} => e */
  418. else if (len == 2)
  419. ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
  420. else if (len >= 3 && (len & 1))
  421. {
  422. /* It's a proper list whose length is odd and at least 3. If
  423. the elements at odd indices (the infix operator positions)
  424. are all 'equal?', then it's a simple curly-infix list.
  425. Otherwise it's a mixed curly-infix list. */
  426. SCM op = scm_cadr (ans);
  427. /* Check to see if the elements at odd indices are 'equal?' */
  428. for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
  429. {
  430. if (scm_is_null (tl))
  431. {
  432. /* Convert simple curly-infix list to prefix:
  433. {a <op> b <op> ...} => (<op> a b ...) */
  434. tl = ans;
  435. while (scm_is_pair (scm_cdr (tl)))
  436. {
  437. tmp = scm_cddr (tl);
  438. SCM_SETCDR (tl, tmp);
  439. tl = tmp;
  440. }
  441. ans = scm_cons (op, ans);
  442. break;
  443. }
  444. else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
  445. {
  446. /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
  447. ans = scm_cons (sym_nfx, ans);
  448. break;
  449. }
  450. }
  451. }
  452. else
  453. /* Mixed curly-infix (possibly improper) list:
  454. {e . tail} => ($nfx$ e . tail) */
  455. ans = scm_cons (sym_nfx, ans);
  456. }
  457. return maybe_annotate_source (ans, port, opts, line, column);
  458. }
  459. #undef FUNC_NAME
  460. /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
  461. C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
  462. found. */
  463. #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
  464. do \
  465. { \
  466. scm_t_wchar a; \
  467. size_t i = 0; \
  468. c = 0; \
  469. while (i < ndigits) \
  470. { \
  471. a = scm_getc_unlocked (port); \
  472. if (a == EOF) \
  473. goto str_eof; \
  474. if (terminator \
  475. && (a == (scm_t_wchar) terminator) \
  476. && (i > 0)) \
  477. break; \
  478. if ('0' <= a && a <= '9') \
  479. a -= '0'; \
  480. else if ('A' <= a && a <= 'F') \
  481. a = a - 'A' + 10; \
  482. else if ('a' <= a && a <= 'f') \
  483. a = a - 'a' + 10; \
  484. else \
  485. { \
  486. c = a; \
  487. goto bad_escaped; \
  488. } \
  489. c = c * 16 + a; \
  490. i ++; \
  491. } \
  492. } while (0)
  493. static void
  494. skip_intraline_whitespace (SCM port)
  495. {
  496. scm_t_wchar c;
  497. do
  498. {
  499. c = scm_getc_unlocked (port);
  500. if (c == EOF)
  501. return;
  502. }
  503. while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
  504. scm_ungetc_unlocked (c, port);
  505. }
  506. /* Read either a double-quoted string or an R7RS-style symbol delimited
  507. by vertical lines, depending on the value of 'chr' ('"' or '|').
  508. Regardless, the result is always returned as a string. */
  509. static SCM
  510. scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
  511. #define FUNC_NAME "scm_lreadr"
  512. {
  513. /* For strings smaller than C_STR, this function creates only one Scheme
  514. object (the string returned). */
  515. SCM str = SCM_EOL;
  516. size_t c_str_len = 0;
  517. scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
  518. /* Need to capture line and column numbers here. */
  519. long line = SCM_LINUM (port);
  520. int column = SCM_COL (port) - 1;
  521. while (chr != (c = scm_getc_unlocked (port)))
  522. {
  523. if (c == EOF)
  524. {
  525. str_eof:
  526. scm_i_input_error (FUNC_NAME, port,
  527. (chr == '|'
  528. ? "end of file in symbol"
  529. : "end of file in string constant"),
  530. SCM_EOL);
  531. }
  532. if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
  533. {
  534. str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
  535. c_str_len = 0;
  536. }
  537. if (c == '\\')
  538. {
  539. switch (c = scm_getc_unlocked (port))
  540. {
  541. case EOF:
  542. goto str_eof;
  543. case '|':
  544. case '\\':
  545. case '(': /* Accept "\(" for use at the beginning of lines
  546. in multiline strings to avoid confusing emacs
  547. lisp modes. */
  548. break;
  549. case '\n':
  550. if (opts->hungry_eol_escapes_p)
  551. skip_intraline_whitespace (port);
  552. continue;
  553. case '0':
  554. c = '\0';
  555. break;
  556. case 'f':
  557. c = '\f';
  558. break;
  559. case 'n':
  560. c = '\n';
  561. break;
  562. case 'r':
  563. c = '\r';
  564. break;
  565. case 't':
  566. c = '\t';
  567. break;
  568. case 'a':
  569. c = '\007';
  570. break;
  571. case 'v':
  572. c = '\v';
  573. break;
  574. case 'b':
  575. c = '\010';
  576. break;
  577. case 'x':
  578. if (opts->r6rs_escapes_p || chr == '|')
  579. SCM_READ_HEX_ESCAPE (10, ';');
  580. else
  581. SCM_READ_HEX_ESCAPE (2, '\0');
  582. break;
  583. case 'u':
  584. if (!opts->r6rs_escapes_p)
  585. {
  586. SCM_READ_HEX_ESCAPE (4, '\0');
  587. break;
  588. }
  589. case 'U':
  590. if (!opts->r6rs_escapes_p)
  591. {
  592. SCM_READ_HEX_ESCAPE (6, '\0');
  593. break;
  594. }
  595. default:
  596. if (c == chr)
  597. break;
  598. bad_escaped:
  599. scm_i_input_error (FUNC_NAME, port,
  600. "illegal character in escape sequence: ~S",
  601. scm_list_1 (SCM_MAKE_CHAR (c)));
  602. }
  603. }
  604. c_str[c_str_len++] = c;
  605. }
  606. if (scm_is_null (str))
  607. /* Fast path: we got a string that fits in C_STR. */
  608. str = scm_from_utf32_stringn (c_str, c_str_len);
  609. else
  610. {
  611. if (c_str_len > 0)
  612. str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
  613. str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
  614. }
  615. return maybe_annotate_source (str, port, opts, line, column);
  616. }
  617. #undef FUNC_NAME
  618. static SCM
  619. scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
  620. {
  621. return scm_read_string_like_syntax (chr, port, opts);
  622. }
  623. static SCM
  624. scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
  625. {
  626. return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
  627. }
  628. static SCM
  629. scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
  630. {
  631. SCM result, str = SCM_EOL;
  632. char local_buffer[READER_BUFFER_SIZE], *buffer;
  633. size_t bytes_read;
  634. /* Need to capture line and column numbers here. */
  635. long line = SCM_LINUM (port);
  636. int column = SCM_COL (port) - 1;
  637. scm_ungetc_unlocked (chr, port);
  638. buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
  639. &bytes_read);
  640. str = scm_from_port_stringn (buffer, bytes_read, port);
  641. result = scm_string_to_number (str, SCM_UNDEFINED);
  642. if (scm_is_false (result))
  643. {
  644. /* Return a symbol instead of a number */
  645. if (opts->case_insensitive_p)
  646. str = scm_string_downcase_x (str);
  647. result = scm_string_to_symbol (str);
  648. }
  649. else if (SCM_NIMP (result))
  650. result = maybe_annotate_source (result, port, opts, line, column);
  651. SCM_COL (port) += scm_i_string_length (str);
  652. return result;
  653. }
  654. static SCM
  655. scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
  656. {
  657. SCM result;
  658. int ends_with_colon = 0;
  659. size_t bytes_read;
  660. int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
  661. char local_buffer[READER_BUFFER_SIZE], *buffer;
  662. SCM str;
  663. scm_ungetc_unlocked (chr, port);
  664. buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
  665. &bytes_read);
  666. if (bytes_read > 0)
  667. ends_with_colon = buffer[bytes_read - 1] == ':';
  668. if (postfix && ends_with_colon && (bytes_read > 1))
  669. {
  670. str = scm_from_port_stringn (buffer, bytes_read - 1, port);
  671. if (opts->case_insensitive_p)
  672. str = scm_string_downcase_x (str);
  673. result = scm_symbol_to_keyword (scm_string_to_symbol (str));
  674. }
  675. else
  676. {
  677. str = scm_from_port_stringn (buffer, bytes_read, port);
  678. if (opts->case_insensitive_p)
  679. str = scm_string_downcase_x (str);
  680. result = scm_string_to_symbol (str);
  681. }
  682. SCM_COL (port) += scm_i_string_length (str);
  683. return result;
  684. }
  685. static SCM
  686. scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
  687. #define FUNC_NAME "scm_lreadr"
  688. {
  689. SCM result;
  690. size_t read;
  691. char local_buffer[READER_BUFFER_SIZE], *buffer;
  692. unsigned int radix;
  693. SCM str;
  694. switch (chr)
  695. {
  696. case 'B':
  697. case 'b':
  698. radix = 2;
  699. break;
  700. case 'o':
  701. case 'O':
  702. radix = 8;
  703. break;
  704. case 'd':
  705. case 'D':
  706. radix = 10;
  707. break;
  708. case 'x':
  709. case 'X':
  710. radix = 16;
  711. break;
  712. default:
  713. scm_ungetc_unlocked (chr, port);
  714. scm_ungetc_unlocked ('#', port);
  715. radix = 10;
  716. }
  717. buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
  718. &read);
  719. str = scm_from_port_stringn (buffer, read, port);
  720. result = scm_string_to_number (str, scm_from_uint (radix));
  721. SCM_COL (port) += scm_i_string_length (str);
  722. if (scm_is_true (result))
  723. return result;
  724. scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
  725. return SCM_BOOL_F;
  726. }
  727. #undef FUNC_NAME
  728. static SCM
  729. scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
  730. {
  731. SCM p;
  732. long line = SCM_LINUM (port);
  733. int column = SCM_COL (port) - 1;
  734. switch (chr)
  735. {
  736. case '`':
  737. p = scm_sym_quasiquote;
  738. break;
  739. case '\'':
  740. p = scm_sym_quote;
  741. break;
  742. case ',':
  743. {
  744. scm_t_wchar c;
  745. c = scm_getc_unlocked (port);
  746. if ('@' == c)
  747. p = scm_sym_uq_splicing;
  748. else
  749. {
  750. scm_ungetc_unlocked (c, port);
  751. p = scm_sym_unquote;
  752. }
  753. break;
  754. }
  755. default:
  756. fprintf (stderr, "%s: unhandled quote character (%i)\n",
  757. "scm_read_quote", chr);
  758. abort ();
  759. }
  760. p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
  761. return maybe_annotate_source (p, port, opts, line, column);
  762. }
  763. SCM_SYMBOL (sym_syntax, "syntax");
  764. SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
  765. SCM_SYMBOL (sym_unsyntax, "unsyntax");
  766. SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
  767. static SCM
  768. scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
  769. {
  770. SCM p;
  771. long line = SCM_LINUM (port);
  772. int column = SCM_COL (port) - 1;
  773. switch (chr)
  774. {
  775. case '`':
  776. p = sym_quasisyntax;
  777. break;
  778. case '\'':
  779. p = sym_syntax;
  780. break;
  781. case ',':
  782. {
  783. int c;
  784. c = scm_getc_unlocked (port);
  785. if ('@' == c)
  786. p = sym_unsyntax_splicing;
  787. else
  788. {
  789. scm_ungetc_unlocked (c, port);
  790. p = sym_unsyntax;
  791. }
  792. break;
  793. }
  794. default:
  795. fprintf (stderr, "%s: unhandled syntax character (%i)\n",
  796. "scm_read_syntax", chr);
  797. abort ();
  798. }
  799. p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
  800. return maybe_annotate_source (p, port, opts, line, column);
  801. }
  802. static SCM
  803. scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
  804. {
  805. SCM id = scm_read_mixed_case_symbol (chr, port, opts);
  806. if (!scm_is_eq (id, sym_nil))
  807. scm_i_input_error ("scm_read_nil", port,
  808. "unexpected input while reading #nil: ~a",
  809. scm_list_1 (id));
  810. return SCM_ELISP_NIL;
  811. }
  812. static SCM
  813. scm_read_semicolon_comment (int chr, SCM port)
  814. {
  815. int c;
  816. /* We use the get_byte here because there is no need to get the
  817. locale correct with comment input. This presumes that newline
  818. always represents itself no matter what the encoding is. */
  819. for (c = scm_get_byte_or_eof_unlocked (port);
  820. (c != EOF) && (c != '\n');
  821. c = scm_get_byte_or_eof_unlocked (port));
  822. return SCM_UNSPECIFIED;
  823. }
  824. /* If the EXPECTED_CHARS are the next ones available from PORT, then
  825. consume them and return 1. Otherwise leave the port position where
  826. it was and return 0. EXPECTED_CHARS should be all lowercase, and
  827. will be matched case-insensitively against the characters read from
  828. PORT. */
  829. static int
  830. try_read_ci_chars (SCM port, const char *expected_chars)
  831. {
  832. int num_chars_wanted = strlen (expected_chars);
  833. int num_chars_read = 0;
  834. char *chars_read = alloca (num_chars_wanted);
  835. int c;
  836. while (num_chars_read < num_chars_wanted)
  837. {
  838. c = scm_getc_unlocked (port);
  839. if (c == EOF)
  840. break;
  841. else if (c_tolower (c) != expected_chars[num_chars_read])
  842. {
  843. scm_ungetc_unlocked (c, port);
  844. break;
  845. }
  846. else
  847. chars_read[num_chars_read++] = c;
  848. }
  849. if (num_chars_read == num_chars_wanted)
  850. return 1;
  851. else
  852. {
  853. while (num_chars_read > 0)
  854. scm_ungetc_unlocked (chars_read[--num_chars_read], port);
  855. return 0;
  856. }
  857. }
  858. /* Sharp readers, i.e. readers called after a `#' sign has been read. */
  859. static SCM
  860. scm_read_boolean (int chr, SCM port)
  861. {
  862. switch (chr)
  863. {
  864. case 't':
  865. case 'T':
  866. try_read_ci_chars (port, "rue");
  867. return SCM_BOOL_T;
  868. case 'f':
  869. case 'F':
  870. try_read_ci_chars (port, "alse");
  871. return SCM_BOOL_F;
  872. }
  873. return SCM_UNSPECIFIED;
  874. }
  875. static SCM
  876. scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
  877. #define FUNC_NAME "scm_lreadr"
  878. {
  879. char buffer[READER_CHAR_NAME_MAX_SIZE];
  880. SCM charname;
  881. size_t charname_len, bytes_read;
  882. scm_t_wchar cp;
  883. int overflow;
  884. scm_t_port_internal *pti;
  885. overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
  886. &bytes_read);
  887. if (overflow)
  888. scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
  889. if (bytes_read == 0)
  890. {
  891. chr = scm_getc_unlocked (port);
  892. if (chr == EOF)
  893. scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
  894. "while reading character", SCM_EOL);
  895. /* CHR must be a token delimiter, like a whitespace. */
  896. return (SCM_MAKE_CHAR (chr));
  897. }
  898. pti = SCM_PORT_GET_INTERNAL (port);
  899. /* Simple ASCII characters can be processed immediately. Also, simple
  900. ISO-8859-1 characters can be processed immediately if the encoding for this
  901. port is ISO-8859-1. */
  902. if (bytes_read == 1 &&
  903. ((unsigned char) buffer[0] <= 127
  904. || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
  905. {
  906. SCM_COL (port) += 1;
  907. return SCM_MAKE_CHAR (buffer[0]);
  908. }
  909. /* Otherwise, convert the buffer into a proper scheme string for
  910. processing. */
  911. charname = scm_from_port_stringn (buffer, bytes_read, port);
  912. charname_len = scm_i_string_length (charname);
  913. SCM_COL (port) += charname_len;
  914. cp = scm_i_string_ref (charname, 0);
  915. if (charname_len == 1)
  916. return SCM_MAKE_CHAR (cp);
  917. /* Ignore dotted circles, which may be used to keep combining characters from
  918. combining with the backslash in #\charname. */
  919. if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
  920. return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
  921. if (cp >= '0' && cp < '8')
  922. {
  923. /* Dirk:FIXME:: This type of character syntax is not R5RS
  924. * compliant. Further, it should be verified that the constant
  925. * does only consist of octal digits. */
  926. SCM p = scm_string_to_number (charname, scm_from_uint (8));
  927. if (SCM_I_INUMP (p))
  928. {
  929. scm_t_wchar c = scm_to_uint32 (p);
  930. if (SCM_IS_UNICODE_CHAR (c))
  931. return SCM_MAKE_CHAR (c);
  932. else
  933. scm_i_input_error (FUNC_NAME, port,
  934. "out-of-range octal character escape: ~a",
  935. scm_list_1 (charname));
  936. }
  937. }
  938. if (cp == 'x' && (charname_len > 1))
  939. {
  940. SCM p;
  941. /* Convert from hex, skipping the initial 'x' character in CHARNAME */
  942. p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
  943. scm_from_uint (16));
  944. if (SCM_I_INUMP (p))
  945. {
  946. scm_t_wchar c = scm_to_uint32 (p);
  947. if (SCM_IS_UNICODE_CHAR (c))
  948. return SCM_MAKE_CHAR (c);
  949. else
  950. scm_i_input_error (FUNC_NAME, port,
  951. "out-of-range hex character escape: ~a",
  952. scm_list_1 (charname));
  953. }
  954. }
  955. /* The names of characters should never have non-Latin1
  956. characters. */
  957. if (scm_i_is_narrow_string (charname)
  958. || scm_i_try_narrow_string (charname))
  959. { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
  960. charname_len);
  961. if (scm_is_true (ch))
  962. return ch;
  963. }
  964. scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
  965. scm_list_1 (charname));
  966. return SCM_UNSPECIFIED;
  967. }
  968. #undef FUNC_NAME
  969. static SCM
  970. scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
  971. {
  972. SCM symbol;
  973. /* Read the symbol that comprises the keyword. Doing this instead of
  974. invoking a specific symbol reader function allows `scm_read_keyword ()'
  975. to adapt to the delimiters currently valid of symbols.
  976. XXX: This implementation allows sloppy syntaxes like `#: key'. */
  977. symbol = scm_read_expression (port, opts);
  978. if (!scm_is_symbol (symbol))
  979. scm_i_input_error ("scm_read_keyword", port,
  980. "keyword prefix `~a' not followed by a symbol: ~s",
  981. scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
  982. return (scm_symbol_to_keyword (symbol));
  983. }
  984. static SCM
  985. scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
  986. long line, int column)
  987. {
  988. /* Note: We call `scm_read_sexp ()' rather than READER here in order to
  989. guarantee that it's going to do what we want. After all, this is an
  990. implementation detail of `scm_read_vector ()', not a desirable
  991. property. */
  992. return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
  993. port, opts, line, column);
  994. }
  995. /* Helper used by scm_read_array */
  996. static int
  997. read_decimal_integer (SCM port, int c, ssize_t *resp)
  998. {
  999. ssize_t sign = 1;
  1000. ssize_t res = 0;
  1001. int got_it = 0;
  1002. if (c == '-')
  1003. {
  1004. sign = -1;
  1005. c = scm_getc_unlocked (port);
  1006. }
  1007. while ('0' <= c && c <= '9')
  1008. {
  1009. if (((SSIZE_MAX - (c-'0')) / 10) <= res)
  1010. scm_i_input_error ("read_decimal_integer", port,
  1011. "number too large", SCM_EOL);
  1012. res = 10*res + c-'0';
  1013. got_it = 1;
  1014. c = scm_getc_unlocked (port);
  1015. }
  1016. if (got_it)
  1017. *resp = sign * res;
  1018. return c;
  1019. }
  1020. /* Read an array. This function can also read vectors and uniform
  1021. vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
  1022. handled here.
  1023. C is the first character read after the '#'. */
  1024. static SCM
  1025. scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
  1026. {
  1027. ssize_t rank;
  1028. scm_t_wchar tag_buf[8];
  1029. int tag_len;
  1030. SCM tag, shape = SCM_BOOL_F, elements, array;
  1031. /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
  1032. the array code can not deal with zero-length dimensions yet, and
  1033. we want to allow zero-length vectors, of course. */
  1034. if (c == '(')
  1035. return scm_read_vector (c, port, opts, line, column);
  1036. /* Disambiguate between '#f' and uniform floating point vectors. */
  1037. if (c == 'f')
  1038. {
  1039. c = scm_getc_unlocked (port);
  1040. if (c != '3' && c != '6')
  1041. {
  1042. if (c == 'a' && try_read_ci_chars (port, "lse"))
  1043. return SCM_BOOL_F;
  1044. else if (c != EOF)
  1045. scm_ungetc_unlocked (c, port);
  1046. return SCM_BOOL_F;
  1047. }
  1048. rank = 1;
  1049. tag_buf[0] = 'f';
  1050. tag_len = 1;
  1051. goto continue_reading_tag;
  1052. }
  1053. /* Read rank. */
  1054. rank = 1;
  1055. c = read_decimal_integer (port, c, &rank);
  1056. if (rank < 0)
  1057. scm_i_input_error (NULL, port, "array rank must be non-negative",
  1058. SCM_EOL);
  1059. /* Read tag. */
  1060. tag_len = 0;
  1061. continue_reading_tag:
  1062. while (c != EOF && c != '(' && c != '@' && c != ':'
  1063. && tag_len < sizeof tag_buf / sizeof tag_buf[0])
  1064. {
  1065. tag_buf[tag_len++] = c;
  1066. c = scm_getc_unlocked (port);
  1067. }
  1068. if (tag_len == 0)
  1069. tag = SCM_BOOL_T;
  1070. else
  1071. {
  1072. tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
  1073. if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
  1074. scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
  1075. scm_list_1 (tag));
  1076. }
  1077. /* Read shape. */
  1078. if (c == '@' || c == ':')
  1079. {
  1080. shape = SCM_EOL;
  1081. do
  1082. {
  1083. ssize_t lbnd = 0, len = 0;
  1084. SCM s;
  1085. if (c == '@')
  1086. {
  1087. c = scm_getc_unlocked (port);
  1088. c = read_decimal_integer (port, c, &lbnd);
  1089. }
  1090. s = scm_from_ssize_t (lbnd);
  1091. if (c == ':')
  1092. {
  1093. c = scm_getc_unlocked (port);
  1094. c = read_decimal_integer (port, c, &len);
  1095. if (len < 0)
  1096. scm_i_input_error (NULL, port,
  1097. "array length must be non-negative",
  1098. SCM_EOL);
  1099. s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
  1100. }
  1101. shape = scm_cons (s, shape);
  1102. } while (c == '@' || c == ':');
  1103. shape = scm_reverse_x (shape, SCM_EOL);
  1104. }
  1105. /* Read nested lists of elements. */
  1106. if (c != '(')
  1107. scm_i_input_error (NULL, port,
  1108. "missing '(' in vector or array literal",
  1109. SCM_EOL);
  1110. elements = scm_read_sexp (c, port, opts);
  1111. if (scm_is_false (shape))
  1112. shape = scm_from_ssize_t (rank);
  1113. else if (scm_ilength (shape) != rank)
  1114. scm_i_input_error
  1115. (NULL, port,
  1116. "the number of shape specifications must match the array rank",
  1117. SCM_EOL);
  1118. /* Handle special print syntax of rank zero arrays; see
  1119. scm_i_print_array for a rationale. */
  1120. if (rank == 0)
  1121. {
  1122. if (!scm_is_pair (elements))
  1123. scm_i_input_error (NULL, port,
  1124. "too few elements in array literal, need 1",
  1125. SCM_EOL);
  1126. if (!scm_is_null (SCM_CDR (elements)))
  1127. scm_i_input_error (NULL, port,
  1128. "too many elements in array literal, want 1",
  1129. SCM_EOL);
  1130. elements = SCM_CAR (elements);
  1131. }
  1132. /* Construct array, annotate with source location, and return. */
  1133. array = scm_list_to_typed_array (tag, shape, elements);
  1134. return maybe_annotate_source (array, port, opts, line, column);
  1135. }
  1136. static SCM
  1137. scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
  1138. long line, int column)
  1139. {
  1140. return scm_read_array (chr, port, opts, line, column);
  1141. }
  1142. static SCM
  1143. scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
  1144. long line, int column)
  1145. {
  1146. chr = scm_getc_unlocked (port);
  1147. if (chr != 'u')
  1148. goto syntax;
  1149. chr = scm_getc_unlocked (port);
  1150. if (chr != '8')
  1151. goto syntax;
  1152. chr = scm_getc_unlocked (port);
  1153. if (chr != '(')
  1154. goto syntax;
  1155. return maybe_annotate_source
  1156. (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
  1157. port, opts, line, column);
  1158. syntax:
  1159. scm_i_input_error ("read_bytevector", port,
  1160. "invalid bytevector prefix",
  1161. SCM_MAKE_CHAR (chr));
  1162. return SCM_UNSPECIFIED;
  1163. }
  1164. static SCM
  1165. scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
  1166. long line, int column)
  1167. {
  1168. /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
  1169. terribly inefficient but who cares? */
  1170. SCM s_bits = SCM_EOL;
  1171. for (chr = scm_getc_unlocked (port);
  1172. (chr != EOF) && ((chr == '0') || (chr == '1'));
  1173. chr = scm_getc_unlocked (port))
  1174. {
  1175. s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
  1176. }
  1177. if (chr != EOF)
  1178. scm_ungetc_unlocked (chr, port);
  1179. return maybe_annotate_source
  1180. (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
  1181. port, opts, line, column);
  1182. }
  1183. static SCM
  1184. scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
  1185. {
  1186. int bang_seen = 0;
  1187. for (;;)
  1188. {
  1189. int c = scm_getc_unlocked (port);
  1190. if (c == EOF)
  1191. scm_i_input_error ("skip_block_comment", port,
  1192. "unterminated `#! ... !#' comment", SCM_EOL);
  1193. if (c == '!')
  1194. bang_seen = 1;
  1195. else if (c == '#' && bang_seen)
  1196. break;
  1197. else
  1198. bang_seen = 0;
  1199. }
  1200. return SCM_UNSPECIFIED;
  1201. }
  1202. static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
  1203. int value);
  1204. static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
  1205. int value);
  1206. static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
  1207. int value);
  1208. static SCM
  1209. scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
  1210. {
  1211. char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
  1212. int c;
  1213. int i = 0;
  1214. while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
  1215. {
  1216. c = scm_getc_unlocked (port);
  1217. if (c == EOF)
  1218. scm_i_input_error ("skip_block_comment", port,
  1219. "unterminated `#! ... !#' comment", SCM_EOL);
  1220. else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
  1221. name[i++] = c;
  1222. else if (CHAR_IS_DELIMITER (c))
  1223. {
  1224. scm_ungetc_unlocked (c, port);
  1225. name[i] = '\0';
  1226. if (0 == strcmp ("r6rs", name))
  1227. ; /* Silently ignore */
  1228. else if (0 == strcmp ("fold-case", name))
  1229. set_port_case_insensitive_p (port, opts, 1);
  1230. else if (0 == strcmp ("no-fold-case", name))
  1231. set_port_case_insensitive_p (port, opts, 0);
  1232. else if (0 == strcmp ("curly-infix", name))
  1233. set_port_curly_infix_p (port, opts, 1);
  1234. else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
  1235. {
  1236. set_port_curly_infix_p (port, opts, 1);
  1237. set_port_square_brackets_p (port, opts, 0);
  1238. }
  1239. else
  1240. break;
  1241. return SCM_UNSPECIFIED;
  1242. }
  1243. else
  1244. {
  1245. scm_ungetc_unlocked (c, port);
  1246. break;
  1247. }
  1248. }
  1249. while (i > 0)
  1250. scm_ungetc_unlocked (name[--i], port);
  1251. return scm_read_scsh_block_comment (chr, port);
  1252. }
  1253. static SCM
  1254. scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
  1255. {
  1256. /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
  1257. nested. So care must be taken. */
  1258. int nesting_level = 1;
  1259. int a = scm_getc_unlocked (port);
  1260. if (a == EOF)
  1261. scm_i_input_error ("scm_read_r6rs_block_comment", port,
  1262. "unterminated `#| ... |#' comment", SCM_EOL);
  1263. while (nesting_level > 0)
  1264. {
  1265. int b = scm_getc_unlocked (port);
  1266. if (b == EOF)
  1267. scm_i_input_error ("scm_read_r6rs_block_comment", port,
  1268. "unterminated `#| ... |#' comment", SCM_EOL);
  1269. if (a == '|' && b == '#')
  1270. {
  1271. nesting_level--;
  1272. b = EOF;
  1273. }
  1274. else if (a == '#' && b == '|')
  1275. {
  1276. nesting_level++;
  1277. b = EOF;
  1278. }
  1279. a = b;
  1280. }
  1281. return SCM_UNSPECIFIED;
  1282. }
  1283. static SCM
  1284. scm_read_commented_expression (scm_t_wchar chr, SCM port,
  1285. scm_t_read_opts *opts)
  1286. {
  1287. scm_t_wchar c;
  1288. c = flush_ws (port, opts, (char *) NULL);
  1289. if (EOF == c)
  1290. scm_i_input_error ("read_commented_expression", port,
  1291. "no expression after #; comment", SCM_EOL);
  1292. scm_ungetc_unlocked (c, port);
  1293. scm_read_expression (port, opts);
  1294. return SCM_UNSPECIFIED;
  1295. }
  1296. static SCM
  1297. scm_read_extended_symbol (scm_t_wchar chr, SCM port)
  1298. {
  1299. /* Guile's extended symbol read syntax looks like this:
  1300. #{This is all a symbol name}#
  1301. So here, CHR is expected to be `{'. */
  1302. int saw_brace = 0;
  1303. size_t len = 0;
  1304. SCM buf = scm_i_make_string (1024, NULL, 0);
  1305. buf = scm_i_string_start_writing (buf);
  1306. while ((chr = scm_getc_unlocked (port)) != EOF)
  1307. {
  1308. if (saw_brace)
  1309. {
  1310. if (chr == '#')
  1311. {
  1312. break;
  1313. }
  1314. else
  1315. {
  1316. saw_brace = 0;
  1317. scm_i_string_set_x (buf, len++, '}');
  1318. }
  1319. }
  1320. if (chr == '}')
  1321. saw_brace = 1;
  1322. else if (chr == '\\')
  1323. {
  1324. /* It used to be that print.c would print extended-read-syntax
  1325. symbols with backslashes before "non-standard" chars, but
  1326. this routine wouldn't do anything with those escapes.
  1327. Bummer. What we've done is to change print.c to output
  1328. R6RS hex escapes for those characters, relying on the fact
  1329. that the extended read syntax would never put a `\' before
  1330. an `x'. For now, we just ignore other instances of
  1331. backslash in the string. */
  1332. switch ((chr = scm_getc_unlocked (port)))
  1333. {
  1334. case EOF:
  1335. goto done;
  1336. case 'x':
  1337. {
  1338. scm_t_wchar c;
  1339. SCM_READ_HEX_ESCAPE (10, ';');
  1340. scm_i_string_set_x (buf, len++, c);
  1341. break;
  1342. str_eof:
  1343. chr = EOF;
  1344. goto done;
  1345. bad_escaped:
  1346. scm_i_string_stop_writing ();
  1347. scm_i_input_error ("scm_read_extended_symbol", port,
  1348. "illegal character in escape sequence: ~S",
  1349. scm_list_1 (SCM_MAKE_CHAR (c)));
  1350. break;
  1351. }
  1352. default:
  1353. scm_i_string_set_x (buf, len++, chr);
  1354. break;
  1355. }
  1356. }
  1357. else
  1358. scm_i_string_set_x (buf, len++, chr);
  1359. if (len >= scm_i_string_length (buf) - 2)
  1360. {
  1361. SCM addy;
  1362. scm_i_string_stop_writing ();
  1363. addy = scm_i_make_string (1024, NULL, 0);
  1364. buf = scm_string_append (scm_list_2 (buf, addy));
  1365. len = 0;
  1366. buf = scm_i_string_start_writing (buf);
  1367. }
  1368. }
  1369. done:
  1370. scm_i_string_stop_writing ();
  1371. if (chr == EOF)
  1372. scm_i_input_error ("scm_read_extended_symbol", port,
  1373. "end of file while reading symbol", SCM_EOL);
  1374. return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
  1375. }
  1376. /* Top-level token readers, i.e., dispatchers. */
  1377. static SCM
  1378. scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
  1379. {
  1380. SCM proc;
  1381. proc = scm_get_hash_procedure (chr);
  1382. if (scm_is_true (scm_procedure_p (proc)))
  1383. {
  1384. long line = SCM_LINUM (port);
  1385. int column = SCM_COL (port) - 2;
  1386. SCM got;
  1387. got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
  1388. if (opts->record_positions_p && SCM_NIMP (got)
  1389. && !scm_i_has_source_properties (got))
  1390. scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
  1391. return got;
  1392. }
  1393. return SCM_UNSPECIFIED;
  1394. }
  1395. /* The reader for the sharp `#' character. It basically dispatches reads
  1396. among the above token readers. */
  1397. static SCM
  1398. scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
  1399. long line, int column)
  1400. #define FUNC_NAME "scm_lreadr"
  1401. {
  1402. SCM result;
  1403. chr = scm_getc_unlocked (port);
  1404. result = scm_read_sharp_extension (chr, port, opts);
  1405. if (!scm_is_eq (result, SCM_UNSPECIFIED))
  1406. return result;
  1407. switch (chr)
  1408. {
  1409. case '\\':
  1410. return (scm_read_character (chr, port, opts));
  1411. case '(':
  1412. return (scm_read_vector (chr, port, opts, line, column));
  1413. case 's':
  1414. case 'u':
  1415. case 'f':
  1416. case 'c':
  1417. /* This one may return either a boolean or an SRFI-4 vector. */
  1418. return (scm_read_srfi4_vector (chr, port, opts, line, column));
  1419. case 'v':
  1420. return (scm_read_bytevector (chr, port, opts, line, column));
  1421. case '*':
  1422. return (scm_read_guile_bit_vector (chr, port, opts, line, column));
  1423. case 't':
  1424. case 'T':
  1425. case 'F':
  1426. return (scm_read_boolean (chr, port));
  1427. case ':':
  1428. return (scm_read_keyword (chr, port, opts));
  1429. case '0': case '1': case '2': case '3': case '4':
  1430. case '5': case '6': case '7': case '8': case '9':
  1431. case '@':
  1432. return (scm_read_array (chr, port, opts, line, column));
  1433. case 'i':
  1434. case 'e':
  1435. case 'b':
  1436. case 'B':
  1437. case 'o':
  1438. case 'O':
  1439. case 'd':
  1440. case 'D':
  1441. case 'x':
  1442. case 'X':
  1443. case 'I':
  1444. case 'E':
  1445. return (scm_read_number_and_radix (chr, port, opts));
  1446. case '{':
  1447. return (scm_read_extended_symbol (chr, port));
  1448. case '!':
  1449. return (scm_read_shebang (chr, port, opts));
  1450. case ';':
  1451. return (scm_read_commented_expression (chr, port, opts));
  1452. case '`':
  1453. case '\'':
  1454. case ',':
  1455. return (scm_read_syntax (chr, port, opts));
  1456. case 'n':
  1457. return (scm_read_nil (chr, port, opts));
  1458. default:
  1459. result = scm_read_sharp_extension (chr, port, opts);
  1460. if (scm_is_eq (result, SCM_UNSPECIFIED))
  1461. {
  1462. /* To remain compatible with 1.8 and earlier, the following
  1463. characters have lower precedence than `read-hash-extend'
  1464. characters. */
  1465. switch (chr)
  1466. {
  1467. case '|':
  1468. return scm_read_r6rs_block_comment (chr, port);
  1469. default:
  1470. scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
  1471. scm_list_1 (SCM_MAKE_CHAR (chr)));
  1472. }
  1473. }
  1474. else
  1475. return result;
  1476. }
  1477. return SCM_UNSPECIFIED;
  1478. }
  1479. #undef FUNC_NAME
  1480. static SCM
  1481. read_inner_expression (SCM port, scm_t_read_opts *opts)
  1482. #define FUNC_NAME "read_inner_expression"
  1483. {
  1484. while (1)
  1485. {
  1486. scm_t_wchar chr;
  1487. chr = scm_getc_unlocked (port);
  1488. switch (chr)
  1489. {
  1490. case SCM_WHITE_SPACES:
  1491. case SCM_LINE_INCREMENTORS:
  1492. break;
  1493. case ';':
  1494. (void) scm_read_semicolon_comment (chr, port);
  1495. break;
  1496. case '{':
  1497. if (opts->curly_infix_p)
  1498. {
  1499. if (opts->neoteric_p)
  1500. return scm_read_sexp (chr, port, opts);
  1501. else
  1502. {
  1503. SCM expr;
  1504. /* Enable neoteric expressions within curly braces */
  1505. opts->neoteric_p = 1;
  1506. expr = scm_read_sexp (chr, port, opts);
  1507. opts->neoteric_p = 0;
  1508. return expr;
  1509. }
  1510. }
  1511. else
  1512. return scm_read_mixed_case_symbol (chr, port, opts);
  1513. case '[':
  1514. if (opts->square_brackets_p)
  1515. return scm_read_sexp (chr, port, opts);
  1516. else if (opts->curly_infix_p)
  1517. {
  1518. /* The syntax of neoteric expressions requires that '[' be
  1519. a delimiter when curly-infix is enabled, so it cannot
  1520. be part of an unescaped symbol. We might as well do
  1521. something useful with it, so we adopt Kawa's convention:
  1522. [...] => ($bracket-list$ ...) */
  1523. long line = SCM_LINUM (port);
  1524. int column = SCM_COL (port) - 1;
  1525. return maybe_annotate_source
  1526. (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
  1527. port, opts, line, column);
  1528. }
  1529. else
  1530. return scm_read_mixed_case_symbol (chr, port, opts);
  1531. case '(':
  1532. return (scm_read_sexp (chr, port, opts));
  1533. case '"':
  1534. return (scm_read_string (chr, port, opts));
  1535. case '|':
  1536. if (opts->r7rs_symbols_p)
  1537. return scm_read_r7rs_symbol (chr, port, opts);
  1538. else
  1539. return scm_read_mixed_case_symbol (chr, port, opts);
  1540. case '\'':
  1541. case '`':
  1542. case ',':
  1543. return (scm_read_quote (chr, port, opts));
  1544. case '#':
  1545. {
  1546. long line = SCM_LINUM (port);
  1547. int column = SCM_COL (port) - 1;
  1548. SCM result = scm_read_sharp (chr, port, opts, line, column);
  1549. if (scm_is_eq (result, SCM_UNSPECIFIED))
  1550. /* We read a comment or some such. */
  1551. break;
  1552. else
  1553. return result;
  1554. }
  1555. case ')':
  1556. scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
  1557. break;
  1558. case '}':
  1559. if (opts->curly_infix_p)
  1560. scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
  1561. else
  1562. return scm_read_mixed_case_symbol (chr, port, opts);
  1563. case ']':
  1564. if (opts->square_brackets_p)
  1565. scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
  1566. /* otherwise fall through */
  1567. case EOF:
  1568. return SCM_EOF_VAL;
  1569. case ':':
  1570. if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
  1571. return scm_symbol_to_keyword (scm_read_expression (port, opts));
  1572. /* Fall through. */
  1573. default:
  1574. {
  1575. if (((chr >= '0') && (chr <= '9'))
  1576. || (strchr ("+-.", chr)))
  1577. return (scm_read_number (chr, port, opts));
  1578. else
  1579. return (scm_read_mixed_case_symbol (chr, port, opts));
  1580. }
  1581. }
  1582. }
  1583. }
  1584. #undef FUNC_NAME
  1585. static SCM
  1586. scm_read_expression (SCM port, scm_t_read_opts *opts)
  1587. #define FUNC_NAME "scm_read_expression"
  1588. {
  1589. if (!opts->neoteric_p)
  1590. return read_inner_expression (port, opts);
  1591. else
  1592. {
  1593. long line = 0;
  1594. int column = 0;
  1595. SCM expr;
  1596. if (opts->record_positions_p)
  1597. {
  1598. /* We need to get the position of the first non-whitespace
  1599. character in order to correctly annotate neoteric
  1600. expressions. For example, for the expression 'f(x)', the
  1601. first call to 'read_inner_expression' reads the 'f' (which
  1602. cannot be annotated), and then we later read the '(x)' and
  1603. use it to construct the new list (f x). */
  1604. int c = flush_ws (port, opts, (char *) NULL);
  1605. if (c == EOF)
  1606. return SCM_EOF_VAL;
  1607. scm_ungetc_unlocked (c, port);
  1608. line = SCM_LINUM (port);
  1609. column = SCM_COL (port);
  1610. }
  1611. expr = read_inner_expression (port, opts);
  1612. /* 'expr' is the first component of the neoteric expression. Now
  1613. we loop, and as long as the next character is '(', '[', or '{',
  1614. (without any intervening whitespace), we use it to construct a
  1615. new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
  1616. for (;;)
  1617. {
  1618. int chr = scm_getc_unlocked (port);
  1619. if (chr == '(')
  1620. /* e(...) => (e ...) */
  1621. expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
  1622. else if (chr == '[')
  1623. /* e[...] => ($bracket-apply$ e ...) */
  1624. expr = scm_cons (sym_bracket_apply,
  1625. scm_cons (expr,
  1626. scm_read_sexp (chr, port, opts)));
  1627. else if (chr == '{')
  1628. {
  1629. SCM arg = scm_read_sexp (chr, port, opts);
  1630. if (scm_is_null (arg))
  1631. expr = scm_list_1 (expr); /* e{} => (e) */
  1632. else
  1633. expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
  1634. }
  1635. else
  1636. {
  1637. if (chr != EOF)
  1638. scm_ungetc_unlocked (chr, port);
  1639. break;
  1640. }
  1641. maybe_annotate_source (expr, port, opts, line, column);
  1642. }
  1643. return expr;
  1644. }
  1645. }
  1646. #undef FUNC_NAME
  1647. /* Actual reader. */
  1648. static void init_read_options (SCM port, scm_t_read_opts *opts);
  1649. SCM_DEFINE (scm_read, "read", 0, 1, 0,
  1650. (SCM port),
  1651. "Read an s-expression from the input port @var{port}, or from\n"
  1652. "the current input port if @var{port} is not specified.\n"
  1653. "Any whitespace before the next token is discarded.")
  1654. #define FUNC_NAME s_scm_read
  1655. {
  1656. scm_t_read_opts opts;
  1657. int c;
  1658. if (SCM_UNBNDP (port))
  1659. port = scm_current_input_port ();
  1660. SCM_VALIDATE_OPINPORT (1, port);
  1661. init_read_options (port, &opts);
  1662. c = flush_ws (port, &opts, (char *) NULL);
  1663. if (EOF == c)
  1664. return SCM_EOF_VAL;
  1665. scm_ungetc_unlocked (c, port);
  1666. return (scm_read_expression (port, &opts));
  1667. }
  1668. #undef FUNC_NAME
  1669. /* Manipulate the read-hash-procedures alist. This could be written in
  1670. Scheme, but maybe it will also be used by C code during initialisation. */
  1671. SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
  1672. (SCM chr, SCM proc),
  1673. "Install the procedure @var{proc} for reading expressions\n"
  1674. "starting with the character sequence @code{#} and @var{chr}.\n"
  1675. "@var{proc} will be called with two arguments: the character\n"
  1676. "@var{chr} and the port to read further data from. The object\n"
  1677. "returned will be the return value of @code{read}. \n"
  1678. "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
  1679. )
  1680. #define FUNC_NAME s_scm_read_hash_extend
  1681. {
  1682. SCM this;
  1683. SCM prev;
  1684. SCM_VALIDATE_CHAR (1, chr);
  1685. SCM_ASSERT (scm_is_false (proc)
  1686. || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
  1687. proc, SCM_ARG2, FUNC_NAME);
  1688. /* Check if chr is already in the alist. */
  1689. this = scm_i_read_hash_procedures_ref ();
  1690. prev = SCM_BOOL_F;
  1691. while (1)
  1692. {
  1693. if (scm_is_null (this))
  1694. {
  1695. /* not found, so add it to the beginning. */
  1696. if (scm_is_true (proc))
  1697. {
  1698. SCM new = scm_cons (scm_cons (chr, proc),
  1699. scm_i_read_hash_procedures_ref ());
  1700. scm_i_read_hash_procedures_set_x (new);
  1701. }
  1702. break;
  1703. }
  1704. if (scm_is_eq (chr, SCM_CAAR (this)))
  1705. {
  1706. /* already in the alist. */
  1707. if (scm_is_false (proc))
  1708. {
  1709. /* remove it. */
  1710. if (scm_is_false (prev))
  1711. {
  1712. SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
  1713. scm_i_read_hash_procedures_set_x (rest);
  1714. }
  1715. else
  1716. scm_set_cdr_x (prev, SCM_CDR (this));
  1717. }
  1718. else
  1719. {
  1720. /* replace it. */
  1721. scm_set_cdr_x (SCM_CAR (this), proc);
  1722. }
  1723. break;
  1724. }
  1725. prev = this;
  1726. this = SCM_CDR (this);
  1727. }
  1728. return SCM_UNSPECIFIED;
  1729. }
  1730. #undef FUNC_NAME
  1731. /* Recover the read-hash procedure corresponding to char c. */
  1732. static SCM
  1733. scm_get_hash_procedure (int c)
  1734. {
  1735. SCM rest = scm_i_read_hash_procedures_ref ();
  1736. while (1)
  1737. {
  1738. if (scm_is_null (rest))
  1739. return SCM_BOOL_F;
  1740. if (SCM_CHAR (SCM_CAAR (rest)) == c)
  1741. return SCM_CDAR (rest);
  1742. rest = SCM_CDR (rest);
  1743. }
  1744. }
  1745. static int
  1746. is_encoding_char (char c)
  1747. {
  1748. if (c >= 'a' && c <= 'z') return 1;
  1749. if (c >= 'A' && c <= 'Z') return 1;
  1750. if (c >= '0' && c <= '9') return 1;
  1751. return strchr ("_-.:/,+=()", c) != NULL;
  1752. }
  1753. /* Maximum size of an encoding name. This is a bit more than the
  1754. longest name listed at
  1755. <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
  1756. characters.) */
  1757. #define ENCODING_NAME_MAX_SIZE 20
  1758. /* Number of bytes at the beginning or end of a file that are scanned
  1759. for a "coding:" declaration. */
  1760. #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
  1761. /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
  1762. coding declaration. Returns either NULL or a string whose storage
  1763. has been allocated with `scm_gc_malloc'. */
  1764. char *
  1765. scm_i_scan_for_encoding (SCM port)
  1766. {
  1767. scm_t_port *pt;
  1768. char header[SCM_ENCODING_SEARCH_SIZE+1];
  1769. size_t bytes_read, encoding_length, i;
  1770. char *encoding = NULL;
  1771. char *pos, *encoding_start;
  1772. int in_comment;
  1773. pt = SCM_PTAB_ENTRY (port);
  1774. if (pt->rw_active == SCM_PORT_WRITE)
  1775. scm_flush_unlocked (port);
  1776. if (pt->rw_random)
  1777. pt->rw_active = SCM_PORT_READ;
  1778. if (pt->read_pos == pt->read_end)
  1779. {
  1780. /* We can use the read buffer, and thus avoid a seek. */
  1781. if (scm_fill_input_unlocked (port) == EOF)
  1782. return NULL;
  1783. bytes_read = pt->read_end - pt->read_pos;
  1784. if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
  1785. bytes_read = SCM_ENCODING_SEARCH_SIZE;
  1786. if (bytes_read <= 1)
  1787. /* An unbuffered port -- don't scan. */
  1788. return NULL;
  1789. memcpy (header, pt->read_pos, bytes_read);
  1790. header[bytes_read] = '\0';
  1791. }
  1792. else
  1793. {
  1794. /* Try to read some bytes and then seek back. Not all ports
  1795. support seeking back; and indeed some file ports (like
  1796. /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
  1797. check performed by SCM_FPORT_FDES---but fail to seek
  1798. backwards. Hence this block comes second. We prefer to use
  1799. the read buffer in-place. */
  1800. if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
  1801. return NULL;
  1802. bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
  1803. header[bytes_read] = '\0';
  1804. scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
  1805. }
  1806. /* search past "coding[:=]" */
  1807. pos = header;
  1808. while (1)
  1809. {
  1810. if ((pos = strstr(pos, "coding")) == NULL)
  1811. return NULL;
  1812. pos += strlen ("coding");
  1813. if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
  1814. (*pos == ':' || *pos == '='))
  1815. {
  1816. pos ++;
  1817. break;
  1818. }
  1819. }
  1820. /* skip spaces */
  1821. while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
  1822. (*pos == ' ' || *pos == '\t'))
  1823. pos ++;
  1824. if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
  1825. /* We found the "coding:" string, but there is probably not enough
  1826. room to store an encoding name in its entirety, so ignore it.
  1827. This makes sure we do not end up returning a truncated encoding
  1828. name. */
  1829. return NULL;
  1830. /* grab the next token */
  1831. encoding_start = pos;
  1832. i = 0;
  1833. while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
  1834. && encoding_start + i - header < bytes_read
  1835. && is_encoding_char (encoding_start[i]))
  1836. i++;
  1837. encoding_length = i;
  1838. if (encoding_length == 0)
  1839. return NULL;
  1840. encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
  1841. /* push backwards to make sure we were in a comment */
  1842. in_comment = 0;
  1843. pos = encoding_start;
  1844. while (pos >= header)
  1845. {
  1846. if (*pos == ';')
  1847. {
  1848. in_comment = 1;
  1849. break;
  1850. }
  1851. else if (*pos == '\n' || pos == header)
  1852. {
  1853. /* This wasn't in a semicolon comment. Check for a
  1854. hash-bang comment. */
  1855. char *beg = strstr (header, "#!");
  1856. char *end = strstr (header, "!#");
  1857. if (beg < encoding_start && encoding_start + encoding_length <= end)
  1858. in_comment = 1;
  1859. break;
  1860. }
  1861. else
  1862. {
  1863. pos --;
  1864. continue;
  1865. }
  1866. }
  1867. if (!in_comment)
  1868. /* This wasn't in a comment */
  1869. return NULL;
  1870. return encoding;
  1871. }
  1872. SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
  1873. (SCM port),
  1874. "Scans the port for an Emacs-like character coding declaration\n"
  1875. "near the top of the contents of a port with random-accessible contents.\n"
  1876. "The coding declaration is of the form\n"
  1877. "@code{coding: XXXXX} and must appear in a scheme comment.\n"
  1878. "\n"
  1879. "Returns a string containing the character encoding of the file\n"
  1880. "if a declaration was found, or @code{#f} otherwise.\n")
  1881. #define FUNC_NAME s_scm_file_encoding
  1882. {
  1883. char *enc;
  1884. SCM s_enc;
  1885. SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
  1886. enc = scm_i_scan_for_encoding (port);
  1887. if (enc == NULL)
  1888. return SCM_BOOL_F;
  1889. else
  1890. {
  1891. s_enc = scm_string_upcase (scm_from_locale_string (enc));
  1892. return s_enc;
  1893. }
  1894. return SCM_BOOL_F;
  1895. }
  1896. #undef FUNC_NAME
  1897. /* Per-port read options.
  1898. We store per-port read options in the 'port-read-options' port
  1899. property, which is stored in the internal port structure. The value
  1900. stored is a single integer that contains a two-bit field for each
  1901. read option.
  1902. If a bit field contains READ_OPTION_INHERIT (3), that indicates that
  1903. the applicable value should be inherited from the corresponding
  1904. global read option. Otherwise, the bit field contains the value of
  1905. the read option. For boolean read options that have been set
  1906. per-port, the possible values are 0 or 1. If the 'keyword_style'
  1907. read option has been set per-port, its possible values are those in
  1908. 'enum t_keyword_style'. */
  1909. /* Key to read options in port properties. */
  1910. SCM_SYMBOL (sym_port_read_options, "port-read-options");
  1911. /* Offsets of bit fields for each per-port override */
  1912. #define READ_OPTION_COPY_SOURCE_P 0
  1913. #define READ_OPTION_RECORD_POSITIONS_P 2
  1914. #define READ_OPTION_CASE_INSENSITIVE_P 4
  1915. #define READ_OPTION_KEYWORD_STYLE 6
  1916. #define READ_OPTION_R6RS_ESCAPES_P 8
  1917. #define READ_OPTION_SQUARE_BRACKETS_P 10
  1918. #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
  1919. #define READ_OPTION_CURLY_INFIX_P 14
  1920. #define READ_OPTION_R7RS_SYMBOLS_P 16
  1921. /* The total width in bits of the per-port overrides */
  1922. #define READ_OPTIONS_NUM_BITS 18
  1923. #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
  1924. #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
  1925. #define READ_OPTION_MASK 3
  1926. #define READ_OPTION_INHERIT 3
  1927. static void
  1928. set_port_read_option (SCM port, int option, int new_value)
  1929. {
  1930. SCM scm_read_options;
  1931. unsigned int read_options;
  1932. new_value &= READ_OPTION_MASK;
  1933. scm_dynwind_begin (0);
  1934. scm_dynwind_lock_port (port);
  1935. scm_read_options = scm_i_port_property (port, sym_port_read_options);
  1936. if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
  1937. read_options = scm_to_uint (scm_read_options);
  1938. else
  1939. read_options = READ_OPTIONS_INHERIT_ALL;
  1940. read_options &= ~(READ_OPTION_MASK << option);
  1941. read_options |= new_value << option;
  1942. scm_read_options = scm_from_uint (read_options);
  1943. scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
  1944. scm_dynwind_end ();
  1945. }
  1946. /* Set OPTS and PORT's case-insensitivity according to VALUE. */
  1947. static void
  1948. set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
  1949. {
  1950. value = !!value;
  1951. opts->case_insensitive_p = value;
  1952. set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
  1953. }
  1954. /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
  1955. static void
  1956. set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
  1957. {
  1958. value = !!value;
  1959. opts->square_brackets_p = value;
  1960. set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
  1961. }
  1962. /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
  1963. static void
  1964. set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
  1965. {
  1966. value = !!value;
  1967. opts->curly_infix_p = value;
  1968. set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
  1969. }
  1970. /* Initialize OPTS based on PORT's read options and the global read
  1971. options. */
  1972. static void
  1973. init_read_options (SCM port, scm_t_read_opts *opts)
  1974. {
  1975. SCM val, scm_read_options;
  1976. unsigned int read_options, x;
  1977. scm_read_options = scm_i_port_property (port, sym_port_read_options);
  1978. if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
  1979. read_options = scm_to_uint (scm_read_options);
  1980. else
  1981. read_options = READ_OPTIONS_INHERIT_ALL;
  1982. x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
  1983. if (x == READ_OPTION_INHERIT)
  1984. {
  1985. val = SCM_PACK (SCM_KEYWORD_STYLE);
  1986. if (scm_is_eq (val, scm_keyword_prefix))
  1987. x = KEYWORD_STYLE_PREFIX;
  1988. else if (scm_is_eq (val, scm_keyword_postfix))
  1989. x = KEYWORD_STYLE_POSTFIX;
  1990. else
  1991. x = KEYWORD_STYLE_HASH_PREFIX;
  1992. }
  1993. opts->keyword_style = x;
  1994. #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
  1995. do \
  1996. { \
  1997. x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
  1998. if (x == READ_OPTION_INHERIT) \
  1999. x = !!SCM_ ## NAME; \
  2000. opts->name = x; \
  2001. } \
  2002. while (0)
  2003. RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
  2004. RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
  2005. RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
  2006. RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
  2007. RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
  2008. RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
  2009. RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
  2010. RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
  2011. #undef RESOLVE_BOOLEAN_OPTION
  2012. opts->neoteric_p = 0;
  2013. }
  2014. void
  2015. scm_init_read ()
  2016. {
  2017. SCM read_hash_procs;
  2018. read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
  2019. scm_i_read_hash_procedures =
  2020. SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
  2021. scm_init_opts (scm_read_options, scm_read_opts);
  2022. #include "libguile/read.x"
  2023. }
  2024. /*
  2025. Local Variables:
  2026. c-file-style: "gnu"
  2027. End:
  2028. */