strings.c 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722
  1. /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
  2. *
  3. * This library is free software; you can redistribute it and/or
  4. * modify it under the terms of the GNU Lesser General Public License
  5. * as published by the Free Software Foundation; either version 3 of
  6. * the License, or (at your option) any later version.
  7. *
  8. * This library is distributed in the hope that it will be useful, but
  9. * WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * Lesser General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Lesser General Public
  14. * License along with this library; if not, write to the Free Software
  15. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16. * 02110-1301 USA
  17. */
  18. #ifdef HAVE_CONFIG_H
  19. # include <config.h>
  20. #endif
  21. #include <string.h>
  22. #include <stdio.h>
  23. #include <ctype.h>
  24. #include <unistr.h>
  25. #include <uniconv.h>
  26. #include "libguile/_scm.h"
  27. #include "libguile/chars.h"
  28. #include "libguile/root.h"
  29. #include "libguile/strings.h"
  30. #include "libguile/deprecation.h"
  31. #include "libguile/validate.h"
  32. #include "libguile/dynwind.h"
  33. /* {Strings}
  34. */
  35. /* Stringbufs
  36. *
  37. * XXX - keeping an accurate refcount during GC seems to be quite
  38. * tricky, so we just keep score of whether a stringbuf might be
  39. * shared, not whether it definitely is.
  40. *
  41. * The scheme I (mvo) tried to keep an accurate reference count would
  42. * recount all strings that point to a stringbuf during the mark-phase
  43. * of the GC. This was done since one cannot access the stringbuf of
  44. * a string when that string is freed (in order to decrease the
  45. * reference count). The memory of the stringbuf might have been
  46. * reused already for something completely different.
  47. *
  48. * This recounted worked for a small number of threads beating on
  49. * cow-strings, but it failed randomly with more than 10 threads, say.
  50. * I couldn't figure out what went wrong, so I used the conservative
  51. * approach implemented below.
  52. *
  53. * A stringbuf needs to know its length, but only so that it can be
  54. * reported when the stringbuf is freed.
  55. *
  56. * There are 3 storage strategies for stringbufs: inline, outline, and
  57. * wide.
  58. *
  59. * Inline strings are small 8-bit strings stored within the double
  60. * cell itself. Outline strings are larger 8-bit strings with GC
  61. * allocated storage. Wide strings are 32-bit strings with allocated
  62. * storage.
  63. *
  64. * There was little value in making wide string inlineable, since
  65. * there is only room for three inlined 32-bit characters. Thus wide
  66. * stringbufs are never inlined.
  67. */
  68. #define STRINGBUF_F_SHARED 0x100
  69. #define STRINGBUF_F_INLINE 0x200
  70. #define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
  71. encoding. Otherwise, strings
  72. are Latin-1. */
  73. #define STRINGBUF_TAG scm_tc7_stringbuf
  74. #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  75. #define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
  76. #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  77. #define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
  78. #define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
  79. #define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
  80. #define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
  81. #define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
  82. ? STRINGBUF_INLINE_CHARS (buf) \
  83. : STRINGBUF_OUTLINE_CHARS (buf))
  84. #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
  85. #define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
  86. ? STRINGBUF_INLINE_LENGTH (buf) \
  87. : STRINGBUF_OUTLINE_LENGTH (buf))
  88. #define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
  89. #define SET_STRINGBUF_SHARED(buf) \
  90. (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
  91. #if SCM_STRING_LENGTH_HISTOGRAM
  92. static size_t lenhist[1001];
  93. #endif
  94. /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
  95. characters. */
  96. static SCM
  97. make_stringbuf (size_t len)
  98. {
  99. /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
  100. scm_i_symbol_chars, all stringbufs are null-terminated. Once
  101. SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
  102. has been changed for scm_i_symbol_chars, this null-termination
  103. can be dropped.
  104. */
  105. #if SCM_STRING_LENGTH_HISTOGRAM
  106. if (len < 1000)
  107. lenhist[len]++;
  108. else
  109. lenhist[1000]++;
  110. #endif
  111. if (len <= STRINGBUF_MAX_INLINE_LEN-1)
  112. {
  113. return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
  114. 0, 0, 0);
  115. }
  116. else
  117. {
  118. char *mem = scm_gc_malloc (len+1, "string");
  119. mem[len] = '\0';
  120. return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
  121. (scm_t_bits) len, (scm_t_bits) 0);
  122. }
  123. }
  124. /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
  125. characters. */
  126. static SCM
  127. make_wide_stringbuf (size_t len)
  128. {
  129. scm_t_wchar *mem;
  130. #if SCM_STRING_LENGTH_HISTOGRAM
  131. if (len < 1000)
  132. lenhist[len]++;
  133. else
  134. lenhist[1000]++;
  135. #endif
  136. mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
  137. mem[len] = 0;
  138. return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
  139. (scm_t_bits) len, (scm_t_bits) 0);
  140. }
  141. /* Return a new stringbuf whose underlying storage consists of the LEN+1
  142. octets pointed to by STR (the last octet is zero). */
  143. SCM
  144. scm_i_take_stringbufn (char *str, size_t len)
  145. {
  146. scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
  147. return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
  148. (scm_t_bits) len, (scm_t_bits) 0);
  149. }
  150. SCM
  151. scm_i_stringbuf_mark (SCM buf)
  152. {
  153. return SCM_BOOL_F;
  154. }
  155. void
  156. scm_i_stringbuf_free (SCM buf)
  157. {
  158. if (!STRINGBUF_INLINE (buf))
  159. {
  160. if (!STRINGBUF_WIDE (buf))
  161. scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
  162. STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
  163. else
  164. scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
  165. sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
  166. + 1), "string");
  167. }
  168. }
  169. /* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
  170. one containing 32-bit UCS-4-encoded characters. */
  171. static void
  172. widen_stringbuf (SCM buf)
  173. {
  174. size_t i, len;
  175. scm_t_wchar *mem;
  176. if (STRINGBUF_WIDE (buf))
  177. return;
  178. if (STRINGBUF_INLINE (buf))
  179. {
  180. len = STRINGBUF_INLINE_LENGTH (buf);
  181. mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
  182. for (i = 0; i < len; i++)
  183. mem[i] =
  184. (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
  185. mem[len] = 0;
  186. SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
  187. SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
  188. SCM_SET_CELL_WORD_1 (buf, mem);
  189. SCM_SET_CELL_WORD_2 (buf, len);
  190. }
  191. else
  192. {
  193. len = STRINGBUF_OUTLINE_LENGTH (buf);
  194. mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
  195. for (i = 0; i < len; i++)
  196. mem[i] =
  197. (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
  198. mem[len] = 0;
  199. scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
  200. SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
  201. SCM_SET_CELL_WORD_1 (buf, mem);
  202. SCM_SET_CELL_WORD_2 (buf, len);
  203. }
  204. }
  205. scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
  206. /* Copy-on-write strings.
  207. */
  208. #define STRING_TAG scm_tc7_string
  209. #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
  210. #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
  211. #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
  212. #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
  213. #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
  214. #define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
  215. /* Read-only strings.
  216. */
  217. #define RO_STRING_TAG (scm_tc7_string + 0x200)
  218. #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
  219. /* Mutation-sharing substrings
  220. */
  221. #define SH_STRING_TAG (scm_tc7_string + 0x100)
  222. #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
  223. /* START and LENGTH as for STRINGs. */
  224. #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
  225. /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
  226. characters. CHARSP, if not NULL, will be set to location of the
  227. char array. */
  228. SCM
  229. scm_i_make_string (size_t len, char **charsp)
  230. {
  231. SCM buf = make_stringbuf (len);
  232. SCM res;
  233. if (charsp)
  234. *charsp = STRINGBUF_CHARS (buf);
  235. res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
  236. (scm_t_bits)0, (scm_t_bits) len);
  237. return res;
  238. }
  239. /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
  240. characters. CHARSP, if not NULL, will be set to location of the
  241. character array. */
  242. SCM
  243. scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
  244. {
  245. SCM buf = make_wide_stringbuf (len);
  246. SCM res;
  247. if (charsp)
  248. *charsp = STRINGBUF_WIDE_CHARS (buf);
  249. res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
  250. (scm_t_bits) 0, (scm_t_bits) len);
  251. return res;
  252. }
  253. static void
  254. validate_substring_args (SCM str, size_t start, size_t end)
  255. {
  256. if (!IS_STRING (str))
  257. scm_wrong_type_arg_msg (NULL, 0, str, "string");
  258. if (start > STRING_LENGTH (str))
  259. scm_out_of_range (NULL, scm_from_size_t (start));
  260. if (end > STRING_LENGTH (str) || end < start)
  261. scm_out_of_range (NULL, scm_from_size_t (end));
  262. }
  263. static inline void
  264. get_str_buf_start (SCM *str, SCM *buf, size_t *start)
  265. {
  266. *start = STRING_START (*str);
  267. if (IS_SH_STRING (*str))
  268. {
  269. *str = SH_STRING_STRING (*str);
  270. *start += STRING_START (*str);
  271. }
  272. *buf = STRING_STRINGBUF (*str);
  273. }
  274. SCM
  275. scm_i_substring (SCM str, size_t start, size_t end)
  276. {
  277. SCM buf;
  278. size_t str_start;
  279. get_str_buf_start (&str, &buf, &str_start);
  280. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  281. SET_STRINGBUF_SHARED (buf);
  282. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  283. return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
  284. (scm_t_bits)str_start + start,
  285. (scm_t_bits) end - start);
  286. }
  287. SCM
  288. scm_i_substring_read_only (SCM str, size_t start, size_t end)
  289. {
  290. SCM buf;
  291. size_t str_start;
  292. get_str_buf_start (&str, &buf, &str_start);
  293. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  294. SET_STRINGBUF_SHARED (buf);
  295. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  296. return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
  297. (scm_t_bits)str_start + start,
  298. (scm_t_bits) end - start);
  299. }
  300. SCM
  301. scm_i_substring_copy (SCM str, size_t start, size_t end)
  302. {
  303. size_t len = end - start;
  304. SCM buf, my_buf;
  305. size_t str_start;
  306. get_str_buf_start (&str, &buf, &str_start);
  307. if (scm_i_is_narrow_string (str))
  308. {
  309. my_buf = make_stringbuf (len);
  310. memcpy (STRINGBUF_CHARS (my_buf),
  311. STRINGBUF_CHARS (buf) + str_start + start, len);
  312. }
  313. else
  314. {
  315. my_buf = make_wide_stringbuf (len);
  316. u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
  317. (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
  318. + start), len);
  319. /* Even though this string is wide, the substring may be narrow.
  320. Consider adding code to narrow the string. */
  321. }
  322. scm_remember_upto_here_1 (buf);
  323. return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
  324. (scm_t_bits) 0, (scm_t_bits) len);
  325. }
  326. SCM
  327. scm_i_substring_shared (SCM str, size_t start, size_t end)
  328. {
  329. if (start == 0 && end == STRING_LENGTH (str))
  330. return str;
  331. else
  332. {
  333. size_t len = end - start;
  334. if (IS_SH_STRING (str))
  335. {
  336. start += STRING_START (str);
  337. str = SH_STRING_STRING (str);
  338. }
  339. return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
  340. (scm_t_bits)start, (scm_t_bits) len);
  341. }
  342. }
  343. SCM
  344. scm_c_substring (SCM str, size_t start, size_t end)
  345. {
  346. validate_substring_args (str, start, end);
  347. return scm_i_substring (str, start, end);
  348. }
  349. SCM
  350. scm_c_substring_read_only (SCM str, size_t start, size_t end)
  351. {
  352. validate_substring_args (str, start, end);
  353. return scm_i_substring_read_only (str, start, end);
  354. }
  355. SCM
  356. scm_c_substring_copy (SCM str, size_t start, size_t end)
  357. {
  358. validate_substring_args (str, start, end);
  359. return scm_i_substring_copy (str, start, end);
  360. }
  361. SCM
  362. scm_c_substring_shared (SCM str, size_t start, size_t end)
  363. {
  364. validate_substring_args (str, start, end);
  365. return scm_i_substring_shared (str, start, end);
  366. }
  367. SCM
  368. scm_i_string_mark (SCM str)
  369. {
  370. if (IS_SH_STRING (str))
  371. return SH_STRING_STRING (str);
  372. else
  373. return STRING_STRINGBUF (str);
  374. }
  375. void
  376. scm_i_string_free (SCM str)
  377. {
  378. }
  379. /* Internal accessors
  380. */
  381. /* Returns the number of characters in STR. This may be different
  382. than the memory size of the string storage. */
  383. size_t
  384. scm_i_string_length (SCM str)
  385. {
  386. return STRING_LENGTH (str);
  387. }
  388. /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
  389. encoding. False if it is 'wide', having a 32-bit UCS-4
  390. encoding. */
  391. int
  392. scm_i_is_narrow_string (SCM str)
  393. {
  394. return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
  395. }
  396. /* Returns a pointer to the 8-bit Latin-1 encoded character array of
  397. STR. */
  398. const char *
  399. scm_i_string_chars (SCM str)
  400. {
  401. SCM buf;
  402. size_t start;
  403. get_str_buf_start (&str, &buf, &start);
  404. if (scm_i_is_narrow_string (str))
  405. return STRINGBUF_CHARS (buf) + start;
  406. else
  407. scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
  408. scm_list_1 (str));
  409. return NULL;
  410. }
  411. /* Returns a pointer to the 32-bit UCS-4 encoded character array of
  412. STR. */
  413. const scm_t_wchar *
  414. scm_i_string_wide_chars (SCM str)
  415. {
  416. SCM buf;
  417. size_t start;
  418. get_str_buf_start (&str, &buf, &start);
  419. if (!scm_i_is_narrow_string (str))
  420. return STRINGBUF_WIDE_CHARS (buf) + start;
  421. else
  422. scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
  423. scm_list_1 (str));
  424. }
  425. /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
  426. a new string buffer, so that it can be modified without modifying
  427. other strings. Also, lock the string mutex. Later, one must call
  428. scm_i_string_stop_writing to unlock the mutex. */
  429. SCM
  430. scm_i_string_start_writing (SCM orig_str)
  431. {
  432. SCM buf, str = orig_str;
  433. size_t start;
  434. get_str_buf_start (&str, &buf, &start);
  435. if (IS_RO_STRING (str))
  436. scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
  437. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  438. if (STRINGBUF_SHARED (buf))
  439. {
  440. /* Clone the stringbuf. */
  441. size_t len = STRING_LENGTH (str);
  442. SCM new_buf;
  443. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  444. if (scm_i_is_narrow_string (str))
  445. {
  446. new_buf = make_stringbuf (len);
  447. memcpy (STRINGBUF_CHARS (new_buf),
  448. STRINGBUF_CHARS (buf) + STRING_START (str), len);
  449. }
  450. else
  451. {
  452. new_buf = make_wide_stringbuf (len);
  453. u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
  454. (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
  455. + STRING_START (str)), len);
  456. }
  457. scm_i_thread_put_to_sleep ();
  458. SET_STRING_STRINGBUF (str, new_buf);
  459. start -= STRING_START (str);
  460. SET_STRING_START (str, 0);
  461. scm_i_thread_wake_up ();
  462. buf = new_buf;
  463. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  464. }
  465. return orig_str;
  466. }
  467. /* Return a pointer to the 8-bit Latin-1 chars of a string. */
  468. char *
  469. scm_i_string_writable_chars (SCM str)
  470. {
  471. SCM buf;
  472. size_t start;
  473. get_str_buf_start (&str, &buf, &start);
  474. if (scm_i_is_narrow_string (str))
  475. return STRINGBUF_CHARS (buf) + start;
  476. else
  477. scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
  478. scm_list_1 (str));
  479. return NULL;
  480. }
  481. /* Return a pointer to the UCS-4 codepoints of a string. */
  482. static scm_t_wchar *
  483. scm_i_string_writable_wide_chars (SCM str)
  484. {
  485. SCM buf;
  486. size_t start;
  487. get_str_buf_start (&str, &buf, &start);
  488. if (!scm_i_is_narrow_string (str))
  489. return STRINGBUF_WIDE_CHARS (buf) + start;
  490. else
  491. scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
  492. scm_list_1 (str));
  493. }
  494. /* Unlock the string mutex that was locked when
  495. scm_i_string_start_writing was called. */
  496. void
  497. scm_i_string_stop_writing (void)
  498. {
  499. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  500. }
  501. /* Return the Xth character of STR as a UCS-4 codepoint. */
  502. scm_t_wchar
  503. scm_i_string_ref (SCM str, size_t x)
  504. {
  505. if (scm_i_is_narrow_string (str))
  506. return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
  507. else
  508. return scm_i_string_wide_chars (str)[x];
  509. }
  510. /* Set the Pth character of STR to UCS-4 codepoint CHR. */
  511. void
  512. scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
  513. {
  514. if (chr > 0xFF && scm_i_is_narrow_string (str))
  515. widen_stringbuf (STRING_STRINGBUF (str));
  516. if (scm_i_is_narrow_string (str))
  517. {
  518. char *dst = scm_i_string_writable_chars (str);
  519. dst[p] = (char) (unsigned char) chr;
  520. }
  521. else
  522. {
  523. scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
  524. dst[p] = chr;
  525. }
  526. }
  527. /* Symbols.
  528. Basic symbol creation and accessing is done here, the rest is in
  529. symbols.[hc]. This has been done to keep stringbufs and the
  530. internals of strings and string-like objects confined to this file.
  531. */
  532. #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
  533. SCM
  534. scm_i_make_symbol (SCM name, scm_t_bits flags,
  535. unsigned long hash, SCM props)
  536. {
  537. SCM buf;
  538. size_t start = STRING_START (name);
  539. size_t length = STRING_LENGTH (name);
  540. if (IS_SH_STRING (name))
  541. {
  542. name = SH_STRING_STRING (name);
  543. start += STRING_START (name);
  544. }
  545. buf = SYMBOL_STRINGBUF (name);
  546. if (start == 0 && length == STRINGBUF_LENGTH (buf))
  547. {
  548. /* reuse buf. */
  549. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  550. SET_STRINGBUF_SHARED (buf);
  551. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  552. }
  553. else
  554. {
  555. /* make new buf. */
  556. if (scm_i_is_narrow_string (name))
  557. {
  558. SCM new_buf = make_stringbuf (length);
  559. memcpy (STRINGBUF_CHARS (new_buf),
  560. STRINGBUF_CHARS (buf) + start, length);
  561. buf = new_buf;
  562. }
  563. else
  564. {
  565. SCM new_buf = make_wide_stringbuf (length);
  566. u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
  567. (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
  568. length);
  569. buf = new_buf;
  570. }
  571. }
  572. return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
  573. (scm_t_bits) hash, SCM_UNPACK (props));
  574. }
  575. SCM
  576. scm_i_c_make_symbol (const char *name, size_t len,
  577. scm_t_bits flags, unsigned long hash, SCM props)
  578. {
  579. SCM buf = make_stringbuf (len);
  580. memcpy (STRINGBUF_CHARS (buf), name, len);
  581. return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
  582. (scm_t_bits) hash, SCM_UNPACK (props));
  583. }
  584. /* Return a new symbol that uses the LEN bytes pointed to by NAME as its
  585. underlying storage. */
  586. SCM
  587. scm_i_c_take_symbol (char *name, size_t len,
  588. scm_t_bits flags, unsigned long hash, SCM props)
  589. {
  590. SCM buf = scm_i_take_stringbufn (name, len);
  591. return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
  592. (scm_t_bits) hash, SCM_UNPACK (props));
  593. }
  594. /* Returns the number of characters in SYM. This may be different
  595. from the memory size of SYM. */
  596. size_t
  597. scm_i_symbol_length (SCM sym)
  598. {
  599. return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
  600. }
  601. size_t
  602. scm_c_symbol_length (SCM sym)
  603. #define FUNC_NAME "scm_c_symbol_length"
  604. {
  605. SCM_VALIDATE_SYMBOL (1, sym);
  606. return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
  607. }
  608. #undef FUNC_NAME
  609. /* True if the name of SYM is stored as a Latin-1 encoded string.
  610. False if it is stored as a 32-bit UCS-4-encoded string. */
  611. int
  612. scm_i_is_narrow_symbol (SCM sym)
  613. {
  614. SCM buf;
  615. buf = SYMBOL_STRINGBUF (sym);
  616. return !STRINGBUF_WIDE (buf);
  617. }
  618. /* Returns a pointer to the 8-bit Latin-1 encoded character array that
  619. contains the name of SYM. */
  620. const char *
  621. scm_i_symbol_chars (SCM sym)
  622. {
  623. SCM buf;
  624. buf = SYMBOL_STRINGBUF (sym);
  625. if (!STRINGBUF_WIDE (buf))
  626. return STRINGBUF_CHARS (buf);
  627. else
  628. scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
  629. scm_list_1 (sym));
  630. }
  631. /* Return a pointer to the 32-bit UCS-4-encoded character array of a
  632. symbol's name. */
  633. const scm_t_wchar *
  634. scm_i_symbol_wide_chars (SCM sym)
  635. {
  636. SCM buf;
  637. buf = SYMBOL_STRINGBUF (sym);
  638. if (STRINGBUF_WIDE (buf))
  639. return STRINGBUF_WIDE_CHARS (buf);
  640. else
  641. scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
  642. scm_list_1 (sym));
  643. }
  644. SCM
  645. scm_i_symbol_mark (SCM sym)
  646. {
  647. scm_gc_mark (SYMBOL_STRINGBUF (sym));
  648. return SCM_CELL_OBJECT_3 (sym);
  649. }
  650. void
  651. scm_i_symbol_free (SCM sym)
  652. {
  653. }
  654. SCM
  655. scm_i_symbol_substring (SCM sym, size_t start, size_t end)
  656. {
  657. SCM buf = SYMBOL_STRINGBUF (sym);
  658. scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
  659. SET_STRINGBUF_SHARED (buf);
  660. scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
  661. return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
  662. (scm_t_bits)start, (scm_t_bits) end - start);
  663. }
  664. /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
  665. scm_t_wchar
  666. scm_i_symbol_ref (SCM sym, size_t x)
  667. {
  668. if (scm_i_is_narrow_symbol (sym))
  669. return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
  670. else
  671. return scm_i_symbol_wide_chars (sym)[x];
  672. }
  673. /* Debugging
  674. */
  675. SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
  676. "Returns an association list containing debugging information\n"
  677. "for @var{str}. The association list has the following entries."
  678. "@table @code\n"
  679. "@item string\n"
  680. "The string itself.\n"
  681. "@item start\n"
  682. "The start index of the string into its stringbuf\n"
  683. "@item length\n"
  684. "The length of the string\n"
  685. "@item shared\n"
  686. "If this string is a substring, it returns its parent string.\n"
  687. "Otherwise, it returns @code{#f}\n"
  688. "@item read-only\n"
  689. "@code{#t} if the string is read-only\n"
  690. "@item stringbuf-chars\n"
  691. "A new string containing this string's stringbuf's characters\n"
  692. "@item stringbuf-length\n"
  693. "The number of characters in this stringbuf\n"
  694. "@item stringbuf-shared\n"
  695. "@code{#t} if this stringbuf is shared\n"
  696. "@item stringbuf-inline\n"
  697. "@code{#t} if this stringbuf's characters are stored in the\n"
  698. "cell itself, or @code{#f} if they were allocated in memory\n"
  699. "@item stringbuf-wide\n"
  700. "@code{#t} if this stringbuf's characters are stored in a\n"
  701. "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
  702. "buffer\n"
  703. "@end table")
  704. #define FUNC_NAME s_scm_sys_string_dump
  705. {
  706. SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
  707. SCM buf;
  708. SCM_VALIDATE_STRING (1, str);
  709. /* String info */
  710. e1 = scm_cons (scm_from_locale_symbol ("string"),
  711. str);
  712. e2 = scm_cons (scm_from_locale_symbol ("start"),
  713. scm_from_size_t (STRING_START (str)));
  714. e3 = scm_cons (scm_from_locale_symbol ("length"),
  715. scm_from_size_t (STRING_LENGTH (str)));
  716. if (IS_SH_STRING (str))
  717. {
  718. e4 = scm_cons (scm_from_locale_symbol ("shared"),
  719. SH_STRING_STRING (str));
  720. buf = STRING_STRINGBUF (SH_STRING_STRING (str));
  721. }
  722. else
  723. {
  724. e4 = scm_cons (scm_from_locale_symbol ("shared"),
  725. SCM_BOOL_F);
  726. buf = STRING_STRINGBUF (str);
  727. }
  728. if (IS_RO_STRING (str))
  729. e5 = scm_cons (scm_from_locale_symbol ("read-only"),
  730. SCM_BOOL_T);
  731. else
  732. e5 = scm_cons (scm_from_locale_symbol ("read-only"),
  733. SCM_BOOL_F);
  734. /* Stringbuf info */
  735. if (!STRINGBUF_WIDE (buf))
  736. {
  737. size_t len = STRINGBUF_LENGTH (buf);
  738. char *cbuf;
  739. SCM sbc = scm_i_make_string (len, &cbuf);
  740. memcpy (cbuf, STRINGBUF_CHARS (buf), len);
  741. e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
  742. sbc);
  743. }
  744. else
  745. {
  746. size_t len = STRINGBUF_LENGTH (buf);
  747. scm_t_wchar *cbuf;
  748. SCM sbc = scm_i_make_wide_string (len, &cbuf);
  749. u32_cpy ((scm_t_uint32 *) cbuf,
  750. (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
  751. e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
  752. sbc);
  753. }
  754. e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
  755. scm_from_size_t (STRINGBUF_LENGTH (buf)));
  756. if (STRINGBUF_SHARED (buf))
  757. e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
  758. SCM_BOOL_T);
  759. else
  760. e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
  761. SCM_BOOL_F);
  762. if (STRINGBUF_INLINE (buf))
  763. e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
  764. SCM_BOOL_T);
  765. else
  766. e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
  767. SCM_BOOL_F);
  768. if (STRINGBUF_WIDE (buf))
  769. e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
  770. SCM_BOOL_T);
  771. else
  772. e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
  773. SCM_BOOL_F);
  774. return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
  775. }
  776. #undef FUNC_NAME
  777. SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
  778. "Returns an association list containing debugging information\n"
  779. "for @var{sym}. The association list has the following entries."
  780. "@table @code\n"
  781. "@item symbol\n"
  782. "The symbol itself\n"
  783. "@item hash\n"
  784. "Its hash value\n"
  785. "@item interned\n"
  786. "@code{#t} if it is an interned symbol\n"
  787. "@item stringbuf-chars\n"
  788. "A new string containing this symbols's stringbuf's characters\n"
  789. "@item stringbuf-length\n"
  790. "The number of characters in this stringbuf\n"
  791. "@item stringbuf-shared\n"
  792. "@code{#t} if this stringbuf is shared\n"
  793. "@item stringbuf-inline\n"
  794. "@code{#t} if this stringbuf's characters are stored in the\n"
  795. "cell itself, or @code{#f} if they were allocated in memory\n"
  796. "@item stringbuf-wide\n"
  797. "@code{#t} if this stringbuf's characters are stored in a\n"
  798. "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
  799. "buffer\n"
  800. "@end table")
  801. #define FUNC_NAME s_scm_sys_symbol_dump
  802. {
  803. SCM e1, e2, e3, e4, e5, e6, e7, e8;
  804. SCM buf;
  805. SCM_VALIDATE_SYMBOL (1, sym);
  806. e1 = scm_cons (scm_from_locale_symbol ("symbol"),
  807. sym);
  808. e2 = scm_cons (scm_from_locale_symbol ("hash"),
  809. scm_from_ulong (scm_i_symbol_hash (sym)));
  810. e3 = scm_cons (scm_from_locale_symbol ("interned"),
  811. scm_symbol_interned_p (sym));
  812. buf = SYMBOL_STRINGBUF (sym);
  813. /* Stringbuf info */
  814. if (!STRINGBUF_WIDE (buf))
  815. {
  816. size_t len = STRINGBUF_LENGTH (buf);
  817. char *cbuf;
  818. SCM sbc = scm_i_make_string (len, &cbuf);
  819. memcpy (cbuf, STRINGBUF_CHARS (buf), len);
  820. e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
  821. sbc);
  822. }
  823. else
  824. {
  825. size_t len = STRINGBUF_LENGTH (buf);
  826. scm_t_wchar *cbuf;
  827. SCM sbc = scm_i_make_wide_string (len, &cbuf);
  828. u32_cpy ((scm_t_uint32 *) cbuf,
  829. (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
  830. e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
  831. sbc);
  832. }
  833. e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
  834. scm_from_size_t (STRINGBUF_LENGTH (buf)));
  835. if (STRINGBUF_SHARED (buf))
  836. e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
  837. SCM_BOOL_T);
  838. else
  839. e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
  840. SCM_BOOL_F);
  841. if (STRINGBUF_INLINE (buf))
  842. e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
  843. SCM_BOOL_T);
  844. else
  845. e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
  846. SCM_BOOL_F);
  847. if (STRINGBUF_WIDE (buf))
  848. e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
  849. SCM_BOOL_T);
  850. else
  851. e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
  852. SCM_BOOL_F);
  853. return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
  854. }
  855. #undef FUNC_NAME
  856. #if SCM_STRING_LENGTH_HISTOGRAM
  857. SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
  858. #define FUNC_NAME s_scm_sys_stringbuf_hist
  859. {
  860. int i;
  861. for (i = 0; i < 1000; i++)
  862. if (lenhist[i])
  863. fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
  864. fprintf (stderr, ">999: %u\n", lenhist[1000]);
  865. return SCM_UNSPECIFIED;
  866. }
  867. #undef FUNC_NAME
  868. #endif
  869. SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
  870. (SCM obj),
  871. "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
  872. #define FUNC_NAME s_scm_string_p
  873. {
  874. return scm_from_bool (IS_STRING (obj));
  875. }
  876. #undef FUNC_NAME
  877. SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
  878. SCM_DEFINE (scm_string, "string", 0, 0, 1,
  879. (SCM chrs),
  880. "@deffnx {Scheme Procedure} list->string chrs\n"
  881. "Return a newly allocated string composed of the arguments,\n"
  882. "@var{chrs}.")
  883. #define FUNC_NAME s_scm_string
  884. {
  885. SCM result;
  886. SCM rest;
  887. size_t len;
  888. size_t p = 0;
  889. long i;
  890. /* Verify that this is a list of chars. */
  891. i = scm_ilength (chrs);
  892. SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
  893. len = (size_t) i;
  894. rest = chrs;
  895. while (len > 0 && scm_is_pair (rest))
  896. {
  897. SCM elt = SCM_CAR (rest);
  898. SCM_VALIDATE_CHAR (SCM_ARGn, elt);
  899. rest = SCM_CDR (rest);
  900. len--;
  901. scm_remember_upto_here_1 (elt);
  902. }
  903. /* Construct a string containing this list of chars. */
  904. len = (size_t) i;
  905. rest = chrs;
  906. result = scm_i_make_string (len, NULL);
  907. result = scm_i_string_start_writing (result);
  908. while (len > 0 && scm_is_pair (rest))
  909. {
  910. SCM elt = SCM_CAR (rest);
  911. scm_i_string_set_x (result, p, SCM_CHAR (elt));
  912. p++;
  913. rest = SCM_CDR (rest);
  914. len--;
  915. scm_remember_upto_here_1 (elt);
  916. }
  917. scm_i_string_stop_writing ();
  918. if (len > 0)
  919. scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
  920. if (!scm_is_null (rest))
  921. scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
  922. return result;
  923. }
  924. #undef FUNC_NAME
  925. SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
  926. (SCM k, SCM chr),
  927. "Return a newly allocated string of\n"
  928. "length @var{k}. If @var{chr} is given, then all elements of\n"
  929. "the string are initialized to @var{chr}, otherwise the contents\n"
  930. "of the @var{string} are unspecified.")
  931. #define FUNC_NAME s_scm_make_string
  932. {
  933. return scm_c_make_string (scm_to_size_t (k), chr);
  934. }
  935. #undef FUNC_NAME
  936. SCM
  937. scm_c_make_string (size_t len, SCM chr)
  938. #define FUNC_NAME NULL
  939. {
  940. size_t p;
  941. SCM res = scm_i_make_string (len, NULL);
  942. if (!SCM_UNBNDP (chr))
  943. {
  944. SCM_VALIDATE_CHAR (0, chr);
  945. res = scm_i_string_start_writing (res);
  946. for (p = 0; p < len; p++)
  947. scm_i_string_set_x (res, p, SCM_CHAR (chr));
  948. scm_i_string_stop_writing ();
  949. }
  950. return res;
  951. }
  952. #undef FUNC_NAME
  953. SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
  954. (SCM string),
  955. "Return the number of characters in @var{string}.")
  956. #define FUNC_NAME s_scm_string_length
  957. {
  958. SCM_VALIDATE_STRING (1, string);
  959. return scm_from_size_t (STRING_LENGTH (string));
  960. }
  961. #undef FUNC_NAME
  962. SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
  963. (SCM string),
  964. "Return the bytes used to represent a character in @var{string}."
  965. "This will return 1 or 4.")
  966. #define FUNC_NAME s_scm_string_width
  967. {
  968. SCM_VALIDATE_STRING (1, string);
  969. if (!scm_i_is_narrow_string (string))
  970. return scm_from_int (4);
  971. return scm_from_int (1);
  972. }
  973. #undef FUNC_NAME
  974. size_t
  975. scm_c_string_length (SCM string)
  976. {
  977. if (!IS_STRING (string))
  978. scm_wrong_type_arg_msg (NULL, 0, string, "string");
  979. return STRING_LENGTH (string);
  980. }
  981. SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
  982. (SCM str, SCM k),
  983. "Return character @var{k} of @var{str} using zero-origin\n"
  984. "indexing. @var{k} must be a valid index of @var{str}.")
  985. #define FUNC_NAME s_scm_string_ref
  986. {
  987. size_t len;
  988. unsigned long idx;
  989. SCM_VALIDATE_STRING (1, str);
  990. len = scm_i_string_length (str);
  991. if (SCM_LIKELY (len > 0))
  992. idx = scm_to_unsigned_integer (k, 0, len - 1);
  993. else
  994. scm_out_of_range (NULL, k);
  995. if (scm_i_is_narrow_string (str))
  996. return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
  997. else
  998. return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
  999. }
  1000. #undef FUNC_NAME
  1001. SCM
  1002. scm_c_string_ref (SCM str, size_t p)
  1003. {
  1004. if (p >= scm_i_string_length (str))
  1005. scm_out_of_range (NULL, scm_from_size_t (p));
  1006. if (scm_i_is_narrow_string (str))
  1007. return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
  1008. else
  1009. return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
  1010. }
  1011. SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
  1012. (SCM str, SCM k, SCM chr),
  1013. "Store @var{chr} in element @var{k} of @var{str} and return\n"
  1014. "an unspecified value. @var{k} must be a valid index of\n"
  1015. "@var{str}.")
  1016. #define FUNC_NAME s_scm_string_set_x
  1017. {
  1018. size_t len;
  1019. unsigned long idx;
  1020. SCM_VALIDATE_STRING (1, str);
  1021. len = scm_i_string_length (str);
  1022. if (SCM_LIKELY (len > 0))
  1023. idx = scm_to_unsigned_integer (k, 0, len - 1);
  1024. else
  1025. scm_out_of_range (NULL, k);
  1026. SCM_VALIDATE_CHAR (3, chr);
  1027. str = scm_i_string_start_writing (str);
  1028. scm_i_string_set_x (str, idx, SCM_CHAR (chr));
  1029. scm_i_string_stop_writing ();
  1030. return SCM_UNSPECIFIED;
  1031. }
  1032. #undef FUNC_NAME
  1033. void
  1034. scm_c_string_set_x (SCM str, size_t p, SCM chr)
  1035. {
  1036. if (p >= scm_i_string_length (str))
  1037. scm_out_of_range (NULL, scm_from_size_t (p));
  1038. str = scm_i_string_start_writing (str);
  1039. scm_i_string_set_x (str, p, SCM_CHAR (chr));
  1040. scm_i_string_stop_writing ();
  1041. }
  1042. SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
  1043. (SCM str, SCM start, SCM end),
  1044. "Return a newly allocated string formed from the characters\n"
  1045. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1046. "ending with index @var{end} (exclusive).\n"
  1047. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1048. "exact integers satisfying:\n\n"
  1049. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
  1050. #define FUNC_NAME s_scm_substring
  1051. {
  1052. size_t len, from, to;
  1053. SCM_VALIDATE_STRING (1, str);
  1054. len = scm_i_string_length (str);
  1055. from = scm_to_unsigned_integer (start, 0, len);
  1056. if (SCM_UNBNDP (end))
  1057. to = len;
  1058. else
  1059. to = scm_to_unsigned_integer (end, from, len);
  1060. return scm_i_substring (str, from, to);
  1061. }
  1062. #undef FUNC_NAME
  1063. SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
  1064. (SCM str, SCM start, SCM end),
  1065. "Return a newly allocated string formed from the characters\n"
  1066. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1067. "ending with index @var{end} (exclusive).\n"
  1068. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1069. "exact integers satisfying:\n"
  1070. "\n"
  1071. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
  1072. "\n"
  1073. "The returned string is read-only.\n")
  1074. #define FUNC_NAME s_scm_substring_read_only
  1075. {
  1076. size_t len, from, to;
  1077. SCM_VALIDATE_STRING (1, str);
  1078. len = scm_i_string_length (str);
  1079. from = scm_to_unsigned_integer (start, 0, len);
  1080. if (SCM_UNBNDP (end))
  1081. to = len;
  1082. else
  1083. to = scm_to_unsigned_integer (end, from, len);
  1084. return scm_i_substring_read_only (str, from, to);
  1085. }
  1086. #undef FUNC_NAME
  1087. SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
  1088. (SCM str, SCM start, SCM end),
  1089. "Return a newly allocated string formed from the characters\n"
  1090. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1091. "ending with index @var{end} (exclusive).\n"
  1092. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1093. "exact integers satisfying:\n\n"
  1094. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
  1095. #define FUNC_NAME s_scm_substring_copy
  1096. {
  1097. /* For the Scheme version, START is mandatory, but for the C
  1098. version, it is optional. See scm_string_copy in srfi-13.c for a
  1099. rationale.
  1100. */
  1101. size_t from, to;
  1102. SCM_VALIDATE_STRING (1, str);
  1103. scm_i_get_substring_spec (scm_i_string_length (str),
  1104. start, &from, end, &to);
  1105. return scm_i_substring_copy (str, from, to);
  1106. }
  1107. #undef FUNC_NAME
  1108. SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
  1109. (SCM str, SCM start, SCM end),
  1110. "Return string that indirectly refers to the characters\n"
  1111. "of @var{str} beginning with index @var{start} (inclusive) and\n"
  1112. "ending with index @var{end} (exclusive).\n"
  1113. "@var{str} must be a string, @var{start} and @var{end} must be\n"
  1114. "exact integers satisfying:\n\n"
  1115. "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
  1116. #define FUNC_NAME s_scm_substring_shared
  1117. {
  1118. size_t len, from, to;
  1119. SCM_VALIDATE_STRING (1, str);
  1120. len = scm_i_string_length (str);
  1121. from = scm_to_unsigned_integer (start, 0, len);
  1122. if (SCM_UNBNDP (end))
  1123. to = len;
  1124. else
  1125. to = scm_to_unsigned_integer (end, from, len);
  1126. return scm_i_substring_shared (str, from, to);
  1127. }
  1128. #undef FUNC_NAME
  1129. SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
  1130. (SCM args),
  1131. "Return a newly allocated string whose characters form the\n"
  1132. "concatenation of the given strings, @var{args}.")
  1133. #define FUNC_NAME s_scm_string_append
  1134. {
  1135. SCM res;
  1136. size_t len = 0;
  1137. int wide = 0;
  1138. SCM l, s;
  1139. size_t i;
  1140. union
  1141. {
  1142. char *narrow;
  1143. scm_t_wchar *wide;
  1144. } data;
  1145. SCM_VALIDATE_REST_ARGUMENT (args);
  1146. for (l = args; !scm_is_null (l); l = SCM_CDR (l))
  1147. {
  1148. s = SCM_CAR (l);
  1149. SCM_VALIDATE_STRING (SCM_ARGn, s);
  1150. len += scm_i_string_length (s);
  1151. if (!scm_i_is_narrow_string (s))
  1152. wide = 1;
  1153. }
  1154. data.narrow = NULL;
  1155. if (!wide)
  1156. res = scm_i_make_string (len, &data.narrow);
  1157. else
  1158. res = scm_i_make_wide_string (len, &data.wide);
  1159. for (l = args; !scm_is_null (l); l = SCM_CDR (l))
  1160. {
  1161. size_t len;
  1162. s = SCM_CAR (l);
  1163. SCM_VALIDATE_STRING (SCM_ARGn, s);
  1164. len = scm_i_string_length (s);
  1165. if (!wide)
  1166. {
  1167. memcpy (data.narrow, scm_i_string_chars (s), len);
  1168. data.narrow += len;
  1169. }
  1170. else
  1171. {
  1172. if (scm_i_is_narrow_string (s))
  1173. {
  1174. for (i = 0; i < scm_i_string_length (s); i++)
  1175. data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
  1176. }
  1177. else
  1178. u32_cpy ((scm_t_uint32 *) data.wide,
  1179. (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
  1180. data.wide += len;
  1181. }
  1182. scm_remember_upto_here_1 (s);
  1183. }
  1184. return res;
  1185. }
  1186. #undef FUNC_NAME
  1187. int
  1188. scm_is_string (SCM obj)
  1189. {
  1190. return IS_STRING (obj);
  1191. }
  1192. SCM
  1193. scm_from_locale_stringn (const char *str, size_t len)
  1194. {
  1195. SCM res;
  1196. char *dst;
  1197. if (len == (size_t) -1)
  1198. len = strlen (str);
  1199. if (len == 0)
  1200. return scm_nullstr;
  1201. res = scm_i_make_string (len, &dst);
  1202. memcpy (dst, str, len);
  1203. return res;
  1204. }
  1205. SCM
  1206. scm_from_locale_string (const char *str)
  1207. {
  1208. if (str == NULL)
  1209. return scm_nullstr;
  1210. return scm_from_locale_stringn (str, -1);
  1211. }
  1212. /* Create a new scheme string from the C string STR. The memory of
  1213. STR may be used directly as storage for the new string. */
  1214. SCM
  1215. scm_take_locale_stringn (char *str, size_t len)
  1216. {
  1217. SCM buf, res;
  1218. if (len == (size_t) -1)
  1219. len = strlen (str);
  1220. else
  1221. {
  1222. /* Ensure STR is null terminated. A realloc for 1 extra byte should
  1223. often be satisfied from the alignment padding after the block, with
  1224. no actual data movement. */
  1225. str = scm_realloc (str, len + 1);
  1226. str[len] = '\0';
  1227. }
  1228. buf = scm_i_take_stringbufn (str, len);
  1229. res = scm_double_cell (STRING_TAG,
  1230. SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
  1231. return res;
  1232. }
  1233. SCM
  1234. scm_take_locale_string (char *str)
  1235. {
  1236. return scm_take_locale_stringn (str, -1);
  1237. }
  1238. /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
  1239. and \UXXXXXX. */
  1240. static void
  1241. unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
  1242. {
  1243. char *before, *after;
  1244. size_t i, j;
  1245. before = *bufp;
  1246. after = *bufp;
  1247. i = 0;
  1248. j = 0;
  1249. while (i < *lenp)
  1250. {
  1251. if ((i <= *lenp - 6)
  1252. && before[i] == '\\'
  1253. && before[i + 1] == 'u'
  1254. && before[i + 2] == '0' && before[i + 3] == '0')
  1255. {
  1256. /* Convert \u00NN to \xNN */
  1257. after[j] = '\\';
  1258. after[j + 1] = 'x';
  1259. after[j + 2] = tolower ((int) before[i + 4]);
  1260. after[j + 3] = tolower ((int) before[i + 5]);
  1261. i += 6;
  1262. j += 4;
  1263. }
  1264. else if ((i <= *lenp - 10)
  1265. && before[i] == '\\'
  1266. && before[i + 1] == 'U'
  1267. && before[i + 2] == '0' && before[i + 3] == '0')
  1268. {
  1269. /* Convert \U00NNNNNN to \UNNNNNN */
  1270. after[j] = '\\';
  1271. after[j + 1] = 'U';
  1272. after[j + 2] = tolower ((int) before[i + 4]);
  1273. after[j + 3] = tolower ((int) before[i + 5]);
  1274. after[j + 4] = tolower ((int) before[i + 6]);
  1275. after[j + 5] = tolower ((int) before[i + 7]);
  1276. after[j + 6] = tolower ((int) before[i + 8]);
  1277. after[j + 7] = tolower ((int) before[i + 9]);
  1278. i += 10;
  1279. j += 8;
  1280. }
  1281. else
  1282. {
  1283. after[j] = before[i];
  1284. i++;
  1285. j++;
  1286. }
  1287. }
  1288. *lenp = j;
  1289. after = scm_realloc (after, j);
  1290. }
  1291. char *
  1292. scm_to_locale_stringn (SCM str, size_t * lenp)
  1293. {
  1294. const char *enc;
  1295. /* In the future, enc will hold the port's encoding. */
  1296. enc = NULL;
  1297. return scm_to_stringn (str, lenp, enc,
  1298. SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
  1299. }
  1300. /* Low-level scheme to C string conversion function. */
  1301. char *
  1302. scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
  1303. scm_t_string_failed_conversion_handler handler)
  1304. {
  1305. static const char iso[11] = "ISO-8859-1";
  1306. char *buf;
  1307. size_t ilen, len, i;
  1308. if (!scm_is_string (str))
  1309. scm_wrong_type_arg_msg (NULL, 0, str, "string");
  1310. ilen = scm_i_string_length (str);
  1311. if (ilen == 0)
  1312. {
  1313. buf = scm_malloc (1);
  1314. buf[0] = '\0';
  1315. if (lenp)
  1316. *lenp = 0;
  1317. return buf;
  1318. }
  1319. if (lenp == NULL)
  1320. for (i = 0; i < ilen; i++)
  1321. if (scm_i_string_ref (str, i) == '\0')
  1322. scm_misc_error (NULL,
  1323. "string contains #\\nul character: ~S",
  1324. scm_list_1 (str));
  1325. if (scm_i_is_narrow_string (str))
  1326. {
  1327. if (lenp)
  1328. {
  1329. buf = scm_malloc (ilen);
  1330. memcpy (buf, scm_i_string_chars (str), ilen);
  1331. *lenp = ilen;
  1332. return buf;
  1333. }
  1334. else
  1335. {
  1336. buf = scm_malloc (ilen + 1);
  1337. memcpy (buf, scm_i_string_chars (str), ilen);
  1338. buf[ilen] = '\0';
  1339. return buf;
  1340. }
  1341. }
  1342. buf = NULL;
  1343. len = 0;
  1344. buf = u32_conv_to_encoding (iso,
  1345. (enum iconv_ilseq_handler) handler,
  1346. (scm_t_uint32 *) scm_i_string_wide_chars (str),
  1347. ilen, NULL, NULL, &len);
  1348. if (buf == NULL)
  1349. scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
  1350. scm_list_2 (scm_from_locale_string (iso), str));
  1351. if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
  1352. unistring_escapes_to_guile_escapes (&buf, &len);
  1353. if (lenp)
  1354. *lenp = len;
  1355. else
  1356. {
  1357. buf = scm_realloc (buf, len + 1);
  1358. buf[len] = '\0';
  1359. }
  1360. scm_remember_upto_here_1 (str);
  1361. return buf;
  1362. }
  1363. char *
  1364. scm_to_locale_string (SCM str)
  1365. {
  1366. return scm_to_locale_stringn (str, NULL);
  1367. }
  1368. size_t
  1369. scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
  1370. {
  1371. size_t len;
  1372. char *result = NULL;
  1373. if (!scm_is_string (str))
  1374. scm_wrong_type_arg_msg (NULL, 0, str, "string");
  1375. result = scm_to_locale_stringn (str, &len);
  1376. memcpy (buf, result, (len > max_len) ? max_len : len);
  1377. free (result);
  1378. scm_remember_upto_here_1 (str);
  1379. return len;
  1380. }
  1381. /* converts C scm_array of strings to SCM scm_list of strings. */
  1382. /* If argc < 0, a null terminated scm_array is assumed. */
  1383. SCM
  1384. scm_makfromstrs (int argc, char **argv)
  1385. {
  1386. int i = argc;
  1387. SCM lst = SCM_EOL;
  1388. if (0 > i)
  1389. for (i = 0; argv[i]; i++);
  1390. while (i--)
  1391. lst = scm_cons (scm_from_locale_string (argv[i]), lst);
  1392. return lst;
  1393. }
  1394. /* Return a newly allocated array of char pointers to each of the strings
  1395. in args, with a terminating NULL pointer. */
  1396. char **
  1397. scm_i_allocate_string_pointers (SCM list)
  1398. {
  1399. char **result;
  1400. int len = scm_ilength (list);
  1401. int i;
  1402. if (len < 0)
  1403. scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
  1404. scm_dynwind_begin (0);
  1405. result = (char **) scm_malloc ((len + 1) * sizeof (char *));
  1406. result[len] = NULL;
  1407. scm_dynwind_unwind_handler (free, result, 0);
  1408. /* The list might be have been modified in another thread, so
  1409. we check LIST before each access.
  1410. */
  1411. for (i = 0; i < len && scm_is_pair (list); i++)
  1412. {
  1413. result[i] = scm_to_locale_string (SCM_CAR (list));
  1414. list = SCM_CDR (list);
  1415. }
  1416. scm_dynwind_end ();
  1417. return result;
  1418. }
  1419. void
  1420. scm_i_free_string_pointers (char **pointers)
  1421. {
  1422. int i;
  1423. for (i = 0; pointers[i]; i++)
  1424. free (pointers[i]);
  1425. free (pointers);
  1426. }
  1427. void
  1428. scm_i_get_substring_spec (size_t len,
  1429. SCM start, size_t *cstart,
  1430. SCM end, size_t *cend)
  1431. {
  1432. if (SCM_UNBNDP (start))
  1433. *cstart = 0;
  1434. else
  1435. *cstart = scm_to_unsigned_integer (start, 0, len);
  1436. if (SCM_UNBNDP (end))
  1437. *cend = len;
  1438. else
  1439. *cend = scm_to_unsigned_integer (end, *cstart, len);
  1440. }
  1441. #if SCM_ENABLE_DEPRECATED
  1442. /* When these definitions are removed, it becomes reasonable to use
  1443. read-only strings for string literals. For that, change the reader
  1444. to create string literals with scm_c_substring_read_only instead of
  1445. with scm_c_substring_copy.
  1446. */
  1447. int
  1448. scm_i_deprecated_stringp (SCM str)
  1449. {
  1450. scm_c_issue_deprecation_warning
  1451. ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
  1452. return scm_is_string (str);
  1453. }
  1454. char *
  1455. scm_i_deprecated_string_chars (SCM str)
  1456. {
  1457. char *chars;
  1458. scm_c_issue_deprecation_warning
  1459. ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
  1460. /* We don't accept shared substrings here since they are not
  1461. null-terminated.
  1462. */
  1463. if (IS_SH_STRING (str))
  1464. scm_misc_error (NULL,
  1465. "SCM_STRING_CHARS does not work with shared substrings.",
  1466. SCM_EOL);
  1467. /* We explicitly test for read-only strings to produce a better
  1468. error message.
  1469. */
  1470. if (IS_RO_STRING (str))
  1471. scm_misc_error (NULL,
  1472. "SCM_STRING_CHARS does not work with read-only strings.",
  1473. SCM_EOL);
  1474. /* The following is still wrong, of course...
  1475. */
  1476. str = scm_i_string_start_writing (str);
  1477. chars = scm_i_string_writable_chars (str);
  1478. scm_i_string_stop_writing ();
  1479. return chars;
  1480. }
  1481. size_t
  1482. scm_i_deprecated_string_length (SCM str)
  1483. {
  1484. scm_c_issue_deprecation_warning
  1485. ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
  1486. return scm_c_string_length (str);
  1487. }
  1488. #endif
  1489. void
  1490. scm_init_strings ()
  1491. {
  1492. scm_nullstr = scm_i_make_string (0, NULL);
  1493. #include "libguile/strings.x"
  1494. }
  1495. /*
  1496. Local Variables:
  1497. c-file-style: "gnu"
  1498. End:
  1499. */