cordxtra.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. /*
  2. * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
  3. *
  4. * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
  5. * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
  6. *
  7. * Permission is hereby granted to use or copy this program
  8. * for any purpose, provided the above notices are retained on all copies.
  9. * Permission to modify the code and to distribute modified code is granted,
  10. * provided the above notices are retained, and a notice that the code was
  11. * modified is included with the above copyright notice.
  12. *
  13. * Author: Hans-J. Boehm (boehm@parc.xerox.com)
  14. */
  15. /*
  16. * These are functions on cords that do not need to understand their
  17. * implementation. They serve also serve as example client code for
  18. * cord_basics.
  19. */
  20. /* Boehm, December 8, 1995 1:53 pm PST */
  21. # include <stdio.h>
  22. # include <string.h>
  23. # include <stdlib.h>
  24. # include <stdarg.h>
  25. # include "cord.h"
  26. # include "ec.h"
  27. # define I_HIDE_POINTERS /* So we get access to allocation lock. */
  28. /* We use this for lazy file reading, */
  29. /* so that we remain independent */
  30. /* of the threads primitives. */
  31. # include "gc.h"
  32. /* For now we assume that pointer reads and writes are atomic, */
  33. /* i.e. another thread always sees the state before or after */
  34. /* a write. This might be false on a Motorola M68K with */
  35. /* pointers that are not 32-bit aligned. But there probably */
  36. /* aren't too many threads packages running on those. */
  37. # define ATOMIC_WRITE(x,y) (x) = (y)
  38. # define ATOMIC_READ(x) (*(x))
  39. /* The standard says these are in stdio.h, but they aren't always: */
  40. # ifndef SEEK_SET
  41. # define SEEK_SET 0
  42. # endif
  43. # ifndef SEEK_END
  44. # define SEEK_END 2
  45. # endif
  46. # define BUFSZ 2048 /* Size of stack allocated buffers when */
  47. /* we want large buffers. */
  48. typedef void (* oom_fn)(void);
  49. # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
  50. ABORT("Out of memory\n"); }
  51. # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
  52. CORD CORD_cat_char(CORD x, char c)
  53. {
  54. register char * string;
  55. if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
  56. string = GC_MALLOC_ATOMIC(2);
  57. if (string == 0) OUT_OF_MEMORY;
  58. string[0] = c;
  59. string[1] = '\0';
  60. return(CORD_cat_char_star(x, string, 1));
  61. }
  62. CORD CORD_catn(int nargs, ...)
  63. {
  64. register CORD result = CORD_EMPTY;
  65. va_list args;
  66. register int i;
  67. va_start(args, nargs);
  68. for (i = 0; i < nargs; i++) {
  69. register CORD next = va_arg(args, CORD);
  70. result = CORD_cat(result, next);
  71. }
  72. va_end(args);
  73. return(result);
  74. }
  75. typedef struct {
  76. size_t len;
  77. size_t count;
  78. char * buf;
  79. } CORD_fill_data;
  80. int CORD_fill_proc(char c, void * client_data)
  81. {
  82. register CORD_fill_data * d = (CORD_fill_data *)client_data;
  83. register size_t count = d -> count;
  84. (d -> buf)[count] = c;
  85. d -> count = ++count;
  86. if (count >= d -> len) {
  87. return(1);
  88. } else {
  89. return(0);
  90. }
  91. }
  92. int CORD_batched_fill_proc(const char * s, void * client_data)
  93. {
  94. register CORD_fill_data * d = (CORD_fill_data *)client_data;
  95. register size_t count = d -> count;
  96. register size_t max = d -> len;
  97. register char * buf = d -> buf;
  98. register const char * t = s;
  99. while((buf[count] = *t++) != '\0') {
  100. count++;
  101. if (count >= max) {
  102. d -> count = count;
  103. return(1);
  104. }
  105. }
  106. d -> count = count;
  107. return(0);
  108. }
  109. /* Fill buf with len characters starting at i. */
  110. /* Assumes len characters are available. */
  111. void CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
  112. {
  113. CORD_fill_data fd;
  114. fd.len = len;
  115. fd.buf = buf;
  116. fd.count = 0;
  117. (void)CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
  118. }
  119. int CORD_cmp(CORD x, CORD y)
  120. {
  121. CORD_pos xpos;
  122. CORD_pos ypos;
  123. register size_t avail, yavail;
  124. if (y == CORD_EMPTY) return(x != CORD_EMPTY);
  125. if (x == CORD_EMPTY) return(-1);
  126. if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
  127. CORD_set_pos(xpos, x, 0);
  128. CORD_set_pos(ypos, y, 0);
  129. for(;;) {
  130. if (!CORD_pos_valid(xpos)) {
  131. if (CORD_pos_valid(ypos)) {
  132. return(-1);
  133. } else {
  134. return(0);
  135. }
  136. }
  137. if (!CORD_pos_valid(ypos)) {
  138. return(1);
  139. }
  140. if ((avail = CORD_pos_chars_left(xpos)) <= 0
  141. || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
  142. register char xcurrent = CORD_pos_fetch(xpos);
  143. register char ycurrent = CORD_pos_fetch(ypos);
  144. if (xcurrent != ycurrent) return(xcurrent - ycurrent);
  145. CORD_next(xpos);
  146. CORD_next(ypos);
  147. } else {
  148. /* process as many characters as we can */
  149. register int result;
  150. if (avail > yavail) avail = yavail;
  151. result = strncmp(CORD_pos_cur_char_addr(xpos),
  152. CORD_pos_cur_char_addr(ypos), avail);
  153. if (result != 0) return(result);
  154. CORD_pos_advance(xpos, avail);
  155. CORD_pos_advance(ypos, avail);
  156. }
  157. }
  158. }
  159. int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
  160. {
  161. CORD_pos xpos;
  162. CORD_pos ypos;
  163. register size_t count;
  164. register long avail, yavail;
  165. CORD_set_pos(xpos, x, x_start);
  166. CORD_set_pos(ypos, y, y_start);
  167. for(count = 0; count < len;) {
  168. if (!CORD_pos_valid(xpos)) {
  169. if (CORD_pos_valid(ypos)) {
  170. return(-1);
  171. } else {
  172. return(0);
  173. }
  174. }
  175. if (!CORD_pos_valid(ypos)) {
  176. return(1);
  177. }
  178. if ((avail = CORD_pos_chars_left(xpos)) <= 0
  179. || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
  180. register char xcurrent = CORD_pos_fetch(xpos);
  181. register char ycurrent = CORD_pos_fetch(ypos);
  182. if (xcurrent != ycurrent) return(xcurrent - ycurrent);
  183. CORD_next(xpos);
  184. CORD_next(ypos);
  185. count++;
  186. } else {
  187. /* process as many characters as we can */
  188. register int result;
  189. if (avail > yavail) avail = yavail;
  190. count += avail;
  191. if (count > len) avail -= (count - len);
  192. result = strncmp(CORD_pos_cur_char_addr(xpos),
  193. CORD_pos_cur_char_addr(ypos), (size_t)avail);
  194. if (result != 0) return(result);
  195. CORD_pos_advance(xpos, (size_t)avail);
  196. CORD_pos_advance(ypos, (size_t)avail);
  197. }
  198. }
  199. return(0);
  200. }
  201. char * CORD_to_char_star(CORD x)
  202. {
  203. register size_t len = CORD_len(x);
  204. char * result = GC_MALLOC_ATOMIC(len + 1);
  205. if (result == 0) OUT_OF_MEMORY;
  206. CORD_fill_buf(x, 0, len, result);
  207. result[len] = '\0';
  208. return(result);
  209. }
  210. CORD CORD_from_char_star(const char *s)
  211. {
  212. char * result;
  213. size_t len = strlen(s);
  214. if (0 == len) return(CORD_EMPTY);
  215. result = GC_MALLOC_ATOMIC(len + 1);
  216. if (result == 0) OUT_OF_MEMORY;
  217. memcpy(result, s, len+1);
  218. return(result);
  219. }
  220. const char * CORD_to_const_char_star(CORD x)
  221. {
  222. if (x == 0) return("");
  223. if (CORD_IS_STRING(x)) return((const char *)x);
  224. return(CORD_to_char_star(x));
  225. }
  226. char CORD_fetch(CORD x, size_t i)
  227. {
  228. CORD_pos xpos;
  229. CORD_set_pos(xpos, x, i);
  230. if (!CORD_pos_valid(xpos)) ABORT("bad index?");
  231. return(CORD_pos_fetch(xpos));
  232. }
  233. int CORD_put_proc(char c, void * client_data)
  234. {
  235. register FILE * f = (FILE *)client_data;
  236. return(putc(c, f) == EOF);
  237. }
  238. int CORD_batched_put_proc(const char * s, void * client_data)
  239. {
  240. register FILE * f = (FILE *)client_data;
  241. return(fputs(s, f) == EOF);
  242. }
  243. int CORD_put(CORD x, FILE * f)
  244. {
  245. if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
  246. return(EOF);
  247. } else {
  248. return(1);
  249. }
  250. }
  251. typedef struct {
  252. size_t pos; /* Current position in the cord */
  253. char target; /* Character we're looking for */
  254. } chr_data;
  255. int CORD_chr_proc(char c, void * client_data)
  256. {
  257. register chr_data * d = (chr_data *)client_data;
  258. if (c == d -> target) return(1);
  259. (d -> pos) ++;
  260. return(0);
  261. }
  262. int CORD_rchr_proc(char c, void * client_data)
  263. {
  264. register chr_data * d = (chr_data *)client_data;
  265. if (c == d -> target) return(1);
  266. (d -> pos) --;
  267. return(0);
  268. }
  269. int CORD_batched_chr_proc(const char *s, void * client_data)
  270. {
  271. register chr_data * d = (chr_data *)client_data;
  272. register char * occ = strchr(s, d -> target);
  273. if (occ == 0) {
  274. d -> pos += strlen(s);
  275. return(0);
  276. } else {
  277. d -> pos += occ - s;
  278. return(1);
  279. }
  280. }
  281. size_t CORD_chr(CORD x, size_t i, int c)
  282. {
  283. chr_data d;
  284. d.pos = i;
  285. d.target = c;
  286. if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
  287. return(d.pos);
  288. } else {
  289. return(CORD_NOT_FOUND);
  290. }
  291. }
  292. size_t CORD_rchr(CORD x, size_t i, int c)
  293. {
  294. chr_data d;
  295. d.pos = i;
  296. d.target = c;
  297. if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
  298. return(d.pos);
  299. } else {
  300. return(CORD_NOT_FOUND);
  301. }
  302. }
  303. /* Find the first occurrence of s in x at position start or later. */
  304. /* This uses an asymptotically poor algorithm, which should typically */
  305. /* perform acceptably. We compare the first few characters directly, */
  306. /* and call CORD_ncmp whenever there is a partial match. */
  307. /* This has the advantage that we allocate very little, or not at all. */
  308. /* It's very fast if there are few close misses. */
  309. size_t CORD_str(CORD x, size_t start, CORD s)
  310. {
  311. CORD_pos xpos;
  312. size_t xlen = CORD_len(x);
  313. size_t slen;
  314. register size_t start_len;
  315. const char * s_start;
  316. unsigned long s_buf = 0; /* The first few characters of s */
  317. unsigned long x_buf = 0; /* Start of candidate substring. */
  318. /* Initialized only to make compilers */
  319. /* happy. */
  320. unsigned long mask = 0;
  321. register size_t i;
  322. register size_t match_pos;
  323. if (s == CORD_EMPTY) return(start);
  324. if (CORD_IS_STRING(s)) {
  325. s_start = s;
  326. slen = strlen(s);
  327. } else {
  328. s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
  329. slen = CORD_len(s);
  330. }
  331. if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
  332. start_len = slen;
  333. if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
  334. CORD_set_pos(xpos, x, start);
  335. for (i = 0; i < start_len; i++) {
  336. mask <<= 8;
  337. mask |= 0xff;
  338. s_buf <<= 8;
  339. s_buf |= (unsigned char)s_start[i];
  340. x_buf <<= 8;
  341. x_buf |= (unsigned char)CORD_pos_fetch(xpos);
  342. CORD_next(xpos);
  343. }
  344. for (match_pos = start; ; match_pos++) {
  345. if ((x_buf & mask) == s_buf) {
  346. if (slen == start_len ||
  347. CORD_ncmp(x, match_pos + start_len,
  348. s, start_len, slen - start_len) == 0) {
  349. return(match_pos);
  350. }
  351. }
  352. if ( match_pos == xlen - slen ) {
  353. return(CORD_NOT_FOUND);
  354. }
  355. x_buf <<= 8;
  356. x_buf |= (unsigned char)CORD_pos_fetch(xpos);
  357. CORD_next(xpos);
  358. }
  359. }
  360. void CORD_ec_flush_buf(CORD_ec x)
  361. {
  362. register size_t len = x[0].ec_bufptr - x[0].ec_buf;
  363. char * s;
  364. if (len == 0) return;
  365. s = GC_MALLOC_ATOMIC(len+1);
  366. memcpy(s, x[0].ec_buf, len);
  367. s[len] = '\0';
  368. x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
  369. x[0].ec_bufptr = x[0].ec_buf;
  370. }
  371. void CORD_ec_append_cord(CORD_ec x, CORD s)
  372. {
  373. CORD_ec_flush_buf(x);
  374. x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
  375. }
  376. /*ARGSUSED*/
  377. char CORD_nul_func(size_t i, void * client_data)
  378. {
  379. return((char)(unsigned long)client_data);
  380. }
  381. CORD CORD_chars(char c, size_t i)
  382. {
  383. return(CORD_from_fn(CORD_nul_func, (void *)(unsigned long)c, i));
  384. }
  385. CORD CORD_from_file_eager(FILE * f)
  386. {
  387. register int c;
  388. CORD_ec ecord;
  389. CORD_ec_init(ecord);
  390. for(;;) {
  391. c = getc(f);
  392. if (c == 0) {
  393. /* Append the right number of NULs */
  394. /* Note that any string of NULs is rpresented in 4 words, */
  395. /* independent of its length. */
  396. register size_t count = 1;
  397. CORD_ec_flush_buf(ecord);
  398. while ((c = getc(f)) == 0) count++;
  399. ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
  400. }
  401. if (c == EOF) break;
  402. CORD_ec_append(ecord, c);
  403. }
  404. (void) fclose(f);
  405. return(CORD_balance(CORD_ec_to_cord(ecord)));
  406. }
  407. /* The state maintained for a lazily read file consists primarily */
  408. /* of a large direct-mapped cache of previously read values. */
  409. /* We could rely more on stdio buffering. That would have 2 */
  410. /* disadvantages: */
  411. /* 1) Empirically, not all fseek implementations preserve the */
  412. /* buffer whenever they could. */
  413. /* 2) It would fail if 2 different sections of a long cord */
  414. /* were being read alternately. */
  415. /* We do use the stdio buffer for read ahead. */
  416. /* To guarantee thread safety in the presence of atomic pointer */
  417. /* writes, cache lines are always replaced, and never modified in */
  418. /* place. */
  419. # define LOG_CACHE_SZ 14
  420. # define CACHE_SZ (1 << LOG_CACHE_SZ)
  421. # define LOG_LINE_SZ 9
  422. # define LINE_SZ (1 << LOG_LINE_SZ)
  423. typedef struct {
  424. size_t tag;
  425. char data[LINE_SZ];
  426. /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */
  427. } cache_line;
  428. typedef struct {
  429. FILE * lf_file;
  430. size_t lf_current; /* Current file pointer value */
  431. cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
  432. } lf_state;
  433. # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
  434. # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
  435. # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
  436. # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
  437. # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
  438. typedef struct {
  439. lf_state * state;
  440. size_t file_pos; /* Position of needed character. */
  441. cache_line * new_cache;
  442. } refill_data;
  443. /* Executed with allocation lock. */
  444. static char refill_cache(client_data)
  445. refill_data * client_data;
  446. {
  447. register lf_state * state = client_data -> state;
  448. register size_t file_pos = client_data -> file_pos;
  449. FILE *f = state -> lf_file;
  450. size_t line_start = LINE_START(file_pos);
  451. size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
  452. cache_line * new_cache = client_data -> new_cache;
  453. if (line_start != state -> lf_current
  454. && fseek(f, line_start, SEEK_SET) != 0) {
  455. ABORT("fseek failed");
  456. }
  457. if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
  458. <= file_pos - line_start) {
  459. ABORT("fread failed");
  460. }
  461. new_cache -> tag = DIV_LINE_SZ(file_pos);
  462. /* Store barrier goes here. */
  463. ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
  464. state -> lf_current = line_start + LINE_SZ;
  465. return(new_cache->data[MOD_LINE_SZ(file_pos)]);
  466. }
  467. char CORD_lf_func(size_t i, void * client_data)
  468. {
  469. register lf_state * state = (lf_state *)client_data;
  470. register cache_line * volatile * cl_addr =
  471. &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
  472. register cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
  473. if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
  474. /* Cache miss */
  475. refill_data rd;
  476. rd.state = state;
  477. rd.file_pos = i;
  478. rd.new_cache = GC_NEW_ATOMIC(cache_line);
  479. if (rd.new_cache == 0) OUT_OF_MEMORY;
  480. return((char)(GC_word)
  481. GC_call_with_alloc_lock((GC_fn_type) refill_cache, &rd));
  482. }
  483. return(cl -> data[MOD_LINE_SZ(i)]);
  484. }
  485. /*ARGSUSED*/
  486. void CORD_lf_close_proc(void * obj, void * client_data)
  487. {
  488. if (fclose(((lf_state *)obj) -> lf_file) != 0) {
  489. ABORT("CORD_lf_close_proc: fclose failed");
  490. }
  491. }
  492. CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
  493. {
  494. register lf_state * state = GC_NEW(lf_state);
  495. register int i;
  496. if (state == 0) OUT_OF_MEMORY;
  497. if (len != 0) {
  498. /* Dummy read to force buffer allocation. */
  499. /* This greatly increases the probability */
  500. /* of avoiding deadlock if buffer allocation */
  501. /* is redirected to GC_malloc and the */
  502. /* world is multithreaded. */
  503. char buf[1];
  504. (void) fread(buf, 1, 1, f);
  505. rewind(f);
  506. }
  507. state -> lf_file = f;
  508. for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
  509. state -> lf_cache[i] = 0;
  510. }
  511. state -> lf_current = 0;
  512. GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
  513. return(CORD_from_fn(CORD_lf_func, state, len));
  514. }
  515. CORD CORD_from_file_lazy(FILE * f)
  516. {
  517. register long len;
  518. if (fseek(f, 0l, SEEK_END) != 0) {
  519. ABORT("Bad fd argument - fseek failed");
  520. }
  521. if ((len = ftell(f)) < 0) {
  522. ABORT("Bad fd argument - ftell failed");
  523. }
  524. rewind(f);
  525. return(CORD_from_file_lazy_inner(f, (size_t)len));
  526. }
  527. # define LAZY_THRESHOLD (128*1024 + 1)
  528. CORD CORD_from_file(FILE * f)
  529. {
  530. register long len;
  531. if (fseek(f, 0l, SEEK_END) != 0) {
  532. ABORT("Bad fd argument - fseek failed");
  533. }
  534. if ((len = ftell(f)) < 0) {
  535. ABORT("Bad fd argument - ftell failed");
  536. }
  537. rewind(f);
  538. if (len < LAZY_THRESHOLD) {
  539. return(CORD_from_file_eager(f));
  540. } else {
  541. return(CORD_from_file_lazy_inner(f, (size_t)len));
  542. }
  543. }