123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622 |
- /*
- * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
- *
- * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
- * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
- *
- * Permission is hereby granted to use or copy this program
- * for any purpose, provided the above notices are retained on all copies.
- * Permission to modify the code and to distribute modified code is granted,
- * provided the above notices are retained, and a notice that the code was
- * modified is included with the above copyright notice.
- *
- * Author: Hans-J. Boehm (boehm@parc.xerox.com)
- */
- /*
- * These are functions on cords that do not need to understand their
- * implementation. They serve also serve as example client code for
- * cord_basics.
- */
- /* Boehm, December 8, 1995 1:53 pm PST */
- # include <stdio.h>
- # include <string.h>
- # include <stdlib.h>
- # include <stdarg.h>
- # include "cord.h"
- # include "ec.h"
- # define I_HIDE_POINTERS /* So we get access to allocation lock. */
- /* We use this for lazy file reading, */
- /* so that we remain independent */
- /* of the threads primitives. */
- # include "gc.h"
- /* For now we assume that pointer reads and writes are atomic, */
- /* i.e. another thread always sees the state before or after */
- /* a write. This might be false on a Motorola M68K with */
- /* pointers that are not 32-bit aligned. But there probably */
- /* aren't too many threads packages running on those. */
- # define ATOMIC_WRITE(x,y) (x) = (y)
- # define ATOMIC_READ(x) (*(x))
- /* The standard says these are in stdio.h, but they aren't always: */
- # ifndef SEEK_SET
- # define SEEK_SET 0
- # endif
- # ifndef SEEK_END
- # define SEEK_END 2
- # endif
- # define BUFSZ 2048 /* Size of stack allocated buffers when */
- /* we want large buffers. */
- typedef void (* oom_fn)(void);
- # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
- ABORT("Out of memory\n"); }
- # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
- CORD CORD_cat_char(CORD x, char c)
- {
- register char * string;
-
- if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
- string = GC_MALLOC_ATOMIC(2);
- if (string == 0) OUT_OF_MEMORY;
- string[0] = c;
- string[1] = '\0';
- return(CORD_cat_char_star(x, string, 1));
- }
- CORD CORD_catn(int nargs, ...)
- {
- register CORD result = CORD_EMPTY;
- va_list args;
- register int i;
- va_start(args, nargs);
- for (i = 0; i < nargs; i++) {
- register CORD next = va_arg(args, CORD);
- result = CORD_cat(result, next);
- }
- va_end(args);
- return(result);
- }
- typedef struct {
- size_t len;
- size_t count;
- char * buf;
- } CORD_fill_data;
- int CORD_fill_proc(char c, void * client_data)
- {
- register CORD_fill_data * d = (CORD_fill_data *)client_data;
- register size_t count = d -> count;
-
- (d -> buf)[count] = c;
- d -> count = ++count;
- if (count >= d -> len) {
- return(1);
- } else {
- return(0);
- }
- }
- int CORD_batched_fill_proc(const char * s, void * client_data)
- {
- register CORD_fill_data * d = (CORD_fill_data *)client_data;
- register size_t count = d -> count;
- register size_t max = d -> len;
- register char * buf = d -> buf;
- register const char * t = s;
-
- while((buf[count] = *t++) != '\0') {
- count++;
- if (count >= max) {
- d -> count = count;
- return(1);
- }
- }
- d -> count = count;
- return(0);
- }
- /* Fill buf with len characters starting at i. */
- /* Assumes len characters are available. */
- void CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
- {
- CORD_fill_data fd;
-
- fd.len = len;
- fd.buf = buf;
- fd.count = 0;
- (void)CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
- }
- int CORD_cmp(CORD x, CORD y)
- {
- CORD_pos xpos;
- CORD_pos ypos;
- register size_t avail, yavail;
-
- if (y == CORD_EMPTY) return(x != CORD_EMPTY);
- if (x == CORD_EMPTY) return(-1);
- if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
- CORD_set_pos(xpos, x, 0);
- CORD_set_pos(ypos, y, 0);
- for(;;) {
- if (!CORD_pos_valid(xpos)) {
- if (CORD_pos_valid(ypos)) {
- return(-1);
- } else {
- return(0);
- }
- }
- if (!CORD_pos_valid(ypos)) {
- return(1);
- }
- if ((avail = CORD_pos_chars_left(xpos)) <= 0
- || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
- register char xcurrent = CORD_pos_fetch(xpos);
- register char ycurrent = CORD_pos_fetch(ypos);
- if (xcurrent != ycurrent) return(xcurrent - ycurrent);
- CORD_next(xpos);
- CORD_next(ypos);
- } else {
- /* process as many characters as we can */
- register int result;
-
- if (avail > yavail) avail = yavail;
- result = strncmp(CORD_pos_cur_char_addr(xpos),
- CORD_pos_cur_char_addr(ypos), avail);
- if (result != 0) return(result);
- CORD_pos_advance(xpos, avail);
- CORD_pos_advance(ypos, avail);
- }
- }
- }
- int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
- {
- CORD_pos xpos;
- CORD_pos ypos;
- register size_t count;
- register long avail, yavail;
-
- CORD_set_pos(xpos, x, x_start);
- CORD_set_pos(ypos, y, y_start);
- for(count = 0; count < len;) {
- if (!CORD_pos_valid(xpos)) {
- if (CORD_pos_valid(ypos)) {
- return(-1);
- } else {
- return(0);
- }
- }
- if (!CORD_pos_valid(ypos)) {
- return(1);
- }
- if ((avail = CORD_pos_chars_left(xpos)) <= 0
- || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
- register char xcurrent = CORD_pos_fetch(xpos);
- register char ycurrent = CORD_pos_fetch(ypos);
- if (xcurrent != ycurrent) return(xcurrent - ycurrent);
- CORD_next(xpos);
- CORD_next(ypos);
- count++;
- } else {
- /* process as many characters as we can */
- register int result;
-
- if (avail > yavail) avail = yavail;
- count += avail;
- if (count > len) avail -= (count - len);
- result = strncmp(CORD_pos_cur_char_addr(xpos),
- CORD_pos_cur_char_addr(ypos), (size_t)avail);
- if (result != 0) return(result);
- CORD_pos_advance(xpos, (size_t)avail);
- CORD_pos_advance(ypos, (size_t)avail);
- }
- }
- return(0);
- }
- char * CORD_to_char_star(CORD x)
- {
- register size_t len = CORD_len(x);
- char * result = GC_MALLOC_ATOMIC(len + 1);
-
- if (result == 0) OUT_OF_MEMORY;
- CORD_fill_buf(x, 0, len, result);
- result[len] = '\0';
- return(result);
- }
- CORD CORD_from_char_star(const char *s)
- {
- char * result;
- size_t len = strlen(s);
- if (0 == len) return(CORD_EMPTY);
- result = GC_MALLOC_ATOMIC(len + 1);
- if (result == 0) OUT_OF_MEMORY;
- memcpy(result, s, len+1);
- return(result);
- }
- const char * CORD_to_const_char_star(CORD x)
- {
- if (x == 0) return("");
- if (CORD_IS_STRING(x)) return((const char *)x);
- return(CORD_to_char_star(x));
- }
- char CORD_fetch(CORD x, size_t i)
- {
- CORD_pos xpos;
-
- CORD_set_pos(xpos, x, i);
- if (!CORD_pos_valid(xpos)) ABORT("bad index?");
- return(CORD_pos_fetch(xpos));
- }
- int CORD_put_proc(char c, void * client_data)
- {
- register FILE * f = (FILE *)client_data;
-
- return(putc(c, f) == EOF);
- }
- int CORD_batched_put_proc(const char * s, void * client_data)
- {
- register FILE * f = (FILE *)client_data;
-
- return(fputs(s, f) == EOF);
- }
-
- int CORD_put(CORD x, FILE * f)
- {
- if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
- return(EOF);
- } else {
- return(1);
- }
- }
- typedef struct {
- size_t pos; /* Current position in the cord */
- char target; /* Character we're looking for */
- } chr_data;
- int CORD_chr_proc(char c, void * client_data)
- {
- register chr_data * d = (chr_data *)client_data;
-
- if (c == d -> target) return(1);
- (d -> pos) ++;
- return(0);
- }
- int CORD_rchr_proc(char c, void * client_data)
- {
- register chr_data * d = (chr_data *)client_data;
-
- if (c == d -> target) return(1);
- (d -> pos) --;
- return(0);
- }
- int CORD_batched_chr_proc(const char *s, void * client_data)
- {
- register chr_data * d = (chr_data *)client_data;
- register char * occ = strchr(s, d -> target);
-
- if (occ == 0) {
- d -> pos += strlen(s);
- return(0);
- } else {
- d -> pos += occ - s;
- return(1);
- }
- }
- size_t CORD_chr(CORD x, size_t i, int c)
- {
- chr_data d;
-
- d.pos = i;
- d.target = c;
- if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
- return(d.pos);
- } else {
- return(CORD_NOT_FOUND);
- }
- }
- size_t CORD_rchr(CORD x, size_t i, int c)
- {
- chr_data d;
-
- d.pos = i;
- d.target = c;
- if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
- return(d.pos);
- } else {
- return(CORD_NOT_FOUND);
- }
- }
- /* Find the first occurrence of s in x at position start or later. */
- /* This uses an asymptotically poor algorithm, which should typically */
- /* perform acceptably. We compare the first few characters directly, */
- /* and call CORD_ncmp whenever there is a partial match. */
- /* This has the advantage that we allocate very little, or not at all. */
- /* It's very fast if there are few close misses. */
- size_t CORD_str(CORD x, size_t start, CORD s)
- {
- CORD_pos xpos;
- size_t xlen = CORD_len(x);
- size_t slen;
- register size_t start_len;
- const char * s_start;
- unsigned long s_buf = 0; /* The first few characters of s */
- unsigned long x_buf = 0; /* Start of candidate substring. */
- /* Initialized only to make compilers */
- /* happy. */
- unsigned long mask = 0;
- register size_t i;
- register size_t match_pos;
-
- if (s == CORD_EMPTY) return(start);
- if (CORD_IS_STRING(s)) {
- s_start = s;
- slen = strlen(s);
- } else {
- s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
- slen = CORD_len(s);
- }
- if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
- start_len = slen;
- if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
- CORD_set_pos(xpos, x, start);
- for (i = 0; i < start_len; i++) {
- mask <<= 8;
- mask |= 0xff;
- s_buf <<= 8;
- s_buf |= (unsigned char)s_start[i];
- x_buf <<= 8;
- x_buf |= (unsigned char)CORD_pos_fetch(xpos);
- CORD_next(xpos);
- }
- for (match_pos = start; ; match_pos++) {
- if ((x_buf & mask) == s_buf) {
- if (slen == start_len ||
- CORD_ncmp(x, match_pos + start_len,
- s, start_len, slen - start_len) == 0) {
- return(match_pos);
- }
- }
- if ( match_pos == xlen - slen ) {
- return(CORD_NOT_FOUND);
- }
- x_buf <<= 8;
- x_buf |= (unsigned char)CORD_pos_fetch(xpos);
- CORD_next(xpos);
- }
- }
- void CORD_ec_flush_buf(CORD_ec x)
- {
- register size_t len = x[0].ec_bufptr - x[0].ec_buf;
- char * s;
- if (len == 0) return;
- s = GC_MALLOC_ATOMIC(len+1);
- memcpy(s, x[0].ec_buf, len);
- s[len] = '\0';
- x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
- x[0].ec_bufptr = x[0].ec_buf;
- }
- void CORD_ec_append_cord(CORD_ec x, CORD s)
- {
- CORD_ec_flush_buf(x);
- x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
- }
- /*ARGSUSED*/
- char CORD_nul_func(size_t i, void * client_data)
- {
- return((char)(unsigned long)client_data);
- }
- CORD CORD_chars(char c, size_t i)
- {
- return(CORD_from_fn(CORD_nul_func, (void *)(unsigned long)c, i));
- }
- CORD CORD_from_file_eager(FILE * f)
- {
- register int c;
- CORD_ec ecord;
-
- CORD_ec_init(ecord);
- for(;;) {
- c = getc(f);
- if (c == 0) {
- /* Append the right number of NULs */
- /* Note that any string of NULs is rpresented in 4 words, */
- /* independent of its length. */
- register size_t count = 1;
-
- CORD_ec_flush_buf(ecord);
- while ((c = getc(f)) == 0) count++;
- ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
- }
- if (c == EOF) break;
- CORD_ec_append(ecord, c);
- }
- (void) fclose(f);
- return(CORD_balance(CORD_ec_to_cord(ecord)));
- }
- /* The state maintained for a lazily read file consists primarily */
- /* of a large direct-mapped cache of previously read values. */
- /* We could rely more on stdio buffering. That would have 2 */
- /* disadvantages: */
- /* 1) Empirically, not all fseek implementations preserve the */
- /* buffer whenever they could. */
- /* 2) It would fail if 2 different sections of a long cord */
- /* were being read alternately. */
- /* We do use the stdio buffer for read ahead. */
- /* To guarantee thread safety in the presence of atomic pointer */
- /* writes, cache lines are always replaced, and never modified in */
- /* place. */
- # define LOG_CACHE_SZ 14
- # define CACHE_SZ (1 << LOG_CACHE_SZ)
- # define LOG_LINE_SZ 9
- # define LINE_SZ (1 << LOG_LINE_SZ)
- typedef struct {
- size_t tag;
- char data[LINE_SZ];
- /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */
- } cache_line;
- typedef struct {
- FILE * lf_file;
- size_t lf_current; /* Current file pointer value */
- cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
- } lf_state;
- # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
- # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
- # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
- # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
- # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
- typedef struct {
- lf_state * state;
- size_t file_pos; /* Position of needed character. */
- cache_line * new_cache;
- } refill_data;
- /* Executed with allocation lock. */
- static char refill_cache(client_data)
- refill_data * client_data;
- {
- register lf_state * state = client_data -> state;
- register size_t file_pos = client_data -> file_pos;
- FILE *f = state -> lf_file;
- size_t line_start = LINE_START(file_pos);
- size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
- cache_line * new_cache = client_data -> new_cache;
-
- if (line_start != state -> lf_current
- && fseek(f, line_start, SEEK_SET) != 0) {
- ABORT("fseek failed");
- }
- if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
- <= file_pos - line_start) {
- ABORT("fread failed");
- }
- new_cache -> tag = DIV_LINE_SZ(file_pos);
- /* Store barrier goes here. */
- ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
- state -> lf_current = line_start + LINE_SZ;
- return(new_cache->data[MOD_LINE_SZ(file_pos)]);
- }
- char CORD_lf_func(size_t i, void * client_data)
- {
- register lf_state * state = (lf_state *)client_data;
- register cache_line * volatile * cl_addr =
- &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
- register cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
-
- if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
- /* Cache miss */
- refill_data rd;
-
- rd.state = state;
- rd.file_pos = i;
- rd.new_cache = GC_NEW_ATOMIC(cache_line);
- if (rd.new_cache == 0) OUT_OF_MEMORY;
- return((char)(GC_word)
- GC_call_with_alloc_lock((GC_fn_type) refill_cache, &rd));
- }
- return(cl -> data[MOD_LINE_SZ(i)]);
- }
- /*ARGSUSED*/
- void CORD_lf_close_proc(void * obj, void * client_data)
- {
- if (fclose(((lf_state *)obj) -> lf_file) != 0) {
- ABORT("CORD_lf_close_proc: fclose failed");
- }
- }
- CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
- {
- register lf_state * state = GC_NEW(lf_state);
- register int i;
-
- if (state == 0) OUT_OF_MEMORY;
- if (len != 0) {
- /* Dummy read to force buffer allocation. */
- /* This greatly increases the probability */
- /* of avoiding deadlock if buffer allocation */
- /* is redirected to GC_malloc and the */
- /* world is multithreaded. */
- char buf[1];
- (void) fread(buf, 1, 1, f);
- rewind(f);
- }
- state -> lf_file = f;
- for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
- state -> lf_cache[i] = 0;
- }
- state -> lf_current = 0;
- GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
- return(CORD_from_fn(CORD_lf_func, state, len));
- }
- CORD CORD_from_file_lazy(FILE * f)
- {
- register long len;
-
- if (fseek(f, 0l, SEEK_END) != 0) {
- ABORT("Bad fd argument - fseek failed");
- }
- if ((len = ftell(f)) < 0) {
- ABORT("Bad fd argument - ftell failed");
- }
- rewind(f);
- return(CORD_from_file_lazy_inner(f, (size_t)len));
- }
- # define LAZY_THRESHOLD (128*1024 + 1)
- CORD CORD_from_file(FILE * f)
- {
- register long len;
-
- if (fseek(f, 0l, SEEK_END) != 0) {
- ABORT("Bad fd argument - fseek failed");
- }
- if ((len = ftell(f)) < 0) {
- ABORT("Bad fd argument - ftell failed");
- }
- rewind(f);
- if (len < LAZY_THRESHOLD) {
- return(CORD_from_file_eager(f));
- } else {
- return(CORD_from_file_lazy_inner(f, (size_t)len));
- }
- }
|