123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716 |
- /* Definitions for the string type.
- This file is part of khipu.
- khipu is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation; either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
- #include <cstdio> // For the SEEK_* constants.
- #include <cstdlib>
- #include <climits>
- #include "str.hpp"
- #include "memory.hpp"
- #include "stream.hpp"
- #include "utils/chmask.hpp"
- #include "integer.hpp"
- #include "io.hpp"
- KP_DECLS_BEGIN
- const uint8_t UTF8_SKIP[256] =
- {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
- };
- uint32_t utf8min (const char *s, uint32_t maxlen)
- {
- if (!maxlen)
- return (maxlen);
- const char *p = s;
- for (p += maxlen - 1; ((uint8_t)*p & 0xc0) == 0x80; --p) ;
- return (p + UTF8_SKIP[(uint8_t)*p] >
- s + maxlen ? (uint32_t)(p - s) : maxlen);
- }
- uint32_t ustrlen (const void *xs, uint32_t *lenp)
- {
- uint32_t i;
- const uint8_t *s = (const uint8_t *)xs, *p;
- for (i = 0, p = s; *p != 0; ++i)
- p += UTF8_SKIP[*p];
- *lenp = i;
- return ((uint32_t)(p - s));
- }
- uint32_t ustrnlen (const void *xs, uint32_t maxlen)
- {
- uint32_t i;
- const uint8_t *s = (const uint8_t *)xs, *end = s + maxlen;
- for (i = 0; s < end; ++i)
- s += UTF8_SKIP[*s];
- return (i);
- }
- uint32_t stridx (const string *sp, uint32_t idx)
- {
- uint32_t nbytes = sp->nbytes;
- const unsigned char *ptr, *datap = sp->data;
-
- if (sp->len == nbytes)
- // ASCII string - Fast path.
- return (idx);
- else if (idx > (sp->len * 3) / 4)
- { // Much closer to the end - Walk in reverse.
- ptr = datap + nbytes;
- idx = sp->len - idx;
-
- while (idx > 0)
- if ((*--ptr & 0xc0) != 0x80)
- --idx;
- }
- else
- { // Traverse string forward.
- ptr = datap;
- for (uint32_t i = 0; i < idx; ++i)
- ptr += UTF8_SKIP[*ptr];
- }
- return ((uint32_t)(ptr - datap));
- }
- uint32_t stroff (const string *sp, uint32_t off)
- {
- uint32_t nbytes = sp->nbytes;
- const unsigned char *datap = sp->data, *ptr = datap + off;
- if (sp->len == nbytes)
- // ASCII string.
- return (off);
- else if (off > (nbytes * 3) / 4)
- { // Much closer to the end - Walk in reverse.
- datap += nbytes;
- while (datap > ptr)
- if ((*--datap & 0xc0) != 0x80)
- --nbytes;
- }
- else
- // Traverse string forward.
- for (nbytes = 0; ptr > datap; ++nbytes)
- datap += UTF8_SKIP[*datap];
- return (nbytes);
- }
- uint32_t u8tou32 (const unsigned char *src, uint32_t len)
- {
- uint32_t ret = *src;
- if (len > 1)
- {
- int mask = 0x40;
-
- do
- {
- mask <<= 5;
- ret = (ret << 6) + ((unsigned char)*++src - 0x80);
- }
- while (--len > 1);
-
- ret &= --mask;
- }
- return (ret);
- }
- uint32_t u32tou8 (unsigned char *dst, uint32_t ch)
- {
- uint32_t ret;
- if (ch < 0x80)
- *dst = (unsigned char)ch, ret = 1;
- else
- {
- int step = 0;
-
- if ((ch & (~0u << 11)) == 0)
- step = 2;
- else if ((ch & (~0u << 16)) == 0)
- step = 3;
- else if ((ch & (~0u << 21)) == 0)
- step = 4;
-
- ret = step;
- *dst = (unsigned char)(~0xff >> step--);
- do
- {
- dst[step] = 0x80 | (ch & 0x3f);
- ch >>= 6;
- }
- while (--step > 0);
-
- *dst |= (unsigned char)ch;
- }
- return (ret);
- }
- result<object> string::make (interpreter *interp, const void *cstr)
- {
- uint32_t len, bytes = ustrlen (cstr, &len);
- object rv = KP_TRY (alloc_str (interp, bytes));
- string *ret = as_str (rv);
- memcpy (ret->data, cstr, bytes);
- ret->len = len;
- kp_return (interp->alval);
- }
- result<object> string::make (interpreter *interp, const void *buf, uint32_t len)
- {
- object rv = KP_TRY (alloc_str (interp, len));
- string *ret = as_str (rv);
- memcpy (ret->data, buf, len);
- ret->len = ustrnlen (buf, len);
- kp_return (interp->alval);
- }
- result<object> reverse_s (interpreter *interp, object obj)
- {
- const string *src = as_str (obj);
- object rv = KP_TRY (alloc_str (interp, src->nbytes));
- string *ret = as_str (rv);
- if (src->nbytes == src->len)
- for (uint32_t i = 0, j = src->nbytes - 1 ; ; --j)
- {
- ret->data[i] = src->data[j];
- if (!j)
- break;
- }
- else
- {
- const unsigned char *inp = src->data;
- unsigned char *outp = ret->data + ret->nbytes;
- while (inp < src->data + src->nbytes)
- {
- uint32_t nl = UTF8_SKIP[*inp];
- fscpy (outp -= nl, inp, nl);
- inp += nl;
- }
- }
- ret->len = src->len;
- kp_return (ret->as_obj ());
- }
- result<object> iter_s (interpreter *interp, object obj, object token, bool adv)
- {
- if (token == UNBOUND)
- kp_return (as_str(obj)->nbytes == 0 ? NIL : fixint (0));
- if (!fixint_p (token))
- return (interp->raise ("type-error", "token must be an int"));
- int ix = as_int (token);
- const unsigned char *dp = as_str(obj)->data + ix;
- if ((*dp & 0xc0) == 0x80)
- return (interp->raise ("arg-error", "invalid token"));
- else if (!adv)
- kp_return (charobj (u8tou32 (dp, UTF8_SKIP[*dp])));
- ix += UTF8_SKIP[*dp];
- kp_return ((uint32_t)ix >= as_str(obj)->nbytes ? NIL : fixint (ix));
- }
- result<object> last_s (interpreter *interp, object obj)
- {
- const string *sp = as_str (obj);
- if (!sp->len)
- return (interp->raise_oob (0, 0));
- auto up = sp->data + sp->nbytes;
- auto vp = up;
- do
- --up;
- while ((*up & 0xc0) == 0x80);
- kp_return (charobj (u8tou32 (up, vp - up)));
- }
- static inline object
- fixup_index (const string *sp, object obj)
- {
- return (fixint_p (obj) ? fixint (stridx (sp, as_int (obj))) : obj);
- }
- result<object> find_s (interpreter *interp, object obj, object key,
- object start, object end, object test)
- {
- const string *src = as_str (obj);
- start = fixup_index (src, start);
- end = fixup_index (src, end);
- local_varobj<bvector> bv;
- unsigned char buf[16];
- if (char_p (key))
- bv.local_init (buf, u32tou8 (buf, as_char (key)));
- else if (!str_p (key))
- return (interp->raise ("type-error", "key must be a string or character"));
- else
- bv.local_init (as_str(key)->data, as_str(key)->nbytes);
- object ret = KP_TRY (find_b (interp, obj, bv.as_obj (), start, end, test));
- if (ret != NIL)
- ret = fixint (stroff (src, as_int (ret)));
- kp_return (ret);
- }
- // Stream interface.
- struct sstream_data
- {
- unsigned char *datap;
- uint32_t curpos;
- uint32_t nmax;
- uint32_t nbytes;
- bool owned_p; // True if the buffer is ours.
- };
- static result<int64_t>
- str_read (interpreter *, stream& strm, void *dstp, uint64_t bytes)
- {
- sstream_data *dp = (sstream_data *)strm.cookie;
- unsigned char *ptr = dp->datap + dp->curpos;
- uint32_t rb = (uint32_t)(dp->nbytes - dp->curpos);
- rb = utf8min ((const char *)ptr, min ((uint64_t)rb, bytes));
- memcpy (dstp, ptr, rb);
- dp->curpos += rb;
- return ((int64_t)rb);
- }
- static result<int64_t>
- str_write (interpreter *, stream& strm, const void *src, uint64_t bytes)
- {
- sstream_data *dp = (sstream_data *)strm.cookie;
- if (dp->curpos + bytes >= dp->nmax)
- {
- uint32_t nsz = upsize (dp->curpos + bytes + 1);
- dp->datap = (unsigned char *)xrealloc (dp->datap, dp->nmax = nsz);
- }
- if (dp->curpos + bytes < dp->nbytes)
- { /* Writing to the middle of the string.
- * Be careful to preserve data consistency. */
- unsigned char *p1 = dp->datap + dp->curpos + bytes;
- if ((*p1 & 0xc0) == 0x80)
- { /* Performing the write would lead us to an incomplete
- * character. Move some bytes to make up for this. */
- unsigned char *p2;
- for (p2 = p1 + 1; (*p2 & 0xc0) == 0x80; ++p2) ;
- memmove (p1, p2, dp->nbytes - (p2 - dp->datap));
- dp->nbytes -= p2 - p1;
- }
- }
- memcpy (dp->datap + dp->curpos, src, bytes);
- if ((dp->curpos += bytes) > dp->nbytes)
- dp->nbytes = dp->curpos;
- return ((int64_t)bytes);
- }
- static result<bool>
- str_seek (interpreter *, stream& strm, spos& pos, int whence)
- {
- sstream_data *dp = (sstream_data *)strm.cookie;
- int64_t roff = pos.offset +
- (whence == SEEK_SET ? 0 : whence == SEEK_CUR ?
- dp->curpos : dp->nbytes);
- if (roff < 0)
- return (false);
- else if (roff > dp->nbytes)
- { // Seeking beyond the end of the string.
- if (!(strm.io_flags & STRM_WRITE) || roff > UINT32_MAX)
- return (false);
- else if (roff > dp->nmax)
- dp->datap = (unsigned char *)xrealloc (dp->datap,
- dp->nmax = upsize (roff + 1));
- memset (&dp->datap[dp->nbytes], 0, roff - dp->nbytes);
- }
- else
- // Make sure we don't end up in the middle of a character.
- if ((dp->datap[roff] & 0xc0) == 0x80)
- return (false);
- if ((dp->curpos = (uint32_t)roff) > dp->nbytes)
- dp->nbytes = dp->curpos;
- pos.offset = roff;
- return (true);
- }
- static bool
- str_close (interpreter *, stream& strm)
- {
- sstream_data *dp = (sstream_data *)strm.cookie;
- if (dp->owned_p)
- xfree (dp->datap);
- xfree (dp);
- strm.extra = UNBOUND;
- return (true);
- }
- static const stream::xops str_ops =
- {
- str_read,
- str_write,
- str_seek,
- str_close
- };
- result<stream*> strstream (interpreter *interp, object str, int mode)
- {
- if (!(mode & STRM_RDWR))
- return (nullptr);
- string *sp = as_str (str);
- sstream_data *dp = (sstream_data *)xmalloc (sizeof (*dp));
- mode |= STRM_UTF8;
- if (mode & STRM_WRITE)
- { // Make a copy of the string buffer.
- uint32_t sz = upsize (sp->nbytes + 1);
- dp->datap = (unsigned char *)xmalloc (sz);
- memcpy (dp->datap, sp->data, sp->nbytes);
- dp->datap[sp->nbytes] = '\0';
- dp->nmax = sz;
- dp->owned_p = true;
- }
- else
- { /* We can use the string buffer itself, but make sure to
- * save the string object in the stream 'extra' member. */
- dp->datap = sp->data;
- dp->nmax = sp->nbytes;
- dp->owned_p = false;
- }
- dp->nbytes = sp->nbytes;
- dp->curpos = 0;
- auto strm = stream::make (interp, mode, STRM_BUFSIZ, &str_ops, dp);
- if (strm.error_p ())
- {
- if (dp->owned_p)
- xfree (dp->datap);
- xfree (dp);
- return (exception ());
- }
- stream *ret = deref (strm);
- if (!(mode & STRM_WRITE))
- ret->extra = str;
- return (ret);
- }
- result<object> sstream_get (interpreter *interp, stream *strm)
- {
- if (strm->io_flags & STRM_CLOSED)
- return (interp->raise ("arg-error", "stream has been closed"));
- else if (!(strm->io_flags & STRM_WRITE))
- kp_return (strm->extra); // Cached string.
- bool rv = KP_TRY (strm->flush (interp));
- if (!rv)
- return (interp->raise ("io-error", "failed to flush stream"));
-
- // Make up the string from the accumulated bytes.
- sstream_data *dp = (sstream_data *)strm->cookie;
- return (string::make (interp, (const char *)dp->datap, dp->nbytes));
- }
- result<int64_t> write_s (interpreter *interp, stream *strm,
- object obj, io_info& info)
- {
- const string *sp = as_str (obj);
- if (info.flags & io_info::FLG_RAW)
- return (strm->write (interp, sp->data, sp->nbytes));
- // Bitmask of special characters.
- chmask mask ("\"\n\t\r\a\b\\\0", 8);
- int64_t ret = KP_TRY (strm->putb (interp, '"'));
- for (auto p = sp->data; p < sp->data + sp->nbytes; )
- {
- if (kp_likely (!mask.tst (*p)))
- {
- uint32_t len = UTF8_SKIP[*p];
- ret += KP_TRY (strm->write (interp, p, len));
- p += len;
- continue;
- }
- char buf[2] = { '\\' };
- switch (*p++)
- {
- case '"':
- buf[1] = '"';
- break;
- case '\n':
- buf[1] = 'n';
- break;
- case '\t':
- buf[1] = 't';
- break;
- case '\r':
- buf[1] = 'r';
- break;
- case '\a':
- buf[1] = 'a';
- break;
- case '\b':
- buf[1] = 'b';
- break;
- case '\\':
- buf[1] = '\\';
- break;
- case '\0':
- buf[1] = '0';
- break;
- }
- ret += KP_TRY (strm->write (interp, buf, 2));
- }
- ret += KP_TRY (strm->putb (interp, '"'));
- return (ret);
- }
- KP_EXPORT const char* chobj_repr (object);
- result<int64_t> write_c (interpreter *interp, stream *strm,
- object obj, io_info& info)
- {
- uint32_t ch = as_char (obj);
- if (info.flags & io_info::FLG_RAW)
- return (strm->putuc (interp, as_char (obj)));
- int ret = KP_TRY (strm->putb (interp, '\\'));
- const char *repr = chobj_repr (ch);
- if (repr != nullptr)
- { ret += KP_TRY (strm->write (interp, repr, strlen (repr))); }
- else if (ch <= 0x7f)
- { ret += KP_TRY (strm->putb (interp, (unsigned char)ch)); }
- else
- { // Beyond ASCII - Print it as uXXXX
- char buf[8];
- int len = sprintf (buf, "u%.*x", ch <= 0xffff ? 4 : 8, ch);
- ret += KP_TRY (strm->write (interp, buf, len));
- }
- return (ret);
- }
- result<int64_t> pack_c (interpreter *interp, stream *strm,
- object obj, pack_info&)
- {
- uint32_t ch = as_char (obj);
- return (strm->write (interp, &ch, sizeof (ch)));
- }
- result<object> unpack_c (interpreter *interp, stream *strm, pack_info& info, bool)
- {
- uint32_t ch;
- bool rv = KP_TRY (strm->sread (interp, &ch));
- if (rv)
- kp_return (charobj (ch));
- return (info.error ("invalid char read"));
- }
- struct fmt_info
- {
- io_info io;
- int arg_idx;
- int argc;
- int lidx;
- int spec;
- chmask spec_mask;
- chmask flg_mask;
- const char *last_pos;
- fmt_info (int nargs) : io (io_info::FLG_RAW), argc (nargs), lidx (1),
- spec_mask ("dxXofFgGeEaAscQ"), flg_mask ("#0- +'I")
- {
- }
- result<int> parse (interpreter *interp, const char *str, int nb)
- {
- this->arg_idx = this->spec = -1;
- while (this->flg_mask.tst (*str))
- switch (*str++)
- {
- case '#':
- this->io.flags |= io_info::FLG_ALT;
- break;
- case '0':
- this->io.flags |= io_info::FLG_ZERO;
- break;
- case '-':
- this->io.flags |= io_info::FLG_LJUST;
- break;
- case ' ':
- this->io.flags |= io_info::FLG_SPACE;
- break;
- case '+':
- this->io.flags |= io_info::FLG_SIGN;
- break;
- case '\'':
- this->io.flags |= io_info::FLG_I18N;
- break;
- default:
- break;
- }
- char *endp = 0;
- long val = strtol (str, &endp, 10);
- if (val == 0 && endp == str)
- goto end;
- this->io.width = (int)val;
- str = endp;
- if (*str == '@')
- {
- if (val >= this->argc)
- return (interp->raise_oob (val, this->argc));
- this->io.width = io_info::DFL_WIDTH;
- this->arg_idx = (int)val;
- val = strtol (++str, &endp, 10);
- if (val == 0 && endp == str && this->spec_mask.tst (*str))
- goto end;
- else if (val >= 0)
- this->io.width = (int)val;
- str = endp;
- }
- if (*str == '.')
- {
- val = strtol (++str, &endp, 10);
- if (val == 0 && endp == str)
- return (interp->raise ("arg-error", "invalid format string"));
- else if (val > 0)
- this->io.prec = (int)val;
- str = endp;
- }
- end:
- if (this->arg_idx < 0 && (this->arg_idx = this->lidx++) >= this->argc)
- return (interp->raise_oob (this->lidx, this->argc));
- switch (*str++)
- {
- case 'x': case 'X':
- this->io.radix = *str == 'x' ? 16 : -16;
- break;
- case 'o':
- this->io.radix = 8;
- break;
- case 'a': case 'A':
- this->io.radix = *str == 'a' ? 16 : -16;
- break;
- case 'd': case 'f': case 'F': case 'g': case 'G':
- case 'e': case 'E': case 'c': case 's': case 'Q':
- break;
- default:
- return (interp->raise ("arg-error", "invalid specifier"));
- }
- this->last_pos = str;
- return (0);
- }
- };
- result<object> p_fmt_str (interpreter *interp, object *argv, int argc)
- {
- if (!str_p (*argv))
- return (interp->raise ("type-error", "first argument must be a string"));
- const string *sp = as_str (*argv);
- fmt_info fi (argc);
- stream *strm = KP_TRY (strstream (interp, deref (alloc_str (interp, 0)),
- STRM_WRITE | STRM_NOLOCK));
- for (uint32_t i = 0; i < sp->nbytes; )
- {
- const char *csp = (const char *)&sp->data[i];
- if (*csp != '%')
- {
- KP_VTRY (strm->write (interp, csp, UTF8_SKIP[(uint8_t)*csp]));
- i += UTF8_SKIP[(uint8_t)*csp];
- continue;
- }
- else if (*++csp == '%')
- {
- KP_VTRY (strm->putb (interp, '%'));
- i += 2;
- continue;
- }
- KP_VTRY (fi.parse (interp, csp,
- sp->nbytes - (csp - (const char *)sp->data)),
- // TODO: Convert according to fi.spec.
- xwrite (interp, strm, argv[fi.arg_idx], fi.io));
- i += fi.last_pos - csp + 1;
- }
- return (sstream_get (interp, strm));
- }
- KP_DECLS_END
|