123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- // SPDX-License-Identifier: GPL-2.0 or GPL-3.0
- // Copyright © 2019 Ariadne Devos
- /* sHT -- find lexeme boundaries */
- #ifndef _sHT_LEX_H
- #define _sHT_LEX_H
- #include <stddef.h>
- #include <stdint.h>
- /** Detecting lexeme boundaries
- This module detects lexeme boundaries for variable-length
- strings of bytes belonging to a certain class, followed by
- a terminator byte.
- TODO: 'ignore' variant. */
- /** Accumulates bytes
- It is to be interpreted in the context of a @var{sHT_lex_state} named
- @var{c}. It is disjoint from @var{c}. */
- struct sHT_lex_buf
- {
- /** The number of accumulated bytes, therefore, the number
- of meaningful bytes in @var{bytes}. (R/W, not accessed concurrently)
- (less than @code{c->max_known}) */
- uint16_t offset;
- /** Some accumulated bytes (R/W, not accessed concurrently).
- Its capacity is @code{c->max_known} and its length @var{offset}. */
- unsigned char bytes[];
- };
- /** Lexing parameters
- The lexeme boundary and length detection may speculatively be incorrect.
- All fields are readable and read-only. */
- struct sHT_lex_type
- {
- /** Lex a string into its syntactical element
- At least, that's the common use case.
- The first index is for syntax errors, the second for
- well-formed strings. For the former, @var{n} is the index of the
- syntax error; for the latter, @var{n} is the index of the offending
- byte.
- @var{to}: the first argument passed to @var{sHT_lex}
- @code{n}: the length of the string to lex,
- non-speculatively excluding the terminator
- (not greater than @var{max_known}).
- @var{ret}: the number of bytes @var{sHT_lex} parsed */
- size_t (* cb_value[2])(struct sHT_lex_buf *to, size_t n, size_t ret);
- /** The lexeme is longer than any known
- @var{to}: the first argument passed to @var{sHT_lex}
- @var{ret}: the number of bytes @var{sHT_lex} parsed */
- size_t (* cb_ignore)(struct sHT_lex_buf *to, size_t ret);
- /** The lexeme is longer than any known, but it has been parsed
- The first index is for syntax errors, the second for well-formed
- strings. The terminator or syntax error is included in @var{ret}.
- @var{ret}: the number of bytes @var{sHT_lex_skip} parsed
- @var{x}: the first argument passed to @var{sHT_lex_skip} */
- size_t (* cb_skip_done[2])(size_t ret, void *c);
- /** A byte class represented by a readable bitvector, not modified
- concurrently (probably not at all), indexed by the byte to test
- for its well-formedness. If set, the byte is within the set,
- otherwise, it isn't. */
- const unsigned char *c_allow;
- /** The maximal length of any known lexeme, including the terminating
- @var{c_stop} byte (positive) */
- uint_least16_t max_known;
- /** The terminator byte. Does not belong to @var{c_allow}. */
- unsigned char c_stop;
- };
- _Static_assert((size_t) -1 >= (uint_least16_t) -1, "size_t is too small!");
- /** Find the lexeme boundary of a scattered string
- @var{to}: a buffer to accumulate bytes to
- @var{from}: a string to take bytes from, readable, not modified concurrently
- @var{n}: the length of @var{from} (positive)
- @var{c}: how does a lexeme look like, and what to do when?
- Bytes are accumulated into @var{to}.
- @var{from} is disjoint from @var{to} and @var{c}.
- If not calling into @var{c}, return the number of parsed bytes, including
- the terminator, if any. The first time, @code{c->offset} must be set to zero.
- Speculatively, the boundaries and syntax error detection may be incorrect. */
- size_t
- sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c);
- /** Skip some bytes of @var{from}
- @var{from}: a readable buffer, not modified concurrently, to ignore
- @var{n}: the length of @var{from} (positive)
- @var{c}: how does a lexeme look like, and what to do when?
- @var{x}: ignored, may be used by @var{c} callbacks
- This function does not modify anything, except for what it tail-callees do.
- @code{c->cb_skip_done} may be tail-called. Otherwise, return the number of
- parsed bytes. The syntax and terminator detection may speculatively be
- incorrect. */
- size_t
- sHT_lex_skip(const unsigned char from[], size_t n, const struct sHT_lex_type *c, void *x);
- #endif
|