123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- // SPDX-License-Identifier: GPL-2.0 or GPL-3.0
- // Copyright © 2019 Ariadne Devos
- /* sHT -- find lexeme boundaries */
- #include <sHT/bitvec.h>
- #include <sHT/index.h>
- #include <sHT/lex.h>
- #include <sHT/string.h>
- #include <sHT/taint.h>
- #include <sHT/test.h>
- size_t
- sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c)
- {
- /* The current offset into @var{from}. (Set later). */
- size_t i;
- /* The old number of accumulated bytes,
- therefore, the index into @var{to->bytes} to start writing to. */
- size_t offset = to->offset;
- /* The following loop: iterate over the bytes of @var{from}, to
- validate their syntax and copy them -- and process a fragment when
- complete. However, not all bytes, as only @code{c->max_known} are
- allocated.
- @var{offset}: data from previous runs is remembered. */
- /* Underflow 1: `sHT_lex_buf` invariant (`offset` < `max_known`) */
- size_t todo = sHT_min_size(n, c->max_known - offset);
- /* Induct over byte locations, until a space character, a syntax error
- or the method is found to be too long to be known. */
- /* (1) `n` > 0 (`sHT_lex` precondition)
- (1) => (2) 0 < `n`
- (3) `offset` < `max_known` (`sHT_lex_buf` invariant)
- (3) => (4) 0 < `max_known` - `offset`
- (2, 4) => (5) 0 < min(`n`, `max_known` - `offset`)
- (5): QED `sHT_index_iterate` positivity (TODO: do ... while loop) */
- /* Invariant: byte `offset` to `offset` + `i` (exclusive) of
- @code{buf->bytes} are set. Base case: trivial. */
- sHT_index_iterate(i, todo) {
- /* If zero @var{n} were allowed, this would be out of bounds */
- /* (1) i < todo (@var{sHT_index_iterate})
- (2) todo <= n (@var{sHT_min_size})
- (1, 2) => i < n
- QED @var{from} length */
- uint8_t b = from[i];
- /* (1) i < todo (@var{sHT_index_iterate})
- (2) todo <= max_known - offset (@var{sHT_min_size})
- (1, 2) => (3) i < max_known - offset
- (3) => offset + i < max_known
- QED @var{to} capacity.
- QED induction step (is set). */
- to->bytes[offset + i] = b;
- if (sHT_bit_test(c->c_allow, b)) {
- /* Correct byte, but not a terminator.
- Continue the search. */
- continue;
- }
- /* Non-speculatively, @var{b} is not one of the allowed
- bytes. Either it is the terminator, or a syntax error.
- Which one? (0: syntax error, 1: terminator)*/
- int which = sHT_eq_bool(c->c_stop, b);
- /* Not used anymore; taint for analysis */
- sHT_taint(&to->offset);
- /* +1: also count the terminating byte
- (<tests/lex.c> found this bug)
- (1) i < todo (@var{sHT_index_iterate}),
- (2) todo <= n (@var{sHT_index_iterate})
- (1, 2) => (3) i < n.
- (3) => (4) i + 1 <= n
- QED bounds last argument
- (1) i < todo (@var{sHT_index_iterate}),
- (2) todo <= max_known - offset (@var{sHT_min_size})
- (1, 2) => (3) i <= max_known - offset
- (3) => (4) offset + i <= max_known
- QED length/index bounds */
- return c->cb_value[which](to, offset + i, i + 1);
- }
- /* Compare the number of running total of tested bytes with the
- maximal known lexeme length. If it the former begins to equal
- the latter, there is no point in copying anymore, but the
- syntax must still be validated. */
- /* Overflow:
- (1) i < todo (@var{sHT_index_iterate}),
- (2) todo <= max_known - offset (@var{sHT_min_size})
- (1, 2) => (3) i <= max_known - offset
- (3) => (4) offset + i <= max_known
- (5) `max_known` <= `SIZE_MAX` (typing)
- (3, 5) => `offset` + `i` <= `SIZE_MAX`
- QED no overflow */
- if (sHT_ge(offset + i, c->max_known)) {
- /* Not used anymore; taint for analysis */
- sHT_taint(&to->offset);
- sHT_taint(&to->bytes[0]);
- return c->cb_ignore(to, i);
- }
- /* More bytes must be read before the lexeme is complete.
- Proof of progress (i = n) (non-speculatively):
- (1) offset + i < max_known (@var{sHT_ge})
- (2) i = todo (@var{sHT_index_iterate})
- (1) => (4) i < max_known - offset
- (2, 4) => (5) todo < max_known - offset
- (5) => todo = n (@var{sHT_min_size})
- (2, 5) => i = n
- QED progress */
- /* Remember the number of copied bytes */
- /* Overflow/bounds:
- (1) i <= todo (@var{sHT_index_iterate}),
- (2) todo <= max_known - offset (@var{sHT_min_size})
- (1, 2) => (3) i <= max_known - offset
- (3) => (4) offset + i <= max_known
- QED bounds; continue overflow
- (5) max_known < UINT16_MAX (@var{uint16_t})
- (4, 5) => offset + i < UINT16_MAX
- QED overflow */
- to->offset += i;
- /* Bounds:
- (1) i <= todo (@var{sHT_index_iterate})
- (2) todo <= n (@var{sHT_min_size})
- (1, 2) => i <= n
- QED bounds */
- return i;
- }
|