|
@@ -0,0 +1,153 @@
|
|
|
+/* sHT -- find lexeme boundaries
|
|
|
+ Copyright (C) 2019 Ariadne Devos
|
|
|
+
|
|
|
+ This program is free software: you can redistribute it and/or modify
|
|
|
+ it under the terms of the GNU General Public License as published by
|
|
|
+ the Free Software Foundation, either version 3 of the License, or
|
|
|
+ (at your option) any later version.
|
|
|
+
|
|
|
+ This program is distributed in the hope that it will be useful,
|
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
+ GNU General Public License for more details.
|
|
|
+
|
|
|
+ You should have received a copy of the GNU General Public License
|
|
|
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
|
|
+
|
|
|
+#include <sHT/bitvec.h>
|
|
|
+#include <sHT/index.h>
|
|
|
+#include <sHT/lex.h>
|
|
|
+#include <sHT/string.h>
|
|
|
+#include <sHT/taint.h>
|
|
|
+#include <sHT/test.h>
|
|
|
+
|
|
|
+size_t
|
|
|
+sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c)
|
|
|
+{
|
|
|
+ /* The current offset into @var{from}. (Set later). */
|
|
|
+ size_t i;
|
|
|
+ /* The old number of accumulated bytes,
|
|
|
+ therefore, the index into @var{to->bytes} to start writing to. */
|
|
|
+ size_t offset = to->offset;
|
|
|
+ /* The following loop: iterate over the bytes of @var{from}, to
|
|
|
+ validate their syntax and copy them -- and process a fragment when
|
|
|
+ complete. However, not all bytes, as only @code{c->max_known} are
|
|
|
+ allocated.
|
|
|
+
|
|
|
+ @var{offset}: data from previous runs is remembered. */
|
|
|
+ /* Underflow 1: @var{sHT_lex_buf} invariant.
|
|
|
+ Bounds 0 -> @var{sHT_parser} precondition.
|
|
|
+ Bounds 1 -> type widths in @var{c} and @var{to} */
|
|
|
+ size_t todo = sHT_min_size(n, c->max_known - offset);
|
|
|
+ /* Induct over byte locations, until a space character, a syntax error
|
|
|
+ or the method is found to be too long to be known. */
|
|
|
+ /* (1) todo <= @var{n} (@var{sHT_min_size}),
|
|
|
+ (2) n < SSIZE_MAX,
|
|
|
+ (1, 2) => todo < SSIZE_MAX.
|
|
|
+ QED @var{sHT_index_iterate} max bounds.
|
|
|
+
|
|
|
+ (1) @var{n} != 0 (precondition)
|
|
|
+ (2) offset < max_known
|
|
|
+ (2) => (3) 0 < max_known - offset
|
|
|
+ (1, 3): QED @var{sHT_index_iterate} positivity. */
|
|
|
+ /* Invariant: byte offset to offset + i (exclusive) of
|
|
|
+ @code{buf->bytes} are set. Base case: trivial. */
|
|
|
+ sHT_index_iterate(i, todo) {
|
|
|
+ /* If zero @var{n} were allowed, this would be out of bounds */
|
|
|
+ /* (1) i < todo (@var{sHT_index_iterate})
|
|
|
+ (2) todo <= n (@var{sHT_min_size})
|
|
|
+ (1, 2) => i < n
|
|
|
+ QED @var{from} length */
|
|
|
+ uint8_t b = from[i];
|
|
|
+ /* (1) i < todo (@var{sHT_index_iterate})
|
|
|
+ (2) todo <= max_known - offset (@var{sHT_min_size})
|
|
|
+ (1, 2) => (3) i < max_known - offset
|
|
|
+ (3) => offset + i < max_known
|
|
|
+ QED @var{to} capacity.
|
|
|
+
|
|
|
+ QED induction step (is set). */
|
|
|
+ to->bytes[offset + i] = b;
|
|
|
+ if (sHT_bit_test(c->c_allow, b)) {
|
|
|
+ /* Correct byte, but not a terminator.
|
|
|
+ Continue the search. */
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ /* Non-speculatively, @var{b} is not one of the allowed
|
|
|
+ bytes. Either it is the terminator, or a syntax error.
|
|
|
+ Which one? (0: syntax error, 1: terminator)*/
|
|
|
+ int which = sHT_eq_bool(c->c_stop, b);
|
|
|
+ /* Not used anymore; taint for analysis */
|
|
|
+ sHT_taint(&to->offset);
|
|
|
+ /* +1: also count the terminating byte
|
|
|
+ (<tests/lex.c> found this bug)
|
|
|
+
|
|
|
+ (1) i < todo (@var{sHT_index_iterate}),
|
|
|
+ (2) todo <= n (@var{sHT_index_iterate})
|
|
|
+ (1, 2) => (3) i < n.
|
|
|
+ (3) => (4) i + 1 <= n
|
|
|
+ QED bounds last argument
|
|
|
+
|
|
|
+ (1) i < todo (@var{sHT_index_iterate}),
|
|
|
+ (2) todo <= max_known - offset (@var{sHT_min_size})
|
|
|
+ (1, 2) => (3) i <= max_known - offset
|
|
|
+ (3) => (4) offset + i <= max_known
|
|
|
+
|
|
|
+ QED length/index bounds */
|
|
|
+ return c->cb_value[which](to, offset + i, i + 1);
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Compare the number of running total of tested bytes with the
|
|
|
+ maximal known lexeme length. If it the former begins to equal
|
|
|
+ the latter, there is no point in copying anymore, but the
|
|
|
+ syntax must still be validated. */
|
|
|
+ /* Overflow:
|
|
|
+
|
|
|
+ (1) i < todo (@var{sHT_index_iterate}),
|
|
|
+ (2) todo <= max_known - offset (@var{sHT_min_size})
|
|
|
+ (1, 2) => (3) i <= max_known - offset
|
|
|
+ (3) => (4) offset + i <= max_known
|
|
|
+ (5) max_known < SSIZE_MAX (@var{sHT_lex_type})
|
|
|
+ (4, 5) => offset + i < SSIZE_MAX
|
|
|
+
|
|
|
+ QED no overflow */
|
|
|
+ if (sHT_ge(offset + i, c->max_known)) {
|
|
|
+ /* Not used anymore; taint for analysis */
|
|
|
+ sHT_taint(&to->offset);
|
|
|
+ sHT_taint(&to->bytes[0]);
|
|
|
+ return c->cb_ignore(to, i);
|
|
|
+ }
|
|
|
+
|
|
|
+ /* More bytes must be read before the lexeme is complete.
|
|
|
+ Proof of progress (i = n) (non-speculatively):
|
|
|
+
|
|
|
+ (1) offset + i < max_known (@var{sHT_ge})
|
|
|
+ (2) i = todo (@var{sHT_index_iterate})
|
|
|
+ (1) => (4) i < max_known - offset
|
|
|
+ (2, 4) => (5) todo < max_known - offset
|
|
|
+ (5) => todo = n (@var{sHT_min_size})
|
|
|
+ (2, 5) => i = n
|
|
|
+
|
|
|
+ QED progress */
|
|
|
+ /* Remember the number of copied bytes */
|
|
|
+ /* Overflow/bounds:
|
|
|
+
|
|
|
+ (1) i <= todo (@var{sHT_index_iterate}),
|
|
|
+ (2) todo <= max_known - offset (@var{sHT_min_size})
|
|
|
+ (1, 2) => (3) i <= max_known - offset
|
|
|
+ (3) => (4) offset + i <= max_known
|
|
|
+ QED bounds; continue overflow
|
|
|
+
|
|
|
+ (5) max_known < UINT16_MAX (@var{uint16_t})
|
|
|
+ (4, 5) => offset + i < UINT16_MAX
|
|
|
+
|
|
|
+ QED overflow */
|
|
|
+ to->offset += i;
|
|
|
+ /* Bounds:
|
|
|
+
|
|
|
+ (1) i <= todo (@var{sHT_index_iterate})
|
|
|
+ (2) todo <= n (@var{sHT_min_size})
|
|
|
+ (1, 2) => i <= n
|
|
|
+
|
|
|
+ QED bounds */
|
|
|
+ return i;
|
|
|
+}
|