13 Commits 2d08a8fd0c ... 47536a244f

Author SHA1 Message Date
  Ariadne Devos 47536a244f Recognise some URI schemes 5 years ago
  Ariadne Devos 797c01cbf7 Merge branch 'odd-fixes' into webdav 5 years ago
  Ariadne Devos 2cee7d37fb Allow lexeme boundary detection without state 5 years ago
  Ariadne Devos 0407ba6493 Fix character sign in sHT_lex 5 years ago
  Ariadne Devos 2d05da98d4 Add missing <sHT/lex.h> 5 years ago
  Ariadne Devos 3e77bbb18d Merge branch 'odd-fixes' into webdav 5 years ago
  Ariadne Devos b6efb0a5e3 Make sHT_index_iterate invariant stronger 5 years ago
  Ariadne Devos 6fce1107df Introduce sHT_lex, for lexing 5 years ago
  Ariadne Devos 9ba570a5fe Avoid unhelpful compiler warning 5 years ago
  Ariadne Devos ab3d5aea1e Correct identifier typo in <sHT/taint.h> 5 years ago
  Ariadne Devos 50dc74600b Introduce equality operator returning integer 5 years ago
  Ariadne Devos c3fcca0077 Compile tainting out by default 5 years ago
  Ariadne Devos c8a5b58e45 Syntax fix <sHT/taint.h> 5 years ago
10 changed files with 397 additions and 14 deletions
  1. 9 8
      Makefile.am
  2. 153 0
      buffer/lex.c
  3. 49 0
      buffer/skip.c
  4. 28 0
      http/scheme.sh
  5. 3 0
      sHT/index.h
  6. 128 0
      sHT/lex.h
  7. 7 6
      sHT/taint.h
  8. 4 0
      sHT/test-arch.h
  9. 16 0
      sHT/test.h
  10. 0 0
      sHT/web/uri.h

+ 9 - 8
Makefile.am

@@ -13,8 +13,11 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# % rules are a GNUism, except if they are POSIX. No Internet at the
+# moment. So no -Wportability
 AUTOMAKE_OPTIONS = subdir-objects foreign no-dist \
- -Wobsolete -Woverride -Wportability -Wunsupported -Werror
+ -Wobsolete -Woverride -Wunsupported -Werror
 
 AM_CPPFLAGS = -D_GNU_SOURCE
 
@@ -22,11 +25,14 @@ bin_PROGRAMS = shttpd
 shtsources = \
   bitops/zero-index/any.c \
   buffer/append.c \
+  buffer/lex.c \
   buffer/memeq.c \
   buffer/phash.c \
+  buffer/skip.c \
   fd/fd.c \
   fd/inet.c \
   generic/bug.c \
+  http/scheme-hash.c \
   http/method-hash.c \
   http/header-hash.c \
   task/accept.c \
@@ -41,13 +47,8 @@ shtsources = \
   worker/schedule.c \
   worker/vector.c
 
-phash_recipe = bash $(srcdir)/buffer/gen-phash.sh --gen $< > $@T && mv $@T $@
-
-http/method-hash.c: http/methods.sh buffer/gen-phash.sh
-	${phash_recipe}
-
-http/header-hash.c: http/headers.sh buffer/gen-phash.sh
-	${phash_recipe}
+%-hash.c: %.sh buffer/gen-phash.sh
+	bash $(srcdir)/buffer/gen-phash.sh --gen $< > $@T && mv $@T $@
 
 worker/objcache.c: worker/objcache-bless-table.c
 	touch -c $@

+ 153 - 0
buffer/lex.c

@@ -0,0 +1,153 @@
+/* sHT -- find lexeme boundaries
+   Copyright (C) 2019 Ariadne Devos
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <sHT/bitvec.h>
+#include <sHT/index.h>
+#include <sHT/lex.h>
+#include <sHT/string.h>
+#include <sHT/taint.h>
+#include <sHT/test.h>
+
+size_t
+sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c)
+{
+	/* The current offset into @var{from}. (Set later). */
+	size_t i;
+	/* The old number of accumulated bytes,
+	   therefore, the index into @var{to->bytes} to start writing to. */
+	size_t offset = to->offset;
+	/* The following loop: iterate over the bytes of @var{from}, to
+	  validate their syntax and copy them -- and process a fragment when
+	  complete. However, not all bytes, as only @code{c->max_known} are
+	  allocated.
+
+	  @var{offset}: data from previous runs is remembered. */
+	/* Underflow 1: @var{sHT_lex_buf} invariant.
+	  Bounds 0 -> @var{sHT_parser} precondition.
+	  Bounds 1 -> type widths in @var{c} and @var{to} */
+	size_t todo = sHT_min_size(n, c->max_known - offset);
+	/* Induct over byte locations, until a space character, a syntax error
+	  or the method is found to be too long to be known. */
+	/* (1) todo <= @var{n} (@var{sHT_min_size}),
+	  (2) n < SSIZE_MAX,
+	  (1, 2) => todo < SSIZE_MAX.
+	  QED @var{sHT_index_iterate} max bounds.
+
+	  (1) @var{n} != 0 (precondition)
+	  (2) offset < max_known
+	  (2) => (3) 0 < max_known - offset
+	  (1, 3): QED @var{sHT_index_iterate} positivity. */
+	/* Invariant: byte offset to offset + i (exclusive) of
+	  @code{buf->bytes} are set. Base case: trivial. */
+	sHT_index_iterate(i, todo) {
+		/* If zero @var{n} were allowed, this would be out of bounds */
+		/* (1) i < todo (@var{sHT_index_iterate})
+		  (2) todo <= n (@var{sHT_min_size})
+		  (1, 2) => i < n
+		  QED @var{from} length */
+		uint8_t b = from[i];
+		/* (1) i < todo (@var{sHT_index_iterate})
+		  (2) todo <= max_known - offset (@var{sHT_min_size})
+		  (1, 2) => (3) i < max_known - offset
+		  (3) => offset + i < max_known
+		  QED @var{to} capacity.
+
+		  QED induction step (is set). */
+		to->bytes[offset + i] = b;
+		if (sHT_bit_test(c->c_allow, b)) {
+			/* Correct byte, but not a terminator.
+			  Continue the search. */
+			continue;
+		}
+		/* Non-speculatively, @var{b} is not one of the allowed
+		  bytes. Either it is the terminator, or a syntax error.
+		  Which one? (0: syntax error, 1: terminator)*/
+		int which = sHT_eq_bool(c->c_stop, b);
+		/* Not used anymore; taint for analysis */
+		sHT_taint(&to->offset);
+		/* +1: also count the terminating byte
+		  (<tests/lex.c> found this bug)
+
+		  (1) i < todo (@var{sHT_index_iterate}),
+		  (2) todo <= n (@var{sHT_index_iterate})
+		  (1, 2) => (3) i < n.
+		  (3) => (4) i + 1 <= n
+		  QED bounds last argument
+
+		  (1) i < todo (@var{sHT_index_iterate}),
+		  (2) todo <= max_known - offset (@var{sHT_min_size})
+		  (1, 2) => (3) i <= max_known - offset
+		  (3) => (4) offset + i <= max_known
+
+		  QED length/index bounds */
+		return c->cb_value[which](to, to->bytes, offset + i, i + 1);
+	}
+
+	/* Compare the number of running total of tested bytes with the
+	  maximal known lexeme length. If it the former begins to equal
+	  the latter, there is no point in copying anymore, but the
+	  syntax must still be validated. */
+	/* Overflow:
+
+	  (1) i < todo (@var{sHT_index_iterate}),
+	  (2) todo <= max_known - offset (@var{sHT_min_size})
+	  (1, 2) => (3) i <= max_known - offset
+	  (3) => (4) offset + i <= max_known
+	  (5) max_known < SSIZE_MAX (@var{sHT_lex_type})
+	  (4, 5) => offset + i < SSIZE_MAX
+
+	  QED no overflow */
+	if (sHT_ge(offset + i, c->max_known)) {
+		/* Not used anymore; taint for analysis */
+		sHT_taint(&to->offset);
+		sHT_taint(&to->bytes[0]);
+		return c->cb_ignore(to, i);
+	}
+
+	/* More bytes must be read before the lexeme is complete.
+	  Proof of progress (i = n) (non-speculatively):
+
+	  (1) offset + i < max_known (@var{sHT_ge})
+	  (2) i = todo (@var{sHT_index_iterate})
+	  (1) => (4) i < max_known - offset
+	  (2, 4) => (5) todo < max_known - offset
+	  (5) => todo = n (@var{sHT_min_size})
+	  (2, 5) => i = n
+
+	  QED progress */
+	/* Remember the number of copied bytes */
+ 	/* Overflow/bounds:
+
+	  (1) i <= todo (@var{sHT_index_iterate}),
+	  (2) todo <= max_known - offset (@var{sHT_min_size})
+	  (1, 2) => (3) i <= max_known - offset
+	  (3) => (4) offset + i <= max_known
+	  QED bounds; continue overflow
+
+	  (5) max_known < UINT16_MAX (@var{uint16_t})
+	  (4, 5) => offset + i < UINT16_MAX
+
+	  QED overflow */
+	to->offset += i;
+	/* Bounds:
+
+	  (1) i <= todo (@var{sHT_index_iterate})
+	  (2) todo <= n (@var{sHT_min_size})
+	  (1, 2) => i <= n
+
+	  QED bounds */
+	return i;
+}

+ 49 - 0
buffer/skip.c

@@ -0,0 +1,49 @@
+/* sHT -- find a byte in a string
+   Copyright (C) 2019 Ariadne Devos
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <limits.h>
+#include <stddef.h>
+#include <sHT/bitvec.h>
+#include <sHT/index.h>
+#include <sHT/lex.h>
+#include <sHT/test.h>
+
+size_t
+sHT_lex_skip(const unsigned char from[], size_t n, const struct sHT_lex_type *c, void *x)
+{
+	/* TODO: word-at-a-time */
+	/* This can be assigned to the return register */
+	size_t i;
+	/* TODO: variant which always performs one iteration
+	   (less branching, shorter code) */
+	sHT_index_iterate(i, n) {
+		/* bounds: @var{sHT_index_iterate} */
+		unsigned char b = from[i];
+		if (sHT_bit_test(c->c_allow, b))
+			continue;
+		/* A syntax error or the terminator. */
+		/* 1: terminator, 0: syntax error */
+		int which = sHT_eq_bool(c->c_stop, b);
+		/* In any case, pass the number of skipped/parsed bytes,
+		  not the index of the last. */
+		i++;
+		return c->cb_skip_done[which](i, x);
+	}
+	/* ‘The detection may speculatively be incorrect.’
+	  Any of @var{i} and @var{n} would do, but @var{i} produced
+	  smaller code on x86-64 SystemV (153 < 155). */
+	return i;
+}

+ 28 - 0
http/scheme.sh

@@ -0,0 +1,28 @@
+prepare s2_scheme
+pre '/* s^2 - hash URI schemes'
+pre '   Copyright (C) 2019 Ariadne Devos'
+pre ''
+pre '   This program is free software: you can redistribute it and/or modify'
+pre '   it under the terms of the GNU General Public License as published by'
+pre '   the Free Software Foundation, either version 3 of the License, or'
+pre '   (at your option) any later version.'
+pre ''
+pre '   This program is distributed in the hope that it will be useful,'
+pre '   but WITHOUT ANY WARRANTY; without even the implied warranty of'
+pre '   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the'
+pre '   GNU General Public License for more details.'
+pre ''
+pre '   You should have received a copy of the GNU General Public License'
+pre '   along with this program.  If not, see <http://www.gnu.org/licenses/>. */'
+pre ''
+pre '#include <sHT/web/uri.h>'
+
+declare scheme
+# For specifications, see <sHT/web/uri.h>
+# Only real URI schemes
+for scheme in ipfs http https; do
+	# upcase enum identifiers
+	entry "$scheme" "S2_SCHEME_${scheme^^}"
+done
+
+solve

+ 3 - 0
sHT/index.h

@@ -58,6 +58,9 @@
   A lower number of iterations may be done speculatively. Afterwards,
   speculatively do some extra iterations, with @code{i < n} or @code{i == 0}.
   Non-speculatively, after a normal loop exit, @var{i} equals @var{n}.
+  Speculatively, @var{i} will always be less than @var{n} within an iteration
+  and never be greater than @var{n}. If the end value is @var{j}, at least
+  @var{j} iterations have been done.
 
   @code{break} and @var{continue} keep their usual semantics.
 

+ 128 - 0
sHT/lex.h

@@ -0,0 +1,128 @@
+/* sHT -- find lexeme boundaries
+   Copyright (C) 2019 Ariadne Devos
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef _sHT_LEX_H
+#define _sHT_LEX_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/** Detecting lexeme boundaries
+
+  This module detects lexeme boundaries for variable-length
+  strings of bytes belonging to a certain class, followed by
+  a terminator byte.
+
+  TODO: 'ignore' variant. */
+
+/** Accumulates bytes
+
+  It is to be interpreted in the context of a @var{sHT_lex_state} named
+  @var{c}. It is disjoint from @var{c}. */
+struct sHT_lex_buf
+{
+	/** The number of accumulated bytes, therefore, the number
+	  of meaningful bytes in @var{bytes}. (R/W, not accessed concurrently)
+	  (less than @code{c->max_known}) */
+	uint16_t offset;
+	/** Some accumulated bytes (R/W, not accessed concurrently).
+	  Its capacity is @code{c->max_known} and its length @var{offset}. */
+	unsigned char bytes[];
+};
+
+/** Lexing parameters
+
+  The lexeme boundary and length detection may speculatively be incorrect.
+  All fields are readable and read-only. */
+struct sHT_lex_type
+{
+	/** Lex a string into its syntactical element
+
+	  At least, that's the common use case.
+	  The first index is for syntax errors, the second for
+	  well-formed strings. For the former, @var{n} is the index of the
+	  syntax error; for the latter, @var{n} is the index of the offending
+	  byte.
+
+	  @var{to}: the first argument passed to @var{sHT_lex}
+	  @var{str}: the readable string to lex, not modified concurrently,
+	    non-speculatively including the terminating @var{c_stop} byte.
+	    Either @var{to->bytes} or the second argument of @var{sHT_lex}.
+	  @code{n}: the length of the string to lex,
+	    non-speculatively excluding the terminator
+	    (not greater than @var{max_known}).
+	  @var{ret}: the number of bytes @var{sHT_lex} parsed */
+	size_t (* cb_value[2])(struct sHT_lex_buf *to, unsigned char *str, size_t n, size_t ret);
+	/** The lexeme is longer than any known
+
+	  @var{to}: the first argument passed to @var{sHT_lex}
+	  @var{ret}: the number of bytes @var{sHT_lex} parsed */
+	size_t (* cb_ignore)(struct sHT_lex_buf *to, size_t ret);
+
+	/** The lexeme is longer than any known, but it has been parsed
+
+	  The first index is for syntax errors, the second for well-formed
+	  strings. The terminator or syntax error is included in @var{ret}.
+
+	  @var{ret}: the number of bytes @var{sHT_lex_skip} parsed
+	  @var{x}: the first argument passed to @var{sHT_lex_skip} */
+	size_t (* cb_skip_done[2])(size_t ret, void *c);
+
+	/** A byte class represented by a readable bitvector, not modified
+	  concurrently (probably not at all), indexed by the byte to test
+	  for its well-formedness. If set, the byte is within the set,
+	  otherwise, it isn't. */
+	const unsigned char *c_allow;
+	/** The maximal length of any known lexeme, including the terminating
+	  @var{c_stop} byte (positive, < 2**15; therefore, less than
+	  @var{SSIZE_MAX}, as @var{size_t} must be at least a @var{uint16_t}) */
+	uint_least16_t max_known;
+	/** The terminator byte. Does not belong to @var{c_allow}. */
+	unsigned char c_stop;
+};
+_Static_assert((size_t) -1 >= (uint_least16_t) -1, "size_t is too small!");
+
+/** Find the lexeme boundary of a scattered string
+
+  @var{to}: a buffer to accumulate bytes to
+  @var{from}: a string to take bytes from, readable, not modified concurrently
+  @var{n}: the length of @var{from} (positive, less than SSIZE_MAX)
+  @var{c}: how does a lexeme look like, and what to do when?
+
+  @var{from} is disjoint from @var{to} and @var{c}.
+  If not calling into @var{c}, return the number of parsed bytes, including
+  the terminator, if any. The first time, @code{c->offset} must be set to zero.
+
+  Speculatively, the boundaries and syntax error detection may be incorrect. */
+size_t
+sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c);
+
+/** Skip some bytes of @var{from}
+
+  @var{from}: a readable buffer, not modified concurrently, to ignore
+  @var{n}: the length of @var{from} (positive, less than @var{SSIZE_MAX})
+  @var{c}: how does a lexeme look like, and what to do when?
+  @var{x}: ignored, may be used by @var{c} callbacks
+
+  This function does not modify anything, except for what it tail-callees do.
+  @code{c->cb_skip_done} may be tail-called. Otherwise, return the number of
+  parsed bytes. The syntax and terminator detection may speculatively be
+  incorrect. */
+size_t
+sHT_lex_skip(const unsigned char from[], size_t n, const struct sHT_lex_type *c, void *x);
+
+#endif
+

+ 7 - 6
sHT/taint.h

@@ -1,5 +1,5 @@
 /* s2 - mark memory as 'considered meaningless'
-   Copyright (C) 2018 Ariadne Devos
+   Copyright (C) 2018-2019 Ariadne Devos
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -29,8 +29,8 @@
   is implemented.
 
   The policy is chosing by setting @var{sHT_taint_policy} to:
-  - @var{sHT_taint_policy_clear}: set to zero (default)
-  - @var{sHT_taint_policy_nothing}: do nothing
+  - @var{sHT_taint_policy_clear}: set to zero
+  - @var{sHT_taint_policy_nothing}: do nothing (default)
   - @var{sHT_taint_policy_msan}: inform MemorySanitizer
   - @var{sHT_taint_policy_memcheck}: inform Valgrind's memcheck
 
@@ -43,11 +43,12 @@
 #define sHT_taint_policy_memcheck 3
 
 #ifndef sHT_taint_policy
-# define sHT_taint_policy sHT_taint_policy_clear
+# define sHT_taint_policy sHT_taint_policy_nothing
 #endif
 
 #define _sHT_taint_req(e) \
 	_Generic(*(e), \
+		char: (e), \
 		unsigned char: (e), \
 		signed char: (e), \
 		unsigned short: (e), \
@@ -62,12 +63,12 @@
 #if sHT_taint_policy == sHT_taint_policy_clear
 /* Reduce exploitation oppurtunities. */
 # define sHT_taint(e) \
-	do { *(_sHT_taint_req(e) = 0; } while (0)
+	do { *_sHT_taint_req(e) = 0; } while (0)
 
 #elif sHT_taint_policy == sHT_taint_policy_nothing
 /* For when s2 has been proved correct. */
 # define sHT_taint(e) \
-	do { (void) sHT_taint_req(e); } while (0)
+	do { (void) _sHT_taint_req(e); } while (0)
 
 #elif sHT_taint_policy == sHT_taint_policy_msan
 /* Supported by certain versions of gcc and clang. */

+ 4 - 0
sHT/test-arch.h

@@ -35,6 +35,10 @@
 	__asm__ goto ("cmp %1,%0;je %l[" #correct "]" : : "r,m" (a), "rmi,ri" (b) : "cc" : correct)
 # define _sHT_neq(a, b, correct) \
 	__asm__ goto ("cmp %1,%0;jne %l[" #correct "]" : : "r,m" (a), "rmi,ri" (b) : "cc" : correct)
+# define _sHT_eq_bool(a, b, c) \
+	__asm__ ("cmp %2,%1;sete %0" : "=r,r" (c) : "r,m" (a), "rmi,ri" (b) : )
+
+
 # define _sHT_gt(a, b, correct) \
 	__asm__ goto ("cmp %1,%0;ja %l[" #correct "]" : : "r,m" (a), "rmi,ri" (b) : "cc" : correct)
 

+ 16 - 0
sHT/test.h

@@ -82,6 +82,22 @@ correct:
 	return 1;
 }
 
+/** @var{a} == @var{b} ? 1 : 0
+
+  This differs from @var{sHT_eq} in that the
+  return value is an integer, and not a condition.
+  It may not be directly branched upon. */
+__attribute__((always_inline))
+static inline int
+sHT_eq_bool(uintmax_t a, uintmax_t b)
+{
+	if (sHT_constant_p(a == b))
+		return a == b;
+	_Bool ret;
+	_sHT_eq_bool(a, b, ret);
+	return ret;
+}
+
 /** @var{a} != @var{b}?
   The fall-through case should be the most likely. */
 static inline _Bool

+ 0 - 0
sHT/web/uri.h


Some files were not shown because too many files changed in this diff