8 Commits 47536a244f ... 6493906943

Author SHA1 Message Date
  Ariadne Devos 6493906943 Decide API for extracting components of a URI 5 years ago
  Ariadne Devos 4fc9cdbb9d Analysis of a URI 5 years ago
  Ariadne Devos 49c9f14c17 Add missing include in <sHT/string.h> 5 years ago
  Ariadne Devos 96530aab3c Remark if the port was written down 5 years ago
  Ariadne Devos 1c84decc00 Increase URI length limit for IRIs 5 years ago
  Ariadne Devos 5acad18be7 Slice arrays in ranges 5 years ago
  Ariadne Devos 3183e604da Make overruns and presence of components clear 5 years ago
  Ariadne Devos b3565ec23a Limit what URIs can be parsed 5 years ago
2 changed files with 186 additions and 3 deletions
  1. 24 3
      sHT/string.h
  2. 162 0
      sHT/web/uri.h

+ 24 - 3
sHT/string.h

@@ -20,13 +20,17 @@
 #include <sHT/test.h>
 #include <limits.h>
 #include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
 
-/** String comparison
+/** String comparison and functions
 
-  In short: <string.h>, but with Spectre precautions.
+  In short: an extended <string.h>, but with Spectre precautions.
   -- and non-optimised, but profile first before fixing that.
   These functions interpret memory areas as a sequence of readable
-  bytes, test them and perhaps calculate something. */
+  bytes, test them and perhaps calculate something.
+
+  Some functions and structures are applicable to arrays in general. */
 
 /** Compute the minimum of two object sizes. */
 __attribute__((const))
@@ -107,4 +111,21 @@ sHT_streq(const char *buffer0, const char *buffer1, size_t correct, size_t other
 size_t
 sHT_append(char *to, const char *from, size_t length0, size_t length1, size_t i0, size_t i1);
 
+/** A slice (substring) of some string or array
+
+  The concept corresponds to Rust's 'slices', although implemented
+  somewhat differently. (Website: <https://rust-lang.org>.)
+
+  The structure contains no reference to the actual external string,
+  only an offset (@var{offset}) and the length of the fragment (@var{length}).
+  The latter may be zero, but the former must be a valid index -- unlike
+  C pointers, where one may point one past the end of an object.
+
+  Beware of the fields' bounds, as they can easily be trespassed accidentally. */
+struct sHT_slice
+{
+	uint_least16_t offset;
+	uint_least16_t length;
+};
+
 #endif

+ 162 - 0
sHT/web/uri.h

@@ -22,6 +22,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <sHT/string.h>
+
 /** Parsing URIs (includings URLs)
 
   This module parses URIs, extracting the scheme, host and port and
@@ -97,4 +99,164 @@ enum s2_scheme
 size_t
 s2_scheme_lookup(const uint8_t string[], size_t length);
 
+/** The maximal number of segments in a URI s^2 analyses correctly
+
+  The constant was found by searching my filesystem.
+  > $ find /usr/lib | tr -c -d "/\n" | sort -u && echo 1234567890123456789 */
+#define S2_URI_DEPTH_LIMIT 14u
+/** The maximal length of a URI s^2 analyses correctly
+
+  How this was derived: let a IRI take at most two lines in a common terminal
+  (80 characters each line). Let each character be beyond the Basic
+  Multilingual place, therefore being encoded to four bytes. Hex-encode each
+  byte, expanding to three bytes.
+
+  2 * 80 * 4 * 3 = 1920.
+
+  This is is long enough for non-Latin IPFS URIs. */
+#define S2_URI_LENGTH_LIMIT 1920u
+/** The maximimal port number in a URI
+
+  From (RFC 7605 XXX TODO, 5 What is a Port Number, p. 5):
+  ‘A port number is a 16-bit number ...’ */
+#define S2_URI_MAX_PORT 65535u
+
+/** Some boolean-valued information about a URI,
+  packed into a bit vector. */
+typedef uint_least16_t s2_uri_bits;
+
+/** The port number is out-of-bounds
+
+  I.e., greater than @var{S2_URI_MAX_PORT} */
+#define S2_URI_BIT_PORT_OUT_OF_BOUNDS ((s2_uri_bits) 1u)
+/** There are too many path segments to parse.
+
+  This is an implementation limit, set to @var{S2_URI_DEPTH_LIMIT}. */
+#define S2_URI_BIT_TOO_DEEP ((s2_uri_bits) 2u)
+/** The URI is too long to parse.
+
+  This is an implementation limit, set to @var{S2_URI_LENGTH_LIMIT}. */
+#define S2_URI_BIT_TOO_LONG ((s2_uri_bits) 4u)
+/** The URI has a 'userinfo' subcomponent.
+
+  From (RFC 3986, 3.2.1 User Information, p. 18):
+  ‘The userinfo subcomponent may consist of a user name and, optionally,
+  scheme-specific information about how to gain authorization to access
+  the resource.’
+
+  Not all schemes allow this, but that's out-of-scope for this module
+  -- a generic URI parser.
+
+  (Take note that HTTP has it own ways of authenticating, but this also
+  is out-of-scope here.) */
+#define S2_URI_BIT_HAS_USERINFO ((s2_uri_bits) 8u)
+/** The URI has a host subcomponent.
+
+  This might be a DNS name (HTTP) or a public key (IPFS).
+
+  See (RFC 3986, 3.2.2 Host, p. 18). */
+#define S2_URI_BIT_HAS_HOST ((s2_uri_bits) 16u)
+/** The URI has a 'query' component.
+
+  This is optional. From
+  (RFC 3986, A Collected ABNF for URI, p. 49):
+  ‘absolute-URI = scheme ":" hier-part [ "?" query ]’. */
+#define S2_URI_BIT_HAS_QUERY ((s2_uri_bits) 32u)
+/** The path of the URI does not begin with a slash.
+
+  From (RFC 3986, 3 Syntax Components, p. 16) (paraphrased):
+  ‘In urn:example:animal:ferret:nose, example:animal:ferret:nose is
+  the path.’ (Which did not begin with a slash.) */
+#define S2_URI_BIT_ROOTLESS ((s2_uri_bits) 64u)
+/** The URI has an empty path.
+
+  (Therefore, it is rootless also.)
+
+  From (RFC 3986, 3.3 Path, p. 22):
+  ‘... whereas the URI <foo://info.example.com?fred> has an empty path.’ */
+#define S2_URI_BIT_EMPTY ((s2_uri_bits) 128u)
+/** The URI has a port component
+
+  The parser does not supply default values. */
+#define S2_URI_BIT_HASPORT ((s2_uri_bits) 256u)
+
+/** A URI decomposed into its components
+
+  The query isn't decomposed, but everything that allows percent notation,
+  is percent-decoded. Userinfo has not been decomposed into user:password.
+  Fragments are not allowed.
+
+  All fields may speculatively be incorrect, but remain within bounds.
+  The actual text is in an external buffer; the @var{sHT_slice} fields specify
+  offsets into that data. */
+struct s2_uri
+{
+	/** The scheme, or @var{S2_SCHEME_ABSOLUTE}.
+
+	  If unknown, @var{S2_SCHEME_UNKNOWN}. */
+	enum s2_scheme scheme : 16;
+	/** The port number
+
+	  If it was out-of-bounds or unspecified, the value is meaningless.
+
+	  HTTP and HTTPS have a default port.
+	  From (RFC 3986 (URI: Generic Syntax), 3.2.3 Port, p. 22):
+	  ‘A scheme may define a default port. For example, the "http" scheme
+	  defines a default port of "80", corresponding to its reserved TCP
+	  port number.’
+
+	  However, the parser does not supply default values.
+	  And even though IPFS does not have ports, the parser
+	  considers ports to be valid. */
+	uint_least16_t port;
+	/** The number of segments in the path
+
+	  This can be zero, in which case @var{S2_URI_BIT_EMPTY} is set
+	  in @var{bits}.
+
+	  If the actual value tresspasses @var{S2_URI_DEPTH_LIMIT},
+	  the observed counter is not correct anymore,
+	  and @var{S2_URI_BIT_TOO_DEEP} is set in @var{bits}. */
+	uint_least8_t depth;
+	/** See comments on other fields. */
+	s2_uri_bits bits;
+	/** A user name and, optionally, authorisation
+
+	  This is only present if @var{S2_URI_BIT_HAS_USERINFO} is set in
+	  @var{bits}.
+
+	  See (RFC 3986, 3.2.1 User Information, p. 18). */
+	struct sHT_slice userinfo;
+	/** The host component
+
+	  This may be a DNS name, an IP address, something else or absent.
+	  This is only present if @var{S2_URI_BIT_HAS_HOST} is set in
+	  @var{bits}.
+
+	  See (RFC 3986, 3.2.2 Host, p. 18). */
+	struct sHT_slice host;
+	/** Some non-hierarchical data
+
+	  This is only present if @var{S2_URI_BIT_HAS_QUERY} is set in
+	  @var{bits}.
+
+	  See (RFC 3986, 3.2.2 Host, p. 23). */
+	struct sHT_slice query;
+	/** All path segments, possibly zero or empty */
+	struct sHT_slice segments[S2_URI_DEPTH_LIMIT];
+};
+
+/** Analyse a URI into its constituents
+
+  @var{from}: a read-write buffer, not accessed concurrently
+  @var{n}: the length of @var{from}, less than @var{INT16_MAX}, positive
+  @var{parse}: the analysis of the URI to write, not accessed concurrently
+
+  @var{parse} is disjoint from @var{from}.
+
+  The old value of @var{parse} is irrelevant to the outcome,
+  even in side-channels. */
+void
+sHT_uri_analyse(unsigned char from[], uint_least16_t n, struct s2_uri *parse);
+
 #endif