|
@@ -22,6 +22,8 @@
|
|
|
#include <stddef.h>
|
|
|
#include <stdint.h>
|
|
|
|
|
|
+#include <sHT/string.h>
|
|
|
+
|
|
|
/** Parsing URIs (includings URLs)
|
|
|
|
|
|
This module parses URIs, extracting the scheme, host and port and
|
|
@@ -97,4 +99,164 @@ enum s2_scheme
|
|
|
size_t
|
|
|
s2_scheme_lookup(const uint8_t string[], size_t length);
|
|
|
|
|
|
+/** The maximal number of segments in a URI s^2 analyses correctly
|
|
|
+
|
|
|
+ The constant was found by searching my filesystem.
|
|
|
+ > $ find /usr/lib | tr -c -d "/\n" | sort -u && echo 1234567890123456789 */
|
|
|
+#define S2_URI_DEPTH_LIMIT 14u
|
|
|
+/** The maximal length of a URI s^2 analyses correctly
|
|
|
+
|
|
|
+ How this was derived: let a IRI take at most two lines in a common terminal
|
|
|
+ (80 characters each line). Let each character be beyond the Basic
|
|
|
+ Multilingual place, therefore being encoded to four bytes. Hex-encode each
|
|
|
+ byte, expanding to three bytes.
|
|
|
+
|
|
|
+ 2 * 80 * 4 * 3 = 1920.
|
|
|
+
|
|
|
+ This is is long enough for non-Latin IPFS URIs. */
|
|
|
+#define S2_URI_LENGTH_LIMIT 1920u
|
|
|
+/** The maximimal port number in a URI
|
|
|
+
|
|
|
+ From (RFC 7605 XXX TODO, 5 What is a Port Number, p. 5):
|
|
|
+ ‘A port number is a 16-bit number ...’ */
|
|
|
+#define S2_URI_MAX_PORT 65535u
|
|
|
+
|
|
|
+/** Some boolean-valued information about a URI,
|
|
|
+ packed into a bit vector. */
|
|
|
+typedef uint_least16_t s2_uri_bits;
|
|
|
+
|
|
|
+/** The port number is out-of-bounds
|
|
|
+
|
|
|
+ I.e., greater than @var{S2_URI_MAX_PORT} */
|
|
|
+#define S2_URI_BIT_PORT_OUT_OF_BOUNDS ((s2_uri_bits) 1u)
|
|
|
+/** There are too many path segments to parse.
|
|
|
+
|
|
|
+ This is an implementation limit, set to @var{S2_URI_DEPTH_LIMIT}. */
|
|
|
+#define S2_URI_BIT_TOO_DEEP ((s2_uri_bits) 2u)
|
|
|
+/** The URI is too long to parse.
|
|
|
+
|
|
|
+ This is an implementation limit, set to @var{S2_URI_LENGTH_LIMIT}. */
|
|
|
+#define S2_URI_BIT_TOO_LONG ((s2_uri_bits) 4u)
|
|
|
+/** The URI has a 'userinfo' subcomponent.
|
|
|
+
|
|
|
+ From (RFC 3986, 3.2.1 User Information, p. 18):
|
|
|
+ ‘The userinfo subcomponent may consist of a user name and, optionally,
|
|
|
+ scheme-specific information about how to gain authorization to access
|
|
|
+ the resource.’
|
|
|
+
|
|
|
+ Not all schemes allow this, but that's out-of-scope for this module
|
|
|
+ -- a generic URI parser.
|
|
|
+
|
|
|
+ (Take note that HTTP has it own ways of authenticating, but this also
|
|
|
+ is out-of-scope here.) */
|
|
|
+#define S2_URI_BIT_HAS_USERINFO ((s2_uri_bits) 8u)
|
|
|
+/** The URI has a host subcomponent.
|
|
|
+
|
|
|
+ This might be a DNS name (HTTP) or a public key (IPFS).
|
|
|
+
|
|
|
+ See (RFC 3986, 3.2.2 Host, p. 18). */
|
|
|
+#define S2_URI_BIT_HAS_HOST ((s2_uri_bits) 16u)
|
|
|
+/** The URI has a 'query' component.
|
|
|
+
|
|
|
+ This is optional. From
|
|
|
+ (RFC 3986, A Collected ABNF for URI, p. 49):
|
|
|
+ ‘absolute-URI = scheme ":" hier-part [ "?" query ]’. */
|
|
|
+#define S2_URI_BIT_HAS_QUERY ((s2_uri_bits) 32u)
|
|
|
+/** The path of the URI does not begin with a slash.
|
|
|
+
|
|
|
+ From (RFC 3986, 3 Syntax Components, p. 16) (paraphrased):
|
|
|
+ ‘In urn:example:animal:ferret:nose, example:animal:ferret:nose is
|
|
|
+ the path.’ (Which did not begin with a slash.) */
|
|
|
+#define S2_URI_BIT_ROOTLESS ((s2_uri_bits) 64u)
|
|
|
+/** The URI has an empty path.
|
|
|
+
|
|
|
+ (Therefore, it is rootless also.)
|
|
|
+
|
|
|
+ From (RFC 3986, 3.3 Path, p. 22):
|
|
|
+ ‘... whereas the URI <foo://info.example.com?fred> has an empty path.’ */
|
|
|
+#define S2_URI_BIT_EMPTY ((s2_uri_bits) 128u)
|
|
|
+/** The URI has a port component
|
|
|
+
|
|
|
+ The parser does not supply default values. */
|
|
|
+#define S2_URI_BIT_HASPORT ((s2_uri_bits) 256u)
|
|
|
+
|
|
|
+/** A URI decomposed into its components
|
|
|
+
|
|
|
+ The query isn't decomposed, but everything that allows percent notation,
|
|
|
+ is percent-decoded. Userinfo has not been decomposed into user:password.
|
|
|
+ Fragments are not allowed.
|
|
|
+
|
|
|
+ All fields may speculatively be incorrect, but remain within bounds.
|
|
|
+ The actual text is in an external buffer; the @var{sHT_slice} fields specify
|
|
|
+ offsets into that data. */
|
|
|
+struct s2_uri
|
|
|
+{
|
|
|
+ /** The scheme, or @var{S2_SCHEME_ABSOLUTE}.
|
|
|
+
|
|
|
+ If unknown, @var{S2_SCHEME_UNKNOWN}. */
|
|
|
+ enum s2_scheme scheme : 16;
|
|
|
+ /** The port number
|
|
|
+
|
|
|
+ If it was out-of-bounds or unspecified, the value is meaningless.
|
|
|
+
|
|
|
+ HTTP and HTTPS have a default port.
|
|
|
+ From (RFC 3986 (URI: Generic Syntax), 3.2.3 Port, p. 22):
|
|
|
+ ‘A scheme may define a default port. For example, the "http" scheme
|
|
|
+ defines a default port of "80", corresponding to its reserved TCP
|
|
|
+ port number.’
|
|
|
+
|
|
|
+ However, the parser does not supply default values.
|
|
|
+ And even though IPFS does not have ports, the parser
|
|
|
+ considers ports to be valid. */
|
|
|
+ uint_least16_t port;
|
|
|
+ /** The number of segments in the path
|
|
|
+
|
|
|
+ This can be zero, in which case @var{S2_URI_BIT_EMPTY} is set
|
|
|
+ in @var{bits}.
|
|
|
+
|
|
|
+ If the actual value tresspasses @var{S2_URI_DEPTH_LIMIT},
|
|
|
+ the observed counter is not correct anymore,
|
|
|
+ and @var{S2_URI_BIT_TOO_DEEP} is set in @var{bits}. */
|
|
|
+ uint_least8_t depth;
|
|
|
+ /** See comments on other fields. */
|
|
|
+ s2_uri_bits bits;
|
|
|
+ /** A user name and, optionally, authorisation
|
|
|
+
|
|
|
+ This is only present if @var{S2_URI_BIT_HAS_USERINFO} is set in
|
|
|
+ @var{bits}.
|
|
|
+
|
|
|
+ See (RFC 3986, 3.2.1 User Information, p. 18). */
|
|
|
+ struct sHT_slice userinfo;
|
|
|
+ /** The host component
|
|
|
+
|
|
|
+ This may be a DNS name, an IP address, something else or absent.
|
|
|
+ This is only present if @var{S2_URI_BIT_HAS_HOST} is set in
|
|
|
+ @var{bits}.
|
|
|
+
|
|
|
+ See (RFC 3986, 3.2.2 Host, p. 18). */
|
|
|
+ struct sHT_slice host;
|
|
|
+ /** Some non-hierarchical data
|
|
|
+
|
|
|
+ This is only present if @var{S2_URI_BIT_HAS_QUERY} is set in
|
|
|
+ @var{bits}.
|
|
|
+
|
|
|
+ See (RFC 3986, 3.2.2 Host, p. 23). */
|
|
|
+ struct sHT_slice query;
|
|
|
+ /** All path segments, possibly zero or empty */
|
|
|
+ struct sHT_slice segments[S2_URI_DEPTH_LIMIT];
|
|
|
+};
|
|
|
+
|
|
|
+/** Analyse a URI into its constituents
|
|
|
+
|
|
|
+ @var{from}: a read-write buffer, not accessed concurrently
|
|
|
+ @var{n}: the length of @var{from}, less than @var{INT16_MAX}, positive
|
|
|
+ @var{parse}: the analysis of the URI to write, not accessed concurrently
|
|
|
+
|
|
|
+ @var{parse} is disjoint from @var{from}.
|
|
|
+
|
|
|
+ The old value of @var{parse} is irrelevant to the outcome,
|
|
|
+ even in side-channels. */
|
|
|
+void
|
|
|
+sHT_uri_analyse(unsigned char from[], uint_least16_t n, struct s2_uri *parse);
|
|
|
+
|
|
|
#endif
|