lex.h 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. // SPDX-License-Identifier: GPL-2.0 or GPL-3.0
  2. // Copyright © 2019 Ariadne Devos
  3. /* sHT -- find lexeme boundaries */
  4. #ifndef _sHT_LEX_H
  5. #define _sHT_LEX_H
  6. #include <stddef.h>
  7. #include <stdint.h>
  8. /** Detecting lexeme boundaries
  9. This module detects lexeme boundaries for variable-length
  10. strings of bytes belonging to a certain class, followed by
  11. a terminator byte.
  12. TODO: 'ignore' variant. */
  13. /** Accumulates bytes
  14. It is to be interpreted in the context of a @var{sHT_lex_state} named
  15. @var{c}. It is disjoint from @var{c}. */
  16. struct sHT_lex_buf
  17. {
  18. /** The number of accumulated bytes, therefore, the number
  19. of meaningful bytes in @var{bytes}. (R/W, not accessed concurrently)
  20. (less than @code{c->max_known}) */
  21. uint16_t offset;
  22. /** Some accumulated bytes (R/W, not accessed concurrently).
  23. Its capacity is @code{c->max_known} and its length @var{offset}. */
  24. unsigned char bytes[];
  25. };
  26. /** Lexing parameters
  27. The lexeme boundary and length detection may speculatively be incorrect.
  28. All fields are readable and read-only. */
  29. struct sHT_lex_type
  30. {
  31. /** Lex a string into its syntactical element
  32. At least, that's the common use case.
  33. The first index is for syntax errors, the second for
  34. well-formed strings. For the former, @var{n} is the index of the
  35. syntax error; for the latter, @var{n} is the index of the offending
  36. byte.
  37. @var{to}: the first argument passed to @var{sHT_lex}
  38. @code{n}: the length of the string to lex,
  39. non-speculatively excluding the terminator
  40. (not greater than @var{max_known}).
  41. @var{ret}: the number of bytes @var{sHT_lex} parsed */
  42. size_t (* cb_value[2])(struct sHT_lex_buf *to, size_t n, size_t ret);
  43. /** The lexeme is longer than any known
  44. @var{to}: the first argument passed to @var{sHT_lex}
  45. @var{ret}: the number of bytes @var{sHT_lex} parsed */
  46. size_t (* cb_ignore)(struct sHT_lex_buf *to, size_t ret);
  47. /** The lexeme is longer than any known, but it has been parsed
  48. The first index is for syntax errors, the second for well-formed
  49. strings. The terminator or syntax error is included in @var{ret}.
  50. @var{ret}: the number of bytes @var{sHT_lex_skip} parsed
  51. @var{x}: the first argument passed to @var{sHT_lex_skip} */
  52. size_t (* cb_skip_done[2])(size_t ret, void *c);
  53. /** A byte class represented by a readable bitvector, not modified
  54. concurrently (probably not at all), indexed by the byte to test
  55. for its well-formedness. If set, the byte is within the set,
  56. otherwise, it isn't. */
  57. const unsigned char *c_allow;
  58. /** The maximal length of any known lexeme, including the terminating
  59. @var{c_stop} byte (positive) */
  60. uint_least16_t max_known;
  61. /** The terminator byte. Does not belong to @var{c_allow}. */
  62. unsigned char c_stop;
  63. };
  64. _Static_assert((size_t) -1 >= (uint_least16_t) -1, "size_t is too small!");
  65. /** Find the lexeme boundary of a scattered string
  66. @var{to}: a buffer to accumulate bytes to
  67. @var{from}: a string to take bytes from, readable, not modified concurrently
  68. @var{n}: the length of @var{from} (positive)
  69. @var{c}: how does a lexeme look like, and what to do when?
  70. Bytes are accumulated into @var{to}.
  71. @var{from} is disjoint from @var{to} and @var{c}.
  72. If not calling into @var{c}, return the number of parsed bytes, including
  73. the terminator, if any. The first time, @code{c->offset} must be set to zero.
  74. Speculatively, the boundaries and syntax error detection may be incorrect. */
  75. size_t
  76. sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c);
  77. /** Skip some bytes of @var{from}
  78. @var{from}: a readable buffer, not modified concurrently, to ignore
  79. @var{n}: the length of @var{from} (positive)
  80. @var{c}: how does a lexeme look like, and what to do when?
  81. @var{x}: ignored, may be used by @var{c} callbacks
  82. This function does not modify anything, except for what it tail-callees do.
  83. @code{c->cb_skip_done} may be tail-called. Otherwise, return the number of
  84. parsed bytes. The syntax and terminator detection may speculatively be
  85. incorrect. */
  86. size_t
  87. sHT_lex_skip(const unsigned char from[], size_t n, const struct sHT_lex_type *c, void *x);
  88. #endif