lex.c 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. // SPDX-License-Identifier: GPL-2.0 or GPL-3.0
  2. // Copyright © 2019 Ariadne Devos
  3. /* sHT -- find lexeme boundaries */
  4. #include <sHT/bitvec.h>
  5. #include <sHT/index.h>
  6. #include <sHT/lex.h>
  7. #include <sHT/string.h>
  8. #include <sHT/taint.h>
  9. #include <sHT/test.h>
  10. size_t
  11. sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c)
  12. {
  13. /* The current offset into @var{from}. (Set later). */
  14. size_t i;
  15. /* The old number of accumulated bytes,
  16. therefore, the index into @var{to->bytes} to start writing to. */
  17. size_t offset = to->offset;
  18. /* The following loop: iterate over the bytes of @var{from}, to
  19. validate their syntax and copy them -- and process a fragment when
  20. complete. However, not all bytes, as only @code{c->max_known} are
  21. allocated.
  22. @var{offset}: data from previous runs is remembered. */
  23. /* Underflow 1: `sHT_lex_buf` invariant (`offset` < `max_known`) */
  24. size_t todo = sHT_min_size(n, c->max_known - offset);
  25. /* Induct over byte locations, until a space character, a syntax error
  26. or the method is found to be too long to be known. */
  27. /* (1) `n` > 0 (`sHT_lex` precondition)
  28. (1) => (2) 0 < `n`
  29. (3) `offset` < `max_known` (`sHT_lex_buf` invariant)
  30. (3) => (4) 0 < `max_known` - `offset`
  31. (2, 4) => (5) 0 < min(`n`, `max_known` - `offset`)
  32. (5): QED `sHT_index_iterate` positivity (TODO: do ... while loop) */
  33. /* Invariant: byte `offset` to `offset` + `i` (exclusive) of
  34. @code{buf->bytes} are set. Base case: trivial. */
  35. sHT_index_iterate(i, todo) {
  36. /* If zero @var{n} were allowed, this would be out of bounds */
  37. /* (1) i < todo (@var{sHT_index_iterate})
  38. (2) todo <= n (@var{sHT_min_size})
  39. (1, 2) => i < n
  40. QED @var{from} length */
  41. uint8_t b = from[i];
  42. /* (1) i < todo (@var{sHT_index_iterate})
  43. (2) todo <= max_known - offset (@var{sHT_min_size})
  44. (1, 2) => (3) i < max_known - offset
  45. (3) => offset + i < max_known
  46. QED @var{to} capacity.
  47. QED induction step (is set). */
  48. to->bytes[offset + i] = b;
  49. if (sHT_bit_test(c->c_allow, b)) {
  50. /* Correct byte, but not a terminator.
  51. Continue the search. */
  52. continue;
  53. }
  54. /* Non-speculatively, @var{b} is not one of the allowed
  55. bytes. Either it is the terminator, or a syntax error.
  56. Which one? (0: syntax error, 1: terminator)*/
  57. int which = sHT_eq_bool(c->c_stop, b);
  58. /* Not used anymore; taint for analysis */
  59. sHT_taint(&to->offset);
  60. /* +1: also count the terminating byte
  61. (<tests/lex.c> found this bug)
  62. (1) i < todo (@var{sHT_index_iterate}),
  63. (2) todo <= n (@var{sHT_index_iterate})
  64. (1, 2) => (3) i < n.
  65. (3) => (4) i + 1 <= n
  66. QED bounds last argument
  67. (1) i < todo (@var{sHT_index_iterate}),
  68. (2) todo <= max_known - offset (@var{sHT_min_size})
  69. (1, 2) => (3) i <= max_known - offset
  70. (3) => (4) offset + i <= max_known
  71. QED length/index bounds */
  72. return c->cb_value[which](to, offset + i, i + 1);
  73. }
  74. /* Compare the number of running total of tested bytes with the
  75. maximal known lexeme length. If it the former begins to equal
  76. the latter, there is no point in copying anymore, but the
  77. syntax must still be validated. */
  78. /* Overflow:
  79. (1) i < todo (@var{sHT_index_iterate}),
  80. (2) todo <= max_known - offset (@var{sHT_min_size})
  81. (1, 2) => (3) i <= max_known - offset
  82. (3) => (4) offset + i <= max_known
  83. (5) `max_known` <= `SIZE_MAX` (typing)
  84. (3, 5) => `offset` + `i` <= `SIZE_MAX`
  85. QED no overflow */
  86. if (sHT_ge(offset + i, c->max_known)) {
  87. /* Not used anymore; taint for analysis */
  88. sHT_taint(&to->offset);
  89. sHT_taint(&to->bytes[0]);
  90. return c->cb_ignore(to, i);
  91. }
  92. /* More bytes must be read before the lexeme is complete.
  93. Proof of progress (i = n) (non-speculatively):
  94. (1) offset + i < max_known (@var{sHT_ge})
  95. (2) i = todo (@var{sHT_index_iterate})
  96. (1) => (4) i < max_known - offset
  97. (2, 4) => (5) todo < max_known - offset
  98. (5) => todo = n (@var{sHT_min_size})
  99. (2, 5) => i = n
  100. QED progress */
  101. /* Remember the number of copied bytes */
  102. /* Overflow/bounds:
  103. (1) i <= todo (@var{sHT_index_iterate}),
  104. (2) todo <= max_known - offset (@var{sHT_min_size})
  105. (1, 2) => (3) i <= max_known - offset
  106. (3) => (4) offset + i <= max_known
  107. QED bounds; continue overflow
  108. (5) max_known < UINT16_MAX (@var{uint16_t})
  109. (4, 5) => offset + i < UINT16_MAX
  110. QED overflow */
  111. to->offset += i;
  112. /* Bounds:
  113. (1) i <= todo (@var{sHT_index_iterate})
  114. (2) todo <= n (@var{sHT_min_size})
  115. (1, 2) => i <= n
  116. QED bounds */
  117. return i;
  118. }