regexp.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. /*
  2. * Part of Scheme 48 1.9. See file COPYING for notices and license.
  3. *
  4. * Authors: Richard Kelsey, Jonathan Rees, Mike Sperber, Marcus Crestani,
  5. * Roderic Morris
  6. */
  7. /*
  8. * Scheme 48/POSIX regex interface
  9. */
  10. #include <sys/types.h>
  11. #include <regex.h> /* POSIX.2 */
  12. #include <stdlib.h>
  13. #include <string.h>
  14. #include <unistd.h>
  15. #include "scheme48.h"
  16. extern void s48_init_posix_regex(void);
  17. static s48_ref_t posix_compile_regexp(s48_call_t call,
  18. s48_ref_t pattern,
  19. s48_ref_t extended_p,
  20. s48_ref_t ignore_case_p,
  21. s48_ref_t submatches_p,
  22. s48_ref_t newline_p),
  23. posix_regexp_match(s48_call_t call,
  24. s48_ref_t sch_regex,
  25. s48_ref_t string,
  26. s48_ref_t start,
  27. s48_ref_t submatches_p,
  28. s48_ref_t bol_p,
  29. s48_ref_t eol_p),
  30. posix_regexp_error_message(s48_call_t call,
  31. s48_ref_t pattern,
  32. s48_ref_t extended_p,
  33. s48_ref_t ignore_case_p,
  34. s48_ref_t submatches_p,
  35. s48_ref_t newline_p),
  36. posix_free_regexp(s48_call_t call, s48_ref_t sch_regex);
  37. /*
  38. * Record type imported from Scheme.
  39. */
  40. static s48_ref_t posix_regexp_match_type_binding;
  41. /*
  42. * Install all exported functions in Scheme48.
  43. */
  44. void
  45. s48_init_posix_regexp(void)
  46. {
  47. /* Export our stuff. */
  48. S48_EXPORT_FUNCTION(posix_compile_regexp);
  49. S48_EXPORT_FUNCTION(posix_regexp_match);
  50. S48_EXPORT_FUNCTION(posix_regexp_error_message);
  51. S48_EXPORT_FUNCTION(posix_free_regexp);
  52. /* Protect and import the regex-match type. */
  53. posix_regexp_match_type_binding =
  54. s48_get_imported_binding_2("posix-regexp-match-type");
  55. }
  56. /*
  57. * Interface to regcomp. We encode the flags, make the return value, and
  58. * then call regcomp() to fill it in.
  59. */
  60. static s48_ref_t
  61. posix_compile_regexp(s48_call_t call, s48_ref_t pattern,
  62. s48_ref_t extended_p, s48_ref_t ignore_case_p,
  63. s48_ref_t submatches_p, s48_ref_t newline_p)
  64. {
  65. s48_ref_t sch_regex;
  66. int status;
  67. int flags =
  68. (s48_extract_boolean_2(call, extended_p) ? REG_EXTENDED : 0) |
  69. (s48_extract_boolean_2(call, ignore_case_p) ? REG_ICASE : 0) |
  70. (s48_extract_boolean_2(call, submatches_p) ? 0 : REG_NOSUB) |
  71. (s48_extract_boolean_2(call, newline_p) ? REG_NEWLINE : 0);
  72. s48_check_byte_vector_2(call, pattern);
  73. sch_regex = s48_make_value_2(call, regex_t);
  74. status = regcomp(s48_unsafe_extract_value_pointer_2(call, sch_regex, regex_t),
  75. s48_extract_byte_vector_readonly_2(call, pattern),
  76. flags);
  77. if (status == 0)
  78. return sch_regex;
  79. else
  80. return s48_enter_long_2(call, status); /* not that it can do them much good */
  81. }
  82. /*
  83. * Interface to regexec.
  84. *
  85. * Returns #f if there is no match, #t if there is a match and submatches_p
  86. * is false, and a list of regex-match records otherwise.
  87. *
  88. * Most of this is making the buffer for the match structs and then translating
  89. * them into Scheme match records.
  90. */
  91. static s48_ref_t
  92. posix_regexp_match(s48_call_t call, s48_ref_t sch_regex, s48_ref_t string, s48_ref_t sch_start,
  93. s48_ref_t submatches_p,
  94. s48_ref_t bol_p, s48_ref_t eol_p)
  95. {
  96. int status;
  97. s48_ref_t result;
  98. int start = s48_extract_long_2(call, sch_start);
  99. int len = strlen(s48_extract_byte_vector_readonly_2(call, string));
  100. /* re_nsub doesn't include the full pattern */
  101. size_t nmatch = 1 + s48_extract_value_pointer_2(call, sch_regex, regex_t)->re_nsub;
  102. regmatch_t *pmatch,
  103. pmatch_buffer[32];
  104. int flags =
  105. (s48_extract_boolean_2(call, bol_p) ? 0 : REG_NOTBOL) |
  106. (s48_extract_boolean_2(call, eol_p) ? 0 : REG_NOTEOL);
  107. if ((start < 0) || (start > len))
  108. s48_assertion_violation_2(call,
  109. "posix_regexp_match", "start out of range", 3,
  110. sch_start,
  111. s48_enter_long_2(call, 0),
  112. s48_enter_long_2(call, len));
  113. if (nmatch <= 32)
  114. pmatch = pmatch_buffer;
  115. else {
  116. pmatch = (regmatch_t *) malloc(nmatch * sizeof(regmatch_t));
  117. if (pmatch == NULL)
  118. s48_out_of_memory_error_2(call); }
  119. status = regexec(s48_extract_value_pointer_2(call, sch_regex, regex_t),
  120. s48_extract_byte_vector_readonly_2(call, string) + start,
  121. nmatch, pmatch, flags);
  122. if (status == REG_NOMATCH)
  123. result = s48_false_2(call);
  124. else if (! s48_extract_boolean_2(call, submatches_p))
  125. result = s48_true_2(call);
  126. else {
  127. s48_ref_t matches = s48_null_2(call);
  128. s48_ref_t match;
  129. int i;
  130. for(i = nmatch - 1; i > -1; i--) {
  131. if (pmatch[i].rm_so == -1)
  132. match = s48_false_2(call);
  133. else {
  134. match = s48_make_record_2(call, posix_regexp_match_type_binding);
  135. s48_unsafe_record_set_2(call, match,
  136. 0,
  137. s48_enter_long_2(call, pmatch[i].rm_so + start));
  138. s48_unsafe_record_set_2(call, match,
  139. 1,
  140. s48_enter_long_2(call, pmatch[i].rm_eo + start));
  141. s48_unsafe_record_set_2(call, match, 2, s48_false_2(call)); } /* submatches */
  142. matches = s48_cons_2(call, match, matches); }
  143. result = matches; }
  144. if (nmatch > 32)
  145. free(pmatch);
  146. return result;
  147. }
  148. /*
  149. * Interface to regcomp.
  150. *
  151. * This takes the same arguments as `compile_regexp' but returns the error
  152. * message, if any, that `regcomp()' returns. For some reason `regerror()'
  153. * requires both the status code and the compiled pattern buffer returned
  154. * by `regcomp()'. `compile_regexp' only returned the status so we have to
  155. * redo the compilation.
  156. *
  157. */
  158. static s48_ref_t
  159. posix_regexp_error_message(s48_call_t call, s48_ref_t pattern,
  160. s48_ref_t extended_p, s48_ref_t ignore_case_p,
  161. s48_ref_t submatches_p, s48_ref_t newline_p)
  162. {
  163. regex_t compiled_regex;
  164. int status;
  165. int flags =
  166. (s48_extract_boolean_2(call, extended_p) ? REG_EXTENDED : 0) |
  167. (s48_extract_boolean_2(call, ignore_case_p) ? REG_ICASE : 0) |
  168. (s48_extract_boolean_2(call, submatches_p) ? 0 : REG_NOSUB) |
  169. (s48_extract_boolean_2(call, newline_p) ? REG_NEWLINE : 0);
  170. s48_check_byte_vector_2(call, pattern);
  171. status = regcomp(&compiled_regex, s48_extract_byte_vector_readonly_2(call, pattern), flags);
  172. if (status == 0)
  173. return s48_false_2(call);
  174. else {
  175. size_t buffer_size;
  176. s48_ref_t buffer;
  177. buffer_size = regerror(status, &compiled_regex, NULL, 0);
  178. /* For string lengths C counts the nul, Scheme doesn't. */
  179. buffer = s48_make_byte_vector_2(call, buffer_size);
  180. regerror(status,
  181. &compiled_regex,
  182. s48_extract_byte_vector_2(call, buffer),
  183. buffer_size);
  184. return buffer; }
  185. }
  186. /*
  187. * Stub for regfree().
  188. */
  189. static s48_ref_t
  190. posix_free_regexp(s48_call_t call, s48_ref_t sch_regex)
  191. {
  192. regfree(s48_extract_value_pointer_2(call, sch_regex, regex_t));
  193. return s48_unspecific_2(call);
  194. }