dclib-regex.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. /***************************************************************************
  2. * *
  3. * _____ ____ *
  4. * | __ \ / __ \ _ _ _____ *
  5. * | | \ \ / / \_\ | | | | _ \ *
  6. * | | \ \| | | | | | |_| | *
  7. * | | | || | | | | | ___/ *
  8. * | | / /| | __ | | | | _ \ *
  9. * | |__/ / \ \__/ / | |___| | |_| | *
  10. * |_____/ \____/ |_____|_|_____/ *
  11. * *
  12. * Wiimms source code library *
  13. * *
  14. ***************************************************************************
  15. * *
  16. * Copyright (c) 2012-2022 by Dirk Clemens <wiimm@wiimm.de> *
  17. * *
  18. ***************************************************************************
  19. * *
  20. * This library is free software; you can redistribute it and/or modify *
  21. * it under the terms of the GNU General Public License as published by *
  22. * the Free Software Foundation; either version 2 of the License, or *
  23. * (at your option) any later version. *
  24. * *
  25. * This library is distributed in the hope that it will be useful, *
  26. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  27. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  28. * GNU General Public License for more details. *
  29. * *
  30. * See file gpl-2.0.txt or http://www.gnu.org/licenses/gpl-2.0.txt *
  31. * *
  32. ***************************************************************************/
  33. #define _GNU_SOURCE 1
  34. #include "dclib-regex.h"
  35. //
  36. ///////////////////////////////////////////////////////////////////////////////
  37. /////////////// some macro definitions ///////////////
  38. ///////////////////////////////////////////////////////////////////////////////
  39. #undef REGEX_FREE
  40. #undef REOPT_ICASE
  41. #undef MATCH_BEG
  42. #undef MATCH_END
  43. #if DCLIB_USE_PCRE
  44. #define REGEX_FREE pcre_free
  45. #define REOPT_ICASE PCRE_CASELESS
  46. #define MATCH_BEG(i) match[2*i]
  47. #define MATCH_END(i) match[2*i+1]
  48. #else
  49. #define REGEX_FREE regfree
  50. #define REOPT_ICASE REG_ICASE
  51. #define MATCH_BEG(i) match[i].rm_so
  52. #define MATCH_END(i) match[i].rm_eo
  53. #endif
  54. //
  55. ///////////////////////////////////////////////////////////////////////////////
  56. /////////////// struct Regex_t ///////////////
  57. ///////////////////////////////////////////////////////////////////////////////
  58. void InitializeRegex ( Regex_t *re )
  59. {
  60. DASSERT(re);
  61. memset(re,0,sizeof(*re));
  62. re->re_list = re->re_pool;
  63. re->re_size = sizeof(re->re_pool)/sizeof(*re->re_pool);
  64. }
  65. ///////////////////////////////////////////////////////////////////////////////
  66. void ResetRegex ( Regex_t *re )
  67. {
  68. DASSERT(re);
  69. RegexElem_t *e, *eend = re->re_list + re->re_used;
  70. for ( e = re->re_list; e < eend; e++ )
  71. {
  72. #if DCLIB_USE_PCRE
  73. if (e->regex)
  74. pcre_free(e->regex);
  75. #else
  76. regfree(&e->regex);
  77. #endif
  78. FreeString(e->pattern);
  79. FreeString(e->replace.ptr);
  80. FREE(e->repl);
  81. }
  82. if ( re->re_list != re->re_pool )
  83. FREE(re->re_list);
  84. InitializeRegex(re);
  85. }
  86. ///////////////////////////////////////////////////////////////////////////////
  87. static RegexElem_t * GetNextRegexElem ( Regex_t *re )
  88. {
  89. DASSERT(re);
  90. if ( re->re_used >= re->re_size )
  91. {
  92. re->re_size = 3*re->re_size/2 + 10;
  93. if ( re->re_list == re->re_pool )
  94. {
  95. re->re_list = MALLOC( sizeof(*re->re_list) * re->re_size );
  96. memcpy( re->re_list, re->re_pool, sizeof(*re->re_list) * re->re_used );
  97. }
  98. else
  99. re->re_list = REALLOC( re->re_list, sizeof(*re->re_list) * re->re_size );
  100. }
  101. RegexElem_t *e = re->re_list + re->re_used;
  102. memset(e,0,sizeof(*e));
  103. return e;
  104. }
  105. ///////////////////////////////////////////////////////////////////////////////
  106. static RegexReplace_t * GetNextRegexRepl
  107. ( RegexElem_t *elem, char *ptr, int len, int ref )
  108. {
  109. DASSERT(elem);
  110. if ( !len && ref < 0 )
  111. return 0;
  112. if ( elem->repl_used >= elem->repl_size )
  113. {
  114. elem->repl_size = 3*elem->repl_size/2 + 10;
  115. elem->repl = REALLOC( elem->repl, sizeof(*elem->repl) * elem->repl_size );
  116. }
  117. RegexReplace_t *repl = elem->repl + elem->repl_used++;
  118. repl->str.ptr = ptr;
  119. repl->str.len = len;
  120. repl->ref = ref;
  121. PRINT("REPL[%d]: |%.*s| %d\n",elem->repl_used-1,len,ptr,ref);
  122. return repl;
  123. }
  124. ///////////////////////////////////////////////////////////////////////////////
  125. enumError ScanRegex ( Regex_t *re, bool init_re, ccp regex )
  126. {
  127. DASSERT(re);
  128. if (init_re)
  129. InitializeRegex(re);
  130. else
  131. ResetRegex(re);
  132. if ( !regex || !*regex )
  133. return ERR_NOTHING_TO_DO;
  134. struct { FastBuf_t b; char space[500]; } temp;
  135. InitializeFastBuf(&temp,sizeof(temp));
  136. //--- scanning
  137. enumError err = ERR_OK;
  138. for(;;)
  139. {
  140. while ( isspace((int)*regex) || *regex == ';' )
  141. regex++;
  142. if (!*regex)
  143. break;
  144. noPRINT("RE: ANALYZE #%u/%u: %s\n",re->re_used, re->re_size,regex);
  145. char sep = *regex++;
  146. while ( *regex && *regex != sep )
  147. {
  148. char ch = *regex++;
  149. if ( ch == '\\' && *regex == sep )
  150. ch = *regex++;
  151. AppendCharFastBuf(&temp.b,ch);
  152. }
  153. if ( *regex != sep )
  154. {
  155. err = ERR_SYNTAX;
  156. goto exit;
  157. }
  158. ccp repl = ++regex;
  159. while ( *regex && *regex != sep )
  160. regex++;
  161. const uint repl_len = regex - repl;
  162. RegexElem_t *elem = GetNextRegexElem(re);
  163. if ( *regex == sep )
  164. {
  165. regex++;
  166. while ( *regex && *regex != ';' )
  167. switch (*regex++)
  168. {
  169. case 'g': elem->global = true; break;
  170. case 'i': elem->icase = true; elem->opt |= REOPT_ICASE; break;
  171. }
  172. }
  173. PRINT("RE: PATTERN #%u/%u: [%s] %s -> %.*s : %s\n",
  174. re->re_used, re->re_size,
  175. GetFastBufStatus(&temp.b),
  176. GetFastBufString(&temp.b),
  177. repl_len, repl, regex );
  178. // store data as strings
  179. elem->pattern = MEMDUP(temp.b.buf,temp.b.ptr-temp.b.buf);
  180. elem->replace.ptr = repl_len ? MEMDUP(repl,repl_len) : EmptyString;
  181. elem->replace.len = repl_len;
  182. re->re_used++;
  183. ClearFastBuf(&temp.b);
  184. }
  185. //--- finalize
  186. re->valid = true;
  187. uint e;
  188. RegexElem_t *elem;
  189. for ( e = 0, elem = re->re_list; e < re->re_used; e++, elem++ )
  190. {
  191. #if DCLIB_USE_PCRE
  192. ccp errptr;
  193. int erroffset;
  194. re->regex = pcre_compile(elem->pattern,opt,&errptr,&erroffset,0);
  195. if (!re->regex)
  196. {
  197. PRINT("PCRE error: %s: %s\n",errptr,pattern+erroffset);
  198. return ERR_SEMANTIC;
  199. err = ERR_SEMANTIC;
  200. goto exit;
  201. }
  202. #else
  203. int stat = regcomp(&elem->regex,elem->pattern,elem->opt|REG_EXTENDED);
  204. if (stat)
  205. {
  206. #if HAVE_PRINT0
  207. char error[100];
  208. regerror(stat,&re->regex,errormsizeof(error));
  209. PRINT("PCRE error: %s: %s\n",errptr,pattern+erroffset);
  210. #endif
  211. re->valid = false;
  212. err = ERR_SEMANTIC;
  213. }
  214. else
  215. elem->valid = true;
  216. #endif
  217. //--- analyse replace
  218. char *ptr = (char*)elem->replace.ptr;
  219. char *dest = ptr;
  220. char *start = dest;
  221. char *end = ptr + elem->replace.len;
  222. while ( ptr < end )
  223. {
  224. while ( ptr < end && *ptr != '\\' && *ptr != '$' )
  225. *dest++ = *ptr++;
  226. if ( ptr == end )
  227. {
  228. GetNextRegexRepl(elem,start,dest-start,-1);
  229. break;
  230. }
  231. if ( *ptr == ptr[1] )
  232. {
  233. ptr++;
  234. *dest++ = *ptr++;
  235. }
  236. else if ( ptr[1] >= '0' && ptr[1] <= '9' )
  237. {
  238. GetNextRegexRepl(elem,start,dest-start,ptr[1]-'0');
  239. start = dest;
  240. ptr += 2;
  241. }
  242. else if ( ptr[1] == '{' )
  243. {
  244. char *p = ptr+2;
  245. if ( *p >= '0' && *p <= '9' )
  246. {
  247. uint num = *p++ - '0';
  248. if ( *p >= '0' && *p <= '9' )
  249. num = 10*num + *p++ - '0';
  250. if ( p < end && *p == '}' )
  251. {
  252. GetNextRegexRepl(elem,start,dest-start,num);
  253. start = dest;
  254. ptr = p+1;
  255. continue;
  256. }
  257. }
  258. *dest++ = *ptr++;
  259. }
  260. else
  261. *dest++ = *ptr++;
  262. }
  263. if ( dest != EmptyString )
  264. *dest = 0;
  265. elem->replace.len = dest - elem->replace.ptr;
  266. }
  267. PRINT("RE: n=%d, valid=%d\n",re->re_used,re->valid);
  268. exit:
  269. ResetFastBuf(&temp.b);
  270. return err;
  271. }
  272. ///////////////////////////////////////////////////////////////////////////////
  273. int ReplaceRegex
  274. (
  275. Regex_t *re, // valid Regex_t
  276. FastBuf_t *res, // return buffer, cleared
  277. ccp src,
  278. int src_len // -1: use strlen()
  279. )
  280. {
  281. DASSERT(re);
  282. DASSERT(res);
  283. //--- sanity checks
  284. if (!res)
  285. return -ERR_INVALID_DATA;
  286. ClearFastBuf(res);
  287. res->buf[0] = 0;
  288. if ( !re || !re->valid )
  289. return -ERR_INVALID_DATA;
  290. if ( !src || !*src )
  291. return 0;
  292. if ( src_len < 0 )
  293. src_len = strlen(src);
  294. //--- prepare data
  295. enumError err = ERR_OK;
  296. uint count = 0;
  297. struct { FastBuf_t b; char space[500]; } temp;
  298. InitializeFastBuf(&temp,sizeof(temp));
  299. FastBuf_t *dest = res;
  300. #if DCLIB_USE_PCRE
  301. DASSERT(re->regex);
  302. enum { N_MATCH = 100 };
  303. int match[N_MATCH];
  304. #else
  305. enum { N_MATCH = 100 };
  306. regmatch_t match[N_MATCH];
  307. #endif
  308. //--- exec RE
  309. uint e;
  310. RegexElem_t *elem;
  311. for ( e = 0, elem = re->re_list; e < re->re_used; e++, elem++ )
  312. {
  313. uint pos = 0, copied = 0;
  314. while( pos < src_len )
  315. {
  316. memset(match,0,sizeof(match));
  317. #if DCLIB_USE_PCRE
  318. int stat = pcre_exec(re->regex,0,src,src_len,pos,0,match,N_MATCH);
  319. noPRINT("RE: stat=%2d : %2d %2d : %2d %2d : %2d %2d\n",
  320. stat, match[0],match[1], match[2],match[3], match[4],match[5] );
  321. if ( stat < 0 )
  322. {
  323. if ( stat == PCRE_ERROR_NOMATCH )
  324. break;
  325. err = -ERR_ERROR;
  326. goto exit;
  327. }
  328. const uint match_beg = match[0];
  329. const uint match_end = match[1];
  330. #else
  331. int stat = regexec(&elem->regex,src+pos,N_MATCH,match,0);
  332. PRINT("RE: pos=%2d, stat=%2d : %2d %2d : %2d %2d : %2d %2d\n",
  333. pos, stat, match[0].rm_so,match[0].rm_eo,
  334. match[1].rm_so,match[1].rm_eo, match[2].rm_so,match[2].rm_eo );
  335. if (stat)
  336. {
  337. if ( stat == REG_NOMATCH )
  338. break;
  339. err = -ERR_ERROR;
  340. goto exit;
  341. }
  342. const uint match_beg = match[0].rm_so + pos;
  343. const uint match_end = match[0].rm_eo + pos;
  344. #endif
  345. if ( match_beg > copied )
  346. AppendFastBuf(dest,src+copied,match_beg-copied);
  347. copied = match_end;
  348. RegexReplace_t *repl = elem->repl;
  349. RegexReplace_t *repl_end = repl + elem->repl_used;
  350. for ( ; repl < repl_end; repl++ )
  351. {
  352. AppendMemFastBuf(dest,repl->str);
  353. #if DCLIB_USE_PCRE
  354. #error
  355. #else
  356. if ( repl->ref >= 0 && repl->ref < N_MATCH )
  357. {
  358. const regmatch_t *m = match + repl->ref;
  359. if ( m->rm_so >= 0 )
  360. AppendFastBuf( dest,
  361. src + m->rm_so + pos,
  362. m->rm_eo - m->rm_so );
  363. }
  364. #endif
  365. }
  366. pos = match_end + ( match_beg == match_end );
  367. count++;
  368. if (!elem->global)
  369. break;
  370. }
  371. if ( src_len > copied )
  372. AppendFastBuf(dest,src+copied,src_len-copied);
  373. PRINT("DEST: %s\n",GetFastBufString(dest));
  374. src = GetFastBufString(dest);
  375. src_len = GetFastBufLen(dest);
  376. dest = dest == res ? &temp.b : res;
  377. ClearFastBuf(dest);
  378. }
  379. exit:;
  380. AssignFastBuf(res,src,src_len);
  381. return err < 0 ? err : count;
  382. }
  383. //
  384. ///////////////////////////////////////////////////////////////////////////////
  385. /////////////// END ///////////////
  386. ///////////////////////////////////////////////////////////////////////////////