regex_match.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. /**
  2. * @file regex_match.c
  3. * @author Ambroz Bizjak <ambrop7@gmail.com>
  4. *
  5. * @section LICENSE
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions are met:
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * 2. Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * 3. Neither the name of the author nor the
  15. * names of its contributors may be used to endorse or promote products
  16. * derived from this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  19. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  20. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  21. * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  22. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  23. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  24. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  25. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  27. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. *
  29. * @section DESCRIPTION
  30. *
  31. * Regular expression matching module.
  32. *
  33. * Synopsis:
  34. * regex_match(string input, string regex)
  35. *
  36. * Variables:
  37. * succeeded - "true" or "false", indicating whether input matched regex
  38. * matchN - for N=0,1,2,..., the matching data for the N-th subexpression
  39. * (match0 = whole match)
  40. *
  41. * Description:
  42. * Matches 'input' with the POSIX extended regular expression 'regex'.
  43. * 'regex' must be a string without null bytes, but 'input' can contain null bytes.
  44. * However, it's difficult, if not impossible, to actually match nulls with the regular
  45. * expression.
  46. * The input and regex strings are interpreted according to the POSIX regex functions
  47. * (regcomp(), regexec()); in particular, the current locale setting affects the
  48. * interpretation.
  49. *
  50. * Synopsis:
  51. * regex_replace(string input, list(string) regex, list(string) replace)
  52. *
  53. * Variables:
  54. * string (empty) - transformed input
  55. *
  56. * Description:
  57. * Replaces matching parts of a string. Replacement is performed by repetedly matching
  58. * the remaining part of the string with all regular expressions. On each step, out of
  59. * all regular expressions that match the remainder of the string, the one whose match
  60. * starts at the least position wins, and the matching part is replaced with the
  61. * replacement string corresponding to this regular expression. The process continues
  62. * from the end of the just-replaced portion until no more regular expressions match.
  63. * If multiple regular expressions match at the least position, the one that appears
  64. * first in the 'regex' argument wins.
  65. */
  66. #include <stdlib.h>
  67. #include <string.h>
  68. #include <limits.h>
  69. #include <regex.h>
  70. #include <misc/string_begins_with.h>
  71. #include <misc/parse_number.h>
  72. #include <misc/expstring.h>
  73. #include <misc/debug.h>
  74. #include <misc/balloc.h>
  75. #include <ncd/NCDModule.h>
  76. #include <generated/blog_channel_ncd_regex_match.h>
  77. #define ModuleLog(i, ...) NCDModuleInst_Backend_Log((i), BLOG_CURRENT_CHANNEL, __VA_ARGS__)
  78. #define MAX_MATCHES 64
  79. struct instance {
  80. NCDModuleInst *i;
  81. const char *input;
  82. size_t input_len;
  83. int succeeded;
  84. int num_matches;
  85. regmatch_t matches[MAX_MATCHES];
  86. };
  87. struct replace_instance {
  88. NCDModuleInst *i;
  89. char *output;
  90. size_t output_len;
  91. };
  92. static void func_new (void *vo, NCDModuleInst *i, const struct NCDModuleInst_new_params *params)
  93. {
  94. struct instance *o = vo;
  95. o->i = i;
  96. // read arguments
  97. NCDValRef input_arg;
  98. NCDValRef regex_arg;
  99. if (!NCDVal_ListRead(params->args, 2, &input_arg, &regex_arg)) {
  100. ModuleLog(o->i, BLOG_ERROR, "wrong arity");
  101. goto fail0;
  102. }
  103. if (!NCDVal_IsString(input_arg) || !NCDVal_IsStringNoNulls(regex_arg)) {
  104. ModuleLog(o->i, BLOG_ERROR, "wrong type");
  105. goto fail0;
  106. }
  107. o->input = NCDVal_StringData(input_arg);
  108. o->input_len = NCDVal_StringLength(input_arg);
  109. // make sure we don't overflow regoff_t
  110. if (o->input_len > INT_MAX) {
  111. ModuleLog(o->i, BLOG_ERROR, "input string too long");
  112. goto fail0;
  113. }
  114. // null terminate regex
  115. NCDValNullTermString regex_nts;
  116. if (!NCDVal_StringNullTerminate(regex_arg, &regex_nts)) {
  117. ModuleLog(i, BLOG_ERROR, "NCDVal_StringNullTerminate failed");
  118. goto fail0;
  119. }
  120. // compile regex
  121. regex_t preg;
  122. int ret = regcomp(&preg, regex_nts.data, REG_EXTENDED);
  123. NCDValNullTermString_Free(&regex_nts);
  124. if (ret != 0) {
  125. ModuleLog(o->i, BLOG_ERROR, "regcomp failed (error=%d)", ret);
  126. goto fail0;
  127. }
  128. // execute match
  129. o->matches[0].rm_so = 0;
  130. o->matches[0].rm_eo = o->input_len;
  131. o->succeeded = (regexec(&preg, o->input, MAX_MATCHES, o->matches, REG_STARTEND) == 0);
  132. // free regex
  133. regfree(&preg);
  134. // signal up
  135. NCDModuleInst_Backend_Up(o->i);
  136. return;
  137. fail0:
  138. NCDModuleInst_Backend_SetError(i);
  139. NCDModuleInst_Backend_Dead(i);
  140. }
  141. static int func_getvar (void *vo, const char *name, NCDValMem *mem, NCDValRef *out)
  142. {
  143. struct instance *o = vo;
  144. if (!strcmp(name, "succeeded")) {
  145. const char *str = o->succeeded ? "true" : "false";
  146. *out = NCDVal_NewString(mem, str);
  147. if (NCDVal_IsInvalid(*out)) {
  148. ModuleLog(o->i, BLOG_ERROR, "NCDVal_NewString failed");
  149. }
  150. return 1;
  151. }
  152. size_t pos;
  153. uintmax_t n;
  154. if ((pos = string_begins_with(name, "match")) && parse_unsigned_integer(name + pos, &n)) {
  155. if (o->succeeded && n < MAX_MATCHES && o->matches[n].rm_so >= 0) {
  156. regmatch_t *m = &o->matches[n];
  157. ASSERT(m->rm_so <= o->input_len)
  158. ASSERT(m->rm_eo >= m->rm_so)
  159. ASSERT(m->rm_eo <= o->input_len)
  160. size_t len = m->rm_eo - m->rm_so;
  161. *out = NCDVal_NewStringBin(mem, (uint8_t *)o->input + m->rm_so, len);
  162. if (NCDVal_IsInvalid(*out)) {
  163. ModuleLog(o->i, BLOG_ERROR, "NCDVal_NewStringBin failed");
  164. }
  165. return 1;
  166. }
  167. }
  168. return 0;
  169. }
  170. static void replace_func_new (void *vo, NCDModuleInst *i, const struct NCDModuleInst_new_params *params)
  171. {
  172. struct replace_instance *o = vo;
  173. o->i = i;
  174. // read arguments
  175. NCDValRef input_arg;
  176. NCDValRef regex_arg;
  177. NCDValRef replace_arg;
  178. if (!NCDVal_ListRead(params->args, 3, &input_arg, &regex_arg, &replace_arg)) {
  179. ModuleLog(i, BLOG_ERROR, "wrong arity");
  180. goto fail1;
  181. }
  182. if (!NCDVal_IsString(input_arg) || !NCDVal_IsList(regex_arg) || !NCDVal_IsList(replace_arg)) {
  183. ModuleLog(i, BLOG_ERROR, "wrong type");
  184. goto fail1;
  185. }
  186. // check number of regex/replace
  187. if (NCDVal_ListCount(regex_arg) != NCDVal_ListCount(replace_arg)) {
  188. ModuleLog(i, BLOG_ERROR, "number of regex's is not the same as number of replacements");
  189. goto fail1;
  190. }
  191. size_t num_regex = NCDVal_ListCount(regex_arg);
  192. // allocate array for compiled regex's
  193. regex_t *regs = BAllocArray(num_regex, sizeof(regs[0]));
  194. if (!regs) {
  195. ModuleLog(i, BLOG_ERROR, "BAllocArray failed");
  196. goto fail1;
  197. }
  198. size_t num_done_regex = 0;
  199. // compile regex's, check arguments
  200. while (num_done_regex < num_regex) {
  201. NCDValRef regex = NCDVal_ListGet(regex_arg, num_done_regex);
  202. NCDValRef replace = NCDVal_ListGet(replace_arg, num_done_regex);
  203. if (!NCDVal_IsStringNoNulls(regex) || !NCDVal_IsString(replace)) {
  204. ModuleLog(i, BLOG_ERROR, "wrong regex/replace type for pair %zu", num_done_regex);
  205. goto fail2;
  206. }
  207. // null terminate regex
  208. NCDValNullTermString regex_nts;
  209. if (!NCDVal_StringNullTerminate(regex, &regex_nts)) {
  210. ModuleLog(i, BLOG_ERROR, "NCDVal_StringNullTerminate failed");
  211. goto fail2;
  212. }
  213. int res = regcomp(&regs[num_done_regex], regex_nts.data, REG_EXTENDED);
  214. NCDValNullTermString_Free(&regex_nts);
  215. if (res != 0) {
  216. ModuleLog(i, BLOG_ERROR, "regcomp failed for pair %zu (error=%d)", num_done_regex, res);
  217. goto fail2;
  218. }
  219. num_done_regex++;
  220. }
  221. // init output string
  222. ExpString out;
  223. if (!ExpString_Init(&out)) {
  224. ModuleLog(i, BLOG_ERROR, "ExpString_Init failed");
  225. goto fail2;
  226. }
  227. // input state
  228. const char *in = NCDVal_StringData(input_arg);
  229. size_t in_pos = 0;
  230. size_t in_len = NCDVal_StringLength(input_arg);
  231. // process input
  232. while (in_pos < in_len) {
  233. // find first match
  234. int have_match = 0;
  235. size_t match_regex;
  236. regmatch_t match = {0, 0}; // to remove warning
  237. for (size_t j = 0; j < num_regex; j++) {
  238. regmatch_t this_match;
  239. this_match.rm_so = 0;
  240. this_match.rm_eo = in_len - in_pos;
  241. if (regexec(&regs[j], in + in_pos, 1, &this_match, REG_STARTEND) == 0 && (!have_match || this_match.rm_so < match.rm_so)) {
  242. have_match = 1;
  243. match_regex = j;
  244. match = this_match;
  245. }
  246. }
  247. // if no match, append remaining data and finish
  248. if (!have_match) {
  249. if (!ExpString_AppendBinary(&out, (const uint8_t *)in + in_pos, in_len - in_pos)) {
  250. ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed");
  251. goto fail3;
  252. }
  253. break;
  254. }
  255. // append data before match
  256. if (!ExpString_AppendBinary(&out, (const uint8_t *)in + in_pos, match.rm_so)) {
  257. ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed");
  258. goto fail3;
  259. }
  260. // append replacement data
  261. NCDValRef replace = NCDVal_ListGet(replace_arg, match_regex);
  262. if (!ExpString_AppendBinary(&out, (const uint8_t *)NCDVal_StringData(replace), NCDVal_StringLength(replace))) {
  263. ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed");
  264. goto fail3;
  265. }
  266. in_pos += match.rm_eo;
  267. }
  268. // set output
  269. o->output = ExpString_Get(&out);
  270. o->output_len = ExpString_Length(&out);
  271. // free compiled regex's
  272. while (num_done_regex-- > 0) {
  273. regfree(&regs[num_done_regex]);
  274. }
  275. // free array
  276. BFree(regs);
  277. // signal up
  278. NCDModuleInst_Backend_Up(i);
  279. return;
  280. fail3:
  281. ExpString_Free(&out);
  282. fail2:
  283. while (num_done_regex-- > 0) {
  284. regfree(&regs[num_done_regex]);
  285. }
  286. BFree(regs);
  287. fail1:
  288. NCDModuleInst_Backend_SetError(i);
  289. NCDModuleInst_Backend_Dead(i);
  290. }
  291. static void replace_func_die (void *vo)
  292. {
  293. struct replace_instance *o = vo;
  294. // free output
  295. BFree(o->output);
  296. NCDModuleInst_Backend_Dead(o->i);
  297. }
  298. static int replace_func_getvar (void *vo, const char *name, NCDValMem *mem, NCDValRef *out)
  299. {
  300. struct replace_instance *o = vo;
  301. if (!strcmp(name, "")) {
  302. *out = NCDVal_NewStringBin(mem, (uint8_t *)o->output, o->output_len);
  303. if (NCDVal_IsInvalid(*out)) {
  304. ModuleLog(o->i, BLOG_ERROR, "NCDVal_NewStringBin failed");
  305. }
  306. return 1;
  307. }
  308. return 0;
  309. }
  310. static struct NCDModule modules[] = {
  311. {
  312. .type = "regex_match",
  313. .func_new2 = func_new,
  314. .func_getvar = func_getvar,
  315. .alloc_size = sizeof(struct instance)
  316. }, {
  317. .type = "regex_replace",
  318. .func_new2 = replace_func_new,
  319. .func_die = replace_func_die,
  320. .func_getvar = replace_func_getvar,
  321. .alloc_size = sizeof(struct replace_instance)
  322. }, {
  323. .type = NULL
  324. }
  325. };
  326. const struct NCDModuleGroup ncdmodule_regex_match = {
  327. .modules = modules
  328. };