regex_match.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. /**
  2. * @file regex_match.c
  3. * @author Ambroz Bizjak <ambrop7@gmail.com>
  4. *
  5. * @section LICENSE
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions are met:
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * 2. Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * 3. Neither the name of the author nor the
  15. * names of its contributors may be used to endorse or promote products
  16. * derived from this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  19. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  20. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  21. * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  22. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  23. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  24. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  25. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  27. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. *
  29. * @section DESCRIPTION
  30. *
  31. * Regular expression matching module.
  32. *
  33. * Synopsis:
  34. * regex_match(string input, string regex)
  35. *
  36. * Variables:
  37. * succeeded - "true" or "false", indicating whether input matched regex
  38. * matchN - for N=0,1,2,..., the matching data for the N-th subexpression
  39. * (match0 = whole match)
  40. *
  41. * Description:
  42. * Matches 'input' with the POSIX extended regular expression 'regex'.
  43. * 'regex' must be a string without null bytes, but 'input' can contain null bytes.
  44. * However, it's difficult, if not impossible, to actually match nulls with the regular
  45. * expression.
  46. * The input and regex strings are interpreted according to the POSIX regex functions
  47. * (regcomp(), regexec()); in particular, the current locale setting affects the
  48. * interpretation.
  49. *
  50. * Synopsis:
  51. * regex_replace(string input, list(string) regex, list(string) replace)
  52. *
  53. * Variables:
  54. * string (empty) - transformed input
  55. *
  56. * Description:
  57. * Replaces matching parts of the input string. Replacement is performed one regular
  58. * expression after another: starting with the input string, for each given regular
  59. * expression, matching substrings of the current string are replaced with the
  60. * corresponding replacement string.
  61. */
  62. #include <stdlib.h>
  63. #include <string.h>
  64. #include <limits.h>
  65. #include <regex.h>
  66. #include <misc/string_begins_with.h>
  67. #include <misc/parse_number.h>
  68. #include <misc/expstring.h>
  69. #include <misc/debug.h>
  70. #include <ncd/NCDModule.h>
  71. #include <generated/blog_channel_ncd_regex_match.h>
  72. #define ModuleLog(i, ...) NCDModuleInst_Backend_Log((i), BLOG_CURRENT_CHANNEL, __VA_ARGS__)
  73. #define MAX_MATCHES 64
  74. struct instance {
  75. NCDModuleInst *i;
  76. char *input;
  77. size_t input_len;
  78. int succeeded;
  79. int num_matches;
  80. regmatch_t matches[MAX_MATCHES];
  81. };
  82. struct replace_instance {
  83. NCDModuleInst *i;
  84. char *output;
  85. size_t output_len;
  86. int output_free;
  87. };
  88. static int regex_replace (const char *input, size_t input_len, const char *regex, const char *replace, size_t replace_len, char **out_output, size_t *out_output_len, NCDModuleInst *i)
  89. {
  90. int res = 0;
  91. // make sure we don't overflow regoff_t
  92. if (input_len > INT_MAX) {
  93. ModuleLog(i, BLOG_ERROR, "string is too long");
  94. goto fail0;
  95. }
  96. // compile regex
  97. regex_t preg;
  98. int ret;
  99. if ((ret = regcomp(&preg, regex, REG_EXTENDED)) != 0) {
  100. ModuleLog(i, BLOG_ERROR, "regcomp failed (error=%d)", ret);
  101. goto fail0;
  102. }
  103. // init output string
  104. ExpString str;
  105. if (!ExpString_Init(&str)) {
  106. ModuleLog(i, BLOG_ERROR, "ExpString_Init failed");
  107. goto fail1;
  108. }
  109. while (1) {
  110. // execute match
  111. regmatch_t matches[MAX_MATCHES];
  112. matches[0].rm_so = 0;
  113. matches[0].rm_eo = input_len;
  114. if (regexec(&preg, input, MAX_MATCHES, matches, REG_STARTEND) != 0) {
  115. break;
  116. }
  117. ASSERT(matches[0].rm_so >= 0)
  118. ASSERT(matches[0].rm_so <= input_len)
  119. ASSERT(matches[0].rm_eo >= matches[0].rm_so)
  120. ASSERT(matches[0].rm_eo <= input_len)
  121. // append data before match
  122. if (!ExpString_AppendBinary(&str, input, matches[0].rm_so)) {
  123. ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed");
  124. goto fail2;
  125. }
  126. // append replace data
  127. if (!ExpString_AppendBinary(&str, replace, replace_len)) {
  128. ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed");
  129. goto fail2;
  130. }
  131. // go on matching the rest
  132. input += matches[0].rm_eo;
  133. input_len -= matches[0].rm_eo;
  134. }
  135. // append remaining data
  136. if (!ExpString_AppendBinary(&str, input, input_len)) {
  137. ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinary failed");
  138. goto fail2;
  139. }
  140. // success
  141. *out_output = ExpString_Get(&str);
  142. *out_output_len = ExpString_Length(&str);
  143. res = 1;
  144. fail2:
  145. if (!res) {
  146. ExpString_Free(&str);
  147. }
  148. fail1:
  149. regfree(&preg);
  150. fail0:
  151. return res;
  152. }
  153. static void func_new (NCDModuleInst *i)
  154. {
  155. // allocate instance
  156. struct instance *o = malloc(sizeof(*o));
  157. if (!o) {
  158. ModuleLog(i, BLOG_ERROR, "failed to allocate instance");
  159. goto fail0;
  160. }
  161. NCDModuleInst_Backend_SetUser(i, o);
  162. // init arguments
  163. o->i = i;
  164. // read arguments
  165. NCDValue *input_arg;
  166. NCDValue *regex_arg;
  167. if (!NCDValue_ListRead(o->i->args, 2, &input_arg, &regex_arg)) {
  168. ModuleLog(o->i, BLOG_ERROR, "wrong arity");
  169. goto fail1;
  170. }
  171. if (NCDValue_Type(input_arg) != NCDVALUE_STRING || !NCDValue_IsStringNoNulls(regex_arg)) {
  172. ModuleLog(o->i, BLOG_ERROR, "wrong type");
  173. goto fail1;
  174. }
  175. o->input = NCDValue_StringValue(input_arg);
  176. o->input_len = NCDValue_StringLength(input_arg);
  177. char *regex = NCDValue_StringValue(regex_arg);
  178. // make sure we don't overflow regoff_t
  179. if (o->input_len > INT_MAX) {
  180. ModuleLog(o->i, BLOG_ERROR, "input string too long");
  181. goto fail1;
  182. }
  183. // compile regex
  184. regex_t preg;
  185. int ret;
  186. if ((ret = regcomp(&preg, regex, REG_EXTENDED)) != 0) {
  187. ModuleLog(o->i, BLOG_ERROR, "regcomp failed (error=%d)", ret);
  188. goto fail1;
  189. }
  190. // execute match
  191. o->matches[0].rm_so = 0;
  192. o->matches[0].rm_eo = o->input_len;
  193. o->succeeded = (regexec(&preg, o->input, MAX_MATCHES, o->matches, REG_STARTEND) == 0);
  194. // free regex
  195. regfree(&preg);
  196. // signal up
  197. NCDModuleInst_Backend_Up(o->i);
  198. return;
  199. fail1:
  200. free(o);
  201. fail0:
  202. NCDModuleInst_Backend_SetError(i);
  203. NCDModuleInst_Backend_Dead(i);
  204. }
  205. static void func_die (void *vo)
  206. {
  207. struct instance *o = vo;
  208. NCDModuleInst *i = o->i;
  209. // free instance
  210. free(o);
  211. NCDModuleInst_Backend_Dead(i);
  212. }
  213. static int func_getvar (void *vo, const char *name, NCDValue *out)
  214. {
  215. struct instance *o = vo;
  216. if (!strcmp(name, "succeeded")) {
  217. if (!NCDValue_InitString(out, (o->succeeded ? "true" : "false"))) {
  218. ModuleLog(o->i, BLOG_ERROR, "NCDValue_InitCopy failed");
  219. return 0;
  220. }
  221. return 1;
  222. }
  223. size_t pos;
  224. uintmax_t n;
  225. if ((pos = string_begins_with(name, "match")) && parse_unsigned_integer(name + pos, &n)) {
  226. if (o->succeeded && n < MAX_MATCHES && o->matches[n].rm_so >= 0) {
  227. regmatch_t *m = &o->matches[n];
  228. ASSERT(m->rm_so <= o->input_len)
  229. ASSERT(m->rm_eo >= m->rm_so)
  230. ASSERT(m->rm_eo <= o->input_len)
  231. size_t len = m->rm_eo - m->rm_so;
  232. if (!NCDValue_InitStringBin(out, o->input + m->rm_so, len)) {
  233. ModuleLog(o->i, BLOG_ERROR, "NCDValue_InitStringBin failed");
  234. return 0;
  235. }
  236. return 1;
  237. }
  238. }
  239. return 0;
  240. }
  241. static void replace_func_new (NCDModuleInst *i)
  242. {
  243. // allocate structure
  244. struct replace_instance *o = malloc(sizeof(*o));
  245. if (!o) {
  246. ModuleLog(i, BLOG_ERROR, "malloc failed");
  247. goto fail0;
  248. }
  249. o->i = i;
  250. NCDModuleInst_Backend_SetUser(i, o);
  251. // read arguments
  252. NCDValue *input_arg;
  253. NCDValue *regex_arg;
  254. NCDValue *replace_arg;
  255. if (!NCDValue_ListRead(i->args, 3, &input_arg, &regex_arg, &replace_arg)) {
  256. ModuleLog(i, BLOG_ERROR, "wrong arity");
  257. goto fail1;
  258. }
  259. if (!NCDValue_IsString(input_arg) || !NCDValue_IsList(regex_arg) || !NCDValue_IsList(replace_arg)) {
  260. ModuleLog(i, BLOG_ERROR, "wrong type");
  261. goto fail1;
  262. }
  263. // check number of regex/replace
  264. if (NCDValue_ListCount(regex_arg) != NCDValue_ListCount(replace_arg)) {
  265. ModuleLog(i, BLOG_ERROR, "number of regex's is not the same as number of replacements");
  266. goto fail1;
  267. }
  268. // start with input as current text
  269. char *current = NCDValue_StringValue(input_arg);
  270. size_t current_len = NCDValue_StringLength(input_arg);
  271. int current_free = 0;
  272. NCDValue *regex = NCDValue_ListFirst(regex_arg);
  273. NCDValue *replace = NCDValue_ListFirst(replace_arg);
  274. while (regex) {
  275. // check type of regex and replace
  276. if (!NCDValue_IsStringNoNulls(regex) || !NCDValue_IsString(replace)) {
  277. ModuleLog(i, BLOG_ERROR, "regex/replace element has wrong type");
  278. goto fail2;
  279. }
  280. // perform the replacing
  281. char *replaced;
  282. size_t replaced_len;
  283. if (!regex_replace(current, current_len, NCDValue_StringValue(regex), NCDValue_StringValue(replace), NCDValue_StringLength(replace), &replaced, &replaced_len, i)) {
  284. goto fail2;
  285. }
  286. // update current text
  287. if (current_free) {
  288. free(current);
  289. }
  290. current = replaced;
  291. current_len = replaced_len;
  292. current_free = 1;
  293. regex = NCDValue_ListNext(regex_arg, regex);
  294. replace = NCDValue_ListNext(replace_arg, replace);
  295. }
  296. // set output
  297. o->output = current;
  298. o->output_len = current_len;
  299. o->output_free = current_free;
  300. // signal up
  301. NCDModuleInst_Backend_Up(o->i);
  302. return;
  303. fail2:
  304. if (current_free) {
  305. free(current);
  306. }
  307. fail1:
  308. free(o);
  309. fail0:
  310. NCDModuleInst_Backend_SetError(i);
  311. NCDModuleInst_Backend_Dead(i);
  312. }
  313. static void replace_func_die (void *vo)
  314. {
  315. struct replace_instance *o = vo;
  316. NCDModuleInst *i = o->i;
  317. // free output
  318. if (o->output_free) {
  319. free(o->output);
  320. }
  321. // free instance
  322. free(o);
  323. NCDModuleInst_Backend_Dead(i);
  324. }
  325. static int replace_func_getvar (void *vo, const char *name, NCDValue *out)
  326. {
  327. struct replace_instance *o = vo;
  328. if (!strcmp(name, "")) {
  329. if (!NCDValue_InitStringBin(out, o->output, o->output_len)) {
  330. ModuleLog(o->i, BLOG_ERROR, "NCDValue_InitStringBin failed");
  331. return 0;
  332. }
  333. return 1;
  334. }
  335. return 0;
  336. }
  337. static const struct NCDModule modules[] = {
  338. {
  339. .type = "regex_match",
  340. .func_new = func_new,
  341. .func_die = func_die,
  342. .func_getvar = func_getvar
  343. }, {
  344. .type = "regex_replace",
  345. .func_new = replace_func_new,
  346. .func_die = replace_func_die,
  347. .func_getvar = replace_func_getvar
  348. }, {
  349. .type = NULL
  350. }
  351. };
  352. const struct NCDModuleGroup ncdmodule_regex_match = {
  353. .modules = modules
  354. };