regex2dfa.cc 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. #include <fst/fstlib.h>
  2. #include <fst/script/fstscript.h>
  3. #include "re2/re2.h"
  4. #include "re2/regexp.h"
  5. #include "re2/prog.h"
  6. namespace regex2dfa {
  7. static std::map< std::string, uint32_t > state_map;
  8. static uint32_t state_counter = 0;
  9. bool AttFstFromRegex(const std::string & regex, std::string * dfa) {
  10. // specify compile flags for re2
  11. re2::Regexp::ParseFlags re_flags;
  12. re_flags = re2::Regexp::MatchNL;
  13. re_flags = re_flags | re2::Regexp::OneLine;
  14. re_flags = re_flags | re2::Regexp::PerlClasses;
  15. re_flags = re_flags | re2::Regexp::PerlB;
  16. re_flags = re_flags | re2::Regexp::PerlX;
  17. re_flags = re_flags | re2::Regexp::Latin1;
  18. re2::RegexpStatus status;
  19. // compile regex to DFA
  20. re2::Regexp* re = NULL;
  21. re2::Prog* prog = NULL;
  22. try {
  23. RE2::Options opt;
  24. re2::Regexp* re = re2::Regexp::Parse( regex, re_flags, &status );
  25. if (re!=NULL) {
  26. re2::Prog* prog = re->CompileToProg( opt.max_mem() );
  27. if (prog!=NULL) {
  28. (*dfa) = prog->PrintEntireDFA( re2::Prog::kFullMatch );
  29. }
  30. }
  31. } catch (int e) {
  32. return false;
  33. }
  34. if ((*dfa)=="") {
  35. return false;
  36. }
  37. // cleanup
  38. if (prog!=NULL)
  39. delete prog;
  40. if (re!=NULL)
  41. re->Decref();
  42. return true;
  43. }
  44. std::vector<std::string> tokenize(const std::string & line,
  45. const char & delim) {
  46. std::vector<std::string> retval;
  47. std::istringstream iss(line);
  48. std::string fragment;
  49. while(std::getline(iss, fragment, delim)) {
  50. retval.push_back(fragment);
  51. }
  52. return retval;
  53. }
  54. bool StateExists(std::string state_label) {
  55. return (state_map.find(state_label) != state_map.end());
  56. }
  57. uint32_t AddState(std::string state_label) {
  58. state_map.insert(std::pair<std::string, uint32_t>(state_label, state_counter++));
  59. return state_counter;
  60. }
  61. uint32_t StateLookup(std::string state_label) {
  62. return state_map.at(state_label);
  63. }
  64. bool CreateFst(const std::string & str_dfa,
  65. fst::script::FstClass * input_fst) {
  66. fst::StdVectorFst fst;
  67. bool startStateIsntSet = true;
  68. std::string line;
  69. std::istringstream my_str_stream(str_dfa);
  70. while ( getline (my_str_stream,line) ) {
  71. if (line.empty()) {
  72. break;
  73. }
  74. std::vector<std::string> split_vec = tokenize(line, ' ');
  75. if (4 == split_vec.size()) {
  76. if(!StateExists(split_vec.at(0))) {
  77. fst.AddState();
  78. AddState(split_vec.at(0));
  79. }
  80. if(!StateExists(split_vec.at(1))) {
  81. fst.AddState();
  82. AddState(split_vec.at(1));
  83. }
  84. fst.AddArc(StateLookup(split_vec.at(0)),
  85. fst::StdArc(atoi(split_vec.at(2).c_str()),
  86. atoi(split_vec.at(3).c_str()),
  87. 0,
  88. StateLookup(split_vec.at(1))));
  89. } else if (1 == split_vec.size()) {
  90. if(!StateExists(split_vec.at(0))) {
  91. fst.AddState();
  92. }
  93. uint32_t final_state = StateLookup(split_vec.at(0));
  94. fst.SetFinal(final_state, 0);
  95. }
  96. }
  97. fst.SetStart(0);
  98. *input_fst = static_cast<fst::script::FstClass>(fst);
  99. return true;
  100. }
  101. bool FormatFst(const std::string & str_dfa,
  102. std::string * formatted_dfa) {
  103. std::string & retval = (*formatted_dfa);
  104. std::string line;
  105. std::istringstream my_str_stream(str_dfa);
  106. while ( getline (my_str_stream,line) ) {
  107. if (line.empty()) {
  108. break;
  109. }
  110. std::vector<std::string> split_vec = tokenize(line, '\t');
  111. if (4 == split_vec.size()) {
  112. retval += split_vec.at(0);
  113. retval += "\t" + split_vec.at(1);
  114. retval += "\t" + split_vec.at(2);
  115. retval += "\t" + split_vec.at(2);
  116. retval += "\n";
  117. } else if (2 == split_vec.size()) {
  118. retval += split_vec.at(0);
  119. retval += "\n";
  120. }
  121. }
  122. return true;
  123. }
  124. bool AttFstMinimize(const std::string & str_dfa,
  125. std::string * minimized_dfa) {
  126. fst::script::FstClass * fst = new fst::script::FstClass();
  127. CreateFst(str_dfa, fst);
  128. fst::script::MutableFstClass * mutable_fst
  129. = static_cast<fst::script::MutableFstClass*>(fst);
  130. fst::script::Minimize(mutable_fst);
  131. std::ostringstream ostrm;
  132. fst::script::PrintFst(*fst, ostrm, "", NULL, NULL, NULL, true, true);
  133. FormatFst(ostrm.str(), minimized_dfa);
  134. delete fst;
  135. return true;
  136. }
  137. bool Regex2Dfa(const std::string & regex,
  138. std::string * minimized_dfa) {
  139. state_counter = 0;
  140. state_map.clear();
  141. bool success = false;
  142. std::string dfa;
  143. bool compile_success = AttFstFromRegex(regex, &dfa);
  144. if (compile_success) {
  145. bool minimize_success = AttFstMinimize(dfa, minimized_dfa);
  146. success = true;
  147. }
  148. return success;
  149. }
  150. } // namespace regex2dfa
  151. extern "C" {
  152. int _regex2dfa(const char* input_regex, uint32_t input_regex_len, char **out, size_t *sz) {
  153. std::string input_regex_str(input_regex, input_regex_len);
  154. std::string dfa;
  155. bool success = regex2dfa::Regex2Dfa(input_regex_str, &dfa);
  156. if (!success) {
  157. return 1;
  158. }
  159. *sz = dfa.size();
  160. *out = (char*)malloc(*sz);
  161. memmove(*out, dfa.c_str(), *sz);
  162. return 0;
  163. }
  164. }