internal_generator.go 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. /*
  2. Copyright 2014 Zachary Klippenstein
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. /*
  14. * Copyright (c) 2023, Psiphon Inc.
  15. * All rights reserved.
  16. *
  17. * This program is free software: you can redistribute it and/or modify
  18. * it under the terms of the GNU General Public License as published by
  19. * the Free Software Foundation, either version 3 of the License, or
  20. * (at your option) any later version.
  21. *
  22. * This program is distributed in the hope that it will be useful,
  23. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  24. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  25. * GNU General Public License for more details.
  26. *
  27. * You should have received a copy of the GNU General Public License
  28. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  29. *
  30. */
  31. package regen
  32. import (
  33. "bytes"
  34. "fmt"
  35. "math"
  36. "regexp/syntax"
  37. )
  38. // generatorFactory is a function that creates a random string generator from a regular expression AST.
  39. type generatorFactory func(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error)
  40. // Must be initialized in init() to avoid "initialization loop" compile error.
  41. var generatorFactories map[syntax.Op]generatorFactory
  42. const noBound = -1
  43. func init() {
  44. generatorFactories = map[syntax.Op]generatorFactory{
  45. syntax.OpEmptyMatch: opEmptyMatch,
  46. syntax.OpLiteral: opLiteral,
  47. syntax.OpAnyCharNotNL: opAnyCharNotNl,
  48. syntax.OpAnyChar: opAnyChar,
  49. syntax.OpQuest: opQuest,
  50. syntax.OpStar: opStar,
  51. syntax.OpPlus: opPlus,
  52. syntax.OpRepeat: opRepeat,
  53. syntax.OpCharClass: opCharClass,
  54. syntax.OpConcat: opConcat,
  55. syntax.OpAlternate: opAlternate,
  56. syntax.OpCapture: opCapture,
  57. syntax.OpBeginLine: noop,
  58. syntax.OpEndLine: noop,
  59. syntax.OpBeginText: noop,
  60. syntax.OpEndText: noop,
  61. syntax.OpWordBoundary: noop,
  62. syntax.OpNoWordBoundary: noop,
  63. }
  64. }
  65. type internalGenerator struct {
  66. Name string
  67. GenerateFunc func() ([]byte, error)
  68. }
  69. func (gen *internalGenerator) Generate() (b []byte, err error) {
  70. defer func() {
  71. if r := recover(); r != nil {
  72. err = fmt.Errorf("panicked on bad input: Generate: %v", r)
  73. }
  74. }()
  75. return gen.GenerateFunc()
  76. }
  77. func (gen *internalGenerator) String() string {
  78. return gen.Name
  79. }
  80. // Create a new generator for each expression in regexps.
  81. func newGenerators(regexps []*syntax.Regexp, args *GeneratorArgs) ([]*internalGenerator, error) {
  82. generators := make([]*internalGenerator, len(regexps))
  83. var err error
  84. // create a generator for each alternate pattern
  85. for i, subR := range regexps {
  86. generators[i], err = newGenerator(subR, args)
  87. if err != nil {
  88. return nil, err
  89. }
  90. }
  91. return generators, nil
  92. }
  93. // Create a new generator for r.
  94. func newGenerator(regexp *syntax.Regexp, args *GeneratorArgs) (generator *internalGenerator, err error) {
  95. simplified := regexp.Simplify()
  96. factory, ok := generatorFactories[simplified.Op]
  97. if ok {
  98. return factory(simplified, args)
  99. }
  100. return nil, fmt.Errorf("invalid generator pattern: /%s/ as /%s/\n%s",
  101. regexp, simplified, inspectRegexpToString(simplified))
  102. }
  103. // Generator that does nothing.
  104. func noop(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  105. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  106. return []byte{}, nil
  107. }}, nil
  108. }
  109. func opEmptyMatch(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  110. enforceOp(regexp, syntax.OpEmptyMatch)
  111. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  112. return []byte{}, nil
  113. }}, nil
  114. }
  115. func opLiteral(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  116. enforceOp(regexp, syntax.OpLiteral)
  117. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  118. if args.ByteMode {
  119. return runesToBytes(regexp.Rune...)
  120. } else {
  121. return runesToUTF8(regexp.Rune...), nil
  122. }
  123. }}, nil
  124. }
  125. func opAnyChar(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  126. enforceOp(regexp, syntax.OpAnyChar)
  127. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  128. if args.ByteMode {
  129. return runesToBytes(rune(args.rng.Intn(math.MaxUint8 + 1)))
  130. } else {
  131. return runesToUTF8(rune(args.rng.Int31())), nil
  132. }
  133. }}, nil
  134. }
  135. func opAnyCharNotNl(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  136. enforceOp(regexp, syntax.OpAnyCharNotNL)
  137. var charClass *tCharClass
  138. if args.ByteMode {
  139. charClass = newCharClass(0, rune(math.MaxUint8))
  140. } else {
  141. charClass = newCharClass(1, rune(math.MaxInt32))
  142. }
  143. return createCharClassGenerator(regexp.String(), charClass, args)
  144. }
  145. func opQuest(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  146. enforceOp(regexp, syntax.OpQuest)
  147. return createRepeatingGenerator(regexp, args, 0, 1)
  148. }
  149. func opStar(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  150. enforceOp(regexp, syntax.OpStar)
  151. return createRepeatingGenerator(regexp, args, noBound, noBound)
  152. }
  153. func opPlus(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  154. enforceOp(regexp, syntax.OpPlus)
  155. return createRepeatingGenerator(regexp, args, 1, noBound)
  156. }
  157. func opRepeat(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  158. enforceOp(regexp, syntax.OpRepeat)
  159. return createRepeatingGenerator(regexp, args, regexp.Min, regexp.Max)
  160. }
  161. // Handles syntax.ClassNL because the parser uses that flag to generate character
  162. // classes that respect it.
  163. func opCharClass(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  164. enforceOp(regexp, syntax.OpCharClass)
  165. var charClass *tCharClass
  166. if args.ByteMode {
  167. charClass = parseByteClass(regexp.Rune)
  168. if charClass == nil {
  169. return nil, fmt.Errorf("invalid byte class: /%s/", regexp)
  170. }
  171. } else {
  172. charClass = parseCharClass(regexp.Rune)
  173. }
  174. return createCharClassGenerator(regexp.String(), charClass, args)
  175. }
  176. func opConcat(regexp *syntax.Regexp, genArgs *GeneratorArgs) (*internalGenerator, error) {
  177. enforceOp(regexp, syntax.OpConcat)
  178. generators, err := newGenerators(regexp.Sub, genArgs)
  179. if err != nil {
  180. return nil, generatorError(err, "error creating generators for concat pattern /%s/", regexp)
  181. }
  182. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  183. var result bytes.Buffer
  184. for _, generator := range generators {
  185. gen, err := generator.Generate()
  186. if err != nil {
  187. return nil, err
  188. }
  189. result.Write(gen)
  190. }
  191. return result.Bytes(), nil
  192. }}, nil
  193. }
  194. func opAlternate(regexp *syntax.Regexp, genArgs *GeneratorArgs) (*internalGenerator, error) {
  195. enforceOp(regexp, syntax.OpAlternate)
  196. generators, err := newGenerators(regexp.Sub, genArgs)
  197. if err != nil {
  198. return nil, generatorError(err, "error creating generators for alternate pattern /%s/", regexp)
  199. }
  200. numGens := len(generators)
  201. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  202. i := genArgs.rng.Intn(numGens)
  203. generator := generators[i]
  204. return generator.Generate()
  205. }}, nil
  206. }
  207. func opCapture(regexp *syntax.Regexp, args *GeneratorArgs) (*internalGenerator, error) {
  208. enforceOp(regexp, syntax.OpCapture)
  209. if err := enforceSingleSub(regexp); err != nil {
  210. return nil, err
  211. }
  212. groupRegexp := regexp.Sub[0]
  213. generator, err := newGenerator(groupRegexp, args)
  214. if err != nil {
  215. return nil, err
  216. }
  217. // Group indices are 0-based, but index 0 is the whole expression.
  218. index := regexp.Cap - 1
  219. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  220. return args.CaptureGroupHandler(index, regexp.Name, groupRegexp, generator, args)
  221. }}, nil
  222. }
  223. func defaultCaptureGroupHandler(index int, name string, group *syntax.Regexp, generator Generator, args *GeneratorArgs) ([]byte, error) {
  224. return generator.Generate()
  225. }
  226. // Panic if r.Op != op.
  227. func enforceOp(r *syntax.Regexp, op syntax.Op) {
  228. if r.Op != op {
  229. panic(fmt.Sprintf("invalid Op: expected %s, was %s", opToString(op), opToString(r.Op)))
  230. }
  231. }
  232. // Return an error if r has 0 or more than 1 sub-expression.
  233. func enforceSingleSub(regexp *syntax.Regexp) error {
  234. if len(regexp.Sub) != 1 {
  235. return generatorError(nil,
  236. "%s expected 1 sub-expression, but got %d: %s", opToString(regexp.Op), len(regexp.Sub), regexp)
  237. }
  238. return nil
  239. }
  240. func createCharClassGenerator(name string, charClass *tCharClass, args *GeneratorArgs) (*internalGenerator, error) {
  241. return &internalGenerator{name, func() ([]byte, error) {
  242. i := args.rng.Int31n(charClass.TotalSize)
  243. r := charClass.GetRuneAt(i)
  244. if args.ByteMode {
  245. return runesToBytes(r)
  246. } else {
  247. return runesToUTF8(r), nil
  248. }
  249. }}, nil
  250. }
  251. // Returns a generator that will run the generator for r's sub-expression [min, max] times.
  252. func createRepeatingGenerator(regexp *syntax.Regexp, genArgs *GeneratorArgs, min, max int) (*internalGenerator, error) {
  253. if err := enforceSingleSub(regexp); err != nil {
  254. return nil, err
  255. }
  256. generator, err := newGenerator(regexp.Sub[0], genArgs)
  257. if err != nil {
  258. return nil, generatorError(err, "failed to create generator for subexpression: /%s/", regexp)
  259. }
  260. if min == noBound {
  261. min = int(genArgs.MinUnboundedRepeatCount)
  262. }
  263. if max == noBound {
  264. max = int(genArgs.MaxUnboundedRepeatCount)
  265. }
  266. return &internalGenerator{regexp.String(), func() ([]byte, error) {
  267. n := min + genArgs.rng.Intn(max-min+1)
  268. var result bytes.Buffer
  269. for i := 0; i < n; i++ {
  270. value, err := generator.Generate()
  271. if err != nil {
  272. return nil, err
  273. }
  274. result.Write(value)
  275. }
  276. return result.Bytes(), nil
  277. }}, nil
  278. }