char_class.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. /*
  2. Copyright 2014 Zachary Klippenstein
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. /*
  14. * Copyright (c) 2023, Psiphon Inc.
  15. * All rights reserved.
  16. *
  17. * This program is free software: you can redistribute it and/or modify
  18. * it under the terms of the GNU General Public License as published by
  19. * the Free Software Foundation, either version 3 of the License, or
  20. * (at your option) any later version.
  21. *
  22. * This program is distributed in the hope that it will be useful,
  23. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  24. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  25. * GNU General Public License for more details.
  26. *
  27. * You should have received a copy of the GNU General Public License
  28. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  29. *
  30. */
  31. package regen
  32. import (
  33. "fmt"
  34. "math"
  35. )
  36. // CharClass represents a regular expression character class as a list of ranges.
  37. // The runes contained in the class can be accessed by index.
  38. type tCharClass struct {
  39. Ranges []tCharClassRange
  40. TotalSize int32
  41. }
  42. // CharClassRange represents a single range of characters in a character class.
  43. type tCharClassRange struct {
  44. Start rune
  45. Size int32
  46. }
  47. // NewCharClass creates a character class with a single range.
  48. func newCharClass(start rune, end rune) *tCharClass {
  49. charRange := newCharClassRange(start, end)
  50. return &tCharClass{
  51. Ranges: []tCharClassRange{charRange},
  52. TotalSize: charRange.Size,
  53. }
  54. }
  55. /*
  56. ParseCharClass parses a character class as represented by syntax.Parse into a slice of CharClassRange structs.
  57. Char classes are encoded as pairs of runes representing ranges:
  58. [0-9] = 09, [a0] = aa00 (2 1-len ranges).
  59. e.g.
  60. "[a0-9]" -> "aa09" -> a, 0-9
  61. "[^a-z]" -> "…" -> 0-(a-1), (z+1)-(max rune)
  62. */
  63. func parseCharClass(runes []rune) *tCharClass {
  64. var totalSize int32
  65. numRanges := len(runes) / 2
  66. ranges := make([]tCharClassRange, numRanges)
  67. for i := 0; i < numRanges; i++ {
  68. start := runes[i*2]
  69. end := runes[i*2+1]
  70. // indicates a negative class
  71. if start == 0 {
  72. // doesn't make sense to generate null bytes, so all ranges must start at
  73. // no less than 1.
  74. start = 1
  75. }
  76. r := newCharClassRange(start, end)
  77. ranges[i] = r
  78. totalSize += r.Size
  79. }
  80. return &tCharClass{ranges, totalSize}
  81. }
  82. // parseByteClass parses character classes only for byte values (0-255).
  83. // Returns nil if runes does not contain any byte values.
  84. //
  85. // Note:
  86. // If an end range is greater than 255, it is truncated to 255.
  87. func parseByteClass(runes []rune) *tCharClass {
  88. var totalSize int32
  89. var ranges []tCharClassRange
  90. for i := 0; i < len(runes)-1; i += 2 {
  91. start := runes[i]
  92. end := runes[i+1]
  93. var r tCharClassRange
  94. if start <= math.MaxUint8 {
  95. if end > math.MaxUint8 {
  96. end = math.MaxUint8
  97. }
  98. r = newCharClassRange(start, end)
  99. ranges = append(ranges, r)
  100. totalSize += r.Size
  101. }
  102. }
  103. if len(ranges) == 0 {
  104. return nil
  105. }
  106. return &tCharClass{ranges, totalSize}
  107. }
  108. // GetRuneAt gets a rune from CharClass as a contiguous array of runes.
  109. func (class *tCharClass) GetRuneAt(i int32) rune {
  110. for _, r := range class.Ranges {
  111. if i < r.Size {
  112. return r.Start + rune(i)
  113. }
  114. i -= r.Size
  115. }
  116. panic("index out of bounds")
  117. }
  118. func (class *tCharClass) String() string {
  119. return fmt.Sprintf("%s", class.Ranges)
  120. }
  121. func newCharClassRange(start rune, end rune) tCharClassRange {
  122. size := end - start + 1
  123. if size < 1 {
  124. panic("char class range size must be at least 1")
  125. }
  126. return tCharClassRange{
  127. Start: start,
  128. Size: size,
  129. }
  130. }
  131. func (r tCharClassRange) String() string {
  132. if r.Size == 1 {
  133. return fmt.Sprintf("%s:1", runesToUTF8(r.Start))
  134. }
  135. return fmt.Sprintf("%s-%s:%d", runesToUTF8(r.Start), runesToUTF8(r.Start+rune(r.Size-1)), r.Size)
  136. }