ecmascript.php 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. <?php
  2. /**
  3. * This is a rename of the JavaScript scanner.
  4. * TODO Some of these things are JS specific and should be moved into
  5. * the new JS scanner.
  6. */
  7. class LuminousECMAScriptScanner extends LuminousEmbeddedWebScript {
  8. public $script_tags = '</script>';
  9. // regular expressions in JavaScript are delimited by '/', BUT, the slash
  10. // character may appear unescaped within character classes
  11. // we can handle this fairly easily with a single regex because the classes
  12. // do not nest
  13. // TODO:
  14. // I do not know if this is specific to Javascript or ECMAScript derivatives
  15. // as a whole, I also don't know if multi-line regexen are legal (i.e. when
  16. // the definition spans multiple lines)
  17. protected $regex_regex = "%
  18. /
  19. (?:
  20. [^\\[\\\\/]+ # not slash, backslash, or [
  21. | \\\\. # escape char
  22. |
  23. (?: # char class [..]
  24. \\[
  25. (?:
  26. [^\\]\\\\]+ # not slash or ]
  27. | \\\\. # escape
  28. )*
  29. (?: \\] | \$)
  30. ) # close char class
  31. )*
  32. (?: /[iogmx]* | \$) #delimiter or eof
  33. %sx";
  34. // logs a persistent token stream so that we can lookbehind to figure out
  35. // operators vs regexes.
  36. private $tokens_ = array();
  37. private $child_state = null;
  38. function __construct($src=null) {
  39. $this->rule_tag_map = array(
  40. 'COMMENT_SL' => 'COMMENT',
  41. 'SSTRING' => 'STRING',
  42. 'DSTRING' => 'STRING',
  43. 'OPENER' => null,
  44. 'CLOSER' => null,
  45. );
  46. $this->dirty_exit_recovery = array(
  47. 'COMMENT_SL' => '/.*/',
  48. 'COMMENT' => '%.*?(\*/|$)%s',
  49. 'SSTRING' => "/(?:[^\\\\']+|\\\\.)*('|$)/",
  50. 'DSTRING' => '/(?:[^\\\\"]+|\\\\.)*("|$)/',
  51. // FIXME: Anyone using a server-side interruption to build a regex is
  52. // frankly insane, but we are wrong in the case that they were in a
  53. // character class when the server language interrupted, and we may
  54. // exit the regex prematurely with this
  55. 'REGEX' => '%(?:[^\\\\/]+|\\\\.)*(?:/[iogmx]*|$)%',
  56. );
  57. parent::__construct($src);
  58. $this->add_identifier_mapping('KEYWORD', array('break', 'case', 'catch',
  59. 'comment', 'continue', 'do', 'default', 'delete', 'else', 'export',
  60. 'for', 'function', 'if', 'import', 'in', 'instanceof', 'label', 'new',
  61. 'null', 'return', 'switch', 'throw', 'try', 'typeof', 'var', 'void',
  62. 'while', 'with',
  63. 'true', 'false', 'this'
  64. ));
  65. $this->add_identifier_mapping('FUNCTION', array('$', 'alert', 'confirm',
  66. 'clearTimeout', 'clearInterval',
  67. 'encodeURI', 'encodeURIComponent', 'eval', 'isFinite', 'isNaN',
  68. 'parseInt', 'parseFloat', 'prompt',
  69. 'setTimeout', 'setInterval',
  70. 'decodeURI', 'decodeURIComponent', 'jQuery'));
  71. $this->add_identifier_mapping('TYPE', array('Array', 'Boolean', 'Date',
  72. 'Error', 'EvalError', 'Infinity', 'Image', 'Math', 'NaN', 'Number',
  73. 'Object', 'Option', 'RangeError', 'ReferenceError', 'RegExp', 'String',
  74. 'SyntaxError', 'TypeError', 'URIError',
  75. 'document',
  76. 'undefined', 'window'));
  77. }
  78. function is_operand() {
  79. for ($i = count($this->tokens) -1 ; $i>= 0; $i--) {
  80. $tok = $this->tokens[$i][0];
  81. if ($tok === null || $tok === 'COMMENT' || $tok === 'COMMENT_SL') continue;
  82. return ($tok === 'OPERATOR' || $tok === 'OPENER');
  83. }
  84. return true;
  85. }
  86. function init() {
  87. if ($this->embedded_server)
  88. $this->add_pattern('STOP_SERVER', $this->server_tags);
  89. if ($this->embedded_html)
  90. $this->add_pattern('STOP_SCRIPT', '%</script>%');
  91. $op_pattern = '[=!+*%\-&^|~:?\;,.>';
  92. if (!($this->embedded_server || $this->embedded_html))
  93. $op_pattern .= '<]+';
  94. else {
  95. // build an alternation with a < followed by a lookahead
  96. $op_pattern .= ']|<(?![';
  97. // XXX this covers <? and <% but not very well
  98. if ($this->embedded_server) $op_pattern .= '?%';
  99. if ($this->embedded_html) $op_pattern .= '/';
  100. $op_pattern .= '])'; // closes lookahead
  101. $op_pattern = "(?:$op_pattern)+";
  102. }
  103. $op_pattern = "@$op_pattern@";
  104. $this->add_pattern('IDENT', '/[a-zA-Z_$][_$\w]*/');
  105. // NOTE: slash is a special case, and </ may be a script close
  106. $this->add_pattern('OPERATOR', $op_pattern);
  107. // we care about openers for figuring out where regular expressions are
  108. $this->add_pattern('OPENER', '/[\[\{\(]+/');
  109. $this->add_pattern('CLOSER', '/[\]\}\)]+/');
  110. $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_HEX);
  111. $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_REAL);
  112. $this->add_pattern('SSTRING', LuminousTokenPresets::$SINGLE_STR_SL);
  113. $this->add_pattern('DSTRING', LuminousTokenPresets::$DOUBLE_STR_SL);
  114. $this->add_pattern('COMMENT', LuminousTokenPresets::$C_COMMENT_ML);
  115. $this->add_pattern('COMMENT_SL', LuminousTokenPresets::$C_COMMENT_SL);
  116. // special case
  117. $this->add_pattern('SLASH', '%/%');
  118. $stop_patterns = array();
  119. $xml_scanner = new LuminousHTMLScanner($this->string());
  120. $xml_scanner->xml_literal = true;
  121. $xml_scanner->scripts = false;
  122. $xml_scanner->embedded_server = $this->embedded_server;
  123. if ($this->embedded_server)
  124. $xml_scanner->server_tags = $this->server_tags;
  125. $xml_scanner->init();
  126. $xml_scanner->pos($this->pos());
  127. $this->add_child_scanner('xml', $xml_scanner);
  128. }
  129. // c+p from HTML scanner
  130. function scan_child($lang) {
  131. assert (isset($this->child_scanners[$lang]));
  132. $scanner = $this->child_scanners[$lang];
  133. $scanner->pos($this->pos());
  134. $substr = $scanner->main();
  135. $this->record($scanner->tagged(), 'XML', true);
  136. $this->pos($scanner->pos());
  137. if ($scanner->interrupt) {
  138. $this->child_state = array($lang, $this->pos());
  139. } else {
  140. $this->child_state = null;
  141. }
  142. }
  143. function main() {
  144. $this->start();
  145. $this->interrupt = false;
  146. while (!$this->eos()) {
  147. $index = $this->pos();
  148. $tok = null;
  149. $m = null;
  150. $escaped = false;
  151. if (!$this->clean_exit) {
  152. try {
  153. $tok = $this->resume();
  154. } catch(Exception $e) {
  155. if (LUMINOUS_DEBUG) throw $e;
  156. else {
  157. $this->clean_exit = true;
  158. continue;
  159. }
  160. }
  161. }
  162. elseif ($this->child_state !== null && $this->child_state[1] < $this->pos()) {
  163. $this->scan_child($this->child_state[0]);
  164. continue;
  165. }
  166. elseif (($rule = $this->next_match()) !== null) {
  167. $tok = $rule[0];
  168. if ($rule[1] > $index) {
  169. $this->record(substr($this->string(), $index, $rule[1] - $index), null);
  170. }
  171. } else {
  172. $this->record(substr($this->string(), $index), null);
  173. $this->clean_exit = true;
  174. $this->interrupt = false;
  175. $this->terminate();
  176. break;
  177. }
  178. if ($tok === 'SLASH') {
  179. if ($this->is_operand()) {
  180. $tok = 'REGEX';
  181. $this->unscan();
  182. assert($this->peek() === '/');
  183. $m = $this->scan($this->regex_regex);
  184. if ($m === null) {
  185. assert(0);
  186. $m = $this->rest();
  187. $this->terminate();
  188. }
  189. } else {
  190. $tok = 'OPERATOR';
  191. }
  192. }
  193. elseif ($tok === 'OPERATOR' && $this->match() === '<') {
  194. if ($this->is_operand()) {
  195. $this->unscan();
  196. $this->scan_child('xml');
  197. continue;
  198. }
  199. }
  200. elseif ($tok === 'STOP_SERVER') {
  201. $this->interrupt = true;
  202. $this->unscan();
  203. break;
  204. }
  205. elseif ($tok === 'STOP_SCRIPT') {
  206. $this->unscan();
  207. break;
  208. }
  209. if ($m === null)
  210. $m = $this->match();
  211. if ($this->server_break($tok))
  212. break;
  213. if ($tok === 'COMMENT_SL' && $this->script_break($tok)
  214. )
  215. break;
  216. assert($this->pos() > $index);
  217. $tag = $tok;
  218. $this->record($m, $tag, $escaped);
  219. }
  220. }
  221. }