| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- <?php
- /**
- * This is a rename of the JavaScript scanner.
- * TODO Some of these things are JS specific and should be moved into
- * the new JS scanner.
- */
- class LuminousECMAScriptScanner extends LuminousEmbeddedWebScript {
- public $script_tags = '</script>';
- // regular expressions in JavaScript are delimited by '/', BUT, the slash
- // character may appear unescaped within character classes
- // we can handle this fairly easily with a single regex because the classes
- // do not nest
- // TODO:
- // I do not know if this is specific to Javascript or ECMAScript derivatives
- // as a whole, I also don't know if multi-line regexen are legal (i.e. when
- // the definition spans multiple lines)
- protected $regex_regex = "%
- /
- (?:
- [^\\[\\\\/]+ # not slash, backslash, or [
- | \\\\. # escape char
- |
- (?: # char class [..]
- \\[
- (?:
- [^\\]\\\\]+ # not slash or ]
- | \\\\. # escape
- )*
- (?: \\] | \$)
- ) # close char class
- )*
- (?: /[iogmx]* | \$) #delimiter or eof
- %sx";
-
-
- // logs a persistent token stream so that we can lookbehind to figure out
- // operators vs regexes.
- private $tokens_ = array();
-
- private $child_state = null;
-
- function __construct($src=null) {
-
- $this->rule_tag_map = array(
- 'COMMENT_SL' => 'COMMENT',
- 'SSTRING' => 'STRING',
- 'DSTRING' => 'STRING',
- 'OPENER' => null,
- 'CLOSER' => null,
- );
- $this->dirty_exit_recovery = array(
- 'COMMENT_SL' => '/.*/',
- 'COMMENT' => '%.*?(\*/|$)%s',
- 'SSTRING' => "/(?:[^\\\\']+|\\\\.)*('|$)/",
- 'DSTRING' => '/(?:[^\\\\"]+|\\\\.)*("|$)/',
- // FIXME: Anyone using a server-side interruption to build a regex is
- // frankly insane, but we are wrong in the case that they were in a
- // character class when the server language interrupted, and we may
- // exit the regex prematurely with this
- 'REGEX' => '%(?:[^\\\\/]+|\\\\.)*(?:/[iogmx]*|$)%',
- );
-
- parent::__construct($src);
- $this->add_identifier_mapping('KEYWORD', array('break', 'case', 'catch',
- 'comment', 'continue', 'do', 'default', 'delete', 'else', 'export',
- 'for', 'function', 'if', 'import', 'in', 'instanceof', 'label', 'new',
- 'null', 'return', 'switch', 'throw', 'try', 'typeof', 'var', 'void',
- 'while', 'with',
- 'true', 'false', 'this'
- ));
- $this->add_identifier_mapping('FUNCTION', array('$', 'alert', 'confirm',
- 'clearTimeout', 'clearInterval',
- 'encodeURI', 'encodeURIComponent', 'eval', 'isFinite', 'isNaN',
- 'parseInt', 'parseFloat', 'prompt',
- 'setTimeout', 'setInterval',
- 'decodeURI', 'decodeURIComponent', 'jQuery'));
-
- $this->add_identifier_mapping('TYPE', array('Array', 'Boolean', 'Date',
- 'Error', 'EvalError', 'Infinity', 'Image', 'Math', 'NaN', 'Number',
- 'Object', 'Option', 'RangeError', 'ReferenceError', 'RegExp', 'String',
- 'SyntaxError', 'TypeError', 'URIError',
-
- 'document',
- 'undefined', 'window'));
- }
-
- function is_operand() {
- for ($i = count($this->tokens) -1 ; $i>= 0; $i--) {
- $tok = $this->tokens[$i][0];
- if ($tok === null || $tok === 'COMMENT' || $tok === 'COMMENT_SL') continue;
- return ($tok === 'OPERATOR' || $tok === 'OPENER');
- }
- return true;
- }
-
- function init() {
-
- if ($this->embedded_server)
- $this->add_pattern('STOP_SERVER', $this->server_tags);
- if ($this->embedded_html)
- $this->add_pattern('STOP_SCRIPT', '%</script>%');
-
- $op_pattern = '[=!+*%\-&^|~:?\;,.>';
- if (!($this->embedded_server || $this->embedded_html))
- $op_pattern .= '<]+';
- else {
- // build an alternation with a < followed by a lookahead
- $op_pattern .= ']|<(?![';
- // XXX this covers <? and <% but not very well
- if ($this->embedded_server) $op_pattern .= '?%';
- if ($this->embedded_html) $op_pattern .= '/';
- $op_pattern .= '])'; // closes lookahead
- $op_pattern = "(?:$op_pattern)+";
- }
- $op_pattern = "@$op_pattern@";
-
- $this->add_pattern('IDENT', '/[a-zA-Z_$][_$\w]*/');
- // NOTE: slash is a special case, and </ may be a script close
- $this->add_pattern('OPERATOR', $op_pattern);
- // we care about openers for figuring out where regular expressions are
- $this->add_pattern('OPENER', '/[\[\{\(]+/');
- $this->add_pattern('CLOSER', '/[\]\}\)]+/');
-
- $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_HEX);
- $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_REAL);
- $this->add_pattern('SSTRING', LuminousTokenPresets::$SINGLE_STR_SL);
- $this->add_pattern('DSTRING', LuminousTokenPresets::$DOUBLE_STR_SL);
- $this->add_pattern('COMMENT', LuminousTokenPresets::$C_COMMENT_ML);
- $this->add_pattern('COMMENT_SL', LuminousTokenPresets::$C_COMMENT_SL);
- // special case
- $this->add_pattern('SLASH', '%/%');
-
- $stop_patterns = array();
-
- $xml_scanner = new LuminousHTMLScanner($this->string());
- $xml_scanner->xml_literal = true;
- $xml_scanner->scripts = false;
- $xml_scanner->embedded_server = $this->embedded_server;
- if ($this->embedded_server)
- $xml_scanner->server_tags = $this->server_tags;
- $xml_scanner->init();
- $xml_scanner->pos($this->pos());
- $this->add_child_scanner('xml', $xml_scanner);
- }
-
-
-
-
- // c+p from HTML scanner
- function scan_child($lang) {
- assert (isset($this->child_scanners[$lang]));
- $scanner = $this->child_scanners[$lang];
- $scanner->pos($this->pos());
- $substr = $scanner->main();
- $this->record($scanner->tagged(), 'XML', true);
- $this->pos($scanner->pos());
- if ($scanner->interrupt) {
- $this->child_state = array($lang, $this->pos());
- } else {
- $this->child_state = null;
- }
- }
-
- function main() {
- $this->start();
- $this->interrupt = false;
- while (!$this->eos()) {
- $index = $this->pos();
- $tok = null;
- $m = null;
- $escaped = false;
- if (!$this->clean_exit) {
- try {
- $tok = $this->resume();
- } catch(Exception $e) {
- if (LUMINOUS_DEBUG) throw $e;
- else {
- $this->clean_exit = true;
- continue;
- }
- }
- }
- elseif ($this->child_state !== null && $this->child_state[1] < $this->pos()) {
- $this->scan_child($this->child_state[0]);
- continue;
- }
-
- elseif (($rule = $this->next_match()) !== null) {
- $tok = $rule[0];
- if ($rule[1] > $index) {
- $this->record(substr($this->string(), $index, $rule[1] - $index), null);
- }
- } else {
- $this->record(substr($this->string(), $index), null);
- $this->clean_exit = true;
- $this->interrupt = false;
- $this->terminate();
- break;
- }
-
- if ($tok === 'SLASH') {
- if ($this->is_operand()) {
- $tok = 'REGEX';
- $this->unscan();
- assert($this->peek() === '/');
- $m = $this->scan($this->regex_regex);
- if ($m === null) {
- assert(0);
- $m = $this->rest();
- $this->terminate();
- }
-
- } else {
- $tok = 'OPERATOR';
- }
- }
- elseif ($tok === 'OPERATOR' && $this->match() === '<') {
- if ($this->is_operand()) {
- $this->unscan();
- $this->scan_child('xml');
- continue;
- }
- }
- elseif ($tok === 'STOP_SERVER') {
- $this->interrupt = true;
- $this->unscan();
- break;
- }
- elseif ($tok === 'STOP_SCRIPT') {
- $this->unscan();
- break;
- }
- if ($m === null)
- $m = $this->match();
-
- if ($this->server_break($tok))
- break;
-
- if ($tok === 'COMMENT_SL' && $this->script_break($tok)
- )
- break;
- assert($this->pos() > $index);
-
- $tag = $tok;
- $this->record($m, $tag, $escaped);
- }
- }
- }
|