| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525 |
- <?php
- /*
- * Like ruby, I think it's impossible to fully tokenize Perl without
- * executing some of the code to disambiguate some symbols. As such, we're
- * going to settle for 'probably right' rather than 'definitely right'.
- *
- * TODO: I think this is mostly complete but it needs interpolation
- * highlighting in strings and heredoc, and a regex highlighting filter,
- * probably a stream filter
- */
- class LuminousPerlScanner extends LuminousSimpleScanner {
- // keeps track of heredocs we need to handle
- private $heredoc = null;
-
- // helper function:
- // consumes a string until the given delimiter (which may be balanced).
- // will handle nested balanced delimiters.
- // this is used as the general case for perl quote-operators like:
- // q/somestring/ q"somestring", q@somestring@, q[some[]string]
- // it can be called twice for s/someregex/somereplacement/
- // expects the initial opening delim to already have been consumed
- function consume_string($delimiter, $type) {
- $close = LuminousUtils::balance_delimiter($delimiter);
- $balanced = $close !== $delimiter;
- $patterns = array( '/(?<!\\\\)((?:\\\\\\\\)*)('
- . preg_quote($close, '/') . ')/');
-
- if ($balanced) {
- $patterns[] = '/(?<!\\\\)((?:\\\\\\\\)*)('
- . preg_quote($delimiter, '/') . ')/';
- }
- $stack = 1; // we're already inside the string
- $start = $this->pos();
- $close_delimiter_match = null;
- while($stack) {
- $next = $this->get_next($patterns);
- if ($next[0] === -1) {
- $this->terminate();
- $finish = $this->pos();
- break;
- }
- elseif($balanced && $next[1][2] === $delimiter) {
- $stack++;
- $finish = $next[0] + strlen($next[1][0]);
- }
- elseif($next[1][2] === $close) {
- $stack--;
- if (!$stack)
- $close_delimiter_match = $next[1][2];
- $finish = $next[0] + strlen($next[1][1]);
- }
- else assert(0);
- $this->pos($next[0] + strlen($next[1][0]));
- }
- $substr = substr($this->string(), $start, $finish-$start);
- // special case for qw, the string is not a 'STRING', it is actually
- // a whitespace separated list of strings. So we need to split it and
- // record them separately
- if ($type === 'SPLIT_STRING') {
- foreach(preg_split('/(\s+)/',
- $substr, -1, PREG_SPLIT_DELIM_CAPTURE) as $token) {
- if (preg_match('/^\s/', $token)) {
- $this->record($token, null);
- } else {
- $this->record($token, 'STRING');
- }
- }
- } else {
- $this->record($substr, $type);
- }
- if ($close_delimiter_match !== null) {
- $this->record($close_delimiter_match, 'DELIMITER');
- }
- }
- // Helper function: guesses whether or not a slash is a regex delimiter
- // by looking behind in the token stream.
- function is_delimiter() {
- for($i = count($this->tokens) - 1; $i >= 0; $i--) {
- $t = $this->tokens[$i];
- if ($t[0] === null || $t[0] === 'COMMENT') continue;
- elseif ($t[0] === 'OPENER' || $t[0] === 'OPERATOR') return true;
- elseif ($t[0] === 'IDENT') {
- switch($t[1]) {
- // named operators
- case 'lt':
- case 'gt':
- case 'le':
- case 'ge':
- case 'eq':
- case 'ne':
- case 'cmp':
- case 'and':
- case 'or':
- case 'xor':
- // other keywords/functions
- case 'if':
- case 'elsif':
- case 'while':
- case 'unless':
- case 'split':
- case 'print':
- return true;
- }
- }
- return false;
- }
- return true;
- }
- // override function for slashes, to disambiguate regexen from division
- // operators.
- function slash_override($matches) {
- $this->pos( $this->pos() + strlen($matches[0]) );
- // this can catch '//', which I THINK is an operator but I could be wrong.
- if (strlen($matches[0]) === 2 || !$this->is_delimiter()) {
- $this->record($matches[0], 'OPERATOR');
- } else {
- $this->record($matches[0], 'DELIMITER');
- $this->consume_string($matches[0], 'REGEX');
- if ($this->scan('/[cgimosx]+/')) {
- $this->record($this->match(), 'KEYWORD');
- }
- }
- }
- // override function for 'quote-like operators'
- // e.g. m"hello", m'hello', m/hello/, m(hello), m(he()l()o())
- function str_override($matches) {
- $this->pos( $this->pos() + strlen($matches[0]) );
- $this->record($matches[0], 'DELIMITER');
-
- $f = $matches[1];
-
- $type = 'STRING';
- if ($f === 'm' || $f === 'qr' || $f === 's' || $f === 'tr'
- || $f === 'y') $type = 'REGEX';
- elseif($f === 'qw') $type = 'SPLIT_STRING';
- $this->consume_string($matches[3], $type);
- if ($f === 's' || $f === 'tr' || $f === 'y') {
- // s/tr/y take two strings, e.g. s/something/somethingelse/, so we
- // have to consume the next delimiter (if it exists) and consume the
- // string, again.
-
- // if delims were balanced, there's a new delimiter right here, e.g.
- // s[something][somethingelse]
- $this->skip_whitespace();
- $balanced = LuminousUtils::balance_delimiter($matches[3]) !== $matches[3];
- if ($balanced) {
- $delim2 = $this->scan('/[^a-zA-Z0-9]/');
- if ($delim2 !== null) {
- $this->record($delim2, 'DELIMITER');
- $this->consume_string($delim2, 'STRING');
- }
- }
- // if they weren't balanced then the delimiter is the same, and has
- // already been consumed as the end-delim to the first pattern
- else {
- $this->consume_string($matches[3], 'STRING');
- }
- }
- if ($type === 'REGEX' && $this->scan('/[cgimosxpe]+/')) {
- $this->record($this->match(), 'KEYWORD');
- }
- }
- // this override handles the heredoc declaration, and makes a note of it
- // it adds a new token (a newline) which is overridden to invoke the real
- // heredoc handling. This is because in Perl, heredocs declarations need not
- // be the end of the line so we can't necessarily start heredocing straight
- // away.
- function heredoc_override($matches) {
- list($group, $op, $quote1, $delim, $quote2) = $matches;
- $this->record($op, 'OPERATOR');
- // Now, if $quote1 is '\', then $quote2 is empty. If quote2 is empty
- // but quote1 is not '\', this is not a heredoc.
- if ($quote1 === '\\' && $quote2 === '') {
- $this->record($quote1 . $delim, 'DELIMITER');
- } elseif($quote2 === '' && $quote1 !== '') {
- // this is the error case
- // shift to the end of the op and break
- $this->pos_shift(strlen($op));
- return;
- } else {
- $this->record($quote1 . $delim . $quote2, 'DELIMITER');
- }
- $this->pos_shift(strlen($group));
- // TODO. the quotes (matches[2] and matches[4]) are ignored for now, but
- // they mean something w.r.t interpolation.
- $this->heredoc = $delim;
- $this->add_pattern('HEREDOC_NL', "/\n/");
- $this->overrides['HEREDOC_NL'] = array($this, 'heredoc_real_override');
- }
- // this override handles the actual heredoc text
- function heredoc_real_override($matches) {
- $this->record($matches[0], null);
- $this->pos_shift(strlen($matches[0]));
- // don't need this anymore
- $this->remove_pattern('HEREDOC_NL');
- assert($this->heredoc !== null);
- $delim = preg_quote($this->heredoc);
- $substr = $this->scan_until('/^' . $delim . '\\b/m');
- if ($substr !== null) {
- $this->record($substr, 'HEREDOC');
- $delim_ = $this->scan('/' . $delim . '/');
- assert($delim !== null);
- $this->record($delim_, 'DELIMITER');
- } else {
- $this->record($this->rest(), 'HEREDOC');
- $this->terminate();
- }
- }
- // halts highlighting on __DATA__ and __END__
- function term_override($matches) {
- $this->record($matches[0], 'DELIMITER');
- $this->pos( $this->pos() + strlen($matches[0]) );
- $this->record($this->rest(), null);
- $this->terminate();
- }
- // pod cuts might be very long and trigger the backtrack limit, so
- // we do it the old fashioned way
- function pod_cut_override($matches) {
- $line = $this->scan('/^=.*/m');
- assert($line !== null);
- $term = '/^=cut$|\\z/m';
- $substr = $this->scan_until($term);
- assert($substr !== null);
- $end = $this->scan($term);
- assert($end !== null);
- $this->record($line . $substr . $end, 'DOCCOMMENT');
- }
-
-
- function init() {
-
- $this->add_pattern('COMMENT', '/#.*/');
-
- // pod/cut documentation
- $this->add_pattern('podcut', '/^=[a-zA-Z_]/m');
- $this->overrides['podcut'] = array($this, 'pod_cut_override');
- // variables
- $this->add_pattern('VARIABLE', '/[\\$%@][a-z_]\w*/i');
- // special variables http://www.kichwa.com/quik_ref/spec_variables.html
- $this->add_pattern('VARIABLE', '/\\$[\|%=\-~^\d&`\'+_\.\/\\\\,"#\\$\\?\\*O\\[\\];!@]/');
- // `backticks` (shell cmd)
- $this->add_pattern('CMD', '/`(?: [^`\\\\]++ | \\\\ . )*+ (?:`|$)/x');
- // straight strings
- $this->add_pattern('STRING', LuminousTokenPresets::$DOUBLE_STR);
- $this->add_pattern('STRING', LuminousTokenPresets::$SINGLE_STR);
- // terminators
- $this->add_pattern('TERM', '/__(?:DATA|END)__/');
- // heredoc (overriden)
- $this->add_pattern('HEREDOC', '/(<<)([\'"`\\\\]?)([a-zA-Z_]\w*)(\\2?)/');
- // operators, slash is a special case and is overridden
- $this->add_pattern('OPERATOR', '/[!%^&*\-=+;:|,\\.?<>~\\\\]+/');
- $this->add_pattern('SLASH', '%//?%');
- // we care about 'openers' for regex-vs-division disambiguatation
- $this->add_pattern('OPENER', '%[\[\{\(]+%x');
-
- $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_HEX);
- $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_REAL);
- // quote-like operators. we override these.
- // I got these out of the old luminous tree, I don't know how accurate
- // or complete they are.
- // According to psh, delimiters can be escaped?
- $this->add_pattern('DELIMETERS',
- '/(q[rqxw]?|m|s|tr|y)([\s]*)(\\\\?[^a-zA-Z0-9\s])/');
- $this->add_pattern('IDENT', '/[a-zA-Z_]\w*/');
-
-
- $this->overrides['DELIMETERS'] = array($this, 'str_override');
- $this->overrides['SLASH'] = array($this, 'slash_override');
- $this->overrides['HEREDOC'] = array($this, 'heredoc_override');
- $this->overrides['TERM'] = array($this, 'term_override');
-
- // map cmd to a 'function' and get rid of openers
- $this->rule_tag_map = array(
- 'CMD' => 'FUNCTION',
- 'OPENER' => null,
- );
- // this sort of borks with the strange regex delimiters
- $this->remove_filter('pcre');
- /************************************************************************/
- // data definition follows.
- // https://www.physiol.ox.ac.uk/Computing/Online_Documentation/Perl-5.8.6/index-functions-by-cat.html
- $this->add_identifier_mapping('KEYWORD', array( 'bless',
- 'caller', 'continue', 'dbmclose', 'dbmopen',
- 'defined',
- 'delete', 'die', 'do', 'dump', 'else', 'elsif',
- 'eval', 'exit', 'for', 'foreach', 'goto', 'import', 'if', 'last', 'local',
- 'my',
- 'next', 'no',
- 'our', 'package', 'prototype', 'redo', 'ref', 'reset',
- 'return', 'require', 'scalar', 'sub', 'tie', 'tied',
- 'undef',
- 'utie',
- 'unless', 'use', 'wantarray', 'while'));
- $this->add_identifier_mapping('OPERATOR', array('lt', 'gt', 'le',
- 'ge', 'eq', 'ne', 'cmp', 'and', 'or', 'xor'));
- $this->add_identifier_mapping('FUNCTION', array(
- 'chomp',
- 'chop',
- 'chr',
- 'crypt',
- 'hex',
- 'index',
- 'lc',
- 'lcfirst',
- 'length',
- 'oct',
- 'ord',
- 'pack',
- 'reverse',
- 'rindex',
- 'sprintf',
- 'substr',
- 'uc',
- 'ucfirst',
- 'pos',
- 'quotemeta',
- 'split',
- 'study',
- 'abs',
- 'atan2',
- 'cos',
- 'exp',
- 'hex',
- 'int',
- 'log',
- 'oct',
- 'rand',
- 'sin',
- 'sqrt',
- 'srand',
- 'pop',
- 'push',
- 'shift',
- 'splice',
- 'unshift',
- 'grep',
- 'join',
- 'map',
- 'reverse',
- 'sort',
- 'unpack',
- 'delete',
- 'each',
- 'exists',
- 'keys',
- 'values',
- 'binmode',
- 'close',
- 'closedir',
- 'dbmclose',
- 'dbmopen',
- 'die',
- 'eof',
- 'fileno',
- 'flock',
- 'format',
- 'getc',
- 'print',
- 'printf',
- 'read',
- 'readdir',
- 'readline',
- 'rewinddir',
- 'seek',
- 'seekdir',
- 'select',
- 'syscall',
- 'sysread',
- 'sysseek',
- 'syswrite',
- 'tell',
- 'telldir',
- 'truncate',
- 'warn',
- 'write',
- 'pack',
- 'read',
- 'syscall',
- 'sysread',
- 'sysseek',
- 'syswrite',
- 'unpack',
- 'vec',
- 'chdir',
- 'chmod',
- 'chown',
- 'chroot',
- 'fcntl',
- 'glob',
- 'ioctl',
- 'link',
- 'lstat',
- 'mkdir',
- 'open',
- 'opendir',
- 'readlink',
- 'rename',
- 'rmdir',
- 'stat',
- 'symlink',
- 'sysopen',
- 'umask',
- 'unlink',
- 'utime',
- 'alarm',
- 'exec',
- 'fork',
- 'getpgrp',
- 'getppid',
- 'getpriority',
- 'kill',
- 'pipe',
- 'qx/STRING/',
- 'readpipe',
- 'setpgrp',
- 'setpriority',
- 'sleep',
- 'system',
- 'times',
- 'wait',
- 'waitpid',
- 'accept',
- 'bind',
- 'connect',
- 'getpeername',
- 'getsockname',
- 'getsockopt',
- 'listen',
- 'recv',
- 'send',
- 'setsockopt',
- 'shutdown',
- 'socket',
- 'socketpair',
- 'msgctl',
- 'msgget',
- 'msgrcv',
- 'msgsnd',
- 'semctl',
- 'semget',
- 'semop',
- 'shmctl',
- 'shmget',
- 'shmread',
- 'shmwrite',
- 'endgrent',
- 'endhostent',
- 'endnetent',
- 'endpwent',
- 'getgrent',
- 'getgrgid',
- 'getgrnam',
- 'getlogin',
- 'getpwent',
- 'getpwnam',
- 'getpwuid',
- 'setgrent',
- 'setpwent',
- 'endprotoent',
- 'endservent',
- 'gethostbyaddr',
- 'gethostbyname',
- 'gethostent',
- 'getnetbyaddr',
- 'getnetbyname',
- 'getnetent',
- 'getprotobyname',
- 'getprotobynumber',
- 'getprotoent',
- 'getservbyname',
- 'getservbyport',
- 'getservent',
- 'sethostent',
- 'setnetent',
- 'setprotoent',
- 'setservent',
- 'gmtime',
- 'localtime',
- 'time',
- 'times'));
-
- }
- public static function guess_language($src, $info) {
- // check the shebang
- if (preg_match('/^#!.*\\bperl\\b/', $src)) return 1.0;
- $p = 0;
- if (preg_match('/\\$[a-zA-Z_]+/', $src)) $p += 0.02;
- if (preg_match('/@[a-zA-Z_]+/', $src)) $p += 0.02;
- if (preg_match('/%[a-zA-Z_]+/', $src)) $p += 0.02;
- if (preg_match('/\\bsub\s+\w+\s*\\{/', $src)) $p += 0.1;
- if (preg_match('/\\bmy\s+[$@%]/', $src)) $p += 0.05;
- // $x =~ s/
- if (preg_match('/\\$[a-zA-Z_]\w*\s+=~\s+s\W/', $src)) $p += 0.15;
- return $p;
- }
- }
|