perl.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. <?php
  2. /*
  3. * Like ruby, I think it's impossible to fully tokenize Perl without
  4. * executing some of the code to disambiguate some symbols. As such, we're
  5. * going to settle for 'probably right' rather than 'definitely right'.
  6. *
  7. * TODO: I think this is mostly complete but it needs interpolation
  8. * highlighting in strings and heredoc, and a regex highlighting filter,
  9. * probably a stream filter
  10. */
  11. class LuminousPerlScanner extends LuminousSimpleScanner {
  12. // keeps track of heredocs we need to handle
  13. private $heredoc = null;
  14. // helper function:
  15. // consumes a string until the given delimiter (which may be balanced).
  16. // will handle nested balanced delimiters.
  17. // this is used as the general case for perl quote-operators like:
  18. // q/somestring/ q"somestring", q@somestring@, q[some[]string]
  19. // it can be called twice for s/someregex/somereplacement/
  20. // expects the initial opening delim to already have been consumed
  21. function consume_string($delimiter, $type) {
  22. $close = LuminousUtils::balance_delimiter($delimiter);
  23. $balanced = $close !== $delimiter;
  24. $patterns = array( '/(?<!\\\\)((?:\\\\\\\\)*)('
  25. . preg_quote($close, '/') . ')/');
  26. if ($balanced) {
  27. $patterns[] = '/(?<!\\\\)((?:\\\\\\\\)*)('
  28. . preg_quote($delimiter, '/') . ')/';
  29. }
  30. $stack = 1; // we're already inside the string
  31. $start = $this->pos();
  32. $close_delimiter_match = null;
  33. while($stack) {
  34. $next = $this->get_next($patterns);
  35. if ($next[0] === -1) {
  36. $this->terminate();
  37. $finish = $this->pos();
  38. break;
  39. }
  40. elseif($balanced && $next[1][2] === $delimiter) {
  41. $stack++;
  42. $finish = $next[0] + strlen($next[1][0]);
  43. }
  44. elseif($next[1][2] === $close) {
  45. $stack--;
  46. if (!$stack)
  47. $close_delimiter_match = $next[1][2];
  48. $finish = $next[0] + strlen($next[1][1]);
  49. }
  50. else assert(0);
  51. $this->pos($next[0] + strlen($next[1][0]));
  52. }
  53. $substr = substr($this->string(), $start, $finish-$start);
  54. // special case for qw, the string is not a 'STRING', it is actually
  55. // a whitespace separated list of strings. So we need to split it and
  56. // record them separately
  57. if ($type === 'SPLIT_STRING') {
  58. foreach(preg_split('/(\s+)/',
  59. $substr, -1, PREG_SPLIT_DELIM_CAPTURE) as $token) {
  60. if (preg_match('/^\s/', $token)) {
  61. $this->record($token, null);
  62. } else {
  63. $this->record($token, 'STRING');
  64. }
  65. }
  66. } else {
  67. $this->record($substr, $type);
  68. }
  69. if ($close_delimiter_match !== null) {
  70. $this->record($close_delimiter_match, 'DELIMITER');
  71. }
  72. }
  73. // Helper function: guesses whether or not a slash is a regex delimiter
  74. // by looking behind in the token stream.
  75. function is_delimiter() {
  76. for($i = count($this->tokens) - 1; $i >= 0; $i--) {
  77. $t = $this->tokens[$i];
  78. if ($t[0] === null || $t[0] === 'COMMENT') continue;
  79. elseif ($t[0] === 'OPENER' || $t[0] === 'OPERATOR') return true;
  80. elseif ($t[0] === 'IDENT') {
  81. switch($t[1]) {
  82. // named operators
  83. case 'lt':
  84. case 'gt':
  85. case 'le':
  86. case 'ge':
  87. case 'eq':
  88. case 'ne':
  89. case 'cmp':
  90. case 'and':
  91. case 'or':
  92. case 'xor':
  93. // other keywords/functions
  94. case 'if':
  95. case 'elsif':
  96. case 'while':
  97. case 'unless':
  98. case 'split':
  99. case 'print':
  100. return true;
  101. }
  102. }
  103. return false;
  104. }
  105. return true;
  106. }
  107. // override function for slashes, to disambiguate regexen from division
  108. // operators.
  109. function slash_override($matches) {
  110. $this->pos( $this->pos() + strlen($matches[0]) );
  111. // this can catch '//', which I THINK is an operator but I could be wrong.
  112. if (strlen($matches[0]) === 2 || !$this->is_delimiter()) {
  113. $this->record($matches[0], 'OPERATOR');
  114. } else {
  115. $this->record($matches[0], 'DELIMITER');
  116. $this->consume_string($matches[0], 'REGEX');
  117. if ($this->scan('/[cgimosx]+/')) {
  118. $this->record($this->match(), 'KEYWORD');
  119. }
  120. }
  121. }
  122. // override function for 'quote-like operators'
  123. // e.g. m"hello", m'hello', m/hello/, m(hello), m(he()l()o())
  124. function str_override($matches) {
  125. $this->pos( $this->pos() + strlen($matches[0]) );
  126. $this->record($matches[0], 'DELIMITER');
  127. $f = $matches[1];
  128. $type = 'STRING';
  129. if ($f === 'm' || $f === 'qr' || $f === 's' || $f === 'tr'
  130. || $f === 'y') $type = 'REGEX';
  131. elseif($f === 'qw') $type = 'SPLIT_STRING';
  132. $this->consume_string($matches[3], $type);
  133. if ($f === 's' || $f === 'tr' || $f === 'y') {
  134. // s/tr/y take two strings, e.g. s/something/somethingelse/, so we
  135. // have to consume the next delimiter (if it exists) and consume the
  136. // string, again.
  137. // if delims were balanced, there's a new delimiter right here, e.g.
  138. // s[something][somethingelse]
  139. $this->skip_whitespace();
  140. $balanced = LuminousUtils::balance_delimiter($matches[3]) !== $matches[3];
  141. if ($balanced) {
  142. $delim2 = $this->scan('/[^a-zA-Z0-9]/');
  143. if ($delim2 !== null) {
  144. $this->record($delim2, 'DELIMITER');
  145. $this->consume_string($delim2, 'STRING');
  146. }
  147. }
  148. // if they weren't balanced then the delimiter is the same, and has
  149. // already been consumed as the end-delim to the first pattern
  150. else {
  151. $this->consume_string($matches[3], 'STRING');
  152. }
  153. }
  154. if ($type === 'REGEX' && $this->scan('/[cgimosxpe]+/')) {
  155. $this->record($this->match(), 'KEYWORD');
  156. }
  157. }
  158. // this override handles the heredoc declaration, and makes a note of it
  159. // it adds a new token (a newline) which is overridden to invoke the real
  160. // heredoc handling. This is because in Perl, heredocs declarations need not
  161. // be the end of the line so we can't necessarily start heredocing straight
  162. // away.
  163. function heredoc_override($matches) {
  164. list($group, $op, $quote1, $delim, $quote2) = $matches;
  165. $this->record($op, 'OPERATOR');
  166. // Now, if $quote1 is '\', then $quote2 is empty. If quote2 is empty
  167. // but quote1 is not '\', this is not a heredoc.
  168. if ($quote1 === '\\' && $quote2 === '') {
  169. $this->record($quote1 . $delim, 'DELIMITER');
  170. } elseif($quote2 === '' && $quote1 !== '') {
  171. // this is the error case
  172. // shift to the end of the op and break
  173. $this->pos_shift(strlen($op));
  174. return;
  175. } else {
  176. $this->record($quote1 . $delim . $quote2, 'DELIMITER');
  177. }
  178. $this->pos_shift(strlen($group));
  179. // TODO. the quotes (matches[2] and matches[4]) are ignored for now, but
  180. // they mean something w.r.t interpolation.
  181. $this->heredoc = $delim;
  182. $this->add_pattern('HEREDOC_NL', "/\n/");
  183. $this->overrides['HEREDOC_NL'] = array($this, 'heredoc_real_override');
  184. }
  185. // this override handles the actual heredoc text
  186. function heredoc_real_override($matches) {
  187. $this->record($matches[0], null);
  188. $this->pos_shift(strlen($matches[0]));
  189. // don't need this anymore
  190. $this->remove_pattern('HEREDOC_NL');
  191. assert($this->heredoc !== null);
  192. $delim = preg_quote($this->heredoc);
  193. $substr = $this->scan_until('/^' . $delim . '\\b/m');
  194. if ($substr !== null) {
  195. $this->record($substr, 'HEREDOC');
  196. $delim_ = $this->scan('/' . $delim . '/');
  197. assert($delim !== null);
  198. $this->record($delim_, 'DELIMITER');
  199. } else {
  200. $this->record($this->rest(), 'HEREDOC');
  201. $this->terminate();
  202. }
  203. }
  204. // halts highlighting on __DATA__ and __END__
  205. function term_override($matches) {
  206. $this->record($matches[0], 'DELIMITER');
  207. $this->pos( $this->pos() + strlen($matches[0]) );
  208. $this->record($this->rest(), null);
  209. $this->terminate();
  210. }
  211. // pod cuts might be very long and trigger the backtrack limit, so
  212. // we do it the old fashioned way
  213. function pod_cut_override($matches) {
  214. $line = $this->scan('/^=.*/m');
  215. assert($line !== null);
  216. $term = '/^=cut$|\\z/m';
  217. $substr = $this->scan_until($term);
  218. assert($substr !== null);
  219. $end = $this->scan($term);
  220. assert($end !== null);
  221. $this->record($line . $substr . $end, 'DOCCOMMENT');
  222. }
  223. function init() {
  224. $this->add_pattern('COMMENT', '/#.*/');
  225. // pod/cut documentation
  226. $this->add_pattern('podcut', '/^=[a-zA-Z_]/m');
  227. $this->overrides['podcut'] = array($this, 'pod_cut_override');
  228. // variables
  229. $this->add_pattern('VARIABLE', '/[\\$%@][a-z_]\w*/i');
  230. // special variables http://www.kichwa.com/quik_ref/spec_variables.html
  231. $this->add_pattern('VARIABLE', '/\\$[\|%=\-~^\d&`\'+_\.\/\\\\,"#\\$\\?\\*O\\[\\];!@]/');
  232. // `backticks` (shell cmd)
  233. $this->add_pattern('CMD', '/`(?: [^`\\\\]++ | \\\\ . )*+ (?:`|$)/x');
  234. // straight strings
  235. $this->add_pattern('STRING', LuminousTokenPresets::$DOUBLE_STR);
  236. $this->add_pattern('STRING', LuminousTokenPresets::$SINGLE_STR);
  237. // terminators
  238. $this->add_pattern('TERM', '/__(?:DATA|END)__/');
  239. // heredoc (overriden)
  240. $this->add_pattern('HEREDOC', '/(<<)([\'"`\\\\]?)([a-zA-Z_]\w*)(\\2?)/');
  241. // operators, slash is a special case and is overridden
  242. $this->add_pattern('OPERATOR', '/[!%^&*\-=+;:|,\\.?<>~\\\\]+/');
  243. $this->add_pattern('SLASH', '%//?%');
  244. // we care about 'openers' for regex-vs-division disambiguatation
  245. $this->add_pattern('OPENER', '%[\[\{\(]+%x');
  246. $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_HEX);
  247. $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_REAL);
  248. // quote-like operators. we override these.
  249. // I got these out of the old luminous tree, I don't know how accurate
  250. // or complete they are.
  251. // According to psh, delimiters can be escaped?
  252. $this->add_pattern('DELIMETERS',
  253. '/(q[rqxw]?|m|s|tr|y)([\s]*)(\\\\?[^a-zA-Z0-9\s])/');
  254. $this->add_pattern('IDENT', '/[a-zA-Z_]\w*/');
  255. $this->overrides['DELIMETERS'] = array($this, 'str_override');
  256. $this->overrides['SLASH'] = array($this, 'slash_override');
  257. $this->overrides['HEREDOC'] = array($this, 'heredoc_override');
  258. $this->overrides['TERM'] = array($this, 'term_override');
  259. // map cmd to a 'function' and get rid of openers
  260. $this->rule_tag_map = array(
  261. 'CMD' => 'FUNCTION',
  262. 'OPENER' => null,
  263. );
  264. // this sort of borks with the strange regex delimiters
  265. $this->remove_filter('pcre');
  266. /************************************************************************/
  267. // data definition follows.
  268. // https://www.physiol.ox.ac.uk/Computing/Online_Documentation/Perl-5.8.6/index-functions-by-cat.html
  269. $this->add_identifier_mapping('KEYWORD', array( 'bless',
  270. 'caller', 'continue', 'dbmclose', 'dbmopen',
  271. 'defined',
  272. 'delete', 'die', 'do', 'dump', 'else', 'elsif',
  273. 'eval', 'exit', 'for', 'foreach', 'goto', 'import', 'if', 'last', 'local',
  274. 'my',
  275. 'next', 'no',
  276. 'our', 'package', 'prototype', 'redo', 'ref', 'reset',
  277. 'return', 'require', 'scalar', 'sub', 'tie', 'tied',
  278. 'undef',
  279. 'utie',
  280. 'unless', 'use', 'wantarray', 'while'));
  281. $this->add_identifier_mapping('OPERATOR', array('lt', 'gt', 'le',
  282. 'ge', 'eq', 'ne', 'cmp', 'and', 'or', 'xor'));
  283. $this->add_identifier_mapping('FUNCTION', array(
  284. 'chomp',
  285. 'chop',
  286. 'chr',
  287. 'crypt',
  288. 'hex',
  289. 'index',
  290. 'lc',
  291. 'lcfirst',
  292. 'length',
  293. 'oct',
  294. 'ord',
  295. 'pack',
  296. 'reverse',
  297. 'rindex',
  298. 'sprintf',
  299. 'substr',
  300. 'uc',
  301. 'ucfirst',
  302. 'pos',
  303. 'quotemeta',
  304. 'split',
  305. 'study',
  306. 'abs',
  307. 'atan2',
  308. 'cos',
  309. 'exp',
  310. 'hex',
  311. 'int',
  312. 'log',
  313. 'oct',
  314. 'rand',
  315. 'sin',
  316. 'sqrt',
  317. 'srand',
  318. 'pop',
  319. 'push',
  320. 'shift',
  321. 'splice',
  322. 'unshift',
  323. 'grep',
  324. 'join',
  325. 'map',
  326. 'reverse',
  327. 'sort',
  328. 'unpack',
  329. 'delete',
  330. 'each',
  331. 'exists',
  332. 'keys',
  333. 'values',
  334. 'binmode',
  335. 'close',
  336. 'closedir',
  337. 'dbmclose',
  338. 'dbmopen',
  339. 'die',
  340. 'eof',
  341. 'fileno',
  342. 'flock',
  343. 'format',
  344. 'getc',
  345. 'print',
  346. 'printf',
  347. 'read',
  348. 'readdir',
  349. 'readline',
  350. 'rewinddir',
  351. 'seek',
  352. 'seekdir',
  353. 'select',
  354. 'syscall',
  355. 'sysread',
  356. 'sysseek',
  357. 'syswrite',
  358. 'tell',
  359. 'telldir',
  360. 'truncate',
  361. 'warn',
  362. 'write',
  363. 'pack',
  364. 'read',
  365. 'syscall',
  366. 'sysread',
  367. 'sysseek',
  368. 'syswrite',
  369. 'unpack',
  370. 'vec',
  371. 'chdir',
  372. 'chmod',
  373. 'chown',
  374. 'chroot',
  375. 'fcntl',
  376. 'glob',
  377. 'ioctl',
  378. 'link',
  379. 'lstat',
  380. 'mkdir',
  381. 'open',
  382. 'opendir',
  383. 'readlink',
  384. 'rename',
  385. 'rmdir',
  386. 'stat',
  387. 'symlink',
  388. 'sysopen',
  389. 'umask',
  390. 'unlink',
  391. 'utime',
  392. 'alarm',
  393. 'exec',
  394. 'fork',
  395. 'getpgrp',
  396. 'getppid',
  397. 'getpriority',
  398. 'kill',
  399. 'pipe',
  400. 'qx/STRING/',
  401. 'readpipe',
  402. 'setpgrp',
  403. 'setpriority',
  404. 'sleep',
  405. 'system',
  406. 'times',
  407. 'wait',
  408. 'waitpid',
  409. 'accept',
  410. 'bind',
  411. 'connect',
  412. 'getpeername',
  413. 'getsockname',
  414. 'getsockopt',
  415. 'listen',
  416. 'recv',
  417. 'send',
  418. 'setsockopt',
  419. 'shutdown',
  420. 'socket',
  421. 'socketpair',
  422. 'msgctl',
  423. 'msgget',
  424. 'msgrcv',
  425. 'msgsnd',
  426. 'semctl',
  427. 'semget',
  428. 'semop',
  429. 'shmctl',
  430. 'shmget',
  431. 'shmread',
  432. 'shmwrite',
  433. 'endgrent',
  434. 'endhostent',
  435. 'endnetent',
  436. 'endpwent',
  437. 'getgrent',
  438. 'getgrgid',
  439. 'getgrnam',
  440. 'getlogin',
  441. 'getpwent',
  442. 'getpwnam',
  443. 'getpwuid',
  444. 'setgrent',
  445. 'setpwent',
  446. 'endprotoent',
  447. 'endservent',
  448. 'gethostbyaddr',
  449. 'gethostbyname',
  450. 'gethostent',
  451. 'getnetbyaddr',
  452. 'getnetbyname',
  453. 'getnetent',
  454. 'getprotobyname',
  455. 'getprotobynumber',
  456. 'getprotoent',
  457. 'getservbyname',
  458. 'getservbyport',
  459. 'getservent',
  460. 'sethostent',
  461. 'setnetent',
  462. 'setprotoent',
  463. 'setservent',
  464. 'gmtime',
  465. 'localtime',
  466. 'time',
  467. 'times'));
  468. }
  469. public static function guess_language($src, $info) {
  470. // check the shebang
  471. if (preg_match('/^#!.*\\bperl\\b/', $src)) return 1.0;
  472. $p = 0;
  473. if (preg_match('/\\$[a-zA-Z_]+/', $src)) $p += 0.02;
  474. if (preg_match('/@[a-zA-Z_]+/', $src)) $p += 0.02;
  475. if (preg_match('/%[a-zA-Z_]+/', $src)) $p += 0.02;
  476. if (preg_match('/\\bsub\s+\w+\s*\\{/', $src)) $p += 0.1;
  477. if (preg_match('/\\bmy\s+[$@%]/', $src)) $p += 0.05;
  478. // $x =~ s/
  479. if (preg_match('/\\$[a-zA-Z_]\w*\s+=~\s+s\W/', $src)) $p += 0.15;
  480. return $p;
  481. }
  482. }