Lexer.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. <?php
  2. /**
  3. * PHP_LexerGenerator, a php 5 lexer generator.
  4. *
  5. * This lexer generator translates a file in a format similar to
  6. * re2c ({@link http://re2c.org}) and translates it into a PHP 5-based lexer
  7. *
  8. * PHP version 5
  9. *
  10. * LICENSE: This source file is subject to version 3.01 of the PHP license
  11. * that is available through the world-wide-web at the following URI:
  12. * http://www.php.net/license/3_01.txt. If you did not receive a copy of
  13. * the PHP License and are unable to obtain it through the web, please
  14. * send a note to license@php.net so we can mail you a copy immediately.
  15. *
  16. * @category php
  17. * @package PHP_LexerGenerator
  18. * @author Gregory Beaver <cellog@php.net>
  19. * @copyright 2006 Gregory Beaver
  20. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  21. * @version CVS: $Id: Lexer.php 246683 2007-11-22 04:43:52Z instance $
  22. * @since File available since Release 0.1.0
  23. */
  24. require_once 'PHP/LexerGenerator/Parser.php';
  25. /**
  26. * Token scanner for plex files.
  27. *
  28. * This scanner detects comments beginning with "/*!lex2php" and
  29. * then returns their components (processing instructions, patterns, strings
  30. * action code, and regexes)
  31. * @package PHP_LexerGenerator
  32. * @author Gregory Beaver <cellog@php.net>
  33. * @copyright 2006 Gregory Beaver
  34. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  35. * @version @package_version@
  36. * @since Class available since Release 0.1.0
  37. */
  38. class PHP_LexerGenerator_Lexer
  39. {
  40. private $data;
  41. private $N;
  42. private $state;
  43. /**
  44. * Current line number in input
  45. * @var int
  46. */
  47. public $line;
  48. /**
  49. * Number of scanning errors detected
  50. * @var int
  51. */
  52. public $errors = 0;
  53. /**
  54. * integer identifier of the current token
  55. * @var int
  56. */
  57. public $token;
  58. /**
  59. * string content of current token
  60. * @var string
  61. */
  62. public $value;
  63. const CODE = PHP_LexerGenerator_Parser::CODE;
  64. const COMMENTEND = PHP_LexerGenerator_Parser::COMMENTEND;
  65. const COMMENTSTART = PHP_LexerGenerator_Parser::COMMENTSTART;
  66. const PATTERN = PHP_LexerGenerator_Parser::PATTERN;
  67. const PHPCODE = PHP_LexerGenerator_Parser::PHPCODE;
  68. const PI = PHP_LexerGenerator_Parser::PI;
  69. const QUOTE = PHP_LexerGenerator_Parser::QUOTE;
  70. const SINGLEQUOTE = PHP_LexerGenerator_Parser::SINGLEQUOTE;
  71. const SUBPATTERN = PHP_LexerGenerator_Parser::SUBPATTERN;
  72. /**
  73. * prepare scanning
  74. * @param string the input
  75. */
  76. function __construct($data)
  77. {
  78. $this->data = str_replace("\r\n", "\n", $data);
  79. $this->N = 0;
  80. $this->line = 1;
  81. $this->state = 'Start';
  82. $this->errors = 0;
  83. }
  84. /**
  85. * Output an error message
  86. * @param string
  87. */
  88. private function error($msg)
  89. {
  90. echo 'Error on line ' . $this->line . ': ' . $msg;
  91. $this->errors++;
  92. }
  93. /**
  94. * Initial scanning state lexer
  95. * @return boolean
  96. */
  97. private function lexStart()
  98. {
  99. if ($this->N >= strlen($this->data)) {
  100. return false;
  101. }
  102. $a = strpos($this->data, '/*!lex2php' . "\n", $this->N);
  103. if ($a === false) {
  104. $this->value = substr($this->data, $this->N);
  105. $this->N = strlen($this->data);
  106. $this->token = self::PHPCODE;
  107. return true;
  108. }
  109. if ($a > $this->N) {
  110. $this->value = substr($this->data, $this->N, $a - $this->N);
  111. $this->N = $a;
  112. $this->token = self::PHPCODE;
  113. return true;
  114. }
  115. $this->value = '/*!lex2php' . "\n";
  116. $this->N += 11; // strlen("/*lex2php\n")
  117. $this->token = self::COMMENTSTART;
  118. $this->state = 'Declare';
  119. return true;
  120. }
  121. /**
  122. * lexer for top-level canning state after the initial declaration comment
  123. * @return boolean
  124. */
  125. private function lexStartNonDeclare()
  126. {
  127. if ($this->N >= strlen($this->data)) {
  128. return false;
  129. }
  130. $a = strpos($this->data, '/*!lex2php' . "\n", $this->N);
  131. if ($a === false) {
  132. $this->value = substr($this->data, $this->N);
  133. $this->N = strlen($this->data);
  134. $this->token = self::PHPCODE;
  135. return true;
  136. }
  137. if ($a > $this->N) {
  138. $this->value = substr($this->data, $this->N, $a - $this->N);
  139. $this->N = $a;
  140. $this->token = self::PHPCODE;
  141. return true;
  142. }
  143. $this->value = '/*!lex2php' . "\n";
  144. $this->N += 11; // strlen("/*lex2php\n")
  145. $this->token = self::COMMENTSTART;
  146. $this->state = 'Rule';
  147. return true;
  148. }
  149. /**
  150. * lexer for declaration comment state
  151. * @return boolean
  152. */
  153. private function lexDeclare()
  154. {
  155. while (true) {
  156. $this -> skipWhitespaceEol();
  157. if (
  158. $this->N + 1 >= strlen($this->data)
  159. || $this->data[$this->N] != '/'
  160. || $this->data[$this->N + 1] != '/'
  161. ) {
  162. break;
  163. }
  164. // Skip single-line comment
  165. while (
  166. $this->N < strlen($this->data)
  167. && $this->data[$this->N] != "\n"
  168. ) {
  169. ++$this->N;
  170. }
  171. }
  172. if ($this->data[$this->N] == '*' && $this->data[$this->N + 1] == '/') {
  173. $this->state = 'StartNonDeclare';
  174. $this->value = '*/';
  175. $this->N += 2;
  176. $this->token = self::COMMENTEND;
  177. return true;
  178. }
  179. if (preg_match('/\G%([a-z]+)/', $this->data, $token, null, $this->N)) {
  180. $this->value = $token[1];
  181. $this->N += strlen($token[1]) + 1;
  182. $this->state = 'DeclarePI';
  183. $this->token = self::PI;
  184. return true;
  185. }
  186. if (preg_match('/\G[a-zA-Z_][a-zA-Z0-9_]*/', $this->data, $token, null, $this->N)) {
  187. $this->value = $token[0];
  188. $this->token = self::PATTERN;
  189. $this->N += strlen($token[0]);
  190. $this->state = 'DeclareEquals';
  191. return true;
  192. }
  193. $this->error('expecting declaration of sub-patterns');
  194. return false;
  195. }
  196. /**
  197. * lexer for processor instructions within declaration comment
  198. * @return boolean
  199. */
  200. private function lexDeclarePI()
  201. {
  202. $this -> skipWhitespace();
  203. if ($this->data[$this->N] == "\n") {
  204. $this->N++;
  205. $this->state = 'Declare';
  206. $this->line++;
  207. return $this->lexDeclare();
  208. }
  209. if ($this->data[$this->N] == '{') {
  210. return $this->lexCode();
  211. }
  212. if (!preg_match("/\G[^\n]+/", $this->data, $token, null, $this->N)) {
  213. $this->error('Unexpected end of file');
  214. return false;
  215. }
  216. $this->value = $token[0];
  217. $this->N += strlen($this->value);
  218. $this->token = self::SUBPATTERN;
  219. return true;
  220. }
  221. /**
  222. * lexer for processor instructions inside rule comments
  223. * @return boolean
  224. */
  225. private function lexDeclarePIRule()
  226. {
  227. $this -> skipWhitespace();
  228. if ($this->data[$this->N] == "\n") {
  229. $this->N++;
  230. $this->state = 'Rule';
  231. $this->line++;
  232. return $this->lexRule();
  233. }
  234. if ($this->data[$this->N] == '{') {
  235. return $this->lexCode();
  236. }
  237. if (!preg_match("/\G[^\n]+/", $this->data, $token, null, $this->N)) {
  238. $this->error('Unexpected end of file');
  239. return false;
  240. }
  241. $this->value = $token[0];
  242. $this->N += strlen($this->value);
  243. $this->token = self::SUBPATTERN;
  244. return true;
  245. }
  246. /**
  247. * lexer for the state representing scanning between a pattern and the "=" sign
  248. * @return boolean
  249. */
  250. private function lexDeclareEquals()
  251. {
  252. $this -> skipWhitespace();
  253. if ($this->N >= strlen($this->data)) {
  254. $this->error('unexpected end of input, expecting "=" for sub-pattern declaration');
  255. }
  256. if ($this->data[$this->N] != '=') {
  257. $this->error('expecting "=" for sub-pattern declaration');
  258. return false;
  259. }
  260. $this->N++;
  261. $this->state = 'DeclareRightside';
  262. $this -> skipWhitespace();
  263. if ($this->N >= strlen($this->data)) {
  264. $this->error('unexpected end of file, expecting right side of sub-pattern declaration');
  265. return false;
  266. }
  267. return $this->lexDeclareRightside();
  268. }
  269. /**
  270. * lexer for the right side of a pattern, detects quotes or regexes
  271. * @return boolean
  272. */
  273. private function lexDeclareRightside()
  274. {
  275. if ($this->data[$this->N] == "\n") {
  276. $this->state = 'lexDeclare';
  277. $this->N++;
  278. $this->line++;
  279. return $this->lexDeclare();
  280. }
  281. if ($this->data[$this->N] == '"') {
  282. return $this->lexQuote();
  283. }
  284. if ($this->data[$this->N] == '\'') {
  285. return $this->lexQuote('\'');
  286. }
  287. $this -> skipWhitespace();
  288. // match a pattern
  289. $test = $this->data[$this->N];
  290. $token = $this->N + 1;
  291. $a = 0;
  292. do {
  293. if ($a++) {
  294. $token++;
  295. }
  296. $token = strpos($this->data, $test, $token);
  297. } while ($token !== false && ($this->data[$token - 1] == '\\'
  298. && $this->data[$token - 2] != '\\'));
  299. if ($token === false) {
  300. $this->error('Unterminated regex pattern (started with "' . $test . '"');
  301. return false;
  302. }
  303. if (substr_count($this->data, "\n", $this->N, $token - $this->N)) {
  304. $this->error('Regex pattern extends over multiple lines');
  305. return false;
  306. }
  307. $this->value = substr($this->data, $this->N + 1, $token - $this->N - 1);
  308. // unescape the regex marker
  309. // we will re-escape when creating the final regex
  310. $this->value = str_replace('\\' . $test, $test, $this->value);
  311. $this->N = $token + 1;
  312. $this->token = self::SUBPATTERN;
  313. return true;
  314. }
  315. /**
  316. * lexer for quoted literals
  317. * @return boolean
  318. */
  319. private function lexQuote($quote = '"')
  320. {
  321. $token = $this->N + 1;
  322. $a = 0;
  323. do {
  324. if ($a++) {
  325. $token++;
  326. }
  327. $token = strpos($this->data, $quote, $token);
  328. } while ($token !== false && $token < strlen($this->data) &&
  329. ($this->data[$token - 1] == '\\' && $this->data[$token - 2] != '\\'));
  330. if ($token === false) {
  331. $this->error('unterminated quote');
  332. return false;
  333. }
  334. if (substr_count($this->data, "\n", $this->N, $token - $this->N)) {
  335. $this->error('quote extends over multiple lines');
  336. return false;
  337. }
  338. $this->value = substr($this->data, $this->N + 1, $token - $this->N - 1);
  339. $this->value = str_replace('\\'.$quote, $quote, $this->value);
  340. $this->value = str_replace('\\\\', '\\', $this->value);
  341. $this->N = $token + 1;
  342. if ($quote == '\'' ) {
  343. $this->token = self::SINGLEQUOTE;
  344. } else {
  345. $this->token = self::QUOTE;
  346. }
  347. return true;
  348. }
  349. /**
  350. * lexer for rules
  351. * @return boolean
  352. */
  353. private function lexRule()
  354. {
  355. while (
  356. $this->N < strlen($this->data)
  357. && (
  358. $this->data[$this->N] == ' '
  359. || $this->data[$this->N] == "\t"
  360. || $this->data[$this->N] == "\n"
  361. ) || (
  362. $this->N < strlen($this->data) - 1
  363. && $this->data[$this->N] == '/'
  364. && $this->data[$this->N + 1] == '/'
  365. )
  366. ) {
  367. if ( $this->data[$this->N] == '/' && $this->data[$this->N + 1] == '/' ) {
  368. // Skip single line comments
  369. $next_newline = strpos($this->data, "\n", $this->N) + 1;
  370. if ($next_newline) {
  371. $this->N = $next_newline;
  372. } else {
  373. $this->N = sizeof($this->data);
  374. }
  375. $this->line++;
  376. } else {
  377. if ($this->data[$this->N] == "\n") {
  378. $this->line++;
  379. }
  380. $this->N++; // skip all whitespace
  381. }
  382. }
  383. if ($this->N >= strlen($this->data)) {
  384. $this->error('unexpected end of input, expecting rule declaration');
  385. }
  386. if ($this->data[$this->N] == '*' && $this->data[$this->N + 1] == '/') {
  387. $this->state = 'StartNonDeclare';
  388. $this->value = '*/';
  389. $this->N += 2;
  390. $this->token = self::COMMENTEND;
  391. return true;
  392. }
  393. if ($this->data[$this->N] == '\'') {
  394. return $this->lexQuote('\'');
  395. }
  396. if (preg_match('/\G%([a-zA-Z_]+)/', $this->data, $token, null, $this->N)) {
  397. $this->value = $token[1];
  398. $this->N += strlen($token[1]) + 1;
  399. $this->state = 'DeclarePIRule';
  400. $this->token = self::PI;
  401. return true;
  402. }
  403. if ($this->data[$this->N] == "{") {
  404. return $this->lexCode();
  405. }
  406. if ($this->data[$this->N] == '"') {
  407. return $this->lexQuote();
  408. }
  409. if (preg_match('/\G[a-zA-Z_][a-zA-Z0-9_]*/', $this->data, $token, null, $this->N)) {
  410. $this->value = $token[0];
  411. $this->N += strlen($token[0]);
  412. $this->token = self::SUBPATTERN;
  413. return true;
  414. } else {
  415. $this->error('expecting token rule (quotes or sub-patterns)');
  416. return false;
  417. }
  418. }
  419. /**
  420. * lexer for php code blocks
  421. * @return boolean
  422. */
  423. private function lexCode()
  424. {
  425. $cp = $this->N + 1;
  426. for ($level = 1; $cp < strlen($this->data) && ($level > 1 || $this->data[$cp] != '}'); $cp++) {
  427. if ($this->data[$cp] == '{') {
  428. $level++;
  429. } elseif ($this->data[$cp] == '}') {
  430. $level--;
  431. } elseif ($this->data[$cp] == '/' && $this->data[$cp + 1] == '/') {
  432. /* Skip C++ style comments */
  433. $cp += 2;
  434. $z = strpos($this->data, "\n", $cp);
  435. if ($z === false) {
  436. $cp = strlen($this->data);
  437. break;
  438. }
  439. $cp = $z;
  440. } elseif ($this->data[$cp] == "'" || $this->data[$cp] == '"') {
  441. /* String a character literals */
  442. $startchar = $this->data[$cp];
  443. $prevc = 0;
  444. for ($cp++; $cp < strlen($this->data) && ($this->data[$cp] != $startchar || $prevc === '\\'); $cp++) {
  445. if ($prevc === '\\') {
  446. $prevc = 0;
  447. } else {
  448. $prevc = $this->data[$cp];
  449. }
  450. }
  451. }
  452. }
  453. if ($cp >= strlen($this->data)) {
  454. $this->error("PHP code starting on this line is not terminated before the end of the file.");
  455. $this->error++;
  456. return false;
  457. } else {
  458. $this->value = substr($this->data, $this->N + 1, $cp - $this->N - 1);
  459. $this->token = self::CODE;
  460. $this->N = $cp + 1;
  461. return true;
  462. }
  463. }
  464. /**
  465. * Skip whitespace characters
  466. */
  467. private function skipWhitespace() {
  468. while (
  469. $this->N < strlen($this->data)
  470. && (
  471. $this->data[$this->N] == ' '
  472. || $this->data[$this->N] == "\t"
  473. )
  474. ) {
  475. $this->N++; // skip whitespace
  476. }
  477. }
  478. /**
  479. * Skip whitespace and EOL characters
  480. */
  481. private function skipWhitespaceEol() {
  482. while (
  483. $this->N < strlen($this->data)
  484. && (
  485. $this->data[$this->N] == ' '
  486. || $this->data[$this->N] == "\t"
  487. || $this->data[$this->N] == "\n"
  488. )
  489. ) {
  490. if ($this->data[$this->N] == "\n") {
  491. ++$this -> line;
  492. }
  493. $this->N++; // skip whitespace
  494. }
  495. }
  496. /**
  497. * Primary scanner
  498. *
  499. * In addition to lexing, this properly increments the line number of lexing.
  500. * This calls the proper sub-lexer based on the parser state
  501. * @param unknown_type $parser
  502. * @return unknown
  503. */
  504. public function advance($parser)
  505. {
  506. if ($this->N >= strlen($this->data)) {
  507. return false;
  508. }
  509. if ($this->{'lex' . $this->state}()) {
  510. $this->line += substr_count($this->value, "\n");
  511. return true;
  512. }
  513. return false;
  514. }
  515. }
  516. ?>