tokenize.c 23 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037
  1. /*
  2. * This is a really stupid C tokenizer. It doesn't do any include
  3. * files or anything complex at all. That's the preprocessor.
  4. *
  5. * Copyright (C) 2003 Transmeta Corp.
  6. * 2003 Linus Torvalds
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in
  16. * all copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. * THE SOFTWARE.
  25. */
  26. #include <stdio.h>
  27. #include <stdlib.h>
  28. #include <stdarg.h>
  29. #include <stddef.h>
  30. #include <string.h>
  31. #include <ctype.h>
  32. #include <unistd.h>
  33. #include <stdint.h>
  34. #include "lib.h"
  35. #include "allocate.h"
  36. #include "token.h"
  37. #include "symbol.h"
  38. #define EOF (-1)
  39. int input_stream_nr = 0;
  40. struct stream *input_streams;
  41. static int input_streams_allocated;
  42. unsigned int tabstop = 8;
  43. #define BUFSIZE (8192)
  44. typedef struct {
  45. int fd, offset, size;
  46. int pos, line, nr;
  47. int newline, whitespace;
  48. struct token **tokenlist;
  49. struct token *token;
  50. unsigned char *buffer;
  51. } stream_t;
  52. const char *stream_name(int stream)
  53. {
  54. if (stream < 0 || stream > input_stream_nr)
  55. return "<bad stream>";
  56. return input_streams[stream].name;
  57. }
  58. int stream_prev(int stream)
  59. {
  60. if (stream < 0 || stream > input_stream_nr)
  61. return -1;
  62. stream = input_streams[stream].pos.stream;
  63. if (stream > input_stream_nr)
  64. return -1;
  65. return stream;
  66. }
  67. static struct position stream_pos(stream_t *stream)
  68. {
  69. struct position pos;
  70. pos.type = 0;
  71. pos.stream = stream->nr;
  72. pos.newline = stream->newline;
  73. pos.whitespace = stream->whitespace;
  74. pos.pos = stream->pos;
  75. pos.line = stream->line;
  76. pos.noexpand = 0;
  77. return pos;
  78. }
  79. const char *show_special(int val)
  80. {
  81. static char buffer[4];
  82. buffer[0] = val;
  83. buffer[1] = 0;
  84. if (val >= SPECIAL_BASE)
  85. strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  86. return buffer;
  87. }
  88. const char *show_ident(const struct ident *ident)
  89. {
  90. static char buff[4][256];
  91. static int n;
  92. char *buffer;
  93. if (!ident)
  94. return "<noident>";
  95. buffer = buff[3 & ++n];
  96. sprintf(buffer, "%.*s", ident->len, ident->name);
  97. return buffer;
  98. }
  99. static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  100. {
  101. if (isprint(c)) {
  102. if (c == escape || c == '\\')
  103. *ptr++ = '\\';
  104. *ptr++ = c;
  105. return ptr;
  106. }
  107. *ptr++ = '\\';
  108. switch (c) {
  109. case '\n':
  110. *ptr++ = 'n';
  111. return ptr;
  112. case '\t':
  113. *ptr++ = 't';
  114. return ptr;
  115. }
  116. if (!isdigit(next))
  117. return ptr + sprintf(ptr, "%o", c);
  118. return ptr + sprintf(ptr, "%03o", c);
  119. }
  120. const char *show_string(const struct string *string)
  121. {
  122. static char buffer[4 * MAX_STRING + 3];
  123. char *ptr;
  124. int i;
  125. if (!string || !string->length)
  126. return "<bad_string>";
  127. ptr = buffer;
  128. *ptr++ = '"';
  129. for (i = 0; i < string->length-1; i++) {
  130. const char *p = string->data + i;
  131. ptr = charstr(ptr, p[0], '"', p[1]);
  132. }
  133. *ptr++ = '"';
  134. *ptr = '\0';
  135. return buffer;
  136. }
  137. static const char *show_char(const char *s, size_t len, char prefix, char delim)
  138. {
  139. static char buffer[MAX_STRING + 4];
  140. char *p = buffer;
  141. if (prefix)
  142. *p++ = prefix;
  143. *p++ = delim;
  144. memcpy(p, s, len);
  145. p += len;
  146. *p++ = delim;
  147. *p++ = '\0';
  148. return buffer;
  149. }
  150. static const char *quote_char(const char *s, size_t len, char prefix, char delim)
  151. {
  152. static char buffer[2*MAX_STRING + 6];
  153. size_t i;
  154. char *p = buffer;
  155. if (prefix)
  156. *p++ = prefix;
  157. if (delim == '"')
  158. *p++ = '\\';
  159. *p++ = delim;
  160. for (i = 0; i < len; i++) {
  161. if (s[i] == '"' || s[i] == '\\')
  162. *p++ = '\\';
  163. *p++ = s[i];
  164. }
  165. if (delim == '"')
  166. *p++ = '\\';
  167. *p++ = delim;
  168. *p++ = '\0';
  169. return buffer;
  170. }
  171. const char *show_token(const struct token *token)
  172. {
  173. static char buffer[256];
  174. if (!token)
  175. return "<no token>";
  176. switch (token_type(token)) {
  177. case TOKEN_ERROR:
  178. return "syntax error";
  179. case TOKEN_EOF:
  180. return "end-of-input";
  181. case TOKEN_IDENT:
  182. return show_ident(token->ident);
  183. case TOKEN_NUMBER:
  184. return token->number;
  185. case TOKEN_SPECIAL:
  186. return show_special(token->special);
  187. case TOKEN_CHAR:
  188. return show_char(token->string->data,
  189. token->string->length - 1, 0, '\'');
  190. case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
  191. return show_char(token->embedded,
  192. token_type(token) - TOKEN_CHAR, 0, '\'');
  193. case TOKEN_WIDE_CHAR:
  194. return show_char(token->string->data,
  195. token->string->length - 1, 'L', '\'');
  196. case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
  197. return show_char(token->embedded,
  198. token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
  199. case TOKEN_STRING:
  200. return show_char(token->string->data,
  201. token->string->length - 1, 0, '"');
  202. case TOKEN_WIDE_STRING:
  203. return show_char(token->string->data,
  204. token->string->length - 1, 'L', '"');
  205. case TOKEN_STREAMBEGIN:
  206. sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
  207. return buffer;
  208. case TOKEN_STREAMEND:
  209. sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
  210. return buffer;
  211. case TOKEN_UNTAINT:
  212. sprintf(buffer, "<untaint>");
  213. return buffer;
  214. case TOKEN_ARG_COUNT:
  215. sprintf(buffer, "<argcnt>");
  216. return buffer;
  217. default:
  218. sprintf(buffer, "unhandled token type '%d' ", token_type(token));
  219. return buffer;
  220. }
  221. }
  222. const char *quote_token(const struct token *token)
  223. {
  224. static char buffer[256];
  225. switch (token_type(token)) {
  226. case TOKEN_ERROR:
  227. return "syntax error";
  228. case TOKEN_IDENT:
  229. return show_ident(token->ident);
  230. case TOKEN_NUMBER:
  231. return token->number;
  232. case TOKEN_SPECIAL:
  233. return show_special(token->special);
  234. case TOKEN_CHAR:
  235. return quote_char(token->string->data,
  236. token->string->length - 1, 0, '\'');
  237. case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
  238. return quote_char(token->embedded,
  239. token_type(token) - TOKEN_CHAR, 0, '\'');
  240. case TOKEN_WIDE_CHAR:
  241. return quote_char(token->string->data,
  242. token->string->length - 1, 'L', '\'');
  243. case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
  244. return quote_char(token->embedded,
  245. token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
  246. case TOKEN_STRING:
  247. return quote_char(token->string->data,
  248. token->string->length - 1, 0, '"');
  249. case TOKEN_WIDE_STRING:
  250. return quote_char(token->string->data,
  251. token->string->length - 1, 'L', '"');
  252. default:
  253. sprintf(buffer, "unhandled token type '%d' ", token_type(token));
  254. return buffer;
  255. }
  256. }
  257. #define HASHED_INPUT_BITS (6)
  258. #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
  259. #define HASH_PRIME 0x9e370001UL
  260. static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
  261. int *hash_stream(const char *name)
  262. {
  263. uint32_t hash = 0;
  264. unsigned char c;
  265. while ((c = *name++) != 0)
  266. hash = (hash + (c << 4) + (c >> 4)) * 11;
  267. hash *= HASH_PRIME;
  268. hash >>= 32 - HASHED_INPUT_BITS;
  269. return input_stream_hashes + hash;
  270. }
  271. int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
  272. {
  273. int stream = input_stream_nr, *hash;
  274. struct stream *current;
  275. if (stream >= input_streams_allocated) {
  276. int newalloc = stream * 4 / 3 + 10;
  277. input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
  278. if (!input_streams)
  279. die("Unable to allocate more streams space");
  280. input_streams_allocated = newalloc;
  281. }
  282. current = input_streams + stream;
  283. memset(current, 0, sizeof(*current));
  284. current->name = name;
  285. current->fd = fd;
  286. current->next_path = next_path;
  287. current->path = NULL;
  288. current->constant = CONSTANT_FILE_MAYBE;
  289. if (pos)
  290. current->pos = *pos;
  291. else
  292. current->pos.stream = -1;
  293. input_stream_nr = stream+1;
  294. hash = hash_stream(name);
  295. current->next_stream = *hash;
  296. *hash = stream;
  297. return stream;
  298. }
  299. static struct token * alloc_token(stream_t *stream)
  300. {
  301. struct token *token = __alloc_token(0);
  302. token->pos = stream_pos(stream);
  303. return token;
  304. }
  305. /*
  306. * Argh... That was surprisingly messy - handling '\r' complicates the
  307. * things a _lot_.
  308. */
  309. static int nextchar_slow(stream_t *stream)
  310. {
  311. int offset = stream->offset;
  312. int size = stream->size;
  313. int c;
  314. int spliced = 0, had_cr, had_backslash;
  315. restart:
  316. had_cr = had_backslash = 0;
  317. repeat:
  318. if (offset >= size) {
  319. if (stream->fd < 0)
  320. goto got_eof;
  321. size = read(stream->fd, stream->buffer, BUFSIZE);
  322. if (size <= 0)
  323. goto got_eof;
  324. stream->size = size;
  325. stream->offset = offset = 0;
  326. }
  327. c = stream->buffer[offset++];
  328. if (had_cr)
  329. goto check_lf;
  330. if (c == '\r') {
  331. had_cr = 1;
  332. goto repeat;
  333. }
  334. norm:
  335. if (!had_backslash) {
  336. switch (c) {
  337. case '\t':
  338. stream->pos += tabstop - stream->pos % tabstop;
  339. break;
  340. case '\n':
  341. stream->line++;
  342. stream->pos = 0;
  343. stream->newline = 1;
  344. break;
  345. case '\\':
  346. had_backslash = 1;
  347. stream->pos++;
  348. goto repeat;
  349. default:
  350. stream->pos++;
  351. }
  352. } else {
  353. if (c == '\n') {
  354. stream->line++;
  355. stream->pos = 0;
  356. spliced = 1;
  357. goto restart;
  358. }
  359. offset--;
  360. c = '\\';
  361. }
  362. out:
  363. stream->offset = offset;
  364. return c;
  365. check_lf:
  366. if (c != '\n')
  367. offset--;
  368. c = '\n';
  369. goto norm;
  370. got_eof:
  371. if (had_backslash) {
  372. c = '\\';
  373. goto out;
  374. }
  375. if (stream->pos & Wnewline_eof)
  376. warning(stream_pos(stream), "no newline at end of file");
  377. else if (spliced)
  378. warning(stream_pos(stream), "backslash-newline at end of file");
  379. return EOF;
  380. }
  381. /*
  382. * We want that as light as possible while covering all normal cases.
  383. * Slow path (including the logics with line-splicing and EOF sanity
  384. * checks) is in nextchar_slow().
  385. */
  386. static inline int nextchar(stream_t *stream)
  387. {
  388. int offset = stream->offset;
  389. if (offset < stream->size) {
  390. int c = stream->buffer[offset++];
  391. static const char special[256] = {
  392. ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
  393. };
  394. if (!special[c]) {
  395. stream->offset = offset;
  396. stream->pos++;
  397. return c;
  398. }
  399. }
  400. return nextchar_slow(stream);
  401. }
  402. struct token eof_token_entry;
  403. static struct token *mark_eof(stream_t *stream)
  404. {
  405. struct token *end;
  406. end = alloc_token(stream);
  407. eof_token_entry.pos = end->pos;
  408. token_type(end) = TOKEN_STREAMEND;
  409. end->pos.newline = 1;
  410. eof_token_entry.next = &eof_token_entry;
  411. eof_token_entry.pos.newline = 1;
  412. end->next = &eof_token_entry;
  413. *stream->tokenlist = end;
  414. stream->tokenlist = NULL;
  415. return end;
  416. }
  417. static void add_token(stream_t *stream)
  418. {
  419. struct token *token = stream->token;
  420. stream->token = NULL;
  421. token->next = NULL;
  422. *stream->tokenlist = token;
  423. stream->tokenlist = &token->next;
  424. }
  425. static void drop_token(stream_t *stream)
  426. {
  427. stream->newline |= stream->token->pos.newline;
  428. stream->whitespace |= stream->token->pos.whitespace;
  429. stream->token = NULL;
  430. }
  431. enum {
  432. Letter = 1,
  433. Digit = 2,
  434. Hex = 4,
  435. Exp = 8,
  436. Dot = 16,
  437. ValidSecond = 32,
  438. Quote = 64,
  439. };
  440. static const char cclass[257] = {
  441. ['0' + 1 ... '9' + 1] = Digit | Hex,
  442. ['A' + 1 ... 'D' + 1] = Letter | Hex,
  443. ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
  444. ['F' + 1] = Letter | Hex,
  445. ['G' + 1 ... 'O' + 1] = Letter,
  446. ['P' + 1] = Letter | Exp, /* P<exp> */
  447. ['Q' + 1 ... 'Z' + 1] = Letter,
  448. ['a' + 1 ... 'd' + 1] = Letter | Hex,
  449. ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
  450. ['f' + 1] = Letter | Hex,
  451. ['g' + 1 ... 'o' + 1] = Letter,
  452. ['p' + 1] = Letter | Exp, /* p<exp> */
  453. ['q' + 1 ... 'z' + 1] = Letter,
  454. ['_' + 1] = Letter,
  455. ['.' + 1] = Dot | ValidSecond,
  456. ['=' + 1] = ValidSecond,
  457. ['+' + 1] = ValidSecond,
  458. ['-' + 1] = ValidSecond,
  459. ['>' + 1] = ValidSecond,
  460. ['<' + 1] = ValidSecond,
  461. ['&' + 1] = ValidSecond,
  462. ['|' + 1] = ValidSecond,
  463. ['#' + 1] = ValidSecond,
  464. ['\'' + 1] = Quote,
  465. ['"' + 1] = Quote,
  466. };
  467. /*
  468. * pp-number:
  469. * digit
  470. * . digit
  471. * pp-number digit
  472. * pp-number identifier-nodigit
  473. * pp-number e sign
  474. * pp-number E sign
  475. * pp-number p sign
  476. * pp-number P sign
  477. * pp-number .
  478. */
  479. static int get_one_number(int c, int next, stream_t *stream)
  480. {
  481. struct token *token;
  482. static char buffer[4095];
  483. char *p = buffer, *buffer_end = buffer + sizeof (buffer);
  484. *p++ = c;
  485. for (;;) {
  486. long class = cclass[next + 1];
  487. if (!(class & (Dot | Digit | Letter)))
  488. break;
  489. if (p != buffer_end)
  490. *p++ = next;
  491. next = nextchar(stream);
  492. if (class & Exp) {
  493. if (next == '-' || next == '+') {
  494. if (p != buffer_end)
  495. *p++ = next;
  496. next = nextchar(stream);
  497. }
  498. }
  499. }
  500. if (p == buffer_end) {
  501. sparse_error(stream_pos(stream), "number token exceeds %td characters",
  502. buffer_end - buffer);
  503. // Pretend we saw just "1".
  504. buffer[0] = '1';
  505. p = buffer + 1;
  506. }
  507. *p++ = 0;
  508. token = stream->token;
  509. token_type(token) = TOKEN_NUMBER;
  510. token->number = xmemdup(buffer, p - buffer);
  511. add_token(stream);
  512. return next;
  513. }
  514. static int eat_string(int next, stream_t *stream, enum token_type type)
  515. {
  516. static char buffer[MAX_STRING];
  517. struct string *string;
  518. struct token *token = stream->token;
  519. int len = 0;
  520. int escape;
  521. int want_hex = 0;
  522. char delim = type < TOKEN_STRING ? '\'' : '"';
  523. for (escape = 0; escape || next != delim; next = nextchar(stream)) {
  524. if (len < MAX_STRING)
  525. buffer[len] = next;
  526. len++;
  527. if (next == '\n') {
  528. warning(stream_pos(stream),
  529. "missing terminating %c character", delim);
  530. /* assume delimiter is lost */
  531. break;
  532. }
  533. if (next == EOF) {
  534. warning(stream_pos(stream),
  535. "End of file in middle of string");
  536. return next;
  537. }
  538. if (!escape) {
  539. if (want_hex && !(cclass[next + 1] & Hex))
  540. warning(stream_pos(stream),
  541. "\\x used with no following hex digits");
  542. want_hex = 0;
  543. escape = next == '\\';
  544. } else {
  545. escape = 0;
  546. want_hex = next == 'x';
  547. }
  548. }
  549. if (want_hex)
  550. warning(stream_pos(stream),
  551. "\\x used with no following hex digits");
  552. if (len > MAX_STRING) {
  553. warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
  554. len = MAX_STRING;
  555. }
  556. if (delim == '\'' && len && len <= 4) {
  557. token_type(token) = type + len;
  558. memset(buffer + len, '\0', 4 - len);
  559. memcpy(token->embedded, buffer, 4);
  560. } else {
  561. token_type(token) = type;
  562. string = __alloc_string(len+1);
  563. memcpy(string->data, buffer, len);
  564. string->data[len] = '\0';
  565. string->length = len+1;
  566. token->string = string;
  567. }
  568. /* Pass it on.. */
  569. token = stream->token;
  570. add_token(stream);
  571. return nextchar(stream);
  572. }
  573. static int drop_stream_eoln(stream_t *stream)
  574. {
  575. drop_token(stream);
  576. for (;;) {
  577. switch (nextchar(stream)) {
  578. case EOF:
  579. return EOF;
  580. case '\n':
  581. return nextchar(stream);
  582. }
  583. }
  584. }
  585. static int drop_stream_comment(stream_t *stream)
  586. {
  587. int newline;
  588. int next;
  589. drop_token(stream);
  590. newline = stream->newline;
  591. next = nextchar(stream);
  592. for (;;) {
  593. int curr = next;
  594. if (curr == EOF) {
  595. warning(stream_pos(stream), "End of file in the middle of a comment");
  596. return curr;
  597. }
  598. next = nextchar(stream);
  599. if (curr == '*' && next == '/')
  600. break;
  601. }
  602. stream->newline = newline;
  603. return nextchar(stream);
  604. }
  605. unsigned char combinations[][4] = COMBINATION_STRINGS;
  606. #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
  607. /* hash function for two-character punctuators - all give unique values */
  608. #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
  609. /*
  610. * note that we won't get false positives - special_hash(0,0) is 0 and
  611. * entry 0 is filled (by +=), so all the missing ones are OK.
  612. */
  613. static unsigned char hash_results[32][2] = {
  614. #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
  615. RES('+', '='), /* 00 */
  616. RES('/', '='), /* 01 */
  617. RES('^', '='), /* 05 */
  618. RES('&', '&'), /* 07 */
  619. RES('#', '#'), /* 08 */
  620. RES('<', '<'), /* 0a */
  621. RES('<', '='), /* 0c */
  622. RES('!', '='), /* 0e */
  623. RES('%', '='), /* 0f */
  624. RES('-', '-'), /* 10 */
  625. RES('-', '='), /* 11 */
  626. RES('-', '>'), /* 13 */
  627. RES('=', '='), /* 15 */
  628. RES('&', '='), /* 17 */
  629. RES('*', '='), /* 18 */
  630. RES('.', '.'), /* 1a */
  631. RES('+', '+'), /* 1b */
  632. RES('|', '='), /* 1c */
  633. RES('>', '='), /* 1d */
  634. RES('|', '|'), /* 1e */
  635. RES('>', '>') /* 1f */
  636. #undef RES
  637. };
  638. static int code[32] = {
  639. #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
  640. CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
  641. CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
  642. CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
  643. CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
  644. CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
  645. CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
  646. CODE('<', '=', SPECIAL_LTE), /* 0c */
  647. CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
  648. CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
  649. CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
  650. CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
  651. CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
  652. CODE('=', '=', SPECIAL_EQUAL), /* 15 */
  653. CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
  654. CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
  655. CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
  656. CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
  657. CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
  658. CODE('>', '=', SPECIAL_GTE), /* 1d */
  659. CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
  660. CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
  661. #undef CODE
  662. };
  663. static int get_one_special(int c, stream_t *stream)
  664. {
  665. struct token *token;
  666. int next, value, i;
  667. next = nextchar(stream);
  668. /*
  669. * Check for numbers, strings, character constants, and comments
  670. */
  671. switch (c) {
  672. case '.':
  673. if (next >= '0' && next <= '9')
  674. return get_one_number(c, next, stream);
  675. break;
  676. case '"':
  677. return eat_string(next, stream, TOKEN_STRING);
  678. case '\'':
  679. return eat_string(next, stream, TOKEN_CHAR);
  680. case '/':
  681. if (next == '/')
  682. return drop_stream_eoln(stream);
  683. if (next == '*')
  684. return drop_stream_comment(stream);
  685. }
  686. /*
  687. * Check for combinations
  688. */
  689. value = c;
  690. if (cclass[next + 1] & ValidSecond) {
  691. i = special_hash(c, next);
  692. if (hash_results[i][0] == c && hash_results[i][1] == next) {
  693. value = code[i];
  694. next = nextchar(stream);
  695. if (value >= SPECIAL_LEFTSHIFT &&
  696. next == "==."[value - SPECIAL_LEFTSHIFT]) {
  697. value += 3;
  698. next = nextchar(stream);
  699. }
  700. }
  701. }
  702. /* Pass it on.. */
  703. token = stream->token;
  704. token_type(token) = TOKEN_SPECIAL;
  705. token->special = value;
  706. add_token(stream);
  707. return next;
  708. }
  709. #define IDENT_HASH_BITS (13)
  710. #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
  711. #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
  712. #define ident_hash_init(c) (c)
  713. #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
  714. #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
  715. static struct ident *hash_table[IDENT_HASH_SIZE];
  716. static int ident_hit, ident_miss, idents;
  717. void show_identifier_stats(void)
  718. {
  719. int i;
  720. int distribution[100];
  721. fprintf(stderr, "identifiers: %d hits, %d misses\n",
  722. ident_hit, ident_miss);
  723. for (i = 0; i < 100; i++)
  724. distribution[i] = 0;
  725. for (i = 0; i < IDENT_HASH_SIZE; i++) {
  726. struct ident * ident = hash_table[i];
  727. int count = 0;
  728. while (ident) {
  729. count++;
  730. ident = ident->next;
  731. }
  732. if (count > 99)
  733. count = 99;
  734. distribution[count]++;
  735. }
  736. for (i = 0; i < 100; i++) {
  737. if (distribution[i])
  738. fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
  739. }
  740. }
  741. static struct ident *alloc_ident(const char *name, int len)
  742. {
  743. struct ident *ident = __alloc_ident(len);
  744. ident->symbols = NULL;
  745. ident->len = len;
  746. ident->tainted = 0;
  747. memcpy(ident->name, name, len);
  748. return ident;
  749. }
  750. static struct ident * insert_hash(struct ident *ident, unsigned long hash)
  751. {
  752. ident->next = hash_table[hash];
  753. hash_table[hash] = ident;
  754. ident_miss++;
  755. return ident;
  756. }
  757. static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
  758. {
  759. struct ident *ident;
  760. struct ident **p;
  761. p = &hash_table[hash];
  762. while ((ident = *p) != NULL) {
  763. if (ident->len == (unsigned char) len) {
  764. if (strncmp(name, ident->name, len) != 0)
  765. goto next;
  766. ident_hit++;
  767. return ident;
  768. }
  769. next:
  770. //misses++;
  771. p = &ident->next;
  772. }
  773. ident = alloc_ident(name, len);
  774. *p = ident;
  775. ident->next = NULL;
  776. ident_miss++;
  777. idents++;
  778. return ident;
  779. }
  780. static unsigned long hash_name(const char *name, int len)
  781. {
  782. unsigned long hash;
  783. const unsigned char *p = (const unsigned char *)name;
  784. hash = ident_hash_init(*p++);
  785. while (--len) {
  786. unsigned int i = *p++;
  787. hash = ident_hash_add(hash, i);
  788. }
  789. return ident_hash_end(hash);
  790. }
  791. struct ident *hash_ident(struct ident *ident)
  792. {
  793. return insert_hash(ident, hash_name(ident->name, ident->len));
  794. }
  795. struct ident *built_in_ident(const char *name)
  796. {
  797. int len = strlen(name);
  798. return create_hashed_ident(name, len, hash_name(name, len));
  799. }
  800. struct token *built_in_token(int stream, struct ident *ident)
  801. {
  802. struct token *token;
  803. token = __alloc_token(0);
  804. token->pos.stream = stream;
  805. token_type(token) = TOKEN_IDENT;
  806. token->ident = ident;
  807. return token;
  808. }
  809. static int get_one_identifier(int c, stream_t *stream)
  810. {
  811. struct token *token;
  812. struct ident *ident;
  813. unsigned long hash;
  814. char buf[256];
  815. int len = 1;
  816. int next;
  817. hash = ident_hash_init(c);
  818. buf[0] = c;
  819. for (;;) {
  820. next = nextchar(stream);
  821. if (!(cclass[next + 1] & (Letter | Digit)))
  822. break;
  823. if (len >= sizeof(buf))
  824. break;
  825. hash = ident_hash_add(hash, next);
  826. buf[len] = next;
  827. len++;
  828. };
  829. if (cclass[next + 1] & Quote) {
  830. if (len == 1 && buf[0] == 'L') {
  831. if (next == '\'')
  832. return eat_string(nextchar(stream), stream,
  833. TOKEN_WIDE_CHAR);
  834. else
  835. return eat_string(nextchar(stream), stream,
  836. TOKEN_WIDE_STRING);
  837. }
  838. }
  839. hash = ident_hash_end(hash);
  840. ident = create_hashed_ident(buf, len, hash);
  841. /* Pass it on.. */
  842. token = stream->token;
  843. token_type(token) = TOKEN_IDENT;
  844. token->ident = ident;
  845. add_token(stream);
  846. return next;
  847. }
  848. static int get_one_token(int c, stream_t *stream)
  849. {
  850. long class = cclass[c + 1];
  851. if (class & Digit)
  852. return get_one_number(c, nextchar(stream), stream);
  853. if (class & Letter)
  854. return get_one_identifier(c, stream);
  855. return get_one_special(c, stream);
  856. }
  857. static struct token *setup_stream(stream_t *stream, int idx, int fd,
  858. unsigned char *buf, unsigned int buf_size)
  859. {
  860. struct token *begin;
  861. stream->nr = idx;
  862. stream->line = 1;
  863. stream->newline = 1;
  864. stream->whitespace = 0;
  865. stream->pos = 0;
  866. stream->token = NULL;
  867. stream->fd = fd;
  868. stream->offset = 0;
  869. stream->size = buf_size;
  870. stream->buffer = buf;
  871. begin = alloc_token(stream);
  872. token_type(begin) = TOKEN_STREAMBEGIN;
  873. stream->tokenlist = &begin->next;
  874. return begin;
  875. }
  876. static struct token *tokenize_stream(stream_t *stream)
  877. {
  878. int c = nextchar(stream);
  879. while (c != EOF) {
  880. if (!isspace(c)) {
  881. struct token *token = alloc_token(stream);
  882. stream->token = token;
  883. stream->newline = 0;
  884. stream->whitespace = 0;
  885. c = get_one_token(c, stream);
  886. continue;
  887. }
  888. stream->whitespace = 1;
  889. c = nextchar(stream);
  890. }
  891. return mark_eof(stream);
  892. }
  893. struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
  894. {
  895. stream_t stream;
  896. struct token *begin;
  897. begin = setup_stream(&stream, 0, -1, buffer, size);
  898. *endtoken = tokenize_stream(&stream);
  899. return begin;
  900. }
  901. struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
  902. {
  903. struct token *begin, *end;
  904. stream_t stream;
  905. unsigned char buffer[BUFSIZE];
  906. int idx;
  907. idx = init_stream(pos, name, fd, next_path);
  908. if (idx < 0) {
  909. // info(endtoken->pos, "File %s is const", name);
  910. return endtoken;
  911. }
  912. begin = setup_stream(&stream, idx, fd, buffer, 0);
  913. end = tokenize_stream(&stream);
  914. if (endtoken)
  915. end->next = endtoken;
  916. return begin;
  917. }