parser.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. #include <stdio.h>
  2. #include "parser_internal.h"
  3. enum {
  4. _NONE = 0,
  5. #define X(a, ...) _##a,
  6. PARSER_STATE_LIST
  7. #undef X
  8. };
  9. static char* state_names[] = {
  10. [_NONE] = "NONE",
  11. #define X(a, ...) [_##a] = #a,
  12. PARSER_STATE_LIST
  13. #undef X
  14. };
  15. #define push_state(st) push_state_(ctx, _##st)
  16. static void push_state_(cp_ctx_t* ctx, int newID) {
  17. VEC_INC(&ctx->state_stack);
  18. stack_state_t* s = &VEC_TAIL(&ctx->state_stack);
  19. memset(s, 0, sizeof(*s));
  20. ctx->state = s;
  21. s->id = newID;
  22. // s->base = newID;
  23. }
  24. #define pop_state(...) pop_state_(ctx __VA_OPT__(,) __VA_ARGS__)
  25. static void pop_state_(cp_ctx_t* ctx) {
  26. VEC_FREE(&ctx->state->type_specs);
  27. VEC_FREE(&ctx->state->type_names);
  28. VEC_FREE(&ctx->state->idents);
  29. VEC_LEN(&ctx->state_stack)--;
  30. ctx->state = &VEC_TAIL(&ctx->state_stack);
  31. }
  32. int is_typespec_(cp_ctx_t* ctx, lexer_token_t* t);
  33. #define is_typespec(...) is_typespec_(ctx, __VA_ARGS__)
  34. void cp_ctx_init(cp_ctx_t* ctx) {
  35. #define X(a, b, ...) ctx->a = strint_(ctx->str_table, b);
  36. C_PARSER_STRING_CACHE_LIST
  37. #undef X
  38. }
  39. void ast_symbol_table_init(cp_ctx_t* ctx, ast_symbol_table_t* st) {
  40. HT_init(&st->seu, 64);
  41. HT_init(&st->fbt, 64);
  42. st->next = NULL;
  43. }
  44. ast_symbol_t* ast_symbol_table_add_fbt_symbol(ast_symbol_table_t* st, char* name) {
  45. ast_symbol_t* sym = calloc(1, sizeof(*sym));
  46. sym->name = name;
  47. // TODO: check for redef
  48. HT_set(&st->fbt, name, sym);
  49. return sym;
  50. }
  51. void ast_symbol_table_add_builtins(cp_ctx_t* ctx, ast_symbol_table_t* st) {
  52. #define X(a, ...) ast_symbol_table_add_fbt_symbol(st, ctx->_##a);
  53. BUILTIN_TYPE_LIST(X)
  54. #undef X
  55. }
  56. void ast_tu_init(cp_ctx_t* ctx, ast_tu_t* tu) {
  57. // HT_init(&tu->types, 1000);
  58. // fill the symbol table with builtin types
  59. tu->globals = calloc(1, sizeof(*tu->globals));
  60. ast_symbol_table_init(ctx, tu->globals);
  61. ast_symbol_table_add_builtins(ctx, tu->globals);
  62. }
  63. #define dexit(...) \
  64. do {\
  65. printf("%s:%d \n", __FILE__, __LINE__); \
  66. exit(1); \
  67. } while(0);
  68. static lexer_token_t* peek_token(cp_ctx_t* ctx, long* tn) {
  69. for(; *tn < VEC_LEN(&ctx->tokens->tokens); (*tn)++) {
  70. lexer_token_t* n = VEC_ITEM(&ctx->tokens->tokens, *tn);
  71. if(n->type == LEXER_TOK_SPACE || n->type == LEXER_TOK_COMMENT) {
  72. continue;
  73. }
  74. return n;
  75. }
  76. return NULL;
  77. }
  78. int parser_probe(cp_ctx_t* ctx, long* tn) {
  79. // with one annoying and stupid exception, we can determine what sort of code is next based on a few simple heuristics
  80. int ident_cnt = 0;
  81. int paren_depth = 0;
  82. int typename_cnt = 0;
  83. int star_cnt = 0;
  84. int typespec_cnt = 0;
  85. for(long i = *tn; i < VEC_LEN(&ctx->tokens->tokens); i++) {
  86. lexer_token_t* n = VEC_ITEM(&ctx->tokens->tokens, i);
  87. if(n->type == LEXER_TOK_SPACE || n->type == LEXER_TOK_COMMENT) {
  88. continue;
  89. }
  90. if(n->text == ctx->_typedef) return 't';
  91. // these could be var dec, function proto, or function def
  92. if(n->text == ctx->_long) { typespec_cnt++; continue; }
  93. if(n->text == ctx->_short) { typespec_cnt++; continue; }
  94. if(n->text == ctx->_unsigned) { typespec_cnt++; continue; }
  95. if(n->text == ctx->_signed) { typespec_cnt++; continue; }
  96. if(n->text == ctx->_struct) { typespec_cnt++; continue; }
  97. if(n->text == ctx->_union) { typespec_cnt++; continue; }
  98. if(n->text == ctx->_enum) { typespec_cnt++; continue; }
  99. #define X(a, ...) if(n->text == ctx->_##a) { typename_cnt++; continue; }
  100. BUILTIN_TYPE_LIST
  101. #undef X
  102. if(n->text == ctx->_star) { star_cnt++; continue; }
  103. // these are definitive signs of a variable declaration or function dec/def
  104. if(n->text == ctx->_inline) return 'f';
  105. if(n->text == ctx->_register) return 'v';
  106. if(n->text == ctx->_const) return 'v';
  107. if(n->text == ctx->_extern) return 'v';
  108. if(n->text == ctx->_volatile) return 'v';
  109. if(n->text == ctx->_static) return 'v';
  110. if(n->text == ctx->_auto) return 'v';
  111. if(n->text == ctx->_eq) return 'i'; // initialization or assignment
  112. // TODO: other assignment operators
  113. if(n->text == ctx->_lparen) return 'f'; // function call or declaration, or fucking void-cast
  114. if(n->text == ctx->_comma) return 'f'; // multiple declarations
  115. // brackets
  116. }
  117. }
  118. ast_var_def_t* parse_declaration(cp_ctx_t* ctx, long* tn) {
  119. int num_typespecs = eat_typespecs(ctx, tn);
  120. printf("got %d typespecs\n", num_typespecs);
  121. ast_type_t* type = cp_ctx_process_type(ctx);
  122. for(; *tn < VEC_LEN(&ctx->tokens->tokens); (*tn)++) {
  123. lexer_token_t* n = VEC_ITEM(&ctx->tokens->tokens, *tn);
  124. // could be a wild semicolon
  125. // could be a typedef
  126. // could be a bare var declaration
  127. if(n->text == ctx->_semi) {
  128. // TODO: check state for cached tokens
  129. ast_var_def_t* vd = calloc(1, sizeof(*vd));
  130. vd->type = type;
  131. if(VEC_LEN(&ctx->state->idents) == 0) {
  132. printf("missing identifier in variable declaration.");
  133. }
  134. else if(VEC_LEN(&ctx->state->idents) > 1) {
  135. printf("too many identifiers in variable declaration.");
  136. }
  137. else {
  138. ast_var_def_t* vd = calloc(1, sizeof(*vd));
  139. vd->type = type;
  140. vd->name = VEC_ITEM(&ctx->state->idents, 0);
  141. printf("got var def %s\n", vd->name->text);
  142. }
  143. cp_ctx_destroy_state(ctx->state);
  144. break;
  145. }
  146. // could be an initialization
  147. if(n->text == ctx->_eq) {
  148. // cp_ctx_process_type(ctx);
  149. cp_ctx_destroy_state(ctx->state);
  150. // TODO parse initialization
  151. break;
  152. }
  153. // could be a function
  154. if(n->text == ctx->_lparen) { // no K&R bullshit here. get fucked.
  155. cp_ctx_process_type(ctx);
  156. // TODO parse initialization
  157. break;
  158. }
  159. if(n->type == LEXER_TOK_IDENT) {
  160. VEC_PUSH(&ctx->state->idents, n);
  161. // TODO: check for commas
  162. // check function declaration
  163. }
  164. }
  165. return NULL;
  166. }
  167. ast_struct_def_t* parse_structdef(cp_ctx_t* ctx, long* tn) {
  168. ast_struct_def_t* stdef = calloc(1, sizeof(*stdef));
  169. for(; *tn < VEC_LEN(&ctx->tokens->tokens); (*tn)++) {
  170. lexer_token_t* n = VEC_ITEM(&ctx->tokens->tokens, *tn);
  171. if(n->type == LEXER_TOK_SPACE || n->type == LEXER_TOK_COMMENT) {
  172. continue;
  173. }
  174. if(n->text == ctx->_rbrace) {
  175. printf("finished struct definition\n");
  176. break;
  177. }
  178. VEC_PUSH(&stdef->members, parse_declaration(ctx, tn));
  179. }
  180. return stdef;
  181. }
  182. int eat_typespecs(cp_ctx_t* ctx, long* tn) {
  183. int found = 0;
  184. ast_symbol_t* sym;
  185. for(; *tn < VEC_LEN(&ctx->tokens->tokens); (*tn)++) {
  186. lexer_token_t* n = VEC_ITEM(&ctx->tokens->tokens, *tn);
  187. if(n->type == LEXER_TOK_SPACE || n->type == LEXER_TOK_COMMENT) {
  188. continue;
  189. }
  190. if(n->text == ctx->_star) {
  191. ctx->state->stars++;
  192. continue;
  193. }
  194. // could be a struct, union, or enum
  195. if(n->text == ctx->_struct) {
  196. printf("got struct\n");
  197. long tn2 = *tn + 1;
  198. lexer_token_t* n1 = peek_token(ctx, &tn2);
  199. if(!n1) { printf("unexpected EOF\n"); break; }
  200. lexer_token_t* name = NULL;
  201. if(n1->type == LEXER_TOK_IDENT) {
  202. // TODO validate name
  203. name = n1;
  204. tn2++;
  205. n1 = peek_token(ctx, &tn2);
  206. printf("got struct name %s\n", name->text);
  207. }
  208. // a struct keyword must be followed by an identifier, an lbrace, or both
  209. if(n1->text == ctx->_lbrace) { // inline struct definition
  210. // parse struct definition
  211. printf("got struct brace\n");
  212. push_state(NONE);
  213. ast_struct_def_t* stdef = parse_structdef(ctx, &tn2);
  214. VEC_PUSH(&ctx->state->type_names, ((ast_typename_t){.type = 's', .name = name, .stdef = stdef}));
  215. pop_state();
  216. *tn = tn2;
  217. }
  218. else {
  219. if(name) {
  220. printf("struct keyword with only name\n");
  221. // just the name
  222. }
  223. else {
  224. printf("dangling struct keyword\n");
  225. }
  226. }
  227. continue;
  228. }
  229. if(n->text == ctx->_union) {
  230. // parse_uniondef(ctx, tn);
  231. continue;
  232. }
  233. if(n->text == ctx->_enum) {
  234. // parse_enumdef(ctx, tn);
  235. continue;
  236. }
  237. if(n->type == LEXER_TOK_IDENT) {
  238. if(is_typespec(n)) {
  239. if(n->text == ctx->_typedef) ctx->state->has_typedef = 1;
  240. printf("typespec\n");
  241. VEC_PUSH(&ctx->state->type_specs, n);
  242. }
  243. else if(sym = lookup_typename(ctx->tu, n)) {
  244. printf("typename\n");
  245. VEC_PUSH(&ctx->state->type_names, ((ast_typename_t){.type = 'b', .name = n, .stdef = NULL}));
  246. }
  247. else {
  248. // printf("ident\n");
  249. // VEC_PUSH(&ctx->state->idents, n);
  250. (*tn)--;
  251. break;
  252. }
  253. }
  254. found++;
  255. }
  256. return found;
  257. }
  258. int parse_statement(cp_ctx_t* ctx, long* tn) {
  259. for(; *tn < VEC_LEN(&ctx->tokens->tokens); (*tn)++) {
  260. lexer_token_t* n = VEC_ITEM(&ctx->tokens->tokens, *tn);
  261. }
  262. }
  263. int pasrse_tu_root(cp_ctx_t* ctx, long* tn) {
  264. ast_symbol_t* sym;
  265. for(; *tn < VEC_LEN(&ctx->tokens->tokens); (*tn)++) {
  266. lexer_token_t* n = VEC_ITEM(&ctx->tokens->tokens, *tn);
  267. // could be a declaration
  268. ast_var_def_t* vd = parse_declaration(ctx, tn);
  269. if(vd) {
  270. continue;
  271. }
  272. // could be a function definition
  273. if(n->text == ctx->_semi) {
  274. // TODO: check state for cached tokens
  275. cp_ctx_process_type(ctx);
  276. cp_ctx_destroy_state(ctx->state);
  277. break;
  278. }
  279. // could be an initialization
  280. if(n->text == ctx->_eq) {
  281. // cp_ctx_process_typedec(ctx);
  282. cp_ctx_destroy_state(ctx->state);
  283. // TODO parse initialization9
  284. break;
  285. }
  286. // could be a function pointer declaration
  287. if(n->text == ctx->_lparen) {
  288. // cp_ctx_process_typedec(ctx);
  289. break;
  290. }
  291. // function or variable declaration
  292. if(n->type == LEXER_TOK_IDENT) {
  293. VEC_PUSH(&ctx->state->idents, n);
  294. // TODO: check for commas
  295. // check function declaration
  296. // no K&R bullshit here. get fucked.
  297. }
  298. }
  299. return 0;
  300. }
  301. void c_parser_tu(cpp_tu_t* cpp_tu, ast_tu_t* tu) {
  302. memset(tu, 0, sizeof(*tu));
  303. tu->cpp = cpp_tu;
  304. #define in_state(x) (_##x == ctx->state->id)
  305. cp_ctx_t* ctx = calloc(1, sizeof(*ctx));
  306. ctx->str_table = tu->cpp->str_table;
  307. ctx->tu = tu;
  308. cp_ctx_init(ctx);
  309. ctx->tokens = tu->cpp->root_ctx->out;
  310. ast_tu_init(ctx, tu);
  311. ast_symbol_t* sym;
  312. #define X(a, b, ...) char* a = ctx->a;
  313. C_PARSER_STRING_CACHE_LIST
  314. #undef X
  315. push_state(NONE);
  316. long tn = 0;
  317. while(tn < VEC_LEN(&ctx->tokens->tokens)) {
  318. pasrse_tu_root(ctx, &tn);
  319. }
  320. /*
  321. int redo = 0; // for debugging
  322. VEC_EACH(&tu->cpp->root_ctx->out->tokens, ni, n) {
  323. RESTART_REAL:
  324. sym = 1; // just in case; segfault if used
  325. if(n->type == LEXER_TOK_SPACE || n->type == LEXER_TOK_COMMENT) {
  326. continue;
  327. }
  328. printf("%c%ld : %s : parser loop [%s] %s:%d:%d\n",
  329. redo ? ' ' : '>',
  330. VEC_LEN(&ctx->state_stack), state_names[ctx->state->id],
  331. n->type == LEXER_TOK_SPACE ? " " : n->text,
  332. n->file->name, n->start_line, n->start_col
  333. );
  334. redo = 0;
  335. }
  336. */
  337. return;
  338. }
  339. ast_symbol_t* lookup_typename(ast_tu_t* tu, lexer_token_t* t) {
  340. ast_symbol_t* sym = NULL;
  341. //
  342. // HT_EACH(&tu->globals->fbt, name, ast_symbol_t*, s) {
  343. // printf(" type: %s\n", name);
  344. // }
  345. //
  346. HT_get(&tu->globals->fbt, t->text, &sym);
  347. return sym;
  348. }
  349. int is_typespec_(cp_ctx_t* ctx, lexer_token_t* t) {
  350. #define X(a, ...) if(ctx->_##a == t->text) return 1;
  351. PARSER_TYPESPEC_LIST(X)
  352. #undef X
  353. return 0;
  354. }