gcsx_tokenize.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. /* GCSx
  2. ** TOKENIZE.CPP
  3. **
  4. ** Script tokenization (to feed to compiler)
  5. */
  6. /*****************************************************************************
  7. ** Copyright (C) 2003-2006 Janson
  8. **
  9. ** This program is free software; you can redistribute it and/or modify
  10. ** it under the terms of the GNU General Public License as published by
  11. ** the Free Software Foundation; either version 2 of the License, or
  12. ** (at your option) any later version.
  13. **
  14. ** This program is distributed in the hope that it will be useful,
  15. ** but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. ** GNU General Public License for more details.
  18. **
  19. ** You should have received a copy of the GNU General Public License
  20. ** along with this program; if not, write to the Free Software
  21. ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
  22. *****************************************************************************/
  23. #include "all.h"
  24. void Tokenizer::deallocRange(list<Tokenizer::Token>::iterator start, list<Tokenizer::Token>::iterator end) { start_func
  25. for (; start != end; ++start) {
  26. delete (*start).text;
  27. (*start).text = NULL;
  28. }
  29. }
  30. // Maps strings to tokens- derived from tokenStrings
  31. map<string, int>* Tokenizer::tokenLookup = NULL;
  32. Tokenizer::Tokenizer(const list<string>* src) : cached(), bookmarks() { start_func
  33. initTokenLookups();
  34. source = src;
  35. row = source->begin();
  36. rowNum = 0;
  37. col = 0;
  38. atNewLine = 1;
  39. nextCloseBrace = 0;
  40. if (row != source->end()) rowLen = (*row).size();
  41. cacheRecord = 0;
  42. bookmarkNew = 0;
  43. cacheReplay = cached.end();
  44. nextBookmarkName = 1000;
  45. errorCount = 0;
  46. warningCount = 0;
  47. silent = 0;
  48. errRow = 0;
  49. errCol = 0;
  50. errBuffer = NULL;
  51. }
  52. Tokenizer::~Tokenizer() { start_func
  53. deallocRange(cached.begin(), cached.end());
  54. delete[] errBuffer;
  55. }
  56. int Tokenizer::getBookmarkName() { start_func
  57. return ++nextBookmarkName;
  58. }
  59. void Tokenizer::initTokenLookups() { start_func
  60. if (tokenLookup == NULL) {
  61. tokenLookup = new map<string, int>;
  62. int pos = 0;
  63. while (tokenStrings[pos].text) {
  64. string token = tokenStrings[pos].text;
  65. tokenLookup->operator[](token) = tokenStrings[pos].type;
  66. ++pos;
  67. }
  68. }
  69. }
  70. void Tokenizer::destroyGlobals() { start_func
  71. delete tokenLookup;
  72. tokenLookup = NULL;
  73. }
  74. int Tokenizer::atEOF() { start_func
  75. if (row == source->end()) return 1;
  76. return 0;
  77. }
  78. char Tokenizer::getCharacter() { start_func
  79. tokenizerAssert(row != source->end());
  80. if (col < rowLen) return (*row)[col];
  81. return '\0';
  82. }
  83. void Tokenizer::moveNext() { start_func
  84. tokenizerAssert(row != source->end());
  85. if (++col > rowLen) nextLine();
  86. }
  87. void Tokenizer::nextLine() { start_func
  88. tokenizerAssert(row != source->end());
  89. col = 0;
  90. ++row;
  91. ++rowNum;
  92. if (row != source->end()) rowLen = (*row).size();
  93. }
  94. string Tokenizer::grabUntil(const char* boundaries) throw_int { start_func
  95. tokenizerAssert(row != source->end());
  96. if (col >= rowLen) throw 1;
  97. string::size_type pos = (*row).find_first_of(boundaries, col);
  98. if (pos >= string::npos) throw 1;
  99. int prev = col;
  100. col = pos;
  101. return (*row).substr(prev, col - prev);
  102. }
  103. string Tokenizer::grabWhile(const char* charset) { start_func
  104. tokenizerAssert(row != source->end());
  105. string::size_type pos = (*row).find_first_not_of(charset, col);
  106. if (pos >= string::npos) pos = rowLen;
  107. int prev = col;
  108. col = pos;
  109. return (*row).substr(prev, col - prev);
  110. }
  111. string Tokenizer::grabRestOfLine() { start_func
  112. tokenizerAssert(row != source->end());
  113. if (col >= rowLen) return blankString;
  114. int prev = col;
  115. col = rowLen;
  116. return (*row).substr(prev, col - prev);
  117. }
  118. int Tokenizer::nextToken(int& type, string& token) { start_func
  119. if (cacheReplay != cached.end()) {
  120. errRow = (*cacheReplay).rowN;
  121. errCol = (*cacheReplay).colN;
  122. type = (*cacheReplay).type;
  123. token = *((*cacheReplay).text);
  124. ++cacheReplay;
  125. // If at end of cache and not recording, clear
  126. if ((!cacheRecord) && (cacheReplay == cached.end())) {
  127. deallocRange(cached.begin(), cached.end());
  128. cached.clear();
  129. // Replay pointer is already at end
  130. tokenizerAssert(cacheReplay == cached.end());
  131. }
  132. return 1;
  133. }
  134. int debug = debugLevel() & DEBUG_TOKENIZE;
  135. if (nextCloseBrace) {
  136. type = nextCloseBrace;
  137. token = "}";
  138. nextCloseBrace = 0;
  139. }
  140. else {
  141. do {
  142. // EOF?
  143. if (atEOF()) {
  144. if (debug) debugWrite(DEBUG_TOKENIZE, "Token: END OF FILE");
  145. token = blankString;
  146. type = TOKEN_NONE;
  147. return 0;
  148. }
  149. // Clear any whitespace
  150. grabWhile(WHITE_SPACE);
  151. errRow = rowNum;
  152. errCol = col;
  153. // Peek at next character to determine what sort of token to parse
  154. char tokenType = getCharacter();
  155. switch (tokenType) {
  156. case '\0':
  157. // End of line
  158. moveNext();
  159. token = blankString;
  160. type = TOKEN_ENDLINE;
  161. // Scan forward to see if a { coming up
  162. {
  163. int sCol = col;
  164. list<string>::const_iterator sRow = row;
  165. string::size_type pos;
  166. for (;;) {
  167. if (sRow == source->end()) break;
  168. pos = (*sRow).find_first_not_of(WHITE_SPACE, sCol);
  169. if (pos >= string::npos) {
  170. sCol = 0;
  171. ++sRow;
  172. continue;
  173. }
  174. if ((*sRow)[pos] == '{') {
  175. // { is coming up- force discard of endline token
  176. atNewLine = 1;
  177. }
  178. break;
  179. }
  180. }
  181. break;
  182. case '\'':
  183. // Type string
  184. token = blankString;
  185. moveNext();
  186. try {
  187. token += grabUntil("\'");
  188. moveNext();
  189. }
  190. catch (int) {
  191. outputError("No type-string terminator found on same line (missing ')");
  192. // RESOLUTION: treat remainder of line as type string
  193. token += grabRestOfLine();
  194. }
  195. toLower(token);
  196. type = TOKEN_STRINGTYPE;
  197. break;
  198. case '"':
  199. // String
  200. token = blankString;
  201. moveNext();
  202. try {
  203. for (;;) {
  204. token += grabUntil("\"\\");
  205. if (getCharacter() == '\\') {
  206. // Escape sequences
  207. moveNext();
  208. switch (tokenType = getCharacter()) {
  209. case 'n':
  210. token += "\n";
  211. break;
  212. case 'r':
  213. token += "\r";
  214. break;
  215. case 't':
  216. token += "\t";
  217. break;
  218. default:
  219. outputWarning("Unrecognized escape sequence '\\%c' (to include a backslash in a string, use \\\\)", tokenType);
  220. // RESOLUTION: insert backslash and character verbatim
  221. token += "\\";
  222. // (fall through)
  223. case '\\':
  224. case '"':
  225. token += string(1, tokenType);
  226. break;
  227. }
  228. moveNext();
  229. }
  230. else {
  231. moveNext();
  232. break;
  233. }
  234. }
  235. }
  236. catch (int) {
  237. outputError("No string terminator found on same line (missing \")");
  238. // RESOLUTION: treat remainder of line as string
  239. token += grabRestOfLine();
  240. }
  241. type = TOKEN_STRING;
  242. break;
  243. case '#':
  244. // Configuration?
  245. if (atNewLine) {
  246. token = grabWhile("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789");
  247. toLower(token);
  248. type = TOKEN_CONFIG;
  249. break;
  250. }
  251. // (otherwise, fall through to normal tokenization)
  252. default:
  253. if (((tokenType >= 'a') && (tokenType <= 'z')) ||
  254. ((tokenType >= 'A') && (tokenType <= 'Z')) ||
  255. (tokenType == '_')) {
  256. // Identifier / keyword / etc
  257. token = grabWhile("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789");
  258. toLower(token);
  259. type = TOKEN_IDENTIFIER;
  260. // Lookup token in map; otherwise an identifier
  261. map<string, int>::iterator found = tokenLookup->find(token);
  262. if (found != tokenLookup->end()) type = (*found).second;
  263. // obj_ is always reserved, not identifier
  264. if (type == TOKEN_IDENTIFIER)
  265. if (token.substr(0, 4) == string("obj_"))
  266. type = TOKEN_RESERVED;
  267. }
  268. else if ((tokenType >= '0') && (tokenType <= '9')) {
  269. // Number
  270. token = grabWhile("0123456789");
  271. // Special case- 0x
  272. if ((token.size() == 1) && (tokenType == '0') && (tolower(getCharacter()) == 'x')) {
  273. token += "x";
  274. moveNext();
  275. string add = grabWhile("0123456789abcdefABCDEF");
  276. token += add;
  277. toLower(token);
  278. if (add.size() == 0) {
  279. outputError("Invalid hexadecimal constant '%s'", token.c_str());
  280. // RESOLUTION: add a zero and continue
  281. token += "0";
  282. }
  283. type = TOKEN_HEX;
  284. }
  285. // One decimal allowed
  286. else if (getCharacter() == '.') {
  287. token += ".";
  288. moveNext();
  289. string add = grabWhile("0123456789");
  290. token += add;
  291. if (add.size() == 0) {
  292. outputWarning("Invalid decimal constant '%s' (digits must appear before and after decimal point)", token.c_str());
  293. // RESOLUTION: add a zero and continue
  294. token += "0";
  295. }
  296. type = TOKEN_DECIMAL;
  297. }
  298. else {
  299. type = TOKEN_INTEGER;
  300. }
  301. // Check for invalid character sequence afterward
  302. string mess = grabWhile("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789");
  303. if (mess.size() != 0) {
  304. outputError("Unrecognized character sequence '%s' at end of numeric constant (identifiers may not begin with a number)", mess.c_str());
  305. // RESOLUTION: continue compiling, having discarded invalid characters
  306. }
  307. }
  308. else {
  309. // Operator / symbol / comment / etc
  310. map<string, int>::iterator found;
  311. token = string(1, tokenType);
  312. do {
  313. // Add to token until it no longer matches a symbol we want
  314. moveNext();
  315. token += string(1, tokenType = getCharacter());
  316. found = tokenLookup->find(token);
  317. } while ((tokenType != '\0') && (found != tokenLookup->end()));
  318. // The last character we added was not actually discarded, remove it
  319. token = token.substr(0, token.size() - 1);
  320. // Determine token type
  321. found = tokenLookup->find(token);
  322. if (found == tokenLookup->end()) type = TOKEN_UNKNOWN;
  323. else type = (*found).second;
  324. if (type == TOKEN_COMMENT_LINE) {
  325. nextLine();
  326. token = blankString;
  327. type = TOKEN_ENDLINE;
  328. }
  329. else if (type == TOKEN_COMMENT_BLOCK) {
  330. for (;;) {
  331. if (atEOF()) {
  332. outputWarning("No end-of-comment marker found (missing */ symbol)");
  333. // RESOLUTION: treat as end-of-file
  334. if (debug) debugWrite(DEBUG_TOKENIZE, "Token: END OF FILE");
  335. token = blankString;
  336. type = TOKEN_NONE;
  337. return 0;
  338. }
  339. tokenType = getCharacter();
  340. moveNext();
  341. if ((tokenType == '*') && (getCharacter() == '/')) {
  342. moveNext();
  343. break;
  344. }
  345. }
  346. }
  347. }
  348. break;
  349. }
  350. // Loop if we get an endline right after a new line (skip blank lines/empty cmds)
  351. // Loop for comments also
  352. } while (((atNewLine) && (type == TOKEN_ENDLINE)) || (type == TOKEN_COMMENT_BLOCK));
  353. }
  354. // Turn into a new line if before a }
  355. if ((type == TOKEN_CLOSE_BRACE) && (!atNewLine)) {
  356. nextCloseBrace = TOKEN_CLOSE_BRACE;
  357. type = TOKEN_ENDLINE; // Will set atNewLine below
  358. }
  359. // At a new line for next time?
  360. // Hide newlines after a { or }
  361. if ((type == TOKEN_ENDLINE) || (type == TOKEN_CLOSE_BRACE) || (type == TOKEN_OPEN_BRACE)) atNewLine = 1;
  362. else atNewLine = 0;
  363. // Debug?
  364. if (debug) {
  365. if (type & TOKEN_KEYWORD) debugWrite(DEBUG_TOKENIZE, "Token: KEYWORD - %s", token.c_str());
  366. else if (type & TOKEN_OPERATOR) debugWrite(DEBUG_TOKENIZE, "Token: OPERATOR - %s", token.c_str());
  367. else debugWrite(DEBUG_TOKENIZE, "Token: %s - %s", debugText[type], token.c_str());
  368. }
  369. if (cacheRecord) {
  370. Token recorded;
  371. recorded.type = type;
  372. recorded.text = new string(token);
  373. recorded.rowN = errRow;
  374. recorded.colN = errCol;
  375. cached.push_back(recorded);
  376. if (bookmarkNew) {
  377. for (map<int, list<Token>::iterator>::iterator pos = bookmarks.begin(); pos != bookmarks.end(); ++pos) {
  378. if ((*pos).second == cached.end()) --(*pos).second;
  379. }
  380. bookmarkNew = 0;
  381. }
  382. // Replay pointer is already at end
  383. tokenizerAssert(cacheReplay == cached.end());
  384. }
  385. return 1;
  386. }
  387. void Tokenizer::skipToken() { start_func
  388. if (cacheReplay != cached.end()) {
  389. ++cacheReplay;
  390. // If at end of cache and not recording, clear
  391. if ((!cacheRecord) && (cacheReplay == cached.end())) {
  392. deallocRange(cached.begin(), cached.end());
  393. cached.clear();
  394. // Replay pointer is already at end
  395. tokenizerAssert(cacheReplay == cached.end());
  396. }
  397. }
  398. else {
  399. int type;
  400. string token;
  401. nextToken(type, token);
  402. }
  403. }
  404. int Tokenizer::peekToken(int& type, string& token) { start_func
  405. if (cacheReplay != cached.end()) {
  406. errRow = (*cacheReplay).rowN;
  407. errCol = (*cacheReplay).colN;
  408. type = (*cacheReplay).type;
  409. token = *((*cacheReplay).text);
  410. return 1;
  411. }
  412. if (nextToken(type, token)) {
  413. // Don't readd to cache if already recording
  414. if (!cacheRecord) {
  415. Token peeked;
  416. peeked.type = type;
  417. peeked.text = new string(token);
  418. peeked.rowN = errRow;
  419. peeked.colN = errCol;
  420. cached.push_back(peeked);
  421. }
  422. // Replay pointer is at end- move to next-to-last
  423. tokenizerAssert(cacheReplay == cached.end());
  424. --cacheReplay;
  425. return 1;
  426. }
  427. return 0;
  428. }
  429. void Tokenizer::bookmarkStore(int name) { start_func
  430. bookmarks[name] = cacheReplay;
  431. if (cacheReplay == cached.end()) bookmarkNew = 1;
  432. cacheRecord = 1;
  433. }
  434. void Tokenizer::bookmarkReturn(int name) { start_func
  435. tokenizerAssert(bookmarks.find(name) != bookmarks.end());
  436. // Return to start of cache
  437. cacheReplay = (*(bookmarks.find(name))).second;
  438. }
  439. void Tokenizer::bookmarkCancel(int name) { start_func
  440. if (bookmarks.find(name) != bookmarks.end()) {
  441. bookmarks.erase(name);
  442. if (bookmarks.empty()) {
  443. cacheRecord = 0;
  444. // Clear anything in cache prior to current replay position
  445. deallocRange(cached.begin(), cacheReplay);
  446. cached.erase(cached.begin(), cacheReplay);
  447. tokenizerAssert(cacheReplay == cached.begin());
  448. }
  449. }
  450. }
  451. #define ERROR_BUFFER_SIZE 1024
  452. void Tokenizer::outputError(const char* text, ...) { start_func
  453. va_list arglist;
  454. va_start(arglist, text);
  455. if (!silent) {
  456. if (!errBuffer) errBuffer = new char[ERROR_BUFFER_SIZE];
  457. vsnprintf(errBuffer, ERROR_BUFFER_SIZE, text, arglist);
  458. errBuffer[ERROR_BUFFER_SIZE - 1] = 0;
  459. // @TODO: Better output (debug window during gameplay; error window during editor)
  460. debugWrite("ERROR row %d col %d: %s", errRow + 1, errCol + 1, errBuffer);
  461. }
  462. ++errorCount;
  463. va_end(arglist);
  464. }
  465. void Tokenizer::outputWarning(const char* text, ...) { start_func
  466. va_list arglist;
  467. va_start(arglist, text);
  468. if (!silent) {
  469. if (!errBuffer) errBuffer = new char[ERROR_BUFFER_SIZE];
  470. vsnprintf(errBuffer, ERROR_BUFFER_SIZE, text, arglist);
  471. errBuffer[ERROR_BUFFER_SIZE - 1] = 0;
  472. // @TODO: Better output (debug window during gameplay; error window during editor)
  473. debugWrite("WARNING row %d col %d: %s", errRow + 1, errCol + 1, errBuffer);
  474. }
  475. ++warningCount;
  476. va_end(arglist);
  477. }
  478. void Tokenizer::silentErrors(int newSilent) { start_func
  479. silent = newSilent;
  480. }
  481. void Tokenizer::resetErrors() { start_func
  482. errorCount = 0;
  483. warningCount = 0;
  484. silent = 0;
  485. }