diff --git a/btparser/btparser.vcxproj b/btparser/btparser.vcxproj index f8d4fef..e72d0c2 100644 --- a/btparser/btparser.vcxproj +++ b/btparser/btparser.vcxproj @@ -20,6 +20,7 @@ + @@ -28,6 +29,7 @@ + diff --git a/btparser/btparser.vcxproj.filters b/btparser/btparser.vcxproj.filters index b220af6..19271c0 100644 --- a/btparser/btparser.vcxproj.filters +++ b/btparser/btparser.vcxproj.filters @@ -24,6 +24,9 @@ Source Files + + Source Files + @@ -47,5 +50,8 @@ Header Files + + Header Files + \ No newline at end of file diff --git a/btparser/lexer.cpp b/btparser/lexer.cpp new file mode 100644 index 0000000..a0ba1ce --- /dev/null +++ b/btparser/lexer.cpp @@ -0,0 +1,499 @@ +#include "lexer.h" +#include "stringutils.h" +#include +#include "filehelper.h" + +#define MAKE_OP_TRIPLE(ch1, ch2, ch3) (ch3 << 16 | ch2 << 8 | ch1) +#define MAKE_OP_DOUBLE(ch1, ch2) (ch2 << 8 | ch1) +#define MAKE_OP_SINGLE(ch1) (ch1) +#define DEFAULT_STRING_BUFFER 65536 + +static void clearReserve(std::string & str, size_t reserve = DEFAULT_STRING_BUFFER) +{ + str.clear(); + str.reserve(reserve); +} + +static void appendCh(std::string & str, char ch) +{ + str.resize(str.size() + 1); + str[str.size() - 1] = ch; +} + +static const char* convertNumber(const char* str, uint64_t & result, int radix) +{ + errno = 0; + char* end; + result = strtoull(str, &end, radix); + if (!result && end == str) + return "not a number"; + if (result == ULLONG_MAX && errno) + return "does not fit"; + if (*end) + return "str not completely consumed"; + return nullptr; +} + +Lexer::Lexer() +{ + setupTokenMaps(); +} + +bool Lexer::ReadInputFile(const std::string & filename) +{ + resetLexerState(); + return FileHelper::ReadAllData(filename, mInput); +} + +bool Lexer::DoLexing(std::vector & tokens, std::string & error) +{ + while (true) + { + auto token = getToken(); + mState.Token = token; + if (token == tok_eof) + break; + if (token == tok_error) + { + error = StringUtils::sprintf("line %d, col %d: %s", mState.CurLine + 1, mState.LineIndex, mError.c_str()); + return false; + } + tokens.push_back(mState); + } + return true; +} + +bool Lexer::Test(const std::function & lexEnum, bool output) +{ + size_t line = 0; + if (output) + lexEnum("1: "); + Token tok; + std::string toks; + clearReserve(toks); + char newlineText[128] = ""; + do + { + tok = getToken(); + if (!output) + continue; + toks.clear(); + while (line < mState.CurLine) + { + line++; + sprintf_s(newlineText, "\n%d: ", line + 1); + toks.append(newlineText); + } + toks.append(tokString(tok)); + appendCh(toks, ' '); + lexEnum(toks); + } while (tok != tok_eof && tok != tok_error); + if (tok != tok_error && tok != tok_eof) + tok = reportError("lexer did not finish at the end of the file"); + for (const auto& warning : mWarnings) + if (output) + lexEnum("\nwarning: " + warning); + return tok != tok_error; +} + +Lexer::Token Lexer::getToken() +{ + //skip whitespace + while (isspace(mLastChar)) + { + if (mLastChar == '\n') + signalNewLine(); + nextChar(); + } + + //skip \\[\r\n] + if (mLastChar == '\\' && (peekChar() == '\r' || peekChar() == '\n')) + { + nextChar(); + return getToken(); + } + + //character literal + if (mLastChar == '\'') + { + std::string charLit; + while (true) + { + nextChar(); + if (mLastChar == EOF) //end of file + return reportError("unexpected end of file in character literal (1)"); + if (mLastChar == '\r' || mLastChar == '\n') + return reportError("unexpected newline in character literal (1)"); + if (mLastChar == '\'') //end of character literal + { + if (charLit.length() != 1) + return reportError(StringUtils::sprintf("invalid character literal '%s'", charLit.c_str())); + mState.CharLit = charLit[0]; + nextChar(); + return tok_charlit; + } + if (mLastChar == '\\') //escape sequence + { + nextChar(); + if (mLastChar == EOF) + return reportError("unexpected end of file in character literal (2)"); + if (mLastChar == '\r' || mLastChar == '\n') + return reportError("unexpected newline in character literal (2)"); + if (mLastChar == '\'' || mLastChar == '\"' || mLastChar == '?' || mLastChar == '\\') + mLastChar = mLastChar; + else if (mLastChar == 'a') + mLastChar = '\a'; + else if (mLastChar == 'b') + mLastChar = '\b'; + else if (mLastChar == 'f') + mLastChar = '\f'; + else if (mLastChar == 'n') + mLastChar = '\n'; + else if (mLastChar == 'r') + mLastChar = '\r'; + else if (mLastChar == 't') + mLastChar = '\t'; + else if (mLastChar == 'v') + mLastChar = '\v'; + else if (mLastChar == '0') + mLastChar = '\0'; + else if (mLastChar == 'x') //\xHH + { + auto ch1 = nextChar(); + auto ch2 = nextChar(); + if (isxdigit(ch1) && isxdigit(ch2)) + { + char byteStr[3] = ""; + byteStr[0] = ch1; + byteStr[1] = ch2; + uint64_t hexData; + auto error = convertNumber(byteStr, hexData, 16); + if (error) + return reportError(StringUtils::sprintf("convertNumber failed (%s) for hex sequence \"\\x%c%c\" in character literal", error, ch1, ch2)); + mLastChar = hexData & 0xFF; + } + else + return reportError(StringUtils::sprintf("invalid hex sequence \"\\x%c%c\" in character literal", ch1, ch2)); + } + else + return reportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in character literal", mLastChar)); + } + charLit += mLastChar; + } + } + + //string literal + if (mLastChar == '\"') + { + mState.StringLit.clear(); + while (true) + { + nextChar(); + if (mLastChar == EOF) //end of file + return reportError("unexpected end of file in string literal (1)"); + if (mLastChar == '\r' || mLastChar == '\n') + return reportError("unexpected newline in string literal (1)"); + if (mLastChar == '\"') //end of string literal + { + nextChar(); + return tok_stringlit; + } + if (mLastChar == '\\') //escape sequence + { + nextChar(); + if (mLastChar == EOF) + return reportError("unexpected end of file in string literal (2)"); + if (mLastChar == '\r' || mLastChar == '\n') + return reportError("unexpected newline in string literal (2)"); + if (mLastChar == '\'' || mLastChar == '\"' || mLastChar == '?' || mLastChar == '\\') + mLastChar = mLastChar; + else if (mLastChar == 'a') + mLastChar = '\a'; + else if (mLastChar == 'b') + mLastChar = '\b'; + else if (mLastChar == 'f') + mLastChar = '\f'; + else if (mLastChar == 'n') + mLastChar = '\n'; + else if (mLastChar == 'r') + mLastChar = '\r'; + else if (mLastChar == 't') + mLastChar = '\t'; + else if (mLastChar == 'v') + mLastChar = '\v'; + else if (mLastChar == '0') + mLastChar = '\0'; + else if (mLastChar == 'x') //\xHH + { + auto ch1 = nextChar(); + auto ch2 = nextChar(); + if (isxdigit(ch1) && isxdigit(ch2)) + { + char byteStr[3] = ""; + byteStr[0] = ch1; + byteStr[1] = ch2; + uint64_t hexData; + auto error = convertNumber(byteStr, hexData, 16); + if (error) + return reportError(StringUtils::sprintf("convertNumber failed (%s) for hex sequence \"\\x%c%c\" in string literal", error, ch1, ch2)); + mLastChar = hexData & 0xFF; + } + else + return reportError(StringUtils::sprintf("invalid hex sequence \"\\x%c%c\" in string literal", ch1, ch2)); + } + else + return reportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in string literal", mLastChar)); + } + appendCh(mState.StringLit, mLastChar); + } + } + + //identifier/keyword + if (isalpha(mLastChar) || mLastChar == '_') //[a-zA-Z_] + { + mState.IdentifierStr = mLastChar; + nextChar(); + while (isalnum(mLastChar) || mLastChar == '_') //[0-9a-zA-Z_] + { + appendCh(mState.IdentifierStr, mLastChar); + nextChar(); + } + + //keywords + auto found = mKeywordMap.find(mState.IdentifierStr); + if (found != mKeywordMap.end()) + return found->second; + + return tok_identifier; + } + + //hex numbers + if (mLastChar == '0' && peekChar() == 'x') //0x + { + nextChar(); //consume the 'x' + mNumStr.clear(); + + while (isxdigit(nextChar())) //[0-9a-fA-F]* + appendCh(mNumStr, mLastChar); + + if (!mNumStr.length()) //check for error condition + return reportError("no hex digits after \"0x\" prefix"); + + auto error = convertNumber(mNumStr.c_str(), mState.NumberVal, 16); + if (error) + return reportError(StringUtils::sprintf("convertNumber failed (%s) on hexadecimal number", error)); + mIsHexNumberVal = true; + return tok_number; + } + if (isdigit(mLastChar)) //[0-9] + { + mNumStr = mLastChar; + + while (isdigit(nextChar())) //[0-9]* + mNumStr += mLastChar; + + auto error = convertNumber(mNumStr.c_str(), mState.NumberVal, 10); + if (error) + return reportError(StringUtils::sprintf("convertNumber failed (%s) on decimal number", error)); + mIsHexNumberVal = false; + return tok_number; + } + + //comments + if (mLastChar == '/' && peekChar() == '/') //line comment + { + do + { + if (mLastChar == '\n') + signalNewLine(); + nextChar(); + } while (!(mLastChar == EOF || mLastChar == '\n')); + + return getToken(); //interpret the next line + } + if (mLastChar == '/' && peekChar() == '*') //block comment + { + do + { + if (mLastChar == '\n') + signalNewLine(); + nextChar(); + } while (!(mLastChar == EOF || mLastChar == '*' && peekChar() == '/')); + + if (mLastChar == EOF) //unexpected end of file + { + mState.LineIndex++; + return reportError("unexpected end of file in block comment"); + } + + nextChar(); + nextChar(); + return getToken(); //get the next non-comment token + } + + //operators + auto opFound = mOpTripleMap.find(MAKE_OP_TRIPLE(mLastChar, peekChar(), peekChar(1))); + if (opFound != mOpTripleMap.end()) + { + nextChar(); + nextChar(); + nextChar(); + return opFound->second; + } + opFound = mOpDoubleMap.find(MAKE_OP_DOUBLE(mLastChar, peekChar())); + if (opFound != mOpDoubleMap.end()) + { + nextChar(); + nextChar(); + return opFound->second; + } + opFound = mOpSingleMap.find(MAKE_OP_SINGLE(mLastChar)); + if (opFound != mOpSingleMap.end()) + { + nextChar(); + return opFound->second; + } + + //end of file + if (mLastChar == EOF) + return tok_eof; + + //unknown character + return reportError(StringUtils::sprintf("unexpected character \'%c\'", mLastChar)); +} + +Lexer::Token Lexer::reportError(const std::string & error) +{ + mError = error; + return tok_error; +} + +int Lexer::nextChar() +{ + return mLastChar = readChar(); +} + +void Lexer::reportWarning(const std::string & warning) +{ + mWarnings.push_back(warning); +} + +void Lexer::resetLexerState() +{ + mInput.clear(); + mInput.reserve(1024 * 1024); + mIndex = 0; + mError.clear(); + mWarnings.clear(); + clearReserve(mState.IdentifierStr); + mState.NumberVal = 0; + mIsHexNumberVal = false; + clearReserve(mState.StringLit); + clearReserve(mNumStr, 16); + mState.CharLit = '\0'; + mLastChar = ' '; + mState.CurLine = 0; + mState.LineIndex = 0; +} + +void Lexer::setupTokenMaps() +{ + //setup keyword map +#define DEF_KEYWORD(keyword) mKeywordMap[#keyword] = tok_##keyword; +#include "keywords.h" +#undef DEF_KEYWORD + + //setup token maps +#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) mOpTripleMap[MAKE_OP_TRIPLE(ch1, ch2, ch3)] = tok_##enumval; +#define DEF_OP_DOUBLE(enumval, ch1, ch2) mOpDoubleMap[MAKE_OP_DOUBLE(ch1, ch2)] = tok_##enumval; +#define DEF_OP_SINGLE(enumval, ch1) mOpSingleMap[MAKE_OP_SINGLE(ch1)] = tok_##enumval; +#include "operators.h" +#undef DEF_OP_TRIPLE +#undef DEF_OP_DOUBLE +#undef DEF_OP_SINGLE + + //setup reverse token maps +#define DEF_KEYWORD(keyword) mReverseTokenMap[tok_##keyword] = #keyword; +#include "keywords.h" +#undef DEF_KEYWORD + +#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) mReverseTokenMap[tok_##enumval] = std::string({ch1, ch2, ch3}); +#define DEF_OP_DOUBLE(enumval, ch1, ch2) mReverseTokenMap[tok_##enumval] = std::string({ch1, ch2}); +#define DEF_OP_SINGLE(enumval, ch1) mReverseTokenMap[tok_##enumval] = std::string({ch1}); +#include "operators.h" +#undef DEF_OP_TRIPLE +#undef DEF_OP_DOUBLE +#undef DEF_OP_SINGLE +} + +std::string Lexer::tokString(Token tok) +{ + switch (tok) + { + case tok_eof: return "tok_eof"; + case tok_error: return StringUtils::sprintf("error(line %d, col %d, \"%s\")", mState.CurLine + 1, mState.LineIndex, mError.c_str()); + case tok_identifier: return mState.IdentifierStr; + case tok_number: return StringUtils::sprintf(mIsHexNumberVal ? "0x%llX" : "%llu", mState.NumberVal); + case tok_stringlit: return StringUtils::sprintf("\"%s\"", StringUtils::Escape(mState.StringLit).c_str()); + case tok_charlit: + { + std::string s; + s = mState.CharLit; + return StringUtils::sprintf("'%s'", StringUtils::Escape(s).c_str()); + } + default: + { + auto found = mReverseTokenMap.find(Token(tok)); + if (found != mReverseTokenMap.end()) + return found->second; + return ""; + } + } +} + +int Lexer::peekChar(size_t distance) +{ + if (mIndex + distance >= mInput.size()) + return EOF; + auto ch = mInput[mIndex + distance]; + if (ch == '\0') + { + reportWarning(StringUtils::sprintf("\\0 character in file data")); + return peekChar(distance + 1); + } + return ch; +} + +int Lexer::readChar() +{ + if (mIndex == mInput.size()) + return EOF; + auto ch = mInput[mIndex++]; + mState.LineIndex++; + if (ch == '\0') + { + reportWarning(StringUtils::sprintf("\\0 character in file data")); + return readChar(); + } + return ch; +} + +bool Lexer::checkString(const std::string & expected) +{ + for (size_t i = 0; i < expected.size(); i++) + { + auto ch = peekChar(i); + if (ch == EOF) + return false; + if (ch != uint8_t(expected[i])) + return false; + } + mIndex += expected.size(); + return true; +} + +void Lexer::signalNewLine() +{ + mState.CurLine++; + mState.LineIndex = 0; +} diff --git a/btparser/lexer.h b/btparser/lexer.h new file mode 100644 index 0000000..3e55583 --- /dev/null +++ b/btparser/lexer.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include +#include +#include + +class Lexer +{ +public: + enum Token + { + //status tokens + tok_eof = -10000, + tok_error, + + //keywords +#define DEF_KEYWORD(keyword) tok_##keyword, +#include "keywords.h" +#undef DEF_KEYWORD + + //others + tok_identifier, //[a-zA-Z_][a-zA-Z0-9_] + tok_number, //(0x[0-9a-fA-F]+)|([0-9]+) + tok_stringlit, //"([^\\"]|\\([\\"'?abfnrtv0]|x[0-9a-fA-f]{2}))*" + tok_charlit, //'([^\\]|\\([\\"'?abfnrtv0]|x[0-9a-fA-f]{2}))' + + //operators +#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) tok_##enumval, +#define DEF_OP_DOUBLE(enumval, ch1, ch2) tok_##enumval, +#define DEF_OP_SINGLE(enumval, ch1) tok_##enumval, +#include "operators.h" +#undef DEF_OP_TRIPLE +#undef DEF_OP_DOUBLE +#undef DEF_OP_SINGLE + }; + + struct TokenState + { + Token Token; + std::string IdentifierStr; //tok_identifier + uint64_t NumberVal = 0; //tok_number + std::string StringLit; //tok_stringlit + char CharLit = '\0'; //tok_charlit + + size_t CurLine = 0; + size_t LineIndex = 0; + }; + + explicit Lexer(); + bool ReadInputFile(const std::string & filename); + bool DoLexing(std::vector & tokens, std::string & error); + bool Test(const std::function & lexEnum, bool output = true); + +private: + TokenState mState; + std::vector mWarnings; + std::string mError; + std::vector mInput; + size_t mIndex = 0; + bool mIsHexNumberVal = false; + std::string mNumStr; + int mLastChar = ' '; + std::unordered_map mKeywordMap; + std::unordered_map mReverseTokenMap; + std::unordered_map mOpTripleMap; + std::unordered_map mOpDoubleMap; + std::unordered_map mOpSingleMap; + + void resetLexerState(); + void setupTokenMaps(); + Token reportError(const std::string & error); + void reportWarning(const std::string & warning); + std::string tokString(Token tok); + int peekChar(size_t distance = 0); + int readChar(); + bool checkString(const std::string & expected); + int nextChar(); + void signalNewLine(); + Token getToken(); +}; \ No newline at end of file diff --git a/btparser/main.cpp b/btparser/main.cpp index c4f71e7..365e73c 100644 --- a/btparser/main.cpp +++ b/btparser/main.cpp @@ -1,567 +1,27 @@ #include #include -#include -#include -#include -#include -#include -#include "filehelper.h" -#include "stringutils.h" #include "testfiles.h" +#include "lexer.h" +#include "filehelper.h" -#define MAKE_OP_TRIPLE(ch1, ch2, ch3) (ch3 << 16 | ch2 << 8 | ch1) -#define MAKE_OP_DOUBLE(ch1, ch2) (ch2 << 8 | ch1) -#define MAKE_OP_SINGLE(ch1) (ch1) - -#define DEFAULT_STRING_BUFFER 65536 - -using namespace std; - -struct Lexer -{ - explicit Lexer() - { - SetupTokenMaps(); - } - - enum Token - { - //status tokens - tok_eof = -10000, - tok_error, - - //keywords -#define DEF_KEYWORD(keyword) tok_##keyword, -#include "keywords.h" -#undef DEF_KEYWORD - - //others - tok_identifier, //[a-zA-Z_][a-zA-Z0-9_] - tok_number, //(0x[0-9a-fA-F]+)|([0-9]+) - tok_stringlit, //"([^\\"]|\\([\\"'?abfnrtv0]|x[0-9a-fA-f]{2}))*" - tok_charlit, //'([^\\]|\\([\\"'?abfnrtv0]|x[0-9a-fA-f]{2}))' - - //operators -#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) tok_##enumval, -#define DEF_OP_DOUBLE(enumval, ch1, ch2) tok_##enumval, -#define DEF_OP_SINGLE(enumval, ch1) tok_##enumval, -#include "operators.h" -#undef DEF_OP_TRIPLE -#undef DEF_OP_DOUBLE -#undef DEF_OP_SINGLE - }; - - vector Input; - size_t Index = 0; - string Error; - vector Warnings; - - //lexer state - string IdentifierStr; - uint64_t NumberVal = 0; - bool IsHexNumberVal = false; - string StringLit; - string NumStr; - char CharLit = '\0'; - int LastChar = ' '; - size_t CurLine = 0; - size_t LineIndex = 0; - - static void clearReserve(string & str, size_t reserve = DEFAULT_STRING_BUFFER) - { - str.clear(); - str.reserve(reserve); - } - - static void appendCh(string & str, char ch) - { - str.resize(str.size() + 1); - str[str.size() - 1] = ch; - } - - void ResetLexerState() - { - Input.clear(); - Input.reserve(1024 * 1024); - Index = 0; - Error.clear(); - Warnings.clear(); - clearReserve(IdentifierStr); - NumberVal = 0; - IsHexNumberVal = false; - clearReserve(StringLit); - clearReserve(NumStr, 16); - CharLit = '\0'; - LastChar = ' '; - CurLine = 0; - LineIndex = 0; - } - - unordered_map KeywordMap; - unordered_map ReverseTokenMap; - unordered_map OpTripleMap; - unordered_map OpDoubleMap; - unordered_map OpSingleMap; - - void SetupTokenMaps() - { - //setup keyword map -#define DEF_KEYWORD(keyword) KeywordMap[#keyword] = tok_##keyword; -#include "keywords.h" -#undef DEF_KEYWORD - - //setup token maps -#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) OpTripleMap[MAKE_OP_TRIPLE(ch1, ch2, ch3)] = tok_##enumval; -#define DEF_OP_DOUBLE(enumval, ch1, ch2) OpDoubleMap[MAKE_OP_DOUBLE(ch1, ch2)] = tok_##enumval; -#define DEF_OP_SINGLE(enumval, ch1) OpSingleMap[MAKE_OP_SINGLE(ch1)] = tok_##enumval; -#include "operators.h" -#undef DEF_OP_TRIPLE -#undef DEF_OP_DOUBLE -#undef DEF_OP_SINGLE - - //setup reverse token maps -#define DEF_KEYWORD(keyword) ReverseTokenMap[tok_##keyword] = #keyword; -#include "keywords.h" -#undef DEF_KEYWORD - -#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) ReverseTokenMap[tok_##enumval] = string({ch1, ch2, ch3}); -#define DEF_OP_DOUBLE(enumval, ch1, ch2) ReverseTokenMap[tok_##enumval] = string({ch1, ch2}); -#define DEF_OP_SINGLE(enumval, ch1) ReverseTokenMap[tok_##enumval] = string({ch1}); -#include "operators.h" -#undef DEF_OP_TRIPLE -#undef DEF_OP_DOUBLE -#undef DEF_OP_SINGLE - } - - Token ReportError(const String & error) - { - Error = error; - return tok_error; - } - - void ReportWarning(const String & warning) - { - Warnings.push_back(warning); - } - - String TokString(int tok) - { - switch (Token(tok)) - { - case tok_eof: return "tok_eof"; - case tok_error: return StringUtils::sprintf("error(line %d, col %d, \"%s\")", CurLine + 1, LineIndex, Error.c_str()); - case tok_identifier: return IdentifierStr; - case tok_number: return StringUtils::sprintf(IsHexNumberVal ? "0x%llX" : "%llu", NumberVal); - case tok_stringlit: return StringUtils::sprintf("\"%s\"", StringUtils::Escape(StringLit).c_str()); - case tok_charlit: - { - String s; - s = CharLit; - return StringUtils::sprintf("'%s'", StringUtils::Escape(s).c_str()); - } - default: - { - auto found = ReverseTokenMap.find(Token(tok)); - if (found != ReverseTokenMap.end()) - return found->second; - return ""; - } - } - } - - int PeekChar(size_t distance = 0) - { - if (Index + distance >= Input.size()) - return EOF; - auto ch = Input[Index + distance]; - if (ch == '\0') - { - ReportWarning(StringUtils::sprintf("\\0 character in file data")); - return PeekChar(distance + 1); - } - return ch; - } - - int ReadChar() - { - if (Index == Input.size()) - return EOF; - auto ch = Input[Index++]; - LineIndex++; - if (ch == '\0') - { - ReportWarning(StringUtils::sprintf("\\0 character in file data")); - return ReadChar(); - } - return ch; - } - - bool CheckString(const string & expected) - { - for (size_t i = 0; i < expected.size(); i++) - { - auto ch = PeekChar(i); - if (ch == EOF) - return false; - if (ch != uint8_t(expected[i])) - return false; - } - Index += expected.size(); - return true; - } - - int NextChar() - { - return LastChar = ReadChar(); - } - - void SignalNextLine() - { - CurLine++; - LineIndex = 0; - } - - static const char* ConvertNumber(const char* str, uint64_t & result, int radix) - { - errno = 0; - char* end; - result = strtoull(str, &end, radix); - if (!result && end == str) - return "not a number"; - if (result == ULLONG_MAX && errno) - return "does not fit"; - if (*end) - return "str not completely consumed"; - return nullptr; - } - - int GetToken() - { - //skip whitespace - while (isspace(LastChar)) - { - if (LastChar == '\n') - SignalNextLine(); - NextChar(); - } - - //skip \\[\r\n] - if (LastChar == '\\' && (PeekChar() == '\r' || PeekChar() == '\n')) - { - NextChar(); - return GetToken(); - } - - //character literal - if (LastChar == '\'') - { - string charLit; - while (true) - { - NextChar(); - if (LastChar == EOF) //end of file - return ReportError("unexpected end of file in character literal (1)"); - if (LastChar == '\r' || LastChar == '\n') - return ReportError("unexpected newline in character literal (1)"); - if (LastChar == '\'') //end of character literal - { - if (charLit.length() != 1) - return ReportError(StringUtils::sprintf("invalid character literal '%s'", charLit.c_str())); - CharLit = charLit[0]; - NextChar(); - return tok_charlit; - } - if (LastChar == '\\') //escape sequence - { - NextChar(); - if (LastChar == EOF) - return ReportError("unexpected end of file in character literal (2)"); - if (LastChar == '\r' || LastChar == '\n') - return ReportError("unexpected newline in character literal (2)"); - if (LastChar == '\'' || LastChar == '\"' || LastChar == '?' || LastChar == '\\') - LastChar = LastChar; - else if (LastChar == 'a') - LastChar = '\a'; - else if (LastChar == 'b') - LastChar = '\b'; - else if (LastChar == 'f') - LastChar = '\f'; - else if (LastChar == 'n') - LastChar = '\n'; - else if (LastChar == 'r') - LastChar = '\r'; - else if (LastChar == 't') - LastChar = '\t'; - else if (LastChar == 'v') - LastChar = '\v'; - else if (LastChar == '0') - LastChar = '\0'; - else if (LastChar == 'x') //\xHH - { - auto ch1 = NextChar(); - auto ch2 = NextChar(); - if (isxdigit(ch1) && isxdigit(ch2)) - { - char byteStr[3] = ""; - byteStr[0] = ch1; - byteStr[1] = ch2; - uint64_t hexData; - auto error = ConvertNumber(byteStr, hexData, 16); - if (error) - return ReportError(StringUtils::sprintf("ConvertNumber failed (%s) for hex sequence \"\\x%c%c\" in character literal", error, ch1, ch2)); - LastChar = hexData & 0xFF; - } - else - return ReportError(StringUtils::sprintf("invalid hex sequence \"\\x%c%c\" in character literal", ch1, ch2)); - } - else - return ReportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in character literal", LastChar)); - } - charLit += LastChar; - } - } - - //string literal - if (LastChar == '\"') - { - StringLit.clear(); - while (true) - { - NextChar(); - if (LastChar == EOF) //end of file - return ReportError("unexpected end of file in string literal (1)"); - if (LastChar == '\r' || LastChar == '\n') - return ReportError("unexpected newline in string literal (1)"); - if (LastChar == '\"') //end of string literal - { - NextChar(); - return tok_stringlit; - } - if (LastChar == '\\') //escape sequence - { - NextChar(); - if (LastChar == EOF) - return ReportError("unexpected end of file in string literal (2)"); - if (LastChar == '\r' || LastChar == '\n') - return ReportError("unexpected newline in string literal (2)"); - if (LastChar == '\'' || LastChar == '\"' || LastChar == '?' || LastChar == '\\') - LastChar = LastChar; - else if (LastChar == 'a') - LastChar = '\a'; - else if (LastChar == 'b') - LastChar = '\b'; - else if (LastChar == 'f') - LastChar = '\f'; - else if (LastChar == 'n') - LastChar = '\n'; - else if (LastChar == 'r') - LastChar = '\r'; - else if (LastChar == 't') - LastChar = '\t'; - else if (LastChar == 'v') - LastChar = '\v'; - else if (LastChar == '0') - LastChar = '\0'; - else if (LastChar == 'x') //\xHH - { - auto ch1 = NextChar(); - auto ch2 = NextChar(); - if (isxdigit(ch1) && isxdigit(ch2)) - { - char byteStr[3] = ""; - byteStr[0] = ch1; - byteStr[1] = ch2; - uint64_t hexData; - auto error = ConvertNumber(byteStr, hexData, 16); - if (error) - return ReportError(StringUtils::sprintf("ConvertNumber failed (%s) for hex sequence \"\\x%c%c\" in string literal", error, ch1, ch2)); - LastChar = hexData & 0xFF; - } - else - return ReportError(StringUtils::sprintf("invalid hex sequence \"\\x%c%c\" in string literal", ch1, ch2)); - } - else - return ReportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in string literal", LastChar)); - } - appendCh(StringLit, LastChar); - } - } - - //identifier/keyword - if (isalpha(LastChar) || LastChar == '_') //[a-zA-Z_] - { - IdentifierStr = LastChar; - NextChar(); - while (isalnum(LastChar) || LastChar == '_') //[0-9a-zA-Z_] - { - appendCh(IdentifierStr, LastChar); - NextChar(); - } - - //keywords - auto found = KeywordMap.find(IdentifierStr); - if (found != KeywordMap.end()) - return found->second; - - return tok_identifier; - } - - //hex numbers - if (LastChar == '0' && PeekChar() == 'x') //0x - { - NextChar(); //consume the 'x' - NumStr.clear(); - - while (isxdigit(NextChar())) //[0-9a-fA-F]* - appendCh(NumStr, LastChar); - - if (!NumStr.length()) //check for error condition - return ReportError("no hex digits after \"0x\" prefix"); - - auto error = ConvertNumber(NumStr.c_str(), NumberVal, 16); - if (error) - return ReportError(StringUtils::sprintf("ConvertNumber failed (%s) on hexadecimal number", error)); - IsHexNumberVal = true; - return tok_number; - } - if (isdigit(LastChar)) //[0-9] - { - NumStr = LastChar; - - while (isdigit(NextChar())) //[0-9]* - NumStr += LastChar; - - auto error = ConvertNumber(NumStr.c_str(), NumberVal, 10); - if (error) - return ReportError(StringUtils::sprintf("ConvertNumber failed (%s) on decimal number", error)); - IsHexNumberVal = false; - return tok_number; - } - - //comments - if (LastChar == '/' && PeekChar() == '/') //line comment - { - do - { - if (LastChar == '\n') - SignalNextLine(); - NextChar(); - } while (!(LastChar == EOF || LastChar == '\n')); - - return GetToken(); //interpret the next line - } - if (LastChar == '/' && PeekChar() == '*') //block comment - { - do - { - if (LastChar == '\n') - SignalNextLine(); - NextChar(); - } while (!(LastChar == EOF || LastChar == '*' && PeekChar() == '/')); - - if (LastChar == EOF) //unexpected end of file - { - LineIndex++; - return ReportError("unexpected end of file in block comment"); - } - - NextChar(); - NextChar(); - return GetToken(); //get the next non-comment token - } - - //operators - auto opFound = OpTripleMap.find(MAKE_OP_TRIPLE(LastChar, PeekChar(), PeekChar(1))); - if (opFound != OpTripleMap.end()) - { - NextChar(); - NextChar(); - NextChar(); - return opFound->second; - } - opFound = OpDoubleMap.find(MAKE_OP_DOUBLE(LastChar, PeekChar())); - if (opFound != OpDoubleMap.end()) - { - NextChar(); - NextChar(); - return opFound->second; - } - opFound = OpSingleMap.find(MAKE_OP_SINGLE(LastChar)); - if (opFound != OpSingleMap.end()) - { - NextChar(); - return opFound->second; - } - - //end of file - if (LastChar == EOF) - return tok_eof; - - //unknown character - return ReportError(StringUtils::sprintf("unexpected character \'%c\'", LastChar)); - } - - bool ReadInputFile(const string & filename) - { - ResetLexerState(); - return FileHelper::ReadAllData(filename, Input); - } - - bool TestLex(const function & lexEnum, bool output = true) - { - size_t line = 0; - if (output) - lexEnum("1: "); - int tok; - string toks; - clearReserve(toks); - char newlineText[128] = ""; - do - { - tok = GetToken(); - if (!output) - continue; - toks.clear(); - while (line < CurLine) - { - line++; - sprintf_s(newlineText, "\n%d: ", line + 1); - toks.append(newlineText); - } - toks.append(TokString(tok)); - appendCh(toks, ' '); - lexEnum(toks); - } while (tok != tok_eof && tok != tok_error); - if (tok != tok_error && tok != tok_eof) - tok = ReportError("lexer did not finish at the end of the file"); - for (const auto & warning : Warnings) - if (output) - lexEnum("\nwarning: " + warning); - return tok != tok_error; - } -}; - -bool TestLexer(Lexer & lexer, const string & filename) +bool TestLexer(Lexer & lexer, const std::string & filename) { if (!lexer.ReadInputFile("tests\\" + filename)) { printf("failed to read \"%s\"\n", filename.c_str()); return false; } - string actual; - Lexer::clearReserve(actual); - auto success = lexer.TestLex([&](const string & line) + std::string actual; + actual.reserve(65536); + auto success = lexer.Test([&](const std::string & line) { actual.append(line); }); - string expected; - if (FileHelper::ReadAllText("tests\\exp_lex\\" + filename, expected)) + std::string expected; + if (FileHelper::ReadAllText("tests\\exp_lex\\" + filename, expected) && expected == actual) { - if (expected == actual) - { - printf("lexer test for \"%s\" success!\n", filename.c_str()); - return true; - } + printf("lexer test for \"%s\" success!\n", filename.c_str()); + return true; } if (success) return true; @@ -571,14 +31,14 @@ bool TestLexer(Lexer & lexer, const string & filename) return false; } -bool DebugLexer(Lexer & lexer, const string & filename, bool output) +bool DebugLexer(Lexer & lexer, const std::string & filename, bool output) { if (!lexer.ReadInputFile("tests\\" + filename)) { printf("failed to read \"%s\"\n", filename.c_str()); return false; } - auto success = lexer.TestLex([](const string & line) + auto success = lexer.Test([](const std::string & line) { printf("%s", line.c_str()); }, output); @@ -587,16 +47,16 @@ bool DebugLexer(Lexer & lexer, const string & filename, bool output) return success; } -void GenerateExpected(Lexer & lexer, const string & filename) +void GenerateExpected(Lexer & lexer, const std::string & filename) { if (!lexer.ReadInputFile("tests\\" + filename)) { printf("failed to read \"%s\"\n", filename.c_str()); return; } - string actual; - Lexer::clearReserve(actual); - lexer.TestLex([&](const string & line) + std::string actual; + actual.reserve(65536); + lexer.Test([&](const std::string & line) { actual.append(line); });