mirror of https://github.com/x64dbg/btparser
553 lines
17 KiB
C++
553 lines
17 KiB
C++
#include "lexer.h"
|
|
#include <cctype>
|
|
#include <windows.h>
|
|
#include "helpers.h"
|
|
|
|
#define MAKE_OP_TRIPLE(ch1, ch2, ch3) (ch3 << 16 | ch2 << 8 | ch1)
|
|
#define MAKE_OP_DOUBLE(ch1, ch2) (ch2 << 8 | ch1)
|
|
#define MAKE_OP_SINGLE(ch1) (ch1)
|
|
#define DEFAULT_STRING_BUFFER 65536
|
|
|
|
static void clearReserve(std::string & str, size_t reserve = DEFAULT_STRING_BUFFER)
|
|
{
|
|
str.clear();
|
|
str.reserve(reserve);
|
|
}
|
|
|
|
static const char* convertNumber(const char* str, uint64_t & result, int radix)
|
|
{
|
|
errno = 0;
|
|
char* end;
|
|
result = strtoull(str, &end, radix);
|
|
if(!result && end == str)
|
|
return "not a number";
|
|
if(result == ULLONG_MAX && errno)
|
|
return "does not fit";
|
|
if(*end)
|
|
return "str not completely consumed";
|
|
return nullptr;
|
|
}
|
|
|
|
Lexer::Lexer()
|
|
{
|
|
setupTokenMaps();
|
|
}
|
|
|
|
bool Lexer::ReadInputFile(const std::string & filename)
|
|
{
|
|
resetLexerState();
|
|
return FileHelper::ReadAllData(filename, mInput);
|
|
}
|
|
|
|
void Lexer::SetInputData(const std::string & data)
|
|
{
|
|
resetLexerState();
|
|
for(auto & ch : data)
|
|
mInput.push_back(ch);
|
|
}
|
|
|
|
bool Lexer::DoLexing(std::vector<TokenState> & tokens, std::string & error)
|
|
{
|
|
while(true)
|
|
{
|
|
size_t lineIndex = -1;
|
|
auto token = getToken(lineIndex);
|
|
if (lineIndex == -1)
|
|
__debugbreak();
|
|
mState.Token = token;
|
|
if(token == tok_error)
|
|
{
|
|
error = StringUtils::sprintf("line %d, col %d: %s", mState.CurLine + 1, mState.LineIndex, mError.c_str());
|
|
return false;
|
|
}
|
|
tokens.push_back(mState);
|
|
// Restore the line index from when we started parsing the token
|
|
tokens.back().LineIndex = lineIndex;
|
|
mState.Clear();
|
|
if(token == tok_eof)
|
|
break;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Lexer::Test(const std::function<void(const std::string & line)> & lexEnum, bool output)
|
|
{
|
|
size_t line = 0;
|
|
if(output)
|
|
lexEnum("1: ");
|
|
Token tok;
|
|
std::string toks;
|
|
clearReserve(toks);
|
|
char newlineText[128] = "";
|
|
do
|
|
{
|
|
size_t lineIndex = -1;
|
|
tok = getToken(lineIndex);
|
|
if(!output)
|
|
continue;
|
|
toks.clear();
|
|
while(line < mState.CurLine)
|
|
{
|
|
line++;
|
|
sprintf_s(newlineText, "\n%zu: ", line + 1);
|
|
toks.append(newlineText);
|
|
}
|
|
toks.append(TokString(tok));
|
|
toks.push_back(' ');
|
|
lexEnum(toks);
|
|
}
|
|
while(tok != tok_eof && tok != tok_error);
|
|
for(const auto & warning : mWarnings)
|
|
if(output)
|
|
lexEnum("\nwarning: " + warning);
|
|
return tok != tok_error;
|
|
}
|
|
|
|
Lexer::Token Lexer::getToken(size_t & tokenLineIndex)
|
|
{
|
|
//skip whitespace
|
|
while(isspace(mLastChar))
|
|
{
|
|
if(mLastChar == '\n')
|
|
signalNewLine();
|
|
nextChar();
|
|
}
|
|
|
|
//skip \\[\r\n]
|
|
if(mLastChar == '\\' && (peekChar() == '\r' || peekChar() == '\n'))
|
|
{
|
|
nextChar();
|
|
return getToken(tokenLineIndex);
|
|
}
|
|
|
|
//character literal
|
|
if(mLastChar == '\'')
|
|
{
|
|
tokenLineIndex = mState.LineIndex - 1;
|
|
std::string charLit;
|
|
while(true)
|
|
{
|
|
nextChar();
|
|
if(mLastChar == EOF) //end of file
|
|
return reportError("unexpected end of file in character literal (1)");
|
|
if(mLastChar == '\r' || mLastChar == '\n')
|
|
return reportError("unexpected newline in character literal (1)");
|
|
if(mLastChar == '\'') //end of character literal
|
|
{
|
|
if(charLit.length() != 1)
|
|
return reportError(StringUtils::sprintf("invalid character literal '%s'", charLit.c_str()));
|
|
mState.CharLit = charLit[0];
|
|
nextChar();
|
|
return tok_charlit;
|
|
}
|
|
if(mLastChar == '\\') //escape sequence
|
|
{
|
|
nextChar();
|
|
if(mLastChar == EOF)
|
|
return reportError("unexpected end of file in character literal (2)");
|
|
if(mLastChar == '\r' || mLastChar == '\n')
|
|
return reportError("unexpected newline in character literal (2)");
|
|
if(mLastChar == '\'' || mLastChar == '\"' || mLastChar == '?' || mLastChar == '\\')
|
|
mLastChar = mLastChar;
|
|
else if(mLastChar == 'a')
|
|
mLastChar = '\a';
|
|
else if(mLastChar == 'b')
|
|
mLastChar = '\b';
|
|
else if(mLastChar == 'f')
|
|
mLastChar = '\f';
|
|
else if(mLastChar == 'n')
|
|
mLastChar = '\n';
|
|
else if(mLastChar == 'r')
|
|
mLastChar = '\r';
|
|
else if(mLastChar == 't')
|
|
mLastChar = '\t';
|
|
else if(mLastChar == 'v')
|
|
mLastChar = '\v';
|
|
else if(mLastChar == '0')
|
|
mLastChar = '\0';
|
|
else if(mLastChar == 'x') //\xHH
|
|
{
|
|
auto ch1 = nextChar();
|
|
auto ch2 = nextChar();
|
|
if(isxdigit(ch1) && isxdigit(ch2))
|
|
{
|
|
char byteStr[3] = "";
|
|
byteStr[0] = ch1;
|
|
byteStr[1] = ch2;
|
|
uint64_t hexData;
|
|
auto error = convertNumber(byteStr, hexData, 16);
|
|
if(error)
|
|
return reportError(StringUtils::sprintf("convertNumber failed (%s) for hex sequence \"\\x%c%c\" in character literal", error, ch1, ch2));
|
|
mLastChar = hexData & 0xFF;
|
|
}
|
|
else
|
|
return reportError(StringUtils::sprintf("invalid hex sequence \"\\x%c%c\" in character literal", ch1, ch2));
|
|
}
|
|
else
|
|
return reportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in character literal", mLastChar));
|
|
}
|
|
charLit += mLastChar;
|
|
}
|
|
}
|
|
|
|
//string literal
|
|
if(mLastChar == '\"')
|
|
{
|
|
tokenLineIndex = mState.LineIndex - 1;
|
|
mState.StringLit.clear();
|
|
while(true)
|
|
{
|
|
nextChar();
|
|
if(mLastChar == EOF) //end of file
|
|
return reportError("unexpected end of file in string literal (1)");
|
|
if(mLastChar == '\r' || mLastChar == '\n')
|
|
return reportError("unexpected newline in string literal (1)");
|
|
if(mLastChar == '\"') //end of string literal
|
|
{
|
|
nextChar();
|
|
return tok_stringlit;
|
|
}
|
|
if(mLastChar == '\\') //escape sequence
|
|
{
|
|
nextChar();
|
|
if(mLastChar == EOF)
|
|
return reportError("unexpected end of file in string literal (2)");
|
|
if(mLastChar == '\r' || mLastChar == '\n')
|
|
return reportError("unexpected newline in string literal (2)");
|
|
if(mLastChar == '\'' || mLastChar == '\"' || mLastChar == '?' || mLastChar == '\\')
|
|
mLastChar = mLastChar;
|
|
else if(mLastChar == 'a')
|
|
mLastChar = '\a';
|
|
else if(mLastChar == 'b')
|
|
mLastChar = '\b';
|
|
else if(mLastChar == 'f')
|
|
mLastChar = '\f';
|
|
else if(mLastChar == 'n')
|
|
mLastChar = '\n';
|
|
else if(mLastChar == 'r')
|
|
mLastChar = '\r';
|
|
else if(mLastChar == 't')
|
|
mLastChar = '\t';
|
|
else if(mLastChar == 'v')
|
|
mLastChar = '\v';
|
|
else if(mLastChar == '0')
|
|
mLastChar = '\0';
|
|
else if(mLastChar == 'x') //\xHH
|
|
{
|
|
auto ch1 = nextChar();
|
|
auto ch2 = nextChar();
|
|
if(isxdigit(ch1) && isxdigit(ch2))
|
|
{
|
|
char byteStr[3] = "";
|
|
byteStr[0] = ch1;
|
|
byteStr[1] = ch2;
|
|
uint64_t hexData;
|
|
auto error = convertNumber(byteStr, hexData, 16);
|
|
if(error)
|
|
return reportError(StringUtils::sprintf("convertNumber failed (%s) for hex sequence \"\\x%c%c\" in string literal", error, ch1, ch2));
|
|
mLastChar = hexData & 0xFF;
|
|
}
|
|
else
|
|
return reportError(StringUtils::sprintf("invalid hex sequence \"\\x%c%c\" in string literal", ch1, ch2));
|
|
}
|
|
else
|
|
return reportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in string literal", mLastChar));
|
|
}
|
|
mState.StringLit.push_back(mLastChar);
|
|
}
|
|
}
|
|
|
|
//identifier/keyword
|
|
if(isalpha(mLastChar) || mLastChar == '_') //[a-zA-Z_]
|
|
{
|
|
tokenLineIndex = mState.LineIndex - 1;
|
|
mState.IdentifierStr = mLastChar;
|
|
nextChar();
|
|
while(isalnum(mLastChar) || mLastChar == '_') //[0-9a-zA-Z_]
|
|
{
|
|
mState.IdentifierStr.push_back(mLastChar);
|
|
nextChar();
|
|
}
|
|
|
|
//keywords
|
|
auto found = mKeywordMap.find(mState.IdentifierStr);
|
|
if(found != mKeywordMap.end())
|
|
return found->second;
|
|
|
|
return tok_identifier;
|
|
}
|
|
|
|
//hex numbers
|
|
if(mLastChar == '0' && peekChar() == 'x') //0x
|
|
{
|
|
tokenLineIndex = mState.LineIndex - 1;
|
|
nextChar(); //consume the 'x'
|
|
mNumStr.clear();
|
|
|
|
while (isxdigit(nextChar())) //[0-9a-fA-F]*
|
|
mNumStr.push_back(mLastChar);
|
|
|
|
if(!mNumStr.length()) //check for error condition
|
|
return reportError("no hex digits after \"0x\" prefix");
|
|
|
|
auto error = convertNumber(mNumStr.c_str(), mState.NumberVal, 16);
|
|
if(error)
|
|
return reportError(StringUtils::sprintf("convertNumber failed (%s) on hexadecimal number", error));
|
|
mIsHexNumberVal = true;
|
|
return tok_number;
|
|
}
|
|
|
|
if(isdigit(mLastChar)) //[0-9]
|
|
{
|
|
tokenLineIndex = mState.LineIndex - 1;
|
|
mNumStr = mLastChar;
|
|
|
|
while(isdigit(nextChar())) //[0-9]*
|
|
mNumStr += mLastChar;
|
|
|
|
auto error = convertNumber(mNumStr.c_str(), mState.NumberVal, 10);
|
|
if(error)
|
|
return reportError(StringUtils::sprintf("convertNumber failed (%s) on decimal number", error));
|
|
mIsHexNumberVal = false;
|
|
return tok_number;
|
|
}
|
|
|
|
//comments
|
|
if(mLastChar == '/' && peekChar() == '/') //line comment
|
|
{
|
|
do
|
|
{
|
|
if(mLastChar == '\n')
|
|
signalNewLine();
|
|
nextChar();
|
|
}
|
|
while(!(mLastChar == EOF || mLastChar == '\n'));
|
|
|
|
return getToken(tokenLineIndex); //interpret the next line
|
|
}
|
|
|
|
if(mLastChar == '/' && peekChar() == '*') //block comment
|
|
{
|
|
do
|
|
{
|
|
if(mLastChar == '\n')
|
|
signalNewLine();
|
|
nextChar();
|
|
}
|
|
while(!(mLastChar == EOF || mLastChar == '*' && peekChar() == '/'));
|
|
|
|
if(mLastChar == EOF) //unexpected end of file
|
|
{
|
|
mState.LineIndex++;
|
|
return reportError("unexpected end of file in block comment");
|
|
}
|
|
|
|
nextChar();
|
|
nextChar();
|
|
return getToken(tokenLineIndex); //get the next non-comment token
|
|
}
|
|
|
|
tokenLineIndex = mState.LineIndex - 1;
|
|
|
|
//operators
|
|
auto opFound = mOpTripleMap.find(MAKE_OP_TRIPLE(mLastChar, peekChar(), peekChar(1)));
|
|
if(opFound != mOpTripleMap.end())
|
|
{
|
|
nextChar();
|
|
nextChar();
|
|
nextChar();
|
|
return opFound->second;
|
|
}
|
|
opFound = mOpDoubleMap.find(MAKE_OP_DOUBLE(mLastChar, peekChar()));
|
|
if(opFound != mOpDoubleMap.end())
|
|
{
|
|
nextChar();
|
|
nextChar();
|
|
return opFound->second;
|
|
}
|
|
opFound = mOpSingleMap.find(MAKE_OP_SINGLE(mLastChar));
|
|
if(opFound != mOpSingleMap.end())
|
|
{
|
|
nextChar();
|
|
return opFound->second;
|
|
}
|
|
|
|
//end of file
|
|
if(mLastChar == EOF)
|
|
{
|
|
tokenLineIndex = 0;
|
|
return tok_eof;
|
|
}
|
|
|
|
//unknown character
|
|
return reportError(StringUtils::sprintf("unexpected character \'%c\'", mLastChar));
|
|
}
|
|
|
|
Lexer::Token Lexer::reportError(const std::string & error)
|
|
{
|
|
mError = error;
|
|
return tok_error;
|
|
}
|
|
|
|
int Lexer::nextChar()
|
|
{
|
|
return mLastChar = readChar();
|
|
}
|
|
|
|
void Lexer::reportWarning(const std::string & warning)
|
|
{
|
|
mWarnings.push_back(warning);
|
|
}
|
|
|
|
void Lexer::resetLexerState()
|
|
{
|
|
mInput.clear();
|
|
mInput.reserve(1024 * 1024);
|
|
mIndex = 0;
|
|
mError.clear();
|
|
mWarnings.clear();
|
|
clearReserve(mState.IdentifierStr);
|
|
mIsHexNumberVal = false;
|
|
clearReserve(mState.StringLit);
|
|
clearReserve(mNumStr, 16);
|
|
mLastChar = ' ';
|
|
mState.Clear();
|
|
}
|
|
|
|
void Lexer::setupTokenMaps()
|
|
{
|
|
//setup keyword map
|
|
#define DEF_KEYWORD(keyword) mKeywordMap[#keyword] = tok_##keyword;
|
|
#include "keywords.h"
|
|
#undef DEF_KEYWORD
|
|
|
|
//setup token maps
|
|
#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) mOpTripleMap[MAKE_OP_TRIPLE(ch1, ch2, ch3)] = tok_##enumval;
|
|
#define DEF_OP_DOUBLE(enumval, ch1, ch2) mOpDoubleMap[MAKE_OP_DOUBLE(ch1, ch2)] = tok_##enumval;
|
|
#define DEF_OP_SINGLE(enumval, ch1) mOpSingleMap[MAKE_OP_SINGLE(ch1)] = tok_##enumval;
|
|
#include "operators.h"
|
|
#undef DEF_OP_TRIPLE
|
|
#undef DEF_OP_DOUBLE
|
|
#undef DEF_OP_SINGLE
|
|
|
|
//setup reverse token maps
|
|
#define DEF_KEYWORD(keyword) mReverseTokenMap[tok_##keyword] = #keyword;
|
|
#include "keywords.h"
|
|
#undef DEF_KEYWORD
|
|
|
|
#define DEF_OP_TRIPLE(enumval, ch1, ch2, ch3) mReverseTokenMap[tok_##enumval] = std::string({ch1, ch2, ch3});
|
|
#define DEF_OP_DOUBLE(enumval, ch1, ch2) mReverseTokenMap[tok_##enumval] = std::string({ch1, ch2});
|
|
#define DEF_OP_SINGLE(enumval, ch1) mReverseTokenMap[tok_##enumval] = std::string({ch1});
|
|
#include "operators.h"
|
|
#undef DEF_OP_TRIPLE
|
|
#undef DEF_OP_DOUBLE
|
|
#undef DEF_OP_SINGLE
|
|
}
|
|
|
|
std::string Lexer::TokString(const TokenState & ts)
|
|
{
|
|
switch(ts.Token)
|
|
{
|
|
case tok_eof:
|
|
return "tok_eof";
|
|
case tok_error:
|
|
return StringUtils::sprintf("error(line %d, col %d, \"%s\")", ts.CurLine + 1, ts.LineIndex, mError.c_str());
|
|
case tok_identifier:
|
|
return ts.IdentifierStr;
|
|
case tok_number:
|
|
return StringUtils::sprintf(mIsHexNumberVal ? "0x%llX" : "%llu", ts.NumberVal);
|
|
case tok_stringlit:
|
|
return StringUtils::sprintf("\"%s\"", StringUtils::Escape(ts.StringLit).c_str());
|
|
case tok_charlit:
|
|
{
|
|
std::string s;
|
|
s = ts.CharLit;
|
|
return StringUtils::sprintf("'%s'", StringUtils::Escape(s).c_str());
|
|
}
|
|
default:
|
|
{
|
|
auto found = mReverseTokenMap.find(ts.Token);
|
|
if(found != mReverseTokenMap.end())
|
|
return found->second;
|
|
return "<UNKNOWN TOKEN>";
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string Lexer::TokString(Token tok)
|
|
{
|
|
switch(tok)
|
|
{
|
|
case tok_eof:
|
|
return "tok_eof";
|
|
case tok_error:
|
|
return StringUtils::sprintf("error(line %d, col %d, \"%s\")", mState.CurLine + 1, mState.LineIndex, mError.c_str());
|
|
case tok_identifier:
|
|
return mState.IdentifierStr;
|
|
case tok_number:
|
|
return StringUtils::sprintf(mIsHexNumberVal ? "0x%llX" : "%llu", mState.NumberVal);
|
|
case tok_stringlit:
|
|
return StringUtils::sprintf("\"%s\"", StringUtils::Escape(mState.StringLit).c_str());
|
|
case tok_charlit:
|
|
{
|
|
std::string s;
|
|
s = mState.CharLit;
|
|
return StringUtils::sprintf("'%s'", StringUtils::Escape(s).c_str());
|
|
}
|
|
default:
|
|
{
|
|
auto found = mReverseTokenMap.find(Token(tok));
|
|
if(found != mReverseTokenMap.end())
|
|
return found->second;
|
|
return "<UNKNOWN TOKEN>";
|
|
}
|
|
}
|
|
}
|
|
|
|
int Lexer::peekChar(size_t distance)
|
|
{
|
|
if(mIndex + distance >= mInput.size())
|
|
return EOF;
|
|
auto ch = mInput[mIndex + distance];
|
|
if(ch == '\0')
|
|
{
|
|
reportWarning(StringUtils::sprintf("\\0 character in file data"));
|
|
return peekChar(distance + 1);
|
|
}
|
|
return ch;
|
|
}
|
|
|
|
int Lexer::readChar()
|
|
{
|
|
if(mIndex == mInput.size())
|
|
return EOF;
|
|
auto ch = mInput[mIndex++];
|
|
mState.LineIndex++;
|
|
if(ch == '\0')
|
|
{
|
|
reportWarning(StringUtils::sprintf("\\0 character in file data"));
|
|
return readChar();
|
|
}
|
|
return ch;
|
|
}
|
|
|
|
bool Lexer::checkString(const std::string & expected)
|
|
{
|
|
for(size_t i = 0; i < expected.size(); i++)
|
|
{
|
|
auto ch = peekChar(i);
|
|
if(ch == EOF)
|
|
return false;
|
|
if(ch != uint8_t(expected[i]))
|
|
return false;
|
|
}
|
|
mIndex += expected.size();
|
|
return true;
|
|
}
|
|
|
|
void Lexer::signalNewLine()
|
|
{
|
|
mState.CurLine++;
|
|
mState.LineIndex = 0;
|
|
}
|