2016-06-05 06:00:36 +08:00
|
|
|
#include <windows.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <unordered_map>
|
2016-06-05 07:01:28 +08:00
|
|
|
#include <functional>
|
2016-06-05 06:00:36 +08:00
|
|
|
#include "filehelper.h"
|
|
|
|
#include "stringutils.h"
|
2016-06-05 07:01:28 +08:00
|
|
|
#include "testfiles.h"
|
2016-06-05 06:00:36 +08:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
struct Lexer
|
|
|
|
{
|
|
|
|
explicit Lexer()
|
|
|
|
{
|
|
|
|
SetupKeywordMap();
|
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
enum Token
|
|
|
|
{
|
|
|
|
//status tokens
|
|
|
|
tok_eof = -10000,
|
|
|
|
tok_error,
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
//keywords
|
2016-06-05 08:41:40 +08:00
|
|
|
#define DEF_KEYWORD(keyword) tok_##keyword,
|
|
|
|
#include "keywords.h"
|
|
|
|
#undef DEF_KEYWORD
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
//others
|
|
|
|
tok_identifier, //[a-zA-Z_][a-zA-Z0-9_]
|
2016-06-05 08:41:40 +08:00
|
|
|
tok_number, //(0x[0-9a-fA-F]+)|([0-9]+)
|
|
|
|
tok_stringlit //"([^\\"\r\n]|\\[\\"abfnrtv])*"
|
2016-06-05 07:01:28 +08:00
|
|
|
};
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 08:41:40 +08:00
|
|
|
string Input;
|
|
|
|
string ConsumedInput;
|
|
|
|
size_t Index = 0;
|
|
|
|
string Error;
|
|
|
|
|
|
|
|
//lexer state
|
|
|
|
string IdentifierStr;
|
|
|
|
uint64_t NumberVal = 0;
|
|
|
|
string StringLit;
|
|
|
|
|
|
|
|
int LastChar = ' ';
|
|
|
|
|
|
|
|
void ResetLexerState()
|
|
|
|
{
|
|
|
|
Input.clear();
|
|
|
|
ConsumedInput.clear();
|
|
|
|
Index = 0;
|
|
|
|
Error.clear();
|
|
|
|
IdentifierStr.clear();
|
|
|
|
NumberVal = 0;
|
|
|
|
StringLit.clear();
|
|
|
|
LastChar = ' ';
|
|
|
|
}
|
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
unordered_map<string, Token> KeywordMap;
|
2016-06-05 08:41:40 +08:00
|
|
|
unordered_map<Token, string> ReverseKeywordMap;
|
2016-06-05 07:01:28 +08:00
|
|
|
|
|
|
|
void SetupKeywordMap()
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 08:41:40 +08:00
|
|
|
#define DEF_KEYWORD(keyword) KeywordMap[#keyword] = tok_##keyword;
|
|
|
|
#include "keywords.h"
|
|
|
|
#undef DEF_KEYWORD
|
|
|
|
#define DEF_KEYWORD(keyword) ReverseKeywordMap[tok_##keyword] = "tok_" #keyword;
|
|
|
|
#include "keywords.h"
|
|
|
|
#undef DEF_KEYWORD
|
2016-06-05 07:01:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Token ReportError(const String & error)
|
|
|
|
{
|
|
|
|
Error = error;
|
|
|
|
return tok_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
String TokString(int tok)
|
|
|
|
{
|
|
|
|
switch (Token(tok))
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
case tok_eof: return "tok_eof";
|
|
|
|
case tok_error: return StringUtils::sprintf("tok_error \"%s\"", Error.c_str());
|
|
|
|
case tok_identifier: return StringUtils::sprintf("tok_identifier \"%s\"", IdentifierStr.c_str());
|
|
|
|
case tok_number: return StringUtils::sprintf("tok_number %llu (0x%llX)", NumberVal, NumberVal);
|
2016-06-05 08:41:40 +08:00
|
|
|
case tok_stringlit: return StringUtils::sprintf("tok_stringlit \"%s\"", StringUtils::Escape(StringLit).c_str());
|
2016-06-05 07:01:28 +08:00
|
|
|
default:
|
2016-06-05 08:41:40 +08:00
|
|
|
{
|
|
|
|
auto found = ReverseKeywordMap.find(Token(tok));
|
|
|
|
if (found != ReverseKeywordMap.end())
|
|
|
|
return found->second;
|
2016-06-05 07:01:28 +08:00
|
|
|
if (tok > 0 && tok < 265)
|
|
|
|
{
|
|
|
|
String s;
|
|
|
|
s = tok;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return "<INVALID TOKEN>";
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
2016-06-05 08:41:40 +08:00
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
int PeekChar(int distance = 0)
|
|
|
|
{
|
|
|
|
if (Index + distance >= Input.length())
|
|
|
|
return EOF;
|
|
|
|
return Input[Index + distance];
|
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
int ReadChar()
|
|
|
|
{
|
|
|
|
if (Index == Input.length())
|
|
|
|
return EOF;
|
|
|
|
ConsumedInput += Input[Index];
|
|
|
|
return uint8_t(Input[Index++]); //do not sign-extend to support UTF-8
|
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 08:41:40 +08:00
|
|
|
bool CheckString(const string & expected)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < expected.size(); i++)
|
|
|
|
{
|
|
|
|
auto ch = PeekChar(i);
|
|
|
|
if (ch == EOF)
|
|
|
|
return false;
|
|
|
|
if (ch != uint8_t(expected[i]))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
Index += expected.size();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
int NextChar()
|
|
|
|
{
|
|
|
|
return LastChar = ReadChar();
|
|
|
|
}
|
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
int GetToken()
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
//skip whitespace
|
|
|
|
while (isspace(LastChar))
|
2016-06-05 08:41:40 +08:00
|
|
|
NextChar();
|
|
|
|
|
|
|
|
//string literal
|
|
|
|
if (LastChar == '\"')
|
|
|
|
{
|
|
|
|
StringLit.clear();
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
NextChar();
|
|
|
|
if (LastChar == EOF) //end of file
|
|
|
|
return ReportError("unexpected end of file in string literal");
|
|
|
|
if (LastChar == '\"') //end of string literal
|
|
|
|
{
|
|
|
|
NextChar();
|
|
|
|
return tok_stringlit;
|
|
|
|
}
|
|
|
|
if (LastChar == '\\') //escape sequence
|
|
|
|
{
|
|
|
|
NextChar();
|
|
|
|
if (LastChar == EOF)
|
|
|
|
return ReportError("unexpected end of file in string literal");
|
|
|
|
if (LastChar == '\\' || LastChar == '\"')
|
|
|
|
LastChar = LastChar;
|
|
|
|
else if (LastChar == 'a')
|
|
|
|
LastChar = '\a';
|
|
|
|
else if (LastChar == 'b')
|
|
|
|
LastChar = '\b';
|
|
|
|
else if (LastChar == 'f')
|
|
|
|
LastChar = '\f';
|
|
|
|
else if (LastChar == 'n')
|
|
|
|
LastChar = '\n';
|
|
|
|
else if (LastChar == 'r')
|
|
|
|
LastChar = '\r';
|
|
|
|
else if (LastChar == 't')
|
|
|
|
LastChar = '\t';
|
|
|
|
else if (LastChar == 'v')
|
|
|
|
LastChar = '\v';
|
|
|
|
else if (LastChar == '0')
|
|
|
|
LastChar = '\0';
|
|
|
|
else
|
|
|
|
return ReportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in string literal", LastChar));
|
|
|
|
}
|
|
|
|
StringLit += LastChar;
|
|
|
|
}
|
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
//identifier/keyword
|
|
|
|
if (isalpha(LastChar) || LastChar == '_') //[a-zA-Z_]
|
|
|
|
{
|
|
|
|
IdentifierStr = LastChar;
|
2016-06-05 08:41:40 +08:00
|
|
|
NextChar();
|
2016-06-05 07:01:28 +08:00
|
|
|
while (isalnum(LastChar) || LastChar == '_') //[0-9a-zA-Z_]
|
|
|
|
{
|
|
|
|
IdentifierStr += LastChar;
|
2016-06-05 08:41:40 +08:00
|
|
|
NextChar();
|
2016-06-05 07:01:28 +08:00
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
//keywords
|
|
|
|
auto found = KeywordMap.find(IdentifierStr);
|
|
|
|
if (found != KeywordMap.end())
|
|
|
|
return found->second;
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
return tok_identifier;
|
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
//hex numbers
|
|
|
|
if (LastChar == '0' && PeekChar() == 'x') //0x
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
string NumStr;
|
|
|
|
ReadChar(); //consume the 'x'
|
|
|
|
|
2016-06-05 08:41:40 +08:00
|
|
|
while (isxdigit(NextChar())) //[0-9a-fA-F]*
|
2016-06-05 06:00:36 +08:00
|
|
|
NumStr += LastChar;
|
|
|
|
|
|
|
|
if (!NumStr.length()) //check for error condition
|
|
|
|
return ReportError("no hex digits after \"0x\" prefix");
|
|
|
|
|
|
|
|
if (sscanf_s(NumStr.c_str(), "%llX", &NumberVal) != 1)
|
|
|
|
return ReportError("sscanf_s failed on hexadecimal number");
|
|
|
|
return tok_number;
|
|
|
|
}
|
2016-06-05 07:01:28 +08:00
|
|
|
if (isdigit(LastChar)) //[0-9]
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
string NumStr;
|
|
|
|
NumStr = LastChar;
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 08:41:40 +08:00
|
|
|
while (isdigit(NextChar())) //[0-9]*
|
2016-06-05 07:01:28 +08:00
|
|
|
NumStr += LastChar;
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
if (sscanf_s(NumStr.c_str(), "%llu", &NumberVal) != 1)
|
|
|
|
return ReportError("sscanf_s failed on decimal number");
|
|
|
|
return tok_number;
|
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
//comments
|
|
|
|
if (LastChar == '/' && PeekChar() == '/') //line comment
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
|
|
|
do
|
|
|
|
{
|
2016-06-05 08:41:40 +08:00
|
|
|
NextChar();
|
2016-06-05 06:00:36 +08:00
|
|
|
} while (LastChar != EOF && LastChar != '\n');
|
|
|
|
|
|
|
|
if (LastChar == '\n')
|
2016-06-05 07:01:28 +08:00
|
|
|
return GetToken(); //interpret the next line
|
|
|
|
}
|
|
|
|
else if (LastChar == '/' && PeekChar() == '*') //block comment
|
|
|
|
{
|
|
|
|
//TODO: implement this
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
2016-06-05 07:01:28 +08:00
|
|
|
|
|
|
|
//end of file
|
|
|
|
if (LastChar == EOF)
|
|
|
|
return tok_eof;
|
|
|
|
|
|
|
|
//unknown character
|
|
|
|
auto ThisChar = LastChar;
|
2016-06-05 08:41:40 +08:00
|
|
|
NextChar();
|
2016-06-05 07:01:28 +08:00
|
|
|
return ThisChar;
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
bool ReadInputFile(const string & filename)
|
|
|
|
{
|
2016-06-05 08:41:40 +08:00
|
|
|
ResetLexerState();
|
2016-06-05 07:01:28 +08:00
|
|
|
return FileHelper::ReadAllText(filename, Input);
|
|
|
|
}
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 08:41:40 +08:00
|
|
|
bool TestLex(function<void(const string & line)> lexEnum)
|
2016-06-05 07:01:28 +08:00
|
|
|
{
|
|
|
|
int tok;
|
|
|
|
do
|
|
|
|
{
|
|
|
|
tok = GetToken();
|
|
|
|
lexEnum(TokString(tok));
|
|
|
|
} while (tok != tok_eof && tok != tok_error);
|
2016-06-05 08:41:40 +08:00
|
|
|
if (tok != tok_error && tok != tok_eof)
|
|
|
|
tok = ReportError("lexer did not finish at the end of the file");
|
|
|
|
return tok != tok_error;
|
2016-06-05 07:01:28 +08:00
|
|
|
}
|
|
|
|
};
|
2016-06-05 06:00:36 +08:00
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
bool TestLexer(const string & filename)
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
Lexer lexer;
|
|
|
|
if (!lexer.ReadInputFile("tests\\" + filename))
|
|
|
|
{
|
|
|
|
printf("failed to read \"%s\"\n", filename.c_str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
string actual;
|
2016-06-05 08:41:40 +08:00
|
|
|
if(!lexer.TestLex([&](const string & line)
|
2016-06-05 07:01:28 +08:00
|
|
|
{
|
|
|
|
actual += line + "\n";
|
2016-06-05 08:41:40 +08:00
|
|
|
}))
|
|
|
|
{
|
|
|
|
printf("lex error in \"%s\": %s\n", filename.c_str(), lexer.Error.c_str());
|
|
|
|
return false;
|
|
|
|
}
|
2016-06-05 07:01:28 +08:00
|
|
|
actual = StringUtils::Trim(actual);
|
2016-06-05 08:41:40 +08:00
|
|
|
string expected;
|
|
|
|
if (!FileHelper::ReadAllText("tests\\expected\\" + filename + ".lextest", expected)) //don't fail tests that we didn't specify yet
|
|
|
|
return true;
|
|
|
|
StringUtils::ReplaceAll(expected, "\r\n", "\n");
|
|
|
|
expected = StringUtils::Trim(expected);
|
2016-06-05 07:01:28 +08:00
|
|
|
if (expected == actual)
|
|
|
|
{
|
|
|
|
printf("lexer test for \"%s\" success!\n", filename.c_str());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
printf("lexer test for \"%s\" failed\n", filename.c_str());
|
|
|
|
FileHelper::WriteAllText("expected.out", expected);
|
|
|
|
FileHelper::WriteAllText("actual.out", actual);
|
|
|
|
return false;
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
void RunLexerTests()
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
for (auto file : testFiles)
|
|
|
|
TestLexer(file);
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
|
|
|
|
2016-06-05 07:01:28 +08:00
|
|
|
bool DebugLexer(const string & filename)
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
printf("Debugging \"%s\"\n", filename.c_str());
|
|
|
|
Lexer lexer;
|
|
|
|
if (!lexer.ReadInputFile("tests\\" + filename))
|
2016-06-05 06:00:36 +08:00
|
|
|
{
|
2016-06-05 07:01:28 +08:00
|
|
|
printf("failed to read \"%s\"\n", filename.c_str());
|
|
|
|
return false;
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
2016-06-05 07:01:28 +08:00
|
|
|
lexer.TestLex([](const string & line)
|
|
|
|
{
|
|
|
|
puts(line.c_str());
|
|
|
|
});
|
|
|
|
puts("");
|
|
|
|
return true;
|
2016-06-05 06:00:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int main()
|
|
|
|
{
|
2016-06-05 08:41:40 +08:00
|
|
|
DebugLexer(testFiles[19]);
|
2016-06-05 07:01:28 +08:00
|
|
|
RunLexerTests();
|
2016-06-05 06:00:36 +08:00
|
|
|
system("pause");
|
|
|
|
return 0;
|
|
|
|
}
|