lots of improvements to the lexer

This commit is contained in:
mrexodia 2016-06-05 02:41:40 +02:00
parent 306b819db3
commit 527c35b671
No known key found for this signature in database
GPG Key ID: D72F9A4FAA0073B4
6 changed files with 211 additions and 64 deletions

3
.gitignore vendored
View File

@ -3,3 +3,6 @@
*.suo *.suo
Release/ Release/
Debug/ Debug/
My Amplifier XE Results */
actual.out
expected.out

View File

@ -19,6 +19,7 @@
<ClInclude Include="dynamicmem.h" /> <ClInclude Include="dynamicmem.h" />
<ClInclude Include="filehelper.h" /> <ClInclude Include="filehelper.h" />
<ClInclude Include="handle.h" /> <ClInclude Include="handle.h" />
<ClInclude Include="keywords.h" />
<ClInclude Include="stringutils.h" /> <ClInclude Include="stringutils.h" />
<ClInclude Include="testfiles.h" /> <ClInclude Include="testfiles.h" />
</ItemGroup> </ItemGroup>

View File

@ -41,5 +41,8 @@
<ClInclude Include="stringutils.h"> <ClInclude Include="stringutils.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="keywords.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
</Project> </Project>

75
cparser/keywords.h Normal file
View File

@ -0,0 +1,75 @@
#ifndef DEF_KEYWORD
// Don't create errors if someone uses this by mistake
#define DEF_KEYWORD(x)
#endif
DEF_KEYWORD(if)
DEF_KEYWORD(do)
DEF_KEYWORD(for)
DEF_KEYWORD(else)
DEF_KEYWORD(while)
DEF_KEYWORD(case)
DEF_KEYWORD(switch)
DEF_KEYWORD(default)
DEF_KEYWORD(break)
DEF_KEYWORD(return)
DEF_KEYWORD(continue)
DEF_KEYWORD(enum)
DEF_KEYWORD(struct)
DEF_KEYWORD(typedef)
DEF_KEYWORD(sizeof)
DEF_KEYWORD(void)
DEF_KEYWORD(unsigned)
DEF_KEYWORD(local)
DEF_KEYWORD(bool)
DEF_KEYWORD(true)
DEF_KEYWORD(false)
DEF_KEYWORD(char)
DEF_KEYWORD(uchar)
DEF_KEYWORD(wchar_t)
DEF_KEYWORD(byte)
DEF_KEYWORD(ubyte)
DEF_KEYWORD(short)
DEF_KEYWORD(ushort)
DEF_KEYWORD(int)
DEF_KEYWORD(uint)
DEF_KEYWORD(long)
DEF_KEYWORD(ulong)
DEF_KEYWORD(int8)
DEF_KEYWORD(uint8)
DEF_KEYWORD(int16)
DEF_KEYWORD(uint16)
DEF_KEYWORD(int32)
DEF_KEYWORD(uint32)
DEF_KEYWORD(int64)
DEF_KEYWORD(uint64)
DEF_KEYWORD(BOOL)
DEF_KEYWORD(CHAR)
DEF_KEYWORD(BYTE)
DEF_KEYWORD(WORD)
DEF_KEYWORD(DWORD)
DEF_KEYWORD(QWORD)
DEF_KEYWORD(double)
DEF_KEYWORD(string)
DEF_KEYWORD(time_t)
DEF_KEYWORD(quad)
DEF_KEYWORD(DOSDATE)
DEF_KEYWORD(DOSTIME)
DEF_KEYWORD(FILETIME)
DEF_KEYWORD(OLETIME)
DEF_KEYWORD(UQUAD)
DEF_KEYWORD(LONGLONG)
DEF_KEYWORD(ULONG_PTR)
DEF_KEYWORD(VQUAD)
DEF_KEYWORD(UINT32)

View File

@ -17,16 +17,6 @@ struct Lexer
SetupKeywordMap(); SetupKeywordMap();
} }
string Input;
string ConsumedInput;
size_t Index = 0;
string Error;
string IdentifierStr = "";
uint64_t NumberVal = 0;
int LastChar = ' ';
enum Token enum Token
{ {
//status tokens //status tokens
@ -34,48 +24,51 @@ struct Lexer
tok_error, tok_error,
//keywords //keywords
tok_typedef, //"typedef" #define DEF_KEYWORD(keyword) tok_##keyword,
tok_struct, //"struct" #include "keywords.h"
tok_char, //"char" #undef DEF_KEYWORD
tok_unsigned, //"unsigned"
tok_int, //"int"
tok_sizeof, //"sizeof"
tok_BYTE, //"BYTE"
tok_WORD, //"WORD"
tok_DWORD, //"DWORD"
tok_ushort, //"ushort"
tok_uint, //"uint"
tok_byte, //"byte"
tok_double, //"double"
tok_string, //"string"
tok_return, //"return"
tok_enum, //"enum"
//others //others
tok_identifier, //[a-zA-Z_][a-zA-Z0-9_] tok_identifier, //[a-zA-Z_][a-zA-Z0-9_]
tok_number //(0x[0-9a-fA-F]+)|([0-9]+) tok_number, //(0x[0-9a-fA-F]+)|([0-9]+)
tok_stringlit //"([^\\"\r\n]|\\[\\"abfnrtv])*"
}; };
string Input;
string ConsumedInput;
size_t Index = 0;
string Error;
//lexer state
string IdentifierStr;
uint64_t NumberVal = 0;
string StringLit;
int LastChar = ' ';
void ResetLexerState()
{
Input.clear();
ConsumedInput.clear();
Index = 0;
Error.clear();
IdentifierStr.clear();
NumberVal = 0;
StringLit.clear();
LastChar = ' ';
}
unordered_map<string, Token> KeywordMap; unordered_map<string, Token> KeywordMap;
unordered_map<Token, string> ReverseKeywordMap;
void SetupKeywordMap() void SetupKeywordMap()
{ {
KeywordMap["typedef"] = tok_typedef; #define DEF_KEYWORD(keyword) KeywordMap[#keyword] = tok_##keyword;
KeywordMap["struct"] = tok_struct; #include "keywords.h"
KeywordMap["char"] = tok_char; #undef DEF_KEYWORD
KeywordMap["unsigned"] = tok_unsigned; #define DEF_KEYWORD(keyword) ReverseKeywordMap[tok_##keyword] = "tok_" #keyword;
KeywordMap["int"] = tok_int; #include "keywords.h"
KeywordMap["sizeof"] = tok_sizeof; #undef DEF_KEYWORD
KeywordMap["BYTE"] = tok_BYTE;
KeywordMap["WORD"] = tok_WORD;
KeywordMap["DWORD"] = tok_DWORD;
KeywordMap["byte"] = tok_byte;
KeywordMap["ushort"] = tok_ushort;
KeywordMap["uint"] = tok_uint;
KeywordMap["double"] = tok_double;
KeywordMap["string"] = tok_string;
KeywordMap["return"] = tok_return;
KeywordMap["enum"] = tok_enum;
} }
Token ReportError(const String & error) Token ReportError(const String & error)
@ -92,12 +85,12 @@ struct Lexer
case tok_error: return StringUtils::sprintf("tok_error \"%s\"", Error.c_str()); case tok_error: return StringUtils::sprintf("tok_error \"%s\"", Error.c_str());
case tok_identifier: return StringUtils::sprintf("tok_identifier \"%s\"", IdentifierStr.c_str()); case tok_identifier: return StringUtils::sprintf("tok_identifier \"%s\"", IdentifierStr.c_str());
case tok_number: return StringUtils::sprintf("tok_number %llu (0x%llX)", NumberVal, NumberVal); case tok_number: return StringUtils::sprintf("tok_number %llu (0x%llX)", NumberVal, NumberVal);
case tok_stringlit: return StringUtils::sprintf("tok_stringlit \"%s\"", StringUtils::Escape(StringLit).c_str());
default: default:
for (const auto & itr : KeywordMap) {
{ auto found = ReverseKeywordMap.find(Token(tok));
if (tok == itr.second) if (found != ReverseKeywordMap.end())
return "tok_" + itr.first; return found->second;
}
if (tok > 0 && tok < 265) if (tok > 0 && tok < 265)
{ {
String s; String s;
@ -106,6 +99,7 @@ struct Lexer
} }
return "<INVALID TOKEN>"; return "<INVALID TOKEN>";
} }
}
} }
int PeekChar(int distance = 0) int PeekChar(int distance = 0)
@ -123,21 +117,84 @@ struct Lexer
return uint8_t(Input[Index++]); //do not sign-extend to support UTF-8 return uint8_t(Input[Index++]); //do not sign-extend to support UTF-8
} }
bool CheckString(const string & expected)
{
for (size_t i = 0; i < expected.size(); i++)
{
auto ch = PeekChar(i);
if (ch == EOF)
return false;
if (ch != uint8_t(expected[i]))
return false;
}
Index += expected.size();
return true;
}
int NextChar()
{
return LastChar = ReadChar();
}
int GetToken() int GetToken()
{ {
//skip whitespace //skip whitespace
while (isspace(LastChar)) while (isspace(LastChar))
LastChar = ReadChar(); NextChar();
//string literal
if (LastChar == '\"')
{
StringLit.clear();
while (true)
{
NextChar();
if (LastChar == EOF) //end of file
return ReportError("unexpected end of file in string literal");
if (LastChar == '\"') //end of string literal
{
NextChar();
return tok_stringlit;
}
if (LastChar == '\\') //escape sequence
{
NextChar();
if (LastChar == EOF)
return ReportError("unexpected end of file in string literal");
if (LastChar == '\\' || LastChar == '\"')
LastChar = LastChar;
else if (LastChar == 'a')
LastChar = '\a';
else if (LastChar == 'b')
LastChar = '\b';
else if (LastChar == 'f')
LastChar = '\f';
else if (LastChar == 'n')
LastChar = '\n';
else if (LastChar == 'r')
LastChar = '\r';
else if (LastChar == 't')
LastChar = '\t';
else if (LastChar == 'v')
LastChar = '\v';
else if (LastChar == '0')
LastChar = '\0';
else
return ReportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in string literal", LastChar));
}
StringLit += LastChar;
}
}
//identifier/keyword //identifier/keyword
if (isalpha(LastChar) || LastChar == '_') //[a-zA-Z_] if (isalpha(LastChar) || LastChar == '_') //[a-zA-Z_]
{ {
IdentifierStr = LastChar; IdentifierStr = LastChar;
LastChar = ReadChar(); NextChar();
while (isalnum(LastChar) || LastChar == '_') //[0-9a-zA-Z_] while (isalnum(LastChar) || LastChar == '_') //[0-9a-zA-Z_]
{ {
IdentifierStr += LastChar; IdentifierStr += LastChar;
LastChar = ReadChar(); NextChar();
} }
//keywords //keywords
@ -154,7 +211,7 @@ struct Lexer
string NumStr; string NumStr;
ReadChar(); //consume the 'x' ReadChar(); //consume the 'x'
while (isxdigit(LastChar = ReadChar())) //[0-9a-fA-F]* while (isxdigit(NextChar())) //[0-9a-fA-F]*
NumStr += LastChar; NumStr += LastChar;
if (!NumStr.length()) //check for error condition if (!NumStr.length()) //check for error condition
@ -169,7 +226,7 @@ struct Lexer
string NumStr; string NumStr;
NumStr = LastChar; NumStr = LastChar;
while (isdigit(LastChar = ReadChar())) //[0-9]* while (isdigit(NextChar())) //[0-9]*
NumStr += LastChar; NumStr += LastChar;
if (sscanf_s(NumStr.c_str(), "%llu", &NumberVal) != 1) if (sscanf_s(NumStr.c_str(), "%llu", &NumberVal) != 1)
@ -182,7 +239,7 @@ struct Lexer
{ {
do do
{ {
LastChar = ReadChar(); NextChar();
} while (LastChar != EOF && LastChar != '\n'); } while (LastChar != EOF && LastChar != '\n');
if (LastChar == '\n') if (LastChar == '\n')
@ -199,16 +256,17 @@ struct Lexer
//unknown character //unknown character
auto ThisChar = LastChar; auto ThisChar = LastChar;
LastChar = ReadChar(); NextChar();
return ThisChar; return ThisChar;
} }
bool ReadInputFile(const string & filename) bool ReadInputFile(const string & filename)
{ {
ResetLexerState();
return FileHelper::ReadAllText(filename, Input); return FileHelper::ReadAllText(filename, Input);
} }
void TestLex(function<void(const string & line)> lexEnum) bool TestLex(function<void(const string & line)> lexEnum)
{ {
int tok; int tok;
do do
@ -216,6 +274,9 @@ struct Lexer
tok = GetToken(); tok = GetToken();
lexEnum(TokString(tok)); lexEnum(TokString(tok));
} while (tok != tok_eof && tok != tok_error); } while (tok != tok_eof && tok != tok_error);
if (tok != tok_error && tok != tok_eof)
tok = ReportError("lexer did not finish at the end of the file");
return tok != tok_error;
} }
}; };
@ -227,17 +288,21 @@ bool TestLexer(const string & filename)
printf("failed to read \"%s\"\n", filename.c_str()); printf("failed to read \"%s\"\n", filename.c_str());
return false; return false;
} }
string actual;
if(!lexer.TestLex([&](const string & line)
{
actual += line + "\n";
}))
{
printf("lex error in \"%s\": %s\n", filename.c_str(), lexer.Error.c_str());
return false;
}
actual = StringUtils::Trim(actual);
string expected; string expected;
if (!FileHelper::ReadAllText(filename + ".lextest", expected)) //don't fail tests that we didn't specify yet if (!FileHelper::ReadAllText("tests\\expected\\" + filename + ".lextest", expected)) //don't fail tests that we didn't specify yet
return true; return true;
StringUtils::ReplaceAll(expected, "\r\n", "\n"); StringUtils::ReplaceAll(expected, "\r\n", "\n");
expected = StringUtils::Trim(expected); expected = StringUtils::Trim(expected);
string actual;
lexer.TestLex([&](const string & line)
{
actual += line + "\n";
});
actual = StringUtils::Trim(actual);
if (expected == actual) if (expected == actual)
{ {
printf("lexer test for \"%s\" success!\n", filename.c_str()); printf("lexer test for \"%s\" success!\n", filename.c_str());
@ -274,7 +339,7 @@ bool DebugLexer(const string & filename)
int main() int main()
{ {
DebugLexer(testFiles[1]); DebugLexer(testFiles[19]);
RunLexerTests(); RunLexerTests();
system("pause"); system("pause");
return 0; return 0;