support for more keywords

This commit is contained in:
mrexodia 2016-06-05 01:01:28 +02:00
parent 7c18c0238f
commit 306b819db3
No known key found for this signature in database
GPG Key ID: D72F9A4FAA0073B4
97 changed files with 390 additions and 155 deletions

View File

@ -16,7 +16,11 @@
<ClCompile Include="stringutils.cpp" /> <ClCompile Include="stringutils.cpp" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClInclude Include="dynamicmem.h" />
<ClInclude Include="filehelper.h" /> <ClInclude Include="filehelper.h" />
<ClInclude Include="handle.h" />
<ClInclude Include="stringutils.h" />
<ClInclude Include="testfiles.h" />
</ItemGroup> </ItemGroup>
<PropertyGroup Label="Globals"> <PropertyGroup Label="Globals">
<ProjectGuid>{B0411C78-2F06-49E0-8DE9-5C52A466F5DE}</ProjectGuid> <ProjectGuid>{B0411C78-2F06-49E0-8DE9-5C52A466F5DE}</ProjectGuid>

View File

@ -29,5 +29,17 @@
<ClInclude Include="filehelper.h"> <ClInclude Include="filehelper.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="testfiles.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="dynamicmem.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="handle.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="stringutils.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -3,11 +3,20 @@
#include <string> #include <string>
#include <stdint.h> #include <stdint.h>
#include <unordered_map> #include <unordered_map>
#include <functional>
#include "filehelper.h" #include "filehelper.h"
#include "stringutils.h" #include "stringutils.h"
#include "testfiles.h"
using namespace std; using namespace std;
struct Lexer
{
explicit Lexer()
{
SetupKeywordMap();
}
string Input; string Input;
string ConsumedInput; string ConsumedInput;
size_t Index = 0; size_t Index = 0;
@ -31,17 +40,25 @@ enum Token
tok_unsigned, //"unsigned" tok_unsigned, //"unsigned"
tok_int, //"int" tok_int, //"int"
tok_sizeof, //"sizeof" tok_sizeof, //"sizeof"
tok_BYTE, //"BYTE"
tok_WORD, //"WORD" tok_WORD, //"WORD"
tok_DWORD, //"DWORD" tok_DWORD, //"DWORD"
tok_ushort, //"ushort"
tok_uint, //"uint"
tok_byte, //"byte"
tok_double, //"double"
tok_string, //"string"
tok_return, //"return"
tok_enum, //"enum"
//others //others
tok_identifier, //[a-zA-Z][a-zA-Z0-9] tok_identifier, //[a-zA-Z_][a-zA-Z0-9_]
tok_number //(0x[0-9a-fA-F]+)|([0-9]+) tok_number //(0x[0-9a-fA-F]+)|([0-9]+)
}; };
unordered_map<string, Token> KeywordMap; unordered_map<string, Token> KeywordMap;
void setup() void SetupKeywordMap()
{ {
KeywordMap["typedef"] = tok_typedef; KeywordMap["typedef"] = tok_typedef;
KeywordMap["struct"] = tok_struct; KeywordMap["struct"] = tok_struct;
@ -49,8 +66,16 @@ void setup()
KeywordMap["unsigned"] = tok_unsigned; KeywordMap["unsigned"] = tok_unsigned;
KeywordMap["int"] = tok_int; KeywordMap["int"] = tok_int;
KeywordMap["sizeof"] = tok_sizeof; KeywordMap["sizeof"] = tok_sizeof;
KeywordMap["BYTE"] = tok_BYTE;
KeywordMap["WORD"] = tok_WORD; KeywordMap["WORD"] = tok_WORD;
KeywordMap["DWORD"] = tok_DWORD; KeywordMap["DWORD"] = tok_DWORD;
KeywordMap["byte"] = tok_byte;
KeywordMap["ushort"] = tok_ushort;
KeywordMap["uint"] = tok_uint;
KeywordMap["double"] = tok_double;
KeywordMap["string"] = tok_string;
KeywordMap["return"] = tok_return;
KeywordMap["enum"] = tok_enum;
} }
Token ReportError(const String & error) Token ReportError(const String & error)
@ -59,7 +84,7 @@ Token ReportError(const String & error)
return tok_error; return tok_error;
} }
String tokString(int tok) String TokString(int tok)
{ {
switch (Token(tok)) switch (Token(tok))
{ {
@ -83,7 +108,14 @@ String tokString(int tok)
} }
} }
int readChar() int PeekChar(int distance = 0)
{
if (Index + distance >= Input.length())
return EOF;
return Input[Index + distance];
}
int ReadChar()
{ {
if (Index == Input.length()) if (Index == Input.length())
return EOF; return EOF;
@ -91,18 +123,22 @@ int readChar()
return uint8_t(Input[Index++]); //do not sign-extend to support UTF-8 return uint8_t(Input[Index++]); //do not sign-extend to support UTF-8
} }
int getToken() int GetToken()
{ {
//skip whitespace //skip whitespace
while (isspace(LastChar)) while (isspace(LastChar))
LastChar = readChar(); LastChar = ReadChar();
//identifier/keyword //identifier/keyword
if (isalpha(LastChar)) //[a-zA-Z] if (isalpha(LastChar) || LastChar == '_') //[a-zA-Z_]
{ {
IdentifierStr = LastChar; IdentifierStr = LastChar;
while (isalnum(LastChar = readChar())) //[0-9a-zA-Z] LastChar = ReadChar();
while (isalnum(LastChar) || LastChar == '_') //[0-9a-zA-Z_]
{
IdentifierStr += LastChar; IdentifierStr += LastChar;
LastChar = ReadChar();
}
//keywords //keywords
auto found = KeywordMap.find(IdentifierStr); auto found = KeywordMap.find(IdentifierStr);
@ -112,18 +148,13 @@ int getToken()
return tok_identifier; return tok_identifier;
} }
//(hex) numbers //hex numbers
if (isdigit(LastChar)) //[0-9] if (LastChar == '0' && PeekChar() == 'x') //0x
{ {
string NumStr; string NumStr;
NumStr = LastChar; ReadChar(); //consume the 'x'
LastChar = readChar(); //this might not be a digit
//hexadecimal numbers while (isxdigit(LastChar = ReadChar())) //[0-9a-fA-F]*
if (NumStr[0] == '0' && LastChar == 'x') //0x
{
NumStr = "";
while (isxdigit(LastChar = readChar())) //[0-9a-fA-F]*
NumStr += LastChar; NumStr += LastChar;
if (!NumStr.length()) //check for error condition if (!NumStr.length()) //check for error condition
@ -133,13 +164,13 @@ int getToken()
return ReportError("sscanf_s failed on hexadecimal number"); return ReportError("sscanf_s failed on hexadecimal number");
return tok_number; return tok_number;
} }
if (isdigit(LastChar)) //[0-9]
//decimal numbers
while (isdigit(LastChar)) //[0-9]*
{ {
string NumStr;
NumStr = LastChar;
while (isdigit(LastChar = ReadChar())) //[0-9]*
NumStr += LastChar; NumStr += LastChar;
LastChar = readChar();
}
if (sscanf_s(NumStr.c_str(), "%llu", &NumberVal) != 1) if (sscanf_s(NumStr.c_str(), "%llu", &NumberVal) != 1)
return ReportError("sscanf_s failed on decimal number"); return ReportError("sscanf_s failed on decimal number");
@ -147,23 +178,19 @@ int getToken()
} }
//comments //comments
if (LastChar == '/') if (LastChar == '/' && PeekChar() == '/') //line comment
{
LastChar = readChar();
//line comment
if (LastChar == '/')
{ {
do do
{ {
LastChar = readChar(); LastChar = ReadChar();
} while (LastChar != EOF && LastChar != '\n'); } while (LastChar != EOF && LastChar != '\n');
if (LastChar == '\n') if (LastChar == '\n')
return getToken(); //interpret the next line return GetToken(); //interpret the next line
} }
else else if (LastChar == '/' && PeekChar() == '*') //block comment
return ReportError("invalid comment"); {
//TODO: implement this
} }
//end of file //end of file
@ -172,39 +199,83 @@ int getToken()
//unknown character //unknown character
auto ThisChar = LastChar; auto ThisChar = LastChar;
LastChar = readChar(); LastChar = ReadChar();
return ThisChar; return ThisChar;
} }
bool ReadInputFile(const char* filename) bool ReadInputFile(const string & filename)
{ {
return FileHelper::ReadAllText(filename, Input); return FileHelper::ReadAllText(filename, Input);
} }
void testLex() void TestLex(function<void(const string & line)> lexEnum)
{ {
int tok; int tok;
do do
{ {
tok = getToken(); tok = GetToken();
puts(tokString(tok).c_str()); lexEnum(TokString(tok));
} while (tok != tok_eof && tok != tok_error); } while (tok != tok_eof && tok != tok_error);
} }
};
void test() bool TestLexer(const string & filename)
{ {
if (!ReadInputFile("test.bt")) Lexer lexer;
if (!lexer.ReadInputFile("tests\\" + filename))
{ {
puts("failed to read input file"); printf("failed to read \"%s\"\n", filename.c_str());
return; return false;
} }
setup(); string expected;
testLex(); if (!FileHelper::ReadAllText(filename + ".lextest", expected)) //don't fail tests that we didn't specify yet
return true;
StringUtils::ReplaceAll(expected, "\r\n", "\n");
expected = StringUtils::Trim(expected);
string actual;
lexer.TestLex([&](const string & line)
{
actual += line + "\n";
});
actual = StringUtils::Trim(actual);
if (expected == actual)
{
printf("lexer test for \"%s\" success!\n", filename.c_str());
return true;
}
printf("lexer test for \"%s\" failed\n", filename.c_str());
FileHelper::WriteAllText("expected.out", expected);
FileHelper::WriteAllText("actual.out", actual);
return false;
}
void RunLexerTests()
{
for (auto file : testFiles)
TestLexer(file);
}
bool DebugLexer(const string & filename)
{
printf("Debugging \"%s\"\n", filename.c_str());
Lexer lexer;
if (!lexer.ReadInputFile("tests\\" + filename))
{
printf("failed to read \"%s\"\n", filename.c_str());
return false;
}
lexer.TestLex([](const string & line)
{
puts(line.c_str());
});
puts("");
return true;
} }
int main() int main()
{ {
test(); DebugLexer(testFiles[1]);
RunLexerTests();
system("pause"); system("pause");
return 0; return 0;
} }

95
cparser/testfiles.h Normal file
View File

@ -0,0 +1,95 @@
static const char* testFiles[] =
{
"test.bt",
"CDATemplate.bt",
"NetflowVersion5.bt",
"SHXTemplate.bt",
"WinhexPosTemplate.bt",
"Mifare1kTemplate.bt",
"PALTemplate.bt",
"GocleverTemplate.bt",
"OGGTemplate.bt",
"STLTemplate.bt",
"SinclairMicrodriveImage.bt",
"RDBTemplate.bt",
"DBFTemplate.bt",
"Mifare4kTemplate.bt",
"GPTTemplate.bt",
"SSPTemplate.bt",
"SHPTemplate.bt",
"SRecTemplate.bt",
"FLVTemplate.bt",
"LUKSTemplate.bt",
"PCXTemplate.bt",
"UTMPTemplate.bt",
"ElTorito.bt",
"DMPTemplate.bt",
"OscarItemTemplate.bt",
"EOTTemplate.bt",
"ISOTemplate.bt",
"CLASSTemplate2.bt",
"EVSBTemplate.bt",
"BMPTemplate.bt",
"TGATemplate.bt",
"TOCTemplate.bt",
"CABTemplate.bt",
"RIFFTemplate.bt",
"AndroidManifestTemplate.bt",
"InspectorWithMP4DateTime.bt",
"FAT16Template.bt",
"PNGTemplate.bt",
"ICOTemplate.bt",
"RegistryPolicyFileTemplate.bt",
"VHDTemplate.bt",
"ISOBMFTemplate.bt",
"PCAPTemplate.bt",
"AVITemplate.bt",
"ZIPTemplate.bt",
"CRXTemplate.bt",
"MIDITemplate.bt",
"GZipTemplate.bt",
"GIFTemplate.bt",
"InspectorDates.bt",
"WAVTemplate.bt",
"RegistryHive.bt",
"EMFTemplate.bt",
"ROMFS.bt",
"OrCad3.20a_SCH.bt",
"MP4Template.bt",
"CLASSTemplate.bt",
"WMFTemplate.bt",
"LNKTemplate.bt",
"OrCAD3.20a_LIB.bt",
"PSFTemplate.bt",
"RARTemplate.bt",
"PYCTemplate.bt",
"EXETemplate.bt",
"PNG12Template.bt",
"TacxTemplate.bt",
"MFTRecord.bt",
"MP3Template.bt",
"MBRTemplate.bt",
"WAVTemplateAdv.bt",
"PDFTemplate.bt",
"EXETemplate2.bt",
"RESTemplate.bt",
"ZIPTemplateAdv.bt",
"SF2Template.bt",
"MOBITemplate.bt",
"MBRTemplateFAT.bt",
"exFATTemplate.bt",
"ELFTemplate.new.bt",
"ELFTemplate.bt",
"MachOTemplate.bt",
"PETemplate.bt",
"EDIDTemplate.bt",
"GeoTIFTemplate.bt",
"CLASSTemplate3.bt",
"CAPTemplate.bt",
"TIFTemplate.bt",
"TTFTemplate.bt",
"JPGTemplate.bt",
"DEXTemplate.bt",
"DEXTemplate.new.bt",
"SWFTemplate.bt",
};

View File

@ -0,0 +1,53 @@
tok_struct
tok_identifier "DBZ"
{
tok_struct
tok_identifier "HEADER"
{
tok_char
tok_identifier "magic"
[
tok_number 4 (0x4)
]
;
tok_unsigned
tok_int
tok_identifier "size"
;
tok_unsigned
tok_int
tok_identifier "dataStart"
;
tok_unsigned
tok_int
tok_identifier "numEntries"
;
}
tok_identifier "header"
;
tok_char
tok_identifier "empty"
[
tok_identifier "header"
.
tok_identifier "size"
-
tok_sizeof
(
tok_identifier "HEADER"
)
]
;
tok_unsigned
tok_int
tok_identifier "entryOffsets"
[
tok_identifier "header"
.
tok_identifier "numEntries"
]
;
}
tok_identifier "dbz"
;
tok_eof