Implement a hacky preprocessor

This commit is contained in:
Duncan Ogilvie 2023-02-05 02:16:27 +01:00
parent d5034cf6d6
commit fe6952a1e8
10 changed files with 512 additions and 22 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ Debug/
My Amplifier XE Results */
actual.out
expected.out
.vs/

View File

@ -22,6 +22,7 @@
<ClCompile Include="lexer.cpp" />
<ClCompile Include="main.cpp" />
<ClCompile Include="parser.cpp" />
<ClCompile Include="preprocessor.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="ast.h" />
@ -29,6 +30,7 @@
<ClInclude Include="keywords.h" />
<ClInclude Include="lexer.h" />
<ClInclude Include="operators.h" />
<ClInclude Include="preprocessor.h" />
<ClInclude Include="testfiles.h" />
</ItemGroup>
<ItemGroup>
@ -37,31 +39,32 @@
<PropertyGroup Label="Globals">
<ProjectGuid>{B0411C78-2F06-49E0-8DE9-5C52A466F5DE}</ProjectGuid>
<RootNamespace>btparser</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v143</PlatformToolset>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v143</PlatformToolset>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v143</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<PlatformToolset>v143</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>

View File

@ -24,6 +24,9 @@
<ClCompile Include="parser.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="preprocessor.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="testfiles.h">
@ -44,6 +47,9 @@
<ClInclude Include="helpers.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="preprocessor.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="parser.h">

View File

@ -14,12 +14,6 @@ static void clearReserve(std::string & str, size_t reserve = DEFAULT_STRING_BUFF
str.reserve(reserve);
}
static void appendCh(std::string & str, char ch)
{
str.resize(str.size() + 1);
str[str.size() - 1] = ch;
}
static const char* convertNumber(const char* str, uint64_t & result, int radix)
{
errno = 0;
@ -64,6 +58,7 @@ bool Lexer::DoLexing(std::vector<TokenState> & tokens, std::string & error)
return false;
}
tokens.push_back(mState);
mState.Clear();
if(token == tok_eof)
break;
}
@ -88,11 +83,11 @@ bool Lexer::Test(const std::function<void(const std::string & line)> & lexEnum,
while(line < mState.CurLine)
{
line++;
sprintf_s(newlineText, "\n%d: ", line + 1);
sprintf_s(newlineText, "\n%zu: ", line + 1);
toks.append(newlineText);
}
toks.append(TokString(tok));
appendCh(toks, ' ');
toks.push_back(' ');
lexEnum(toks);
}
while(tok != tok_eof && tok != tok_error);
@ -250,7 +245,7 @@ Lexer::Token Lexer::getToken()
else
return reportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in string literal", mLastChar));
}
appendCh(mState.StringLit, mLastChar);
mState.StringLit.push_back(mLastChar);
}
}
@ -261,7 +256,7 @@ Lexer::Token Lexer::getToken()
nextChar();
while(isalnum(mLastChar) || mLastChar == '_') //[0-9a-zA-Z_]
{
appendCh(mState.IdentifierStr, mLastChar);
mState.IdentifierStr.push_back(mLastChar);
nextChar();
}
@ -279,8 +274,8 @@ Lexer::Token Lexer::getToken()
nextChar(); //consume the 'x'
mNumStr.clear();
while(isxdigit(nextChar())) //[0-9a-fA-F]*
appendCh(mNumStr, mLastChar);
while (isxdigit(nextChar())) //[0-9a-fA-F]*
mNumStr.push_back(mLastChar);
if(!mNumStr.length()) //check for error condition
return reportError("no hex digits after \"0x\" prefix");
@ -394,14 +389,11 @@ void Lexer::resetLexerState()
mError.clear();
mWarnings.clear();
clearReserve(mState.IdentifierStr);
mState.NumberVal = 0;
mIsHexNumberVal = false;
clearReserve(mState.StringLit);
clearReserve(mNumStr, 16);
mState.CharLit = '\0';
mLastChar = ' ';
mState.CurLine = 0;
mState.LineIndex = 0;
mState.Clear();
}
void Lexer::setupTokenMaps()

View File

@ -4,6 +4,7 @@
#include <vector>
#include <unordered_map>
#include <functional>
#include <string>
class Lexer
{
@ -50,6 +51,14 @@ public:
{
return Token >= tok_signed && Token <= tok_UINT32;
}
void Clear()
{
IdentifierStr.clear();
NumberVal = 0;
StringLit.clear();
CharLit = '\0';
}
};
explicit Lexer();

View File

@ -4,6 +4,7 @@
#include "lexer.h"
#include "parser.h"
#include "helpers.h"
#include "preprocessor.h"
bool TestLexer(Lexer & lexer, const std::string & filename)
{
@ -87,9 +88,27 @@ void DebugLexerTests(bool output = true)
bool DebugParser(const std::string & filename)
{
std::string data;
if (!FileHelper::ReadAllText("tests\\" + filename, data))
{
printf("Failed to read: %s\n", filename.c_str());
return false;
}
std::string pperror;
std::unordered_map<std::string, std::string> definitions;
definitions["WIN32"] = "";
definitions["_MSC_VER"] = "1337";
auto ppData = preprocess(data, pperror, definitions);
if (!pperror.empty())
{
printf("Preprocess error: %s\n", pperror.c_str());
return false;
}
Parser parser;
std::string error;
if(!parser.ParseFile("tests\\" + filename, error))
if(!parser.ParseString(ppData, error))
{
printf("ParseFile failed: %s\n", error.c_str());
return false;

View File

@ -14,7 +14,17 @@ bool Parser::ParseFile(const string & filename, string & error)
error = "failed to read input file";
return false;
}
if(!mLexer.DoLexing(mTokens, error))
if (!mLexer.DoLexing(mTokens, error))
return false;
CurToken = mTokens[0];
mBinaryTemplate = ParseBinaryTemplate();
return !!mBinaryTemplate;
}
bool Parser::ParseString(const std::string& source, std::string& error)
{
mLexer.SetInputData(source);
if (!mLexer.DoLexing(mTokens, error))
return false;
CurToken = mTokens[0];
mBinaryTemplate = ParseBinaryTemplate();

View File

@ -16,6 +16,7 @@ public:
explicit Parser();
bool ParseFile(const std::string & filename, std::string & error);
bool ParseString(const std::string& source, std::string& error);
private:
Lexer mLexer;

443
btparser/preprocessor.cpp Normal file
View File

@ -0,0 +1,443 @@
#include "preprocessor.h"
#include <vector>
#include <stdexcept>
#include <unordered_map>
struct Line
{
size_t number = 0;
bool comment = false;
std::string text;
std::string eolcomment;
std::string str() const
{
std::string s;
s += "line ";
s += std::to_string(number);
if (comment)
s += " (comment)";
s += ": ";
s += text;
s += eolcomment;
return s;
}
void print() const
{
puts(str().c_str());
}
};
struct Tokenizer
{
struct exception : public std::runtime_error
{
exception(const Line& line, const std::string& message = std::string())
: std::runtime_error(message + " === " + line.str())
{
}
};
const Line& line;
size_t position = 0;
Tokenizer(const Line& line)
: line(line) { }
int peek() const
{
if (position >= line.text.length())
return EOF;
return line.text[position];
}
char consume()
{
if (position >= line.text.length())
error("cannot consuum");
return line.text[position++];
}
void skip_spaces(bool required = false)
{
auto oldPosition = position;
while (true)
{
auto ch = peek();
if (ch == ' ' || ch == '\t')
consume();
else
break;
}
if (required && oldPosition == position)
error("whitespace was expected, none found");
}
void error(const std::string& message)
{
throw exception(line, std::to_string(line.number) + ":" + std::to_string(position + 1) + " " + message);
}
std::string identifier()
{
std::string name;
while (true)
{
auto ch = peek();
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_')
{
name.push_back(consume());
}
else if (!name.empty() && (ch >= '0' && ch <= '9'))
{
name.push_back(consume());
}
else
{
break;
}
}
if (name.empty())
error("expected identifier");
return name;
}
std::string remainder()
{
std::string result;
while (true)
{
auto ch = peek();
if (ch == EOF)
break;
result.push_back(consume());
}
return result;
}
std::string until(char expected)
{
std::string result;
while (true)
{
auto ch = peek();
if (ch == EOF)
error("unexpected end of file");
if (ch == expected)
break;
result.push_back(consume());
}
return result;
}
};
std::string remove_block_comments(const std::string& input)
{
std::string result;
bool inComment = false;
for (size_t i = 0; i < input.length(); i++)
{
if (inComment)
{
if (input[i] == '*' && i + 1 < input.length() && input[i + 1] == '/')
inComment = false;
}
else
{
if (input[i] == '/' && i + 1 < input.length() && input[i + 1] == '*')
inComment = true;
}
if (!inComment)
result += input[i];
}
return result;
}
std::string remove_line_comments(std::string& input)
{
std::string line;
auto removeComment = [&line]()
{
auto commentIdx = line.find("//");
if (commentIdx != std::string::npos)
{
line.resize(commentIdx);
}
};
std::string result;
for (auto ch : input)
{
if (ch == '\r')
{
continue;
}
if (ch == '\n')
{
removeComment();
result += line;
result += '\n';
line.clear();
}
else
{
line.push_back(ch);
}
}
if (!line.empty())
{
removeComment();
result += line;
}
return result;
}
// TODO: support comments
std::vector<Line> split_lines(const std::string& input)
{
auto input_uncommented = remove_block_comments(input);
std::vector<Line> lines;
Line line;
size_t lineNumber = 1;
line.number = lineNumber;
for (auto ch : input)
{
if (ch == '\r')
continue;
if (ch == '\n')
{
lineNumber++;
if (!line.text.empty() && line.text.back() == '\\')
{
// continuation
line.text.back() = '\n';
}
else
{
lines.push_back(line);
line.number = lineNumber;
line.text.clear();
}
}
else
{
line.text.push_back(ch);
}
}
if (!line.text.empty())
{
lines.push_back(line);
line.text.clear();
}
for (auto& line : lines)
{
line.text = remove_line_comments(line.text);
}
return lines;
}
//Taken from: https://stackoverflow.com/a/24315631
void ReplaceAll(std::string& s, const std::string& from, const std::string& to)
{
size_t start_pos = 0;
while ((start_pos = s.find(from, start_pos)) != std::string::npos)
{
s.replace(start_pos, from.length(), to);
start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
}
}
std::string preprocess(const std::string& input, std::string& error, const std::unordered_map<std::string, std::string>& definitions)
{
auto lines = split_lines(input);
std::vector<Line> final;
struct Scope
{
size_t lineIndex = 0;
std::string condition;
bool value = false;
};
std::vector<Scope> stack;
auto state = definitions;
auto emitting = [&stack]()
{
for (const auto& s : stack)
if (!s.value)
return false;
return true;
};
for (size_t i = 0; i < lines.size(); i++)
{
const auto& line = lines[i];
Tokenizer t(lines[i]);
t.skip_spaces();
if (t.peek() == '#')
{
t.consume();
t.skip_spaces();
auto directive = t.identifier();
line.print();
if (directive == "ifndef")
{
t.skip_spaces(true);
auto identifier = t.identifier();
printf("#ifndef(%s)\n", identifier.c_str());
stack.push_back({ i, "!defined(" + identifier + ")", state.count(identifier) == 0 });
printf("emitting: %d\n", emitting());
}
else if (directive == "ifdef")
{
t.skip_spaces(true);
auto identifier = t.identifier();
printf("#ifdef(%s)\n", identifier.c_str());
stack.push_back({ i, identifier, state.count(identifier) != 0 });
printf("emitting: %d\n", emitting());
}
else if (directive == "else")
{
if (stack.empty())
throw std::runtime_error("no matching #if for #else");
if (!stack.back().value)
{
stack.back().value = true;
}
printf("#else (%s)\n", stack.back().condition.c_str());
printf("emitting: %d\n", emitting());
}
else if (directive == "endif")
{
if (stack.empty())
throw std::runtime_error("no matching #if for #endif");
printf("#endif (%s)\n", stack.back().condition.c_str());
stack.pop_back();
printf("emitting: %d\n", emitting());
}
else if (directive == "define")
{
t.skip_spaces(true);
auto identifier = t.identifier();
if (t.peek() == '(')
{
t.consume();
t.skip_spaces();
std::vector<std::string> parameters;
while (true)
{
auto ch = t.peek();
if (ch == ')')
break;
if (ch == EOF)
throw std::runtime_error("expected ')', got EOF instead");
auto argument = t.identifier();
parameters.push_back(argument);
t.skip_spaces();
ch = t.peek();
if (ch == ')')
break;
else if (ch == ',')
{
t.consume();
t.skip_spaces();
}
else
throw std::runtime_error("expect ',' or ')' got something else (too lazy sry)");
}
t.consume();
t.skip_spaces();
auto token = t.remainder();
std::string pretty;
for (size_t i = 0; i < parameters.size(); i++)
{
if (i > 0)
pretty += ", ";
pretty += parameters[i];
}
printf("#define %s('%s' = '%s')\n", identifier.c_str(), pretty.c_str(), token.c_str());
}
else
{
t.skip_spaces();
auto token = t.remainder();
if (token.empty())
{
printf("#define(%s)\n", identifier.c_str());
}
else
{
printf("#define('%s' = '%s')\n", identifier.c_str(), token.c_str());
}
if (emitting())
{
state[identifier] = token;
}
}
}
else if (directive == "include")
{
t.skip_spaces();
auto type = t.peek();
if (type == '\"')
{
t.consume();
auto file = t.until('\"');
printf("#include \"%s\"\n", file.c_str());
}
else if (type == '<')
{
t.consume();
auto file = t.until('>');
printf("#include <%s>\n", file.c_str());
}
else
{
throw std::runtime_error("invalid syntax for #include");
}
}
else
{
printf("directive: '%s'\n", directive.c_str());
throw std::runtime_error("unknown directive '" + directive + "'");
}
}
else if (emitting())
{
final.push_back(line);
}
}
std::string result;
for (const auto& line : final)
{
result += line.text;
result += '\n';
}
// TODO: strip out comments
// TODO: somehow prevent replacing inside strings IsInsideString(position)
// TODO: recursively replace
// HACK: not proper
for (const auto& itr : state)
{
ReplaceAll(result, itr.first, itr.second);
}
return result;
}

6
btparser/preprocessor.h Normal file
View File

@ -0,0 +1,6 @@
#pragma once
#include <string>
#include <unordered_map>
std::string preprocess(const std::string& input, std::string& error, const std::unordered_map<std::string, std::string>& definitions);