From fe6952a1e8839bb2d97f8f349d97a0a369269d3c Mon Sep 17 00:00:00 2001
From: Duncan Ogilvie <mr.exodia.tpodt@gmail.com>
Date: Sun, 5 Feb 2023 02:16:27 +0100
Subject: [PATCH] Implement a hacky preprocessor

---
 .gitignore                        |   1 +
 btparser/btparser.vcxproj         |  11 +-
 btparser/btparser.vcxproj.filters |   6 +
 btparser/lexer.cpp                |  24 +-
 btparser/lexer.h                  |   9 +
 btparser/main.cpp                 |  21 +-
 btparser/parser.cpp               |  12 +-
 btparser/parser.h                 |   1 +
 btparser/preprocessor.cpp         | 443 ++++++++++++++++++++++++++++++
 btparser/preprocessor.h           |   6 +
 10 files changed, 512 insertions(+), 22 deletions(-)
 create mode 100644 btparser/preprocessor.cpp
 create mode 100644 btparser/preprocessor.h
diff --git a/.gitignore b/.gitignore
index ec8fcc7..7275071 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ Debug/
 My Amplifier XE Results */
 actual.out
 expected.out
+.vs/
diff --git a/btparser/btparser.vcxproj b/btparser/btparser.vcxproj
index 4811b58..4d9a011 100644
--- a/btparser/btparser.vcxproj
+++ b/btparser/btparser.vcxproj
@@ -22,6 +22,7 @@
     <ClCompile Include="lexer.cpp" />
     <ClCompile Include="main.cpp" />
     <ClCompile Include="parser.cpp" />
+    <ClCompile Include="preprocessor.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ast.h" />
@@ -29,6 +30,7 @@
     <ClInclude Include="keywords.h" />
     <ClInclude Include="lexer.h" />
     <ClInclude Include="operators.h" />
+    <ClInclude Include="preprocessor.h" />
     <ClInclude Include="testfiles.h" />
   </ItemGroup>
   <ItemGroup>
@@ -37,31 +39,32 @@
   <PropertyGroup Label="Globals">
     <ProjectGuid>{B0411C78-2F06-49E0-8DE9-5C52A466F5DE}</ProjectGuid>
     <RootNamespace>btparser</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
diff --git a/btparser/btparser.vcxproj.filters b/btparser/btparser.vcxproj.filters
index a7b5f10..b902403 100644
--- a/btparser/btparser.vcxproj.filters
+++ b/btparser/btparser.vcxproj.filters
@@ -24,6 +24,9 @@
     <ClCompile Include="parser.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="preprocessor.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="testfiles.h">
@@ -44,6 +47,9 @@
     <ClInclude Include="helpers.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="preprocessor.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="parser.h">
diff --git a/btparser/lexer.cpp b/btparser/lexer.cpp
index 360420e..2f06d74 100644
--- a/btparser/lexer.cpp
+++ b/btparser/lexer.cpp
@@ -14,12 +14,6 @@ static void clearReserve(std::string & str, size_t reserve = DEFAULT_STRING_BUFF
     str.reserve(reserve);
 }
 
-static void appendCh(std::string & str, char ch)
-{
-    str.resize(str.size() + 1);
-    str[str.size() - 1] = ch;
-}
-
 static const char* convertNumber(const char* str, uint64_t & result, int radix)
 {
     errno = 0;
@@ -64,6 +58,7 @@ bool Lexer::DoLexing(std::vector<TokenState> & tokens, std::string & error)
             return false;
         }
         tokens.push_back(mState);
+        mState.Clear();
         if(token == tok_eof)
             break;
     }
@@ -88,11 +83,11 @@ bool Lexer::Test(const std::function<void(const std::string & line)> & lexEnum,
         while(line < mState.CurLine)
         {
             line++;
-            sprintf_s(newlineText, "\n%d: ", line + 1);
+            sprintf_s(newlineText, "\n%zu: ", line + 1);
             toks.append(newlineText);
         }
         toks.append(TokString(tok));
-        appendCh(toks, ' ');
+        toks.push_back(' ');
         lexEnum(toks);
     }
     while(tok != tok_eof && tok != tok_error);
@@ -250,7 +245,7 @@ Lexer::Token Lexer::getToken()
                 else
                     return reportError(StringUtils::sprintf("invalid escape sequence \"\\%c\" in string literal", mLastChar));
             }
-            appendCh(mState.StringLit, mLastChar);
+            mState.StringLit.push_back(mLastChar);
         }
     }
 
@@ -261,7 +256,7 @@ Lexer::Token Lexer::getToken()
         nextChar();
         while(isalnum(mLastChar) || mLastChar == '_') //[0-9a-zA-Z_]
         {
-            appendCh(mState.IdentifierStr, mLastChar);
+            mState.IdentifierStr.push_back(mLastChar);
             nextChar();
         }
 
@@ -279,8 +274,8 @@ Lexer::Token Lexer::getToken()
         nextChar(); //consume the 'x'
         mNumStr.clear();
 
-        while(isxdigit(nextChar())) //[0-9a-fA-F]*
-            appendCh(mNumStr, mLastChar);
+        while (isxdigit(nextChar())) //[0-9a-fA-F]*
+            mNumStr.push_back(mLastChar);
 
         if(!mNumStr.length()) //check for error condition
             return reportError("no hex digits after \"0x\" prefix");
@@ -394,14 +389,11 @@ void Lexer::resetLexerState()
     mError.clear();
     mWarnings.clear();
     clearReserve(mState.IdentifierStr);
-    mState.NumberVal = 0;
     mIsHexNumberVal = false;
     clearReserve(mState.StringLit);
     clearReserve(mNumStr, 16);
-    mState.CharLit = '\0';
     mLastChar = ' ';
-    mState.CurLine = 0;
-    mState.LineIndex = 0;
+    mState.Clear();
 }
 
 void Lexer::setupTokenMaps()
diff --git a/btparser/lexer.h b/btparser/lexer.h
index 975622c..6ef4098 100644
--- a/btparser/lexer.h
+++ b/btparser/lexer.h
@@ -4,6 +4,7 @@
 #include <vector>
 #include <unordered_map>
 #include <functional>
+#include <string>
 
 class Lexer
 {
@@ -50,6 +51,14 @@ public:
         {
             return Token >= tok_signed && Token <= tok_UINT32;
         }
+
+        void Clear()
+        {
+            IdentifierStr.clear();
+            NumberVal = 0;
+            StringLit.clear();
+            CharLit = '\0';
+        }
     };
 
     explicit Lexer();
diff --git a/btparser/main.cpp b/btparser/main.cpp
index 9891b81..091fc9e 100644
--- a/btparser/main.cpp
+++ b/btparser/main.cpp
@@ -4,6 +4,7 @@
 #include "lexer.h"
 #include "parser.h"
 #include "helpers.h"
+#include "preprocessor.h"
 
 bool TestLexer(Lexer & lexer, const std::string & filename)
 {
@@ -87,9 +88,27 @@ void DebugLexerTests(bool output = true)
 
 bool DebugParser(const std::string & filename)
 {
+    std::string data;
+    if (!FileHelper::ReadAllText("tests\\" + filename, data))
+    {
+        printf("Failed to read: %s\n", filename.c_str());
+        return false;
+    }
+
+    std::string pperror;
+    std::unordered_map<std::string, std::string> definitions;
+    definitions["WIN32"] = "";
+    definitions["_MSC_VER"] = "1337";
+    auto ppData = preprocess(data, pperror, definitions);
+    if (!pperror.empty())
+    {
+        printf("Preprocess error: %s\n", pperror.c_str());
+        return false;
+    }
+
     Parser parser;
     std::string error;
-    if(!parser.ParseFile("tests\\" + filename, error))
+    if(!parser.ParseString(ppData, error))
     {
         printf("ParseFile failed: %s\n", error.c_str());
         return false;
diff --git a/btparser/parser.cpp b/btparser/parser.cpp
index 1068006..08b786a 100644
--- a/btparser/parser.cpp
+++ b/btparser/parser.cpp
@@ -14,7 +14,17 @@ bool Parser::ParseFile(const string & filename, string & error)
         error = "failed to read input file";
         return false;
     }
-    if(!mLexer.DoLexing(mTokens, error))
+    if (!mLexer.DoLexing(mTokens, error))
+        return false;
+    CurToken = mTokens[0];
+    mBinaryTemplate = ParseBinaryTemplate();
+    return !!mBinaryTemplate;
+}
+
+bool Parser::ParseString(const std::string& source, std::string& error)
+{
+    mLexer.SetInputData(source);
+    if (!mLexer.DoLexing(mTokens, error))
         return false;
     CurToken = mTokens[0];
     mBinaryTemplate = ParseBinaryTemplate();
diff --git a/btparser/parser.h b/btparser/parser.h
index 288084c..8f07883 100644
--- a/btparser/parser.h
+++ b/btparser/parser.h
@@ -16,6 +16,7 @@ public:
 
     explicit Parser();
     bool ParseFile(const std::string & filename, std::string & error);
+    bool ParseString(const std::string& source, std::string& error);
 
 private:
     Lexer mLexer;
diff --git a/btparser/preprocessor.cpp b/btparser/preprocessor.cpp
new file mode 100644
index 0000000..bbfc99d
--- /dev/null
+++ b/btparser/preprocessor.cpp
@@ -0,0 +1,443 @@
+#include "preprocessor.h"
+
+#include <vector>
+#include <stdexcept>
+#include <unordered_map>
+
+struct Line
+{
+	size_t number = 0;
+	bool comment = false;
+	std::string text;
+	std::string eolcomment;
+
+	std::string str() const
+	{
+		std::string s;
+		s += "line ";
+		s += std::to_string(number);
+		if (comment)
+			s += " (comment)";
+		s += ": ";
+		s += text;
+		s += eolcomment;
+		return s;
+	}
+
+	void print() const
+	{
+		puts(str().c_str());
+	}
+};
+
+struct Tokenizer
+{
+	struct exception : public std::runtime_error
+	{
+		exception(const Line& line, const std::string& message = std::string())
+			: std::runtime_error(message + " === " + line.str())
+		{
+		}
+	};
+
+	const Line& line;
+	size_t position = 0;
+
+	Tokenizer(const Line& line)
+		: line(line) { }
+
+	int peek() const
+	{
+		if (position >= line.text.length())
+			return EOF;
+		return line.text[position];
+	}
+
+	char consume()
+	{
+		if (position >= line.text.length())
+			error("cannot consuum");
+		return line.text[position++];
+	}
+
+	void skip_spaces(bool required = false)
+	{
+		auto oldPosition = position;
+		while (true)
+		{
+			auto ch = peek();
+			if (ch == ' ' || ch == '\t')
+				consume();
+			else
+				break;
+		}
+		if (required && oldPosition == position)
+			error("whitespace was expected, none found");
+	}
+
+	void error(const std::string& message)
+	{
+		throw exception(line, std::to_string(line.number) + ":" + std::to_string(position + 1) + " " + message);
+	}
+
+	std::string identifier()
+	{
+		std::string name;
+		while (true)
+		{
+			auto ch = peek();
+			if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_')
+			{
+				name.push_back(consume());
+			}
+			else if (!name.empty() && (ch >= '0' && ch <= '9'))
+			{
+				name.push_back(consume());
+			}
+			else
+			{
+				break;
+			}
+		}
+		if (name.empty())
+			error("expected identifier");
+		return name;
+	}
+
+	std::string remainder()
+	{
+		std::string result;
+		while (true)
+		{
+			auto ch = peek();
+			if (ch == EOF)
+				break;
+			result.push_back(consume());
+		}
+		return result;
+	}
+
+	std::string until(char expected)
+	{
+		std::string result;
+		while (true)
+		{
+			auto ch = peek();
+			if (ch == EOF)
+				error("unexpected end of file");
+			if (ch == expected)
+				break;
+			result.push_back(consume());
+		}
+		return result;
+	}
+};
+
+std::string remove_block_comments(const std::string& input)
+{
+	std::string result;
+	bool inComment = false;
+	for (size_t i = 0; i < input.length(); i++)
+	{
+		if (inComment)
+		{
+			if (input[i] == '*' && i + 1 < input.length() && input[i + 1] == '/')
+				inComment = false;
+		}
+		else
+		{
+			if (input[i] == '/' && i + 1 < input.length() && input[i + 1] == '*')
+				inComment = true;
+		}
+		if (!inComment)
+			result += input[i];
+	}
+	return result;
+}
+
+std::string remove_line_comments(std::string& input)
+{
+	std::string line;
+	auto removeComment = [&line]()
+	{
+		auto commentIdx = line.find("//");
+		if (commentIdx != std::string::npos)
+		{
+			line.resize(commentIdx);
+		}
+	};
+
+	std::string result;
+	for (auto ch : input)
+	{
+		if (ch == '\r')
+		{
+			continue;
+		}
+
+		if (ch == '\n')
+		{
+			removeComment();
+			result += line;
+			result += '\n';
+			line.clear();
+		}
+		else
+		{
+			line.push_back(ch);
+		}
+	}
+
+	if (!line.empty())
+	{
+		removeComment();
+		result += line;
+	}
+	return result;
+}
+
+// TODO: support comments
+std::vector<Line> split_lines(const std::string& input)
+{
+	auto input_uncommented = remove_block_comments(input);
+	std::vector<Line> lines;
+	Line line;
+
+	size_t lineNumber = 1;
+	line.number = lineNumber;
+	for (auto ch : input)
+	{
+		if (ch == '\r')
+			continue;
+
+		if (ch == '\n')
+		{
+			lineNumber++;
+			if (!line.text.empty() && line.text.back() == '\\')
+			{
+				// continuation
+				line.text.back() = '\n';
+			}
+			else
+			{
+				lines.push_back(line);
+				line.number = lineNumber;
+				line.text.clear();
+			}
+		}
+		else
+		{
+			line.text.push_back(ch);
+		}
+	}
+
+	if (!line.text.empty())
+	{
+		lines.push_back(line);
+		line.text.clear();
+	}
+
+	for (auto& line : lines)
+	{
+		line.text = remove_line_comments(line.text);
+	}
+
+	return lines;
+}
+
+//Taken from: https://stackoverflow.com/a/24315631
+void ReplaceAll(std::string& s, const std::string& from, const std::string& to)
+{
+	size_t start_pos = 0;
+	while ((start_pos = s.find(from, start_pos)) != std::string::npos)
+	{
+		s.replace(start_pos, from.length(), to);
+		start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
+	}
+}
+
+std::string preprocess(const std::string& input, std::string& error, const std::unordered_map<std::string, std::string>& definitions)
+{
+	auto lines = split_lines(input);
+	std::vector<Line> final;
+	struct Scope
+	{
+		size_t lineIndex = 0;
+		std::string condition;
+		bool value = false;
+	};
+	std::vector<Scope> stack;
+	auto state = definitions;
+	auto emitting = [&stack]()
+	{
+		for (const auto& s : stack)
+			if (!s.value)
+				return false;
+		return true;
+	};
+	for (size_t i = 0; i < lines.size(); i++)
+	{
+		const auto& line = lines[i];
+		Tokenizer t(lines[i]);
+		t.skip_spaces();
+
+		if (t.peek() == '#')
+		{
+			t.consume();
+			t.skip_spaces();
+
+			auto directive = t.identifier();
+			line.print();
+
+			if (directive == "ifndef")
+			{
+				t.skip_spaces(true);
+				auto identifier = t.identifier();
+				printf("#ifndef(%s)\n", identifier.c_str());
+				stack.push_back({ i, "!defined(" + identifier + ")", state.count(identifier) == 0 });
+				printf("emitting: %d\n", emitting());
+			}
+			else if (directive == "ifdef")
+			{
+				t.skip_spaces(true);
+				auto identifier = t.identifier();
+				printf("#ifdef(%s)\n", identifier.c_str());
+				stack.push_back({ i, identifier, state.count(identifier) != 0 });
+				printf("emitting: %d\n", emitting());
+			}
+			else if (directive == "else")
+			{
+				if (stack.empty())
+					throw std::runtime_error("no matching #if for #else");
+				if (!stack.back().value)
+				{
+					stack.back().value = true;
+				}
+				printf("#else (%s)\n", stack.back().condition.c_str());
+				printf("emitting: %d\n", emitting());
+			}
+			else if (directive == "endif")
+			{
+				if (stack.empty())
+					throw std::runtime_error("no matching #if for #endif");
+				printf("#endif (%s)\n", stack.back().condition.c_str());
+				stack.pop_back();
+				printf("emitting: %d\n", emitting());
+			}
+			else if (directive == "define")
+			{
+				t.skip_spaces(true);
+				auto identifier = t.identifier();
+				if (t.peek() == '(')
+				{
+					t.consume();
+					t.skip_spaces();
+					std::vector<std::string> parameters;
+					while (true)
+					{
+						auto ch = t.peek();
+						if (ch == ')')
+							break;
+						if (ch == EOF)
+							throw std::runtime_error("expected ')', got EOF instead");
+
+						auto argument = t.identifier();
+						parameters.push_back(argument);
+						t.skip_spaces();
+						ch = t.peek();
+						if (ch == ')')
+							break;
+						else if (ch == ',')
+						{
+							t.consume();
+							t.skip_spaces();
+						}
+						else
+							throw std::runtime_error("expect ',' or ')' got something else (too lazy sry)");
+					}
+					t.consume();
+					t.skip_spaces();
+					auto token = t.remainder();
+
+					std::string pretty;
+					for (size_t i = 0; i < parameters.size(); i++)
+					{
+						if (i > 0)
+							pretty += ", ";
+						pretty += parameters[i];
+					}
+
+					printf("#define %s('%s' = '%s')\n", identifier.c_str(), pretty.c_str(), token.c_str());
+
+				}
+				else
+				{
+					t.skip_spaces();
+					auto token = t.remainder();
+					if (token.empty())
+					{
+						printf("#define(%s)\n", identifier.c_str());
+					}
+					else
+					{
+						printf("#define('%s' = '%s')\n", identifier.c_str(), token.c_str());
+					}
+					if (emitting())
+					{
+						state[identifier] = token;
+					}
+				}
+			}
+			else if (directive == "include")
+			{
+				t.skip_spaces();
+				auto type = t.peek();
+				if (type == '\"')
+				{
+					t.consume();
+					auto file = t.until('\"');
+					printf("#include \"%s\"\n", file.c_str());
+				}
+				else if (type == '<')
+				{
+					t.consume();
+					auto file = t.until('>');
+					printf("#include <%s>\n", file.c_str());
+				}
+				else
+				{
+					throw std::runtime_error("invalid syntax for #include");
+				}
+			}
+			else
+			{
+				printf("directive: '%s'\n", directive.c_str());
+				throw std::runtime_error("unknown directive '" + directive + "'");
+			}
+		}
+		else if (emitting())
+		{
+			final.push_back(line);
+		}
+	}
+
+	std::string result;
+	for (const auto& line : final)
+	{
+		result += line.text;
+		result += '\n';
+	}
+
+	// TODO: strip out comments
+	// TODO: somehow prevent replacing inside strings IsInsideString(position)
+	// TODO: recursively replace
+
+
+	// HACK: not proper
+	for (const auto& itr : state)
+	{
+		ReplaceAll(result, itr.first, itr.second);
+	}
+
+	return result;
+}
diff --git a/btparser/preprocessor.h b/btparser/preprocessor.h
new file mode 100644
index 0000000..0d17f94
--- /dev/null
+++ b/btparser/preprocessor.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+std::string preprocess(const std::string& input, std::string& error, const std::unordered_map<std::string, std::string>& definitions);
\ No newline at end of file