x64dbg/src/dbg/stringutils.cpp

#include "stringutils.h"
#include <windows.h>
#include <cstdint>

static inline bool convertLongLongNumber(const char* str, unsigned long long & result, int radix)
{
    errno = 0;
    char* end;
    result = strtoull(str, &end, radix);
    if(!result && end == str)
        return false;
    if(result == ULLONG_MAX && errno)
        return false;
    if(*end)
        return false;
    return true;
}

static inline bool convertNumber(const char* str, size_t & result, int radix)
{
    unsigned long long llr;
    if(!convertLongLongNumber(str, llr, radix))
        return false;
    result = size_t(llr);
    return true;
}

void StringUtils::Split(const String & s, char delim, std::vector<String> & elems)
{
    elems.clear();
    String item;
    item.reserve(s.length());
    for(size_t i = 0; i < s.length(); i++)
    {
        if(s[i] == delim)
        {
            if(!item.empty())
                elems.push_back(item);
            item.clear();
        }
        else
            item.push_back(s[i]);
    }
    if(!item.empty())
        elems.push_back(std::move(item));
}

StringList StringUtils::Split(const String & s, char delim)
{
    std::vector<String> elems;
    Split(s, delim, elems);
    return elems;
}

//https://github.com/lefticus/presentations/blob/master/PracticalPerformancePractices.md#smaller-code-is-faster-code-11
String StringUtils::Escape(unsigned char ch, bool escapeSafe)
{
    char buf[8] = "";
    switch(ch)
    {
    case '\0':
        return "\\0";
    case '\t':
        return escapeSafe ? "\\t" : "\t";
    case '\f':
        return "\\f";
    case '\v':
        return "\\v";
    case '\n':
        return escapeSafe ? "\\n" : "\n";
    case '\r':
        return escapeSafe ? "\\r" : "\r";
    case '\\':
        return escapeSafe ? "\\\\" : "\\";
    case '\"':
        return escapeSafe ? "\\\"" : "\"";
    case '\a':
        return "\\a";
    case '\b':
        return "\\b";
    default:
        if(!isprint(ch)) //unknown unprintable character
            sprintf_s(buf, "\\x%02X", ch);
        else
            *buf = ch;
        return buf;
    }
}

static int IsValidUTF8Char(const char* data, int size)
{
    if(*(unsigned char*)data >= 0xF8) //5 or 6 bytes
        return 0;
    else if(*(unsigned char*)data >= 0xF0) //4 bytes
    {
        if(size < 4)
            return 0;
        for(int i = 1; i <= 3; i++)
        {
            if((*(unsigned char*)(data + i) & 0xC0) != 0x80)
                return 0;
        }
        return 4;
    }
    else if(*(unsigned char*)data >= 0xE0) //3 bytes
    {
        if(size < 3)
            return 0;
        for(int i = 1; i <= 2; i++)
        {
            if((*(unsigned char*)(data + i) & 0xC0) != 0x80)
                return 0;
        }
        return 3;
    }
    else if(*(unsigned char*)data >= 0xC0) //2 bytes
    {
        if(size < 2)
            return 0;
        if((*(unsigned char*)(data + 1) & 0xC0) != 0x80)
            return 0;
        return 2;
    }
    else if(*(unsigned char*)data >= 0x80) // BAD
        return 0;
    else
        return 1;
}

String StringUtils::Escape(const String & s, bool escapeSafe)
{
    std::string escaped;
    escaped.reserve(s.length() + s.length() / 2);
    for(size_t i = 0; i < s.length(); i++)
    {
        char buf[8];
        memset(buf, 0, sizeof(buf));
        unsigned char ch = (unsigned char)s[i];
        switch(ch)
        {
        case '\0':
            memcpy(buf, "\\0", 2);
            break;
        case '\t':
            if(escapeSafe)
                memcpy(buf, "\\t", 2);
            else
                memcpy(buf, &ch, 1);
            break;
        case '\f':
            memcpy(buf, "\\f", 2);
            break;
        case '\v':
            memcpy(buf, "\\v", 2);
            break;
        case '\n':
            if(escapeSafe)
                memcpy(buf, "\\n", 2);
            else
                memcpy(buf, &ch, 1);
            break;
        case '\r':
            if(escapeSafe)
                memcpy(buf, "\\r", 2);
            else
                memcpy(buf, &ch, 1);
            break;
        case '\\':
            if(escapeSafe)
                memcpy(buf, "\\\\", 2);
            else
                memcpy(buf, &ch, 1);
            break;
        case '\"':
            if(escapeSafe)
                memcpy(buf, "\\\"", 2);
            else
                memcpy(buf, &ch, 1);
            break;
        default:
            int UTF8CharSize;
            if(ch >= 0x80 && (UTF8CharSize = IsValidUTF8Char(s.c_str() + i, int(s.length() - i))) != 0) //UTF-8 Character is emitted directly
            {
                memcpy(buf, s.c_str() + i, UTF8CharSize);
                i += UTF8CharSize - 1;
            }
            else if(!isprint(ch)) //unknown unprintable character
                sprintf_s(buf, "\\x%02X", ch);
            else
                *buf = ch;
        }
        escaped.append(buf);
    }
    return escaped;
}

bool StringUtils::Unescape(const String & s, String & result, bool quoted)
{
    int mLastChar = EOF;
    size_t i = 0;
    auto nextChar = [&]()
    {
        if(i == s.length())
            return mLastChar = EOF;
        return mLastChar = s[i++];
    };
    if(quoted)
    {
        nextChar();
        if(mLastChar != '\"') //start of quoted string literal
            return false; //invalid string literal
    }
    result.reserve(s.length());
    while(true)
    {
        nextChar();
        if(mLastChar == EOF) //end of file
        {
            if(!quoted)
                break;
            return false; //unexpected end of file in string literal (1)
        }
        if(mLastChar == '\r' || mLastChar == '\n')
            return false; //unexpected newline in string literal (1)
        if(quoted && mLastChar == '\"') //end of quoted string literal
            break;
        if(mLastChar == '\\') //escape sequence
        {
            nextChar();
            if(mLastChar == EOF)
                return false; //unexpected end of file in string literal (2)
            if(mLastChar == '\r' || mLastChar == '\n')
                return false; //unexpected newline in string literal (2)
            if(mLastChar == '\'' || mLastChar == '\"' || mLastChar == '?' || mLastChar == '\\')
                mLastChar = mLastChar;
            else if(mLastChar == 'a')
                mLastChar = '\a';
            else if(mLastChar == 'b')
                mLastChar = '\b';
            else if(mLastChar == 'f')
                mLastChar = '\f';
            else if(mLastChar == 'n')
                mLastChar = '\n';
            else if(mLastChar == 'r')
                mLastChar = '\r';
            else if(mLastChar == 't')
                mLastChar = '\t';
            else if(mLastChar == 'v')
                mLastChar = '\v';
            else if(mLastChar == '0')
                mLastChar = '\0';
            else if(mLastChar == 'x') //\xHH
            {
                auto ch1 = nextChar();
                auto ch2 = nextChar();
                if(isxdigit(ch1) && isxdigit(ch2))
                {
                    char byteStr[3] = "";
                    byteStr[0] = ch1;
                    byteStr[1] = ch2;
                    uint64_t hexData;
                    auto error = convertLongLongNumber(byteStr, hexData, 16);
                    if(error)
                        return false; //convertNumber failed (%s) for hex sequence \"\\x%c%c\" in string literal
                    mLastChar = hexData & 0xFF;
                }
                else
                    return false; //invalid hex sequence \"\\x%c%c\" in string literal
            }
            else
                return false; //invalid escape sequence \"\\%c\" in string literal
        }
        result.push_back(mLastChar);
    }
    return true;
}

//Trim functions taken from: https://stackoverflow.com/questions/216823/whats-the-best-way-to-trim-stdstring/16743707#16743707
const String StringUtils::WHITESPACE = " \n\r\t";

String StringUtils::Trim(const String & s, const String & delim)
{
    return TrimRight(TrimLeft(s, delim), delim);
}

String StringUtils::TrimLeft(const String & s, const String & delim)
{
    size_t startpos = s.find_first_not_of(delim);
    return (startpos == String::npos) ? "" : s.substr(startpos);
}

String StringUtils::TrimRight(const String & s, const String & delim)
{
    size_t endpos = s.find_last_not_of(delim);
    return (endpos == String::npos) ? "" : s.substr(0, endpos + 1);
}

String StringUtils::PadLeft(const String & s, size_t minLength, char ch)
{
    if(s.length() >= minLength)
        return s;
    String pad;
    pad.resize(minLength - s.length());
    for(size_t i = 0; i < pad.length(); i++)
        pad[i] = ch;
    return pad + s;
}

//Conversion functions taken from: http://www.nubaria.com/en/blog/?p=289
String StringUtils::Utf16ToUtf8(const WString & wstr)
{
    return Utf16ToUtf8(wstr.c_str());
}

String StringUtils::Utf16ToUtf8(const wchar_t* wstr)
{
    String convertedString;
    if(!wstr || !*wstr)
        return convertedString;
    auto requiredSize = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, nullptr, 0, nullptr, nullptr);
    if(requiredSize > 0)
    {
        convertedString.resize(requiredSize - 1);
        if(!WideCharToMultiByte(CP_UTF8, 0, wstr, -1, (char*)convertedString.c_str(), requiredSize, nullptr, nullptr))
            convertedString.clear();
    }
    return convertedString;
}

WString StringUtils::Utf8ToUtf16(const String & str)
{
    return Utf8ToUtf16(str.c_str());
}

WString StringUtils::Utf8ToUtf16(const char* str)
{
    WString convertedString;
    if(!str || !*str)
        return convertedString;
    int requiredSize = MultiByteToWideChar(CP_UTF8, 0, str, -1, nullptr, 0);
    if(requiredSize > 0)
    {
        convertedString.resize(requiredSize - 1);
        if(!MultiByteToWideChar(CP_UTF8, 0, str, -1, (wchar_t*)convertedString.c_str(), requiredSize))
            convertedString.clear();
    }
    return convertedString;
}

String StringUtils::LocalCpToUtf8(const String & str)
{
    return LocalCpToUtf8(str.c_str());
}

String StringUtils::LocalCpToUtf8(const char* str)
{
    return Utf16ToUtf8(LocalCpToUtf16(str).c_str());
}

WString StringUtils::LocalCpToUtf16(const String & str)
{
    return LocalCpToUtf16(str.c_str());
}

WString StringUtils::LocalCpToUtf16(const char* str)
{
    WString convertedString;
    if(!str || !*str)
        return convertedString;
    int requiredSize = MultiByteToWideChar(CP_ACP, 0, str, -1, nullptr, 0);
    if(requiredSize > 0)
    {
        convertedString.resize(requiredSize - 1);
        if(!MultiByteToWideChar(CP_ACP, 0, str, -1, (wchar_t*)convertedString.c_str(), requiredSize))
            convertedString.clear();
    }
    return convertedString;
}

String StringUtils::Utf16ToLocalCp(const WString & str)
{
    String convertedString;
    if(str.size() == 0)
        return convertedString;
    int requiredSize = WideCharToMultiByte(CP_ACP, 0, str.c_str(), -1, nullptr, 0, nullptr, nullptr);
    if(requiredSize > 0)
    {
        convertedString.resize(requiredSize - 1);
        if(!WideCharToMultiByte(CP_ACP, 0, str.c_str(), -1, (char*)convertedString.c_str(), requiredSize, nullptr, nullptr))
            convertedString.clear();
    }
    return convertedString;
}

//Taken from: https://stackoverflow.com/a/24315631
void StringUtils::ReplaceAll(String & s, const String & from, const String & to)
{
    size_t start_pos = 0;
    while((start_pos = s.find(from, start_pos)) != std::string::npos)
    {
        s.replace(start_pos, from.length(), to);
        start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
    }
}

void StringUtils::ReplaceAll(WString & s, const WString & from, const WString & to)
{
    size_t start_pos = 0;
    while((start_pos = s.find(from, start_pos)) != std::string::npos)
    {
        s.replace(start_pos, from.length(), to);
        start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
    }
}

String StringUtils::vsprintf(_In_z_ _Printf_format_string_ const char* format, va_list args)
{
    char sbuffer[64] = "";
    if(_vsnprintf_s(sbuffer, _TRUNCATE, format, args) != -1)
        return sbuffer;

    std::vector<char> buffer(256, '\0');
    while(true)
    {
        int res = _vsnprintf_s(buffer.data(), buffer.size(), _TRUNCATE, format, args);
        if(res == -1)
        {
            buffer.resize(buffer.size() * 2);
            continue;
        }
        else
            break;
    }
    return String(buffer.data());
}

String StringUtils::sprintf(_In_z_ _Printf_format_string_ const char* format, ...)
{
    va_list args;
    va_start(args, format);
    auto result = vsprintf(format, args);
    va_end(args);
    return result;
}

WString StringUtils::vsprintf(_In_z_ _Printf_format_string_ const wchar_t* format, va_list args)
{
    wchar_t sbuffer[64] = L"";
    if(_vsnwprintf_s(sbuffer, _TRUNCATE, format, args) != -1)
        return sbuffer;

    std::vector<wchar_t> buffer(256, L'\0');
    while(true)
    {
        int res = _vsnwprintf_s(buffer.data(), buffer.size(), _TRUNCATE, format, args);
        if(res == -1)
        {
            buffer.resize(buffer.size() * 2);
            continue;
        }
        else
            break;
    }
    return WString(buffer.data());
}

WString StringUtils::sprintf(_In_z_ _Printf_format_string_ const wchar_t* format, ...)
{
    va_list args;
    va_start(args, format);
    auto result = vsprintf(format, args);
    va_end(args);
    return result;
}

String StringUtils::ToLower(const String & s)
{
    auto result = s;
    for(size_t i = 0; i < result.size(); i++)
        result[i] = tolower(result[i]);
    return result;
}

bool StringUtils::StartsWith(const String & str, const String & prefix)
{
    return str.size() >= prefix.size() && 0 == str.compare(0, prefix.size(), prefix);
}

bool StringUtils::EndsWith(const String & str, const String & suffix)
{
    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
}

static int hex2int(char ch)
{
    if(ch >= '0' && ch <= '9')
        return ch - '0';
    if(ch >= 'A' && ch <= 'F')
        return ch - 'A' + 10;
    if(ch >= 'a' && ch <= 'f')
        return ch - 'a' + 10;
    return -1;
}

bool StringUtils::FromHex(const String & text, std::vector<unsigned char> & data, bool reverse)
{
    auto size = text.size();
    if(size % 2)
        return false;
    data.resize(size / 2);
    for(size_t i = 0, j = 0; i < size; i += 2, j++)
    {
        auto high = hex2int(text[i]);
        auto low = hex2int(text[i + 1]);
        if(high == -1 || low == -1)
            return false;
        data[reverse ? data.size() - j - 1 : j] = (high << 4) | low;
    }
    return true;
}

String StringUtils::ToHex(unsigned long long value)
{
    char buf[32];
    sprintf_s(buf, "%llX", value);
    return buf;
}

#define HEXLOOKUP "0123456789ABCDEF"

String StringUtils::ToHex(const unsigned char* buffer, size_t size, bool reverse)
{
    String result;
    result.resize(size * 2);
    for(size_t i = 0, j = 0; i < size; i++, j += 2)
    {
        auto ch = buffer[reverse ? size - i - 1 : i];
        result[j] = HEXLOOKUP[(ch >> 4) & 0xF];
        result[j + 1] = HEXLOOKUP[ch & 0xF];
    }
    return result;
}

String StringUtils::ToCompressedHex(const unsigned char* buffer, size_t size)
{
    if(!size)
        return "";
    String result;
    result.reserve(size * 2);
    for(size_t i = 0; i < size;)
    {
        size_t repeat = 0;
        auto lastCh = buffer[i];
        result.push_back(HEXLOOKUP[(lastCh >> 4) & 0xF]);
        result.push_back(HEXLOOKUP[lastCh & 0xF]);
        for(; i < size && buffer[i] == lastCh; i++)
            repeat++;
        if(repeat == 2)
        {
            result.push_back(HEXLOOKUP[(lastCh >> 4) & 0xF]);
            result.push_back(HEXLOOKUP[lastCh & 0xF]);
        }
        else if(repeat > 2)
#ifdef _WIN64
            result.append(StringUtils::sprintf("{%llX}", repeat));
#else //x86
            result.append(StringUtils::sprintf("{%X}", repeat));
#endif //_WIN64
    }
    return result;
}

bool StringUtils::FromCompressedHex(const String & text, std::vector<unsigned char> & data)
{
    auto size = text.size();
    if(size < 2)
        return false;
    data.clear();
    data.reserve(size); //TODO: better initial estimate
    String repeatStr;
    for(size_t i = 0; i < size;)
    {
        if(isspace(text[i])) //skip whitespace
        {
            i++;
            continue;
        }
        auto high = hex2int(text[i++]); //eat high nibble
        if(i >= size) //not enough data
            return false;
        auto low = hex2int(text[i++]); //eat low nibble
        if(high == -1 || low == -1) //invalid character
            return false;
        auto lastCh = (high << 4) | low;
        data.push_back(lastCh);

        if(i >= size) //end of buffer
            break;

        if(text[i] == '{')
        {
            repeatStr.clear();
            i++; //eat '{'
            while(text[i] != '}')
            {
                repeatStr.push_back(text[i++]); //eat character
                if(i >= size) //unexpected end of buffer (missing '}')
                    return false;
            }
            i++; //eat '}'

            size_t repeat = 0;
            if(!convertNumber(repeatStr.c_str(), repeat, 16) || !repeat) //conversion failed or repeat zero times
                return false;
            for(size_t j = 1; j < repeat; j++)
                data.push_back(lastCh);
        }
    }
    return true;
}

int StringUtils::hackicmp(const char* s1, const char* s2)
{
    unsigned char c1, c2;
    while((c1 = *s1++) == (c2 = *s2++))
        if(c1 == '\0')
            return 0;
    s1--, s2--;
    while((c1 = tolower(*s1++)) == (c2 = tolower(*s2++)))
        if(c1 == '\0')
            return 0;
    return c1 - c2;
}