libscstring/string_utils.cpp

/*
 *  string_utils.cpp
 *
 *  Created by Bob Polis on 14-11-2014.
 *  Copyright 2014 Thalictrum. All rights reserved.
 *
 */

#include "string_utils.hpp"
#include <libscerror.hpp>
#include <sys/stat.h>
#include <limits.h>
#include <uninorm.h>
#include <cerrno>
#include <fstream>
#include <algorithm>
using namespace std;

vector<string> sc::split(const string& str, const string& sep)
{
    vector<string> components;
    string::size_type start = 0;
    string::size_type pos = str.find(sep);
    while (pos != string::npos) { // found separator => add substring to vector
        components.push_back(str.substr(start, pos - start));
        start = pos + sep.length(); // next search starts just after found separator
        pos = str.find(sep, start);
    }
    // at end of string => add last component
    components.push_back(str.substr(start, str.length() - start));
    return components;
}

vector<string> sc::split(const string& str, const regex& sep) {
    vector<string> components;
    sregex_token_iterator end {};
    for (sregex_token_iterator p {str.begin(), str.end(), sep, -1}; p != end; ++p) {
        components.push_back(*p);
    }
    return components;
}

string sc::join(const vector<string>& components, const string& join)
{
    string result;
    for (vector<string>::const_iterator i = components.cbegin(); i != components.cend(); ++i) {
        if (i != components.cbegin()) {
            result += join;
        }
        result += *i;
    }
    return result;
}

string sc::trim(const string& str, const string& del)
{
    string result { str };
    string::size_type pos {0};
    
    // erase leading chars that occur in del
    while (result.size() && (pos = result.find_first_of(del)) == 0) {
        result.erase(pos, 1);
    }
    
    // erase trailing chars that occur in del
    while (result.size() && (pos = result.find_last_of(del)) == result.size() - 1) {
        result.erase(pos);
    }
    return result;
}

bool sc::file_exists(const string& path)
{
    struct stat st;
    return !(::stat(path.c_str(), &st) == -1 && errno == ENOENT);
}

string sc::file_get_contents(const string& path)
{
    ifstream file {path};
    file.exceptions(ios::failbit | ios::badbit);
    file.seekg(0, ios::end);
    ios::pos_type file_len {file.tellg()};
    file.seekg(0);
    vector<char> buf(file_len);
    file.read(buf.data(), file_len);
    return {buf.data(), static_cast<string::size_type>(file_len)};
}

map<string, string> sc::parse_ini_file(const string& path)
{
    map<string, string> result;
    string line;
    ifstream file {path};
    file.exceptions(/*ios::failbit |*/ ios::badbit); // it seems that getline() will set failbit when confronted with eof immediately
    while (getline(file, line)) {
        if (line[0] == '[') continue;
        vector<string> parts {split(line, "=")};
        if (parts.size() > 1) {
            string key {trim(parts[0])};
            string value {trim(parts[1], " \"")};
            result[key] = value;
        }
    }
    return result;
}

void sc::replace_all(const string& what, const string& replacement, string& target) {
    string::size_type pos = string::npos;
    string::size_type from = 0;
    do {
        pos = target.find(what, from);
        if (pos != string::npos) {
            target.replace(pos, what.length(), replacement);
            from = pos + replacement.length();
        }
    } while (pos != string::npos);
}

string sc::str_replace(const string& what, const string& replacement, const string& target)
{
    string result;
    string::size_type pos {0};
    string::size_type from {0};
    while ((pos = target.find(what, from)) != string::npos) {
        result += target.substr(from, pos - from);
        result += replacement;
        from = pos + what.length();
        if (from >= target.length()) break;
    }
    result += target.substr(from);
    return result;
}

wstring sc::replace_all(const wstring& what,
                        const wstring& replacement,
                        const wstring& target)
{
    wstring term {target};
    wstring::size_type pos = wstring::npos;
    wstring::size_type from = 0;
    do {
        pos = term.find(what, from);
        if (pos != wstring::npos) {
            term.replace(pos, what.length(), replacement);
            from = pos + replacement.length();
        }
    } while (pos != wstring::npos);
    return term;
}

void sc::create_dir(const string &path, int mode)
{
    string dir;
    vector<string> path_components {sc::split(path, "/")};
    for (string comp : path_components) {
        dir += comp + '/';
        if (!sc::file_exists(dir)) {
            throw_if_min1(::mkdir(dir.c_str(), mode));
        }
    }
}

string sc::dirname(const string& path) {
    string result {path};
    vector<char> buf;
    buf.resize(PATH_MAX);
    if (::realpath(path.c_str(), buf.data())) {
        result = buf.data();
    }
    auto pos = result.rfind("/");
    if (pos != string::npos) {
        return result.substr(0, pos);
    }
    return result;
}

string sc::basename(const string& path, bool remove_extension) {
    string result {path};
    if (remove_extension) {
        auto dot = path.rfind(".");
        if (dot != string::npos) {
            result = result.substr(0, dot);
        }
    }
    auto pos = result.rfind("/");
    if (pos != string::npos) {
        return result.substr(pos + 1);
    }
    return result;
}

string sc::replace_tilde(const string& path) {
    string result {path};
    auto pos = result.find("~");
    if (pos != string::npos) {
        string home {::getenv("HOME")};
        result.replace(pos, pos + 1, home);
    }
    return result;
}

string sc::filename_extension(const string& path) {
    auto pos = path.rfind(".");
    if (pos != string::npos) {
        return path.substr(pos);
    }
    return "";
}

string sc::tool_path(const string& name) {
    if (name.find('/') != string::npos) { // name has (at least one) slash
        return sc::real_path(name);
    } else { // no slash in name => command from PATH?
        string envpath {::getenv("PATH")};
        vector<string> paths {sc::split(envpath, ":")};
        for (const string& elem : paths) {
            string path {elem + '/' + name};
            if (sc::file_exists(path)) {
                return path;
            }
        }
    }
    return "";
}

string sc::truncate(const string& str, unsigned int maxlen, int /*how*/) {
    if (maxlen > str.length()) return str;
    return str.substr(0, maxlen - 1) + "\u2026"; // add ellipsis
}

string sc::lowercase(const string& str, const locale& loc) {
    string result;
    for (const char c : str) {
        result += tolower(c, loc);
    }
    return result;
}

string sc::uppercase(const string& str, const locale& loc) {
    string result;
    for (const char c : str) {
        result += toupper(c, loc);
    }
    return result;
}

bool sc::is_valid_utf8(const string& str) {
    // From: http://www.zedwood.com/article/cpp-is-valid-utf8-string-function
    int c,i,ix,n,j;
    for (i=0, ix=str.length(); i < ix; i++)
    {
        c = (unsigned char) str[i];
        //if (c==0x09 || c==0x0a || c==0x0d || (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii
        if (0x00 <= c && c <= 0x7f) n=0; // 0bbbbbbb
        else if ((c & 0xE0) == 0xC0) n=1; // 110bbbbb
        else if ( c==0xed && i<(ix-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) return false; //U+d800 to U+dfff
        else if ((c & 0xF0) == 0xE0) n=2; // 1110bbbb
        else if ((c & 0xF8) == 0xF0) n=3; // 11110bbb
        //else if (($c & 0xFC) == 0xF8) n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
        //else if (($c & 0xFE) == 0xFC) n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
        else return false;
        for (j=0; j<n && i<ix; j++) { // n bytes matching 10bbbbbb follow ?
            if ((++i == ix) || (( (unsigned char)str[i] & 0xC0) != 0x80))
                return false;
        }
    }
    return true;
}

string sc::real_path(const string& path) {
    vector<char> buf;
    buf.resize(PATH_MAX);
    if (::realpath(path.c_str(), buf.data())) {
        return string(buf.data());
    }
    return "";
}

string sc::remove_accents(const string& text) {
    vector<char> buf;
    buf.resize(text.size() * 2);
    size_t bufsize {buf.size()};
    u8_normalize(UNINORM_NFD,
                 reinterpret_cast<const uint8_t*>(text.data()),
                 text.size(),
                 reinterpret_cast<uint8_t*>(buf.data()),
                 &bufsize);

    // hack: now remove all bytes with a value higher than 127
    auto it = remove_if(buf.begin(), buf.end(), [](uint8_t c) {
        return c > 127;
    });
    buf.erase(it, buf.end());
    return {buf.data()};
}
first commit 2020-03-16 15:16:35 +01:00			`/*`
			`* string_utils.cpp`
			`*`
			`* Created by Bob Polis on 14-11-2014.`
			`* Copyright 2014 Thalictrum. All rights reserved.`
			`*`
			`*/`

			`#include "string_utils.hpp"`
			`#include <libscerror.hpp>`
			`#include <sys/stat.h>`
			`#include <limits.h>`
Added from_string; added remove_accents This adds a dependency on libunistring. 2021-07-02 11:30:13 +02:00			`#include <uninorm.h>`
first commit 2020-03-16 15:16:35 +01:00			`#include <cerrno>`
			`#include <fstream>`
			`#include <algorithm>`
			`using namespace std;`

			`vector<string> sc::split(const string& str, const string& sep)`
			`{`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`vector<string> components;`
			`string::size_type start = 0;`
			`string::size_type pos = str.find(sep);`
			`while (pos != string::npos) { // found separator => add substring to vector`
			`components.push_back(str.substr(start, pos - start));`
			`start = pos + sep.length(); // next search starts just after found separator`
			`pos = str.find(sep, start);`
			`}`
			`// at end of string => add last component`
			`components.push_back(str.substr(start, str.length() - start));`
			`return components;`
first commit 2020-03-16 15:16:35 +01:00			`}`

			`vector<string> sc::split(const string& str, const regex& sep) {`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`vector<string> components;`
			`sregex_token_iterator end {};`
			`for (sregex_token_iterator p {str.begin(), str.end(), sep, -1}; p != end; ++p) {`
			`components.push_back(*p);`
			`}`
			`return components;`
first commit 2020-03-16 15:16:35 +01:00			`}`

			`string sc::join(const vector<string>& components, const string& join)`
			`{`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`string result;`
			`for (vector<string>::const_iterator i = components.cbegin(); i != components.cend(); ++i) {`
			`if (i != components.cbegin()) {`
			`result += join;`
			`}`
			`result += *i;`
			`}`
			`return result;`
first commit 2020-03-16 15:16:35 +01:00			`}`

			`string sc::trim(const string& str, const string& del)`
			`{`
			`string result { str };`
			`string::size_type pos {0};`

			`// erase leading chars that occur in del`
			`while (result.size() && (pos = result.find_first_of(del)) == 0) {`
			`result.erase(pos, 1);`
			`}`

			`// erase trailing chars that occur in del`
			`while (result.size() && (pos = result.find_last_of(del)) == result.size() - 1) {`
			`result.erase(pos);`
			`}`
			`return result;`
			`}`

			`bool sc::file_exists(const string& path)`
			`{`
			`struct stat st;`
			`return !(::stat(path.c_str(), &st) == -1 && errno == ENOENT);`
			`}`

			`string sc::file_get_contents(const string& path)`
			`{`
			`ifstream file {path};`
			`file.exceptions(ios::failbit \| ios::badbit);`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`file.seekg(0, ios::end);`
			`ios::pos_type file_len {file.tellg()};`
first commit 2020-03-16 15:16:35 +01:00			`file.seekg(0);`
Simplified char buffer construction 2021-12-01 16:12:32 +01:00			`vector<char> buf(file_len);`
first commit 2020-03-16 15:16:35 +01:00			`file.read(buf.data(), file_len);`
			`return {buf.data(), static_cast<string::size_type>(file_len)};`
			`}`

			`map<string, string> sc::parse_ini_file(const string& path)`
			`{`
			`map<string, string> result;`
			`string line;`
			`ifstream file {path};`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`file.exceptions(/ios::failbit \|/ ios::badbit); // it seems that getline() will set failbit when confronted with eof immediately`
first commit 2020-03-16 15:16:35 +01:00			`while (getline(file, line)) {`
			`if (line[0] == '[') continue;`
			`vector<string> parts {split(line, "=")};`
			`if (parts.size() > 1) {`
			`string key {trim(parts[0])};`
			`string value {trim(parts[1], " \"")};`
			`result[key] = value;`
			`}`
			`}`
			`return result;`
			`}`

Removed std prefix 2021-07-02 11:38:57 +02:00			`void sc::replace_all(const string& what, const string& replacement, string& target) {`
			`string::size_type pos = string::npos;`
			`string::size_type from = 0;`
Added desctructive replace_all for std::string 2021-07-02 11:37:54 +02:00			`do {`
			`pos = target.find(what, from);`
Removed std prefix 2021-07-02 11:38:57 +02:00			`if (pos != string::npos) {`
Added desctructive replace_all for std::string 2021-07-02 11:37:54 +02:00			`target.replace(pos, what.length(), replacement);`
			`from = pos + replacement.length();`
			`}`
Removed std prefix 2021-07-02 11:38:57 +02:00			`} while (pos != string::npos);`
Added desctructive replace_all for std::string 2021-07-02 11:37:54 +02:00			`}`

first commit 2020-03-16 15:16:35 +01:00			`string sc::str_replace(const string& what, const string& replacement, const string& target)`
			`{`
			`string result;`
			`string::size_type pos {0};`
			`string::size_type from {0};`
			`while ((pos = target.find(what, from)) != string::npos) {`
			`result += target.substr(from, pos - from);`
			`result += replacement;`
			`from = pos + what.length();`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`if (from >= target.length()) break;`
first commit 2020-03-16 15:16:35 +01:00			`}`
			`result += target.substr(from);`
			`return result;`
			`}`

Removed std prefix 2021-07-02 11:38:57 +02:00			`wstring sc::replace_all(const wstring& what,`
			`const wstring& replacement,`
			`const wstring& target)`
first commit 2020-03-16 15:16:35 +01:00			`{`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`wstring term {target};`
			`wstring::size_type pos = wstring::npos;`
			`wstring::size_type from = 0;`
			`do {`
			`pos = term.find(what, from);`
			`if (pos != wstring::npos) {`
			`term.replace(pos, what.length(), replacement);`
			`from = pos + replacement.length();`
			`}`
			`} while (pos != wstring::npos);`
			`return term;`
first commit 2020-03-16 15:16:35 +01:00			`}`

Removed std prefix 2021-07-02 11:38:57 +02:00			`void sc::create_dir(const string &path, int mode)`
first commit 2020-03-16 15:16:35 +01:00			`{`
			`string dir;`
			`vector<string> path_components {sc::split(path, "/")};`
			`for (string comp : path_components) {`
			`dir += comp + '/';`
			`if (!sc::file_exists(dir)) {`
			`throw_if_min1(::mkdir(dir.c_str(), mode));`
			`}`
			`}`
			`}`

			`string sc::dirname(const string& path) {`
			`string result {path};`
			`vector<char> buf;`
			`buf.resize(PATH_MAX);`
			`if (::realpath(path.c_str(), buf.data())) {`
			`result = buf.data();`
			`}`
			`auto pos = result.rfind("/");`
			`if (pos != string::npos) {`
			`return result.substr(0, pos);`
			`}`
			`return result;`
			`}`

			`string sc::basename(const string& path, bool remove_extension) {`
			`string result {path};`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`if (remove_extension) {`
			`auto dot = path.rfind(".");`
			`if (dot != string::npos) {`
			`result = result.substr(0, dot);`
			`}`
			`}`
first commit 2020-03-16 15:16:35 +01:00			`auto pos = result.rfind("/");`
			`if (pos != string::npos) {`
			`return result.substr(pos + 1);`
			`}`
			`return result;`
			`}`

			`string sc::replace_tilde(const string& path) {`
			`string result {path};`
			`auto pos = result.find("~");`
			`if (pos != string::npos) {`
			`string home {::getenv("HOME")};`
			`result.replace(pos, pos + 1, home);`
			`}`
			`return result;`
			`}`

			`string sc::filename_extension(const string& path) {`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`auto pos = path.rfind(".");`
			`if (pos != string::npos) {`
			`return path.substr(pos);`
			`}`
			`return "";`
first commit 2020-03-16 15:16:35 +01:00			`}`

			`string sc::tool_path(const string& name) {`
			`if (name.find('/') != string::npos) { // name has (at least one) slash`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`return sc::real_path(name);`
first commit 2020-03-16 15:16:35 +01:00			`} else { // no slash in name => command from PATH?`
			`string envpath {::getenv("PATH")};`
			`vector<string> paths {sc::split(envpath, ":")};`
			`for (const string& elem : paths) {`
			`string path {elem + '/' + name};`
			`if (sc::file_exists(path)) {`
			`return path;`
			`}`
			`}`
			`}`
			`return "";`
			`}`

			`string sc::truncate(const string& str, unsigned int maxlen, int /how/) {`
			`if (maxlen > str.length()) return str;`
			`return str.substr(0, maxlen - 1) + "\u2026"; // add ellipsis`
			`}`

			`string sc::lowercase(const string& str, const locale& loc) {`
			`string result;`
			`for (const char c : str) {`
			`result += tolower(c, loc);`
			`}`
			`return result;`
			`}`

			`string sc::uppercase(const string& str, const locale& loc) {`
			`string result;`
			`for (const char c : str) {`
			`result += toupper(c, loc);`
			`}`
			`return result;`
			`}`

			`bool sc::is_valid_utf8(const string& str) {`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00			`// From: http://www.zedwood.com/article/cpp-is-valid-utf8-string-function`
first commit 2020-03-16 15:16:35 +01:00			`int c,i,ix,n,j;`
			`for (i=0, ix=str.length(); i < ix; i++)`
			`{`
			`c = (unsigned char) str[i];`
			`//if (c==0x09 \|\| c==0x0a \|\| c==0x0d \|\| (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii`
			`if (0x00 <= c && c <= 0x7f) n=0; // 0bbbbbbb`
			`else if ((c & 0xE0) == 0xC0) n=1; // 110bbbbb`
			`else if ( c==0xed && i<(ix-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) return false; //U+d800 to U+dfff`
			`else if ((c & 0xF0) == 0xE0) n=2; // 1110bbbb`
			`else if ((c & 0xF8) == 0xF0) n=3; // 11110bbb`
			`//else if (($c & 0xFC) == 0xF8) n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8`
			`//else if (($c & 0xFE) == 0xFC) n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8`
			`else return false;`
			`for (j=0; j<n && i<ix; j++) { // n bytes matching 10bbbbbb follow ?`
			`if ((++i == ix) \|\| (( (unsigned char)str[i] & 0xC0) != 0x80))`
			`return false;`
			`}`
			`}`
			`return true;`
			`}`
added real_path function; bumped version to 1.1 2020-08-23 17:21:19 +02:00
			`string sc::real_path(const string& path) {`
			`vector<char> buf;`
			`buf.resize(PATH_MAX);`
			`if (::realpath(path.c_str(), buf.data())) {`
			`return string(buf.data());`
			`}`
			`return "";`
			`}`
Added from_string; added remove_accents This adds a dependency on libunistring. 2021-07-02 11:30:13 +02:00
			`string sc::remove_accents(const string& text) {`
			`vector<char> buf;`
			`buf.resize(text.size() * 2);`
			`size_t bufsize {buf.size()};`
			`u8_normalize(UNINORM_NFD,`
			`reinterpret_cast<const uint8_t*>(text.data()),`
			`text.size(),`
			`reinterpret_cast<uint8_t*>(buf.data()),`
			`&bufsize);`

			`// hack: now remove all bytes with a value higher than 127`
			`auto it = remove_if(buf.begin(), buf.end(), [](uint8_t c) {`
			`return c > 127;`
			`});`
			`buf.erase(it, buf.end());`
			`return {buf.data()};`
			`}`