libscstring/string_utils.cpp

/*
 *  string_utils.cpp
 *
 *  Created by Bob Polis on 14-11-2014.
 *  Copyright 2014 Thalictrum. All rights reserved.
 *
 */

#include "string_utils.hpp"
#include <libscerror.hpp>
#include <sys/stat.h>
#include <limits.h>
#include <uninorm.h>
#include <cerrno>
#include <fstream>
#include <algorithm>
using namespace std;

vector<string> sc::split(const string& str, const string& sep)
{
    vector<string> components;
    string::size_type start = 0;
    string::size_type pos = str.find(sep);
    while (pos != string::npos) { // found separator => add substring to vector
        components.push_back(str.substr(start, pos - start));
        start = pos + sep.length(); // next search starts just after found separator
        pos = str.find(sep, start);
    }
    // at end of string => add last component
    components.push_back(str.substr(start, str.length() - start));
    return components;
}

vector<string> sc::split(const string& str, const regex& sep) {
    vector<string> components;
    sregex_token_iterator end {};
    for (sregex_token_iterator p {str.begin(), str.end(), sep, -1}; p != end; ++p) {
        components.push_back(*p);
    }
    return components;
}

string sc::join(const vector<string>& components, const string& join)
{
    string result;
    for (vector<string>::const_iterator i = components.cbegin(); i != components.cend(); ++i) {
        if (i != components.cbegin()) {
            result += join;
        }
        result += *i;
    }
    return result;
}

string sc::trim(const string& str, const string& del)
{
    string result { str };
    string::size_type pos {0};

    // erase leading chars that occur in del
    while (result.size() && (pos = result.find_first_of(del)) == 0) {
        result.erase(pos, 1);
    }

    // erase trailing chars that occur in del
    while (result.size() && (pos = result.find_last_of(del)) == result.size() - 1) {
        result.erase(pos);
    }
    return result;
}

bool sc::file_exists(const string& path)
{
    struct stat st;
    return !(::stat(path.c_str(), &st) == -1 && errno == ENOENT);
}

string sc::file_get_contents(const string& path)
{
    ifstream file {path};
    file.exceptions(ios::failbit | ios::badbit);
    file.seekg(0, ios::end);
    ios::pos_type file_len {file.tellg()};
    file.seekg(0);
    vector<char> buf(file_len);
    file.read(buf.data(), file_len);
    return {buf.data(), static_cast<string::size_type>(file_len)};
}

map<string, string> sc::parse_ini_file(const string& path)
{
    map<string, string> result;
    string line;
    ifstream file {path};
    file.exceptions(/*ios::failbit |*/ ios::badbit); // it seems that getline() will set failbit when confronted with eof immediately
    while (getline(file, line)) {
        if (line[0] == '[') continue;
        vector<string> parts {split(line, "=")};
        if (parts.size() > 1) {
            string key {trim(parts[0])};
            string value {trim(parts[1], " \"")};
            result[key] = value;
        }
    }
    return result;
}

void sc::replace_all(const string& what, const string& replacement, string& target) {
    string::size_type pos = string::npos;
    string::size_type from = 0;
    do {
        pos = target.find(what, from);
        if (pos != string::npos) {
            target.replace(pos, what.length(), replacement);
            from = pos + replacement.length();
        }
    } while (pos != string::npos);
}

string sc::str_replace(const string& what, const string& replacement, const string& target)
{
    string result;
    string::size_type pos {0};
    string::size_type from {0};
    while ((pos = target.find(what, from)) != string::npos) {
        result += target.substr(from, pos - from);
        result += replacement;
        from = pos + what.length();
        if (from >= target.length()) break;
    }
    result += target.substr(from);
    return result;
}

wstring sc::replace_all(const wstring& what,
                        const wstring& replacement,
                        const wstring& target)
{
    wstring term {target};
    wstring::size_type pos = wstring::npos;
    wstring::size_type from = 0;
    do {
        pos = term.find(what, from);
        if (pos != wstring::npos) {
            term.replace(pos, what.length(), replacement);
            from = pos + replacement.length();
        }
    } while (pos != wstring::npos);
    return term;
}

void sc::create_dir(const string &path, int mode)
{
    string dir;
    vector<string> path_components {sc::split(path, "/")};
    for (string comp : path_components) {
        dir += comp + '/';
        if (!sc::file_exists(dir)) {
            throw_if_min1(::mkdir(dir.c_str(), mode));
        }
    }
}

string sc::dirname(const string& path) {
    string result {path};
    vector<char> buf;
    buf.resize(PATH_MAX);
    if (::realpath(path.c_str(), buf.data())) {
        result = buf.data();
    }
    auto pos = result.rfind("/");
    if (pos != string::npos) {
        return result.substr(0, pos);
    }
    return result;
}

string sc::basename(const string& path, bool remove_extension) {
    string result {path};
    if (remove_extension) {
        auto dot = path.rfind(".");
        if (dot != string::npos) {
            result = result.substr(0, dot);
        }
    }
    auto pos = result.rfind("/");
    if (pos != string::npos) {
        return result.substr(pos + 1);
    }
    return result;
}

string sc::replace_tilde(const string& path) {
    string result {path};
    auto pos = result.find("~");
    if (pos != string::npos) {
        string home {::getenv("HOME")};
        result.replace(pos, pos + 1, home);
    }
    return result;
}

string sc::filename_extension(const string& path) {
    auto pos = path.rfind(".");
    if (pos != string::npos) {
        return path.substr(pos);
    }
    return "";
}

string sc::tool_path(const string& name) {
    if (name.find('/') != string::npos) { // name has (at least one) slash
        return sc::real_path(name);
    } else { // no slash in name => command from PATH?
        string envpath {::getenv("PATH")};
        vector<string> paths {sc::split(envpath, ":")};
        for (const string& elem : paths) {
            string path {elem + '/' + name};
            if (sc::file_exists(path)) {
                return path;
            }
        }
    }
    return "";
}

string sc::truncate(const string& str, unsigned int maxlen, int /*how*/) {
    if (maxlen > str.length()) return str;
    return str.substr(0, maxlen - 1) + "\u2026"; // add ellipsis
}

string sc::lowercase(const string& str, const locale& loc) {
    string result;
    for (const char c : str) {
        result += tolower(c, loc);
    }
    return result;
}

string sc::uppercase(const string& str, const locale& loc) {
    string result;
    for (const char c : str) {
        result += toupper(c, loc);
    }
    return result;
}

bool sc::is_valid_utf8(const string& str) {
    // From: http://www.zedwood.com/article/cpp-is-valid-utf8-string-function
    int c,i,ix,n,j;
    for (i=0, ix=str.length(); i < ix; i++)
    {
        c = (unsigned char) str[i];
        //if (c==0x09 || c==0x0a || c==0x0d || (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii
        if (0x00 <= c && c <= 0x7f) n=0; // 0bbbbbbb
        else if ((c & 0xE0) == 0xC0) n=1; // 110bbbbb
        else if ( c==0xed && i<(ix-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) return false; //U+d800 to U+dfff
        else if ((c & 0xF0) == 0xE0) n=2; // 1110bbbb
        else if ((c & 0xF8) == 0xF0) n=3; // 11110bbb
        //else if (($c & 0xFC) == 0xF8) n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
        //else if (($c & 0xFE) == 0xFC) n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
        else return false;
        for (j=0; j<n && i<ix; j++) { // n bytes matching 10bbbbbb follow ?
            if ((++i == ix) || (( (unsigned char)str[i] & 0xC0) != 0x80))
                return false;
        }
    }
    return true;
}

string sc::real_path(const string& path) {
    vector<char> buf;
    buf.resize(PATH_MAX);
    if (::realpath(path.c_str(), buf.data())) {
        return string(buf.data());
    }
    return "";
}

string sc::remove_accents(const string& text) {
    vector<char> buf;
    buf.resize(text.size() * 2);
    size_t bufsize {buf.size()};
    u8_normalize(UNINORM_NFD,
                 reinterpret_cast<const uint8_t*>(text.data()),
                 text.size(),
                 reinterpret_cast<uint8_t*>(buf.data()),
                 &bufsize);

    // hack: now remove all bytes with a value higher than 127
    auto it = remove_if(buf.begin(), buf.end(), [](uint8_t c) {
        return c > 127;
    });
    buf.erase(it, buf.end());
    return {buf.data()};
}