Source code for autoflow.utils.data

# -*- encoding: utf-8 -*-
from typing import Union

import numpy as np
import pandas as pd
from datefinder import find_dates
from scipy.sparse import issparse
from sklearn.utils.multiclass import type_of_target


[docs]def convert_to_num(Ybin):
    """
    Convert binary targets to numeric vector
    typically classification target values
    :param Ybin:
    :return:
    """
    result = np.array(Ybin)
    if len(Ybin.shape) != 1:
        result = np.dot(Ybin, range(Ybin.shape[1]))
    return result


[docs]def convert_to_bin(Ycont, nval, verbose=True):
    # Convert numeric vector to binary (typically classification target values)
    if verbose:
        pass
    Ybin = [[0] * nval for x in range(len(Ycont))]
    for i in range(len(Ybin)):
        line = Ybin[i]
        line[np.int(Ycont[i])] = 1
        Ybin[i] = line
    return Ybin


[docs]def predict_RAM_usage(X, categorical):
    # Return estimated RAM usage of dataset after OneHotEncoding in bytes.
    estimated_columns = 0
    for i, cat in enumerate(categorical):
        if cat:
            unique_values = np.unique(X[:, i])
            num_unique_values = np.sum(np.isfinite(unique_values))
            estimated_columns += num_unique_values
        else:
            estimated_columns += 1
    estimated_ram = estimated_columns * X.shape[0] * X.dtype.itemsize
    return estimated_ram


[docs]def softmax(df):
    if len(df.shape) == 1:
        df[df > 20] = 20
        df[df < -20] = -20
        ppositive = 1 / (1 + np.exp(-df))
        ppositive[ppositive > 0.999999] = 1
        ppositive[ppositive < 0.0000001] = 0
        return np.transpose(np.array((1 - ppositive, ppositive)))
    else:
        # Compute the Softmax like it is described here:
        # http://www.iro.umontreal.ca/~bengioy/dlbook/numerical.html
        tmp = df - np.max(df, axis=1).reshape((-1, 1))
        tmp = np.exp(tmp)
        return tmp / np.sum(tmp, axis=1).reshape((-1, 1))


[docs]def densify(X):
    if X is None:
        return X
    if issparse(X):
        return X.todense().getA()
    else:
        return X


[docs]def is_target_need_label_encode(target_col):
    if is_cat(target_col, True):
        unk = np.unique(target_col)
        wanted = np.arange(len(unk), dtype='int32')
        if not np.all(unk == wanted):
            return True
    return False


[docs]def is_cat(s: Union[pd.Series, np.ndarray], consider_ordinal_as_cat):
    if not isinstance(s, pd.Series):
        s = pd.Series(s)
    if s.dtype == object:
        for elem in s:
            if isinstance(elem, (float, int)):
                continue
            else:
                return True
        s = s.astype('float32')
    if consider_ordinal_as_cat:
        valid_types = ["multiclass"]
        if consider_ordinal_as_cat in (2, "binary"):
            valid_types += ["binary"]
        s = s.dropna()
        tp = type_of_target(s)
        if tp in valid_types:
            return True
    return False


[docs]def finite_array(array):
    """
    Replace NaN and Inf (there should not be any!)
    :param array:
    :return:
    """
    a = np.ravel(array)
    maxi = np.nanmax(a[np.isfinite(a)])
    mini = np.nanmin(a[np.isfinite(a)])
    array[array == float('inf')] = maxi
    array[array == float('-inf')] = mini
    return array


[docs]def is_highR_nan(s: pd.Series, threshold):
    return (np.count_nonzero(pd.isna(s)) / s.size) > threshold


[docs]def is_highC_cat(s: pd.Series, threshold):
    return (np.unique(s.astype("str")).size / s.size) > threshold


[docs]def is_nan(s: pd.Series):
    return np.any(pd.isna(s))


[docs]def to_array(X):
    if X is None:
        return X
    if isinstance(X, (pd.DataFrame, pd.Series)):
        return X.values
    return X


[docs]def is_text(s, cat_been_checked=False):
    if not isinstance(s, pd.Series):
        s = pd.Series(s)
    if not cat_been_checked:
        if not is_cat(s, consider_ordinal_as_cat=False):
            return False
    s = s.dropna()
    s = s.astype(str)
    if is_highC_cat(s, 0.8):
        s = s.str.split(" ")
        s = s.apply(len)
        return np.all(s >= 2)
    return False


[docs]def is_date(s, cat_been_checked=False):
    if not isinstance(s, pd.Series):
        s = pd.Series(s)
    if not cat_been_checked:
        if not is_cat(s, consider_ordinal_as_cat=False):
            return False
    s = s.dropna()
    s = s.astype(str)
    return all(bool(list(find_dates(elem, strict=True))) for elem in s)


if __name__ == '__main__':
    print(is_text([
        "hello world",
        "good morning"
        "it is a good day"
    ]))
    print(is_text([
        "hello world",
        "good morning",
        0
    ]))
    print(is_text([
        "hello world",
        "good morning",
        "omg"
    ]))
    print(is_text([
        "hello world",
        "hello world",
        "hello world",
    ]))
    print(is_date([
        '2018',
        '2016',
        '658.2.3'
    ]))
    print(is_date([
        '456',
        '456',
        '256'
    ]))