# -*- encoding: utf-8 -*-
from typing import Union
import numpy as np
import pandas as pd
from datefinder import find_dates
from scipy.sparse import issparse
from sklearn.utils.multiclass import type_of_target
[docs]def convert_to_num(Ybin):
"""
Convert binary targets to numeric vector
typically classification target values
:param Ybin:
:return:
"""
result = np.array(Ybin)
if len(Ybin.shape) != 1:
result = np.dot(Ybin, range(Ybin.shape[1]))
return result
[docs]def convert_to_bin(Ycont, nval, verbose=True):
# Convert numeric vector to binary (typically classification target values)
if verbose:
pass
Ybin = [[0] * nval for x in range(len(Ycont))]
for i in range(len(Ybin)):
line = Ybin[i]
line[np.int(Ycont[i])] = 1
Ybin[i] = line
return Ybin
[docs]def predict_RAM_usage(X, categorical):
# Return estimated RAM usage of dataset after OneHotEncoding in bytes.
estimated_columns = 0
for i, cat in enumerate(categorical):
if cat:
unique_values = np.unique(X[:, i])
num_unique_values = np.sum(np.isfinite(unique_values))
estimated_columns += num_unique_values
else:
estimated_columns += 1
estimated_ram = estimated_columns * X.shape[0] * X.dtype.itemsize
return estimated_ram
[docs]def softmax(df):
if len(df.shape) == 1:
df[df > 20] = 20
df[df < -20] = -20
ppositive = 1 / (1 + np.exp(-df))
ppositive[ppositive > 0.999999] = 1
ppositive[ppositive < 0.0000001] = 0
return np.transpose(np.array((1 - ppositive, ppositive)))
else:
# Compute the Softmax like it is described here:
# http://www.iro.umontreal.ca/~bengioy/dlbook/numerical.html
tmp = df - np.max(df, axis=1).reshape((-1, 1))
tmp = np.exp(tmp)
return tmp / np.sum(tmp, axis=1).reshape((-1, 1))
[docs]def densify(X):
if X is None:
return X
if issparse(X):
return X.todense().getA()
else:
return X
[docs]def is_target_need_label_encode(target_col):
if is_cat(target_col, True):
unk = np.unique(target_col)
wanted = np.arange(len(unk), dtype='int32')
if not np.all(unk == wanted):
return True
return False
[docs]def is_cat(s: Union[pd.Series, np.ndarray], consider_ordinal_as_cat):
if not isinstance(s, pd.Series):
s = pd.Series(s)
if s.dtype == object:
for elem in s:
if isinstance(elem, (float, int)):
continue
else:
return True
s = s.astype('float32')
if consider_ordinal_as_cat:
valid_types = ["multiclass"]
if consider_ordinal_as_cat in (2, "binary"):
valid_types += ["binary"]
s = s.dropna()
tp = type_of_target(s)
if tp in valid_types:
return True
return False
[docs]def finite_array(array):
"""
Replace NaN and Inf (there should not be any!)
:param array:
:return:
"""
a = np.ravel(array)
maxi = np.nanmax(a[np.isfinite(a)])
mini = np.nanmin(a[np.isfinite(a)])
array[array == float('inf')] = maxi
array[array == float('-inf')] = mini
return array
[docs]def is_highR_nan(s: pd.Series, threshold):
return (np.count_nonzero(pd.isna(s)) / s.size) > threshold
[docs]def is_highC_cat(s: pd.Series, threshold):
return (np.unique(s.astype("str")).size / s.size) > threshold
[docs]def is_nan(s: pd.Series):
return np.any(pd.isna(s))
[docs]def to_array(X):
if X is None:
return X
if isinstance(X, (pd.DataFrame, pd.Series)):
return X.values
return X
[docs]def is_text(s, cat_been_checked=False):
if not isinstance(s, pd.Series):
s = pd.Series(s)
if not cat_been_checked:
if not is_cat(s, consider_ordinal_as_cat=False):
return False
s = s.dropna()
s = s.astype(str)
if is_highC_cat(s, 0.8):
s = s.str.split(" ")
s = s.apply(len)
return np.all(s >= 2)
return False
[docs]def is_date(s, cat_been_checked=False):
if not isinstance(s, pd.Series):
s = pd.Series(s)
if not cat_been_checked:
if not is_cat(s, consider_ordinal_as_cat=False):
return False
s = s.dropna()
s = s.astype(str)
return all(bool(list(find_dates(elem, strict=True))) for elem in s)
if __name__ == '__main__':
print(is_text([
"hello world",
"good morning"
"it is a good day"
]))
print(is_text([
"hello world",
"good morning",
0
]))
print(is_text([
"hello world",
"good morning",
"omg"
]))
print(is_text([
"hello world",
"hello world",
"hello world",
]))
print(is_date([
'2018',
'2016',
'658.2.3'
]))
print(is_date([
'456',
'456',
'256'
]))