Source code for mlinsights.mlmodel.categories_to_integers
import numpy
import pandas
from sklearn.base import BaseEstimator, TransformerMixin
[docs]
class CategoriesToIntegers(BaseEstimator, TransformerMixin):
"""
Does something similar to what
`DictVectorizer
<http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_
does but in a transformer. The method *fit* retains all categories,
the method *transform* transforms categories into integers.
Categories are sorted by columns. If the method *transform* tries to convert
a categories which was not seen by method *fit*, it can raise an exception
or ignore it and replace it by zero.
:param columns: specify a columns selection
:param remove: modalities to remove
:param skip_errors: skip when a new categories appear (no 1)
:param single: use a single column per category, do not multiply them for each value
The logging function displays a message when a new dense and big matrix
is created when it should be sparse. A sparse matrix should be allocated instead.
.. exref::
:title: DictVectorizer or CategoriesToIntegers
:tag: sklearn
Example which transforms text into integers:
.. runpython::
:showcode:
import pandas
from mlinsights.mlmodel import CategoriesToIntegers
df = pandas.DataFrame([{"cat": "a"}, {"cat": "b"}])
trans = CategoriesToIntegers()
trans.fit(df)
newdf = trans.transform(df)
print(newdf)
"""
def __init__(self, columns=None, remove=None, skip_errors=False, single=False):
BaseEstimator.__init__(self)
TransformerMixin.__init__(self)
self.columns = (
columns if isinstance(columns, list) or columns is None else [columns]
)
self.skip_errors = skip_errors
self.remove = remove
self.single = single
def __str__(self):
"""
usual
"""
return self.__repr__()
[docs]
def fit(self, X, y=None, **fit_params):
"""
Makes the list of all categories in input *X*.
*X* must be a dataframe.
:param X: iterable
Training data
:param y: iterable, default=None
Training targets.
:param fit_params: additional fit params
:return: self
"""
if not isinstance(X, pandas.DataFrame):
raise TypeError(f"this transformer only accept Dataframes, not {type(X)}")
columns = (
self.columns
if self.columns
else [c for c, d in zip(X.columns, X.dtypes) if d in (object, str)]
)
self._fit_columns = columns
max_cat = max(len(X) // 2 + 1, 10000)
self._categories = {}
for c in columns:
distinct = set(X[c].dropna())
nb = len(distinct)
if nb >= max_cat:
raise ValueError(
f"Too many categories ({nb}) for one column '{c}' max_cat={max_cat}"
)
self._categories[c] = {c: i for i, c in enumerate(list(sorted(distinct)))}
self._schema = self._build_schema()
return self
def _build_schema(self):
"""
Concatenates all the categories
given the information stored in *_categories*.
@return list of columns, beginning of each
"""
schema = []
position = {}
new_vector = {}
last = 0
for c, v in self._categories.items():
sch = [(_[1], f"{c}={_[1]}") for _ in sorted((n, d) for d, n in v.items())]
if self.remove:
sch = [d for d in sch if d[1] not in self.remove]
position[c] = last
new_vector[c] = {d[0]: i for i, d in enumerate(sch)}
last += len(sch)
schema.extend(_[1] for _ in sch)
return schema, position, new_vector
[docs]
def transform(self, X, y=None):
"""
Transforms categories in numerical features based on the list
of categories found by method *fit*.
*X* must be a dataframe. The function does not preserve
the order of the columns.
:param X: iterable
Training data
:param y: iterable, default=None
Training targets.
:return: DataFrame, *X* with categories.
"""
if not isinstance(X, pandas.DataFrame):
raise TypeError(f"X is not a dataframe: {type(X)}")
if self.single:
b = not self.skip_errors
def transform(v, vec):
"transform a vector"
if v in vec:
return vec[v]
if v is None:
return numpy.nan
if isinstance(v, float) and numpy.isnan(v):
return numpy.nan
if not self.skip_errors:
lv = list(sorted(vec))
if len(lv) > 20:
lv = lv[:20]
lv.append("...")
m = "\n".join(map(str, lv))
raise ValueError(
f"Unable to find category value {v} "
f"type(v)={type(v)} among\n{m}"
)
return numpy.nan
sch, pos, new_vector = self._schema
X = X.copy()
for c in self._fit_columns:
X[c] = X[c].apply(lambda v, cv=c: transform(v, new_vector[cv]))
return X
else:
dfcat = X[self._fit_columns]
dfnum = X[[c for c in X.columns if c not in self._fit_columns]]
sch, pos, new_vector = self._schema
vec = new_vector
# new_size = X.shape[0] * len(sch)
res = numpy.zeros((X.shape[0], len(sch)))
res.fill(numpy.nan)
b = not self.skip_errors
for i, row in enumerate(dfcat.to_dict("records")):
for k, v in row.items():
if v is None or (isinstance(v, float) and numpy.isnan(v)):
# missing values
continue
if v not in vec[k]:
if b:
lv = list(sorted(vec[k]))
if len(lv) > 20:
lv = lv[:20]
lv.append("...")
m = "\n".join(map(str, lv))
raise ValueError(
f"Unable to find category value {k!r}: {v!r} "
f"type(v)={type(v)} among\n{m}"
)
p = pos[k]
else:
p = pos[k] + vec[k][v]
res[i, p] = 1.0
if dfnum.shape[1] > 0:
newdf = pandas.DataFrame(res, columns=sch, index=dfcat.index)
allnum = pandas.concat([dfnum, newdf], axis=1)
else:
allnum = pandas.DataFrame(res, columns=sch, index=dfcat.index)
return allnum
[docs]
def fit_transform(self, X, y=None, **fit_params):
"""
Fits and transforms categories in numerical features based on the list
of categories found by method *fit*.
*X* must be a dataframe. The function does not preserve
the order of the columns.
:param X: iterable
Training data
:param y: iterable, default=None
Training targets.
:param fit_params: additional fitting parameters
:return: Dataframe, *X* with categories.
"""
return self.fit(X, y=y, **fit_params).transform(X, y)