'Feature-Engine RareLabelEncoder: ValueError: could not convert string to float: 'Rare'
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import BoxCoxTransformer, PowerTransformer
high_card_cols = ['brand', 'model', 'location']
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
processor = make_column_transformer(
(RareLabelEncoder(n_categories = 9), ['brand', 'model']), #to group rare cateogircal observations
(MeanEncoder(), high_card_cols), # To encode categorical observations with target mean
(OrdinalEncoder(), cat_cols), #to encode low cardinal variables
(PowerTransformer(), ['milage_kmpl']), # transform continuous variables using Exponential transformation
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),# transform continuous variables using BoxCox
remainder = "passthrough"
)
I am currently on a regression task. I have 2 categorical columns that have high cardinality and rare observations. I created a pipeline that includes rarelabelencoder followed by meanencoder and other encoders.
When I try to fit a simple linear regression model, I get the following error:
ValueError: could not convert string to float: 'Rare'
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
lr_pipe = make_pipeline(
(processor),
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-1c31eaf7c59a> in <module>
8 )
9
---> 10 lr_pipe.fit(X_train, y_train.price)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
700 else:
701 # fit method of arity 2 (supervised transformation)
--> 702 return self.fit(X, y, **fit_params).transform(X)
703
704
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
671 array = array.astype(dtype, casting="unsafe", copy=False)
672 else:
--> 673 array = np.asarray(array, order=order, dtype=dtype)
674 except ComplexWarning as complex_warning:
675 raise ValueError("Complex data not supported\n"
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: 'Rare'
How to overcome this issue?
Solution 1:[1]
The beauty of Feature-engine transformers is that you can select the variables directly at the transformer, so there is no need to use sklearn's column transformer at all. You can place all Feature-engine transformers directly within a Pipeline.
lr_pipe = make_pipeline(
(RareLabelEncoder(n_categories = 9,variables= ['brand', 'model']),
(MeanEncoder(variables=high_card_cols),
(OrdinalEncoder(variables = cat_cols),
etc...
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
Solution 2:[2]
Update:
I managed to solve the problem the following way:
I added rarelabelencoder to the pipeline rather than column-transformer. This solved the issue for me.
lr_pipe = make_pipeline(
(RareLabelEncoder(0.002, variables = ['brand', 'model'])),
(nontree_processor),
(StandardScaler()),
(LinearRegression())
)
Solution 3:[3]
ColumnTransformer
applies its transformers in parallel, so the brand
column actually shows up twice coming out of the processor
: once with rare labels grouped, but not otherwise encoded (throwing the error), and then again mean-encoded (but with rare groups getting different values). You can use pipelines to get around that:
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
brandmodel_pipe = make_pipeline(
RareLabelEncoder(n_categories=9),
MeanEncoder(),
)
processor = make_column_transformer(
(brandmodel_pipe, ['brand', 'model']),
(MeanEncoder(), ['location']),
(OrdinalEncoder(), cat_cols),
(PowerTransformer(), ['milage_kmpl']),
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),
remainder = "passthrough"
)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Sole Galli |
Solution 2 | Joe |
Solution 3 | Ben Reiniger |