'Feature-Engine RareLabelEncoder: ValueError: could not convert string to float: 'Rare'

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import BoxCoxTransformer, PowerTransformer

high_card_cols = ['brand', 'model', 'location']
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']

processor = make_column_transformer(
    (RareLabelEncoder(n_categories = 9), ['brand', 'model']), #to group rare cateogircal observations
    (MeanEncoder(), high_card_cols), # To encode categorical observations with target mean
    (OrdinalEncoder(), cat_cols), #to encode low cardinal variables
    (PowerTransformer(), ['milage_kmpl']), # transform continuous variables using Exponential transformation
    (BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),# transform continuous variables using BoxCox    
    remainder = "passthrough"
)

I am currently on a regression task. I have 2 categorical columns that have high cardinality and rare observations. I created a pipeline that includes rarelabelencoder followed by meanencoder and other encoders.

When I try to fit a simple linear regression model, I get the following error:

ValueError: could not convert string to float: 'Rare'

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

lr_pipe = make_pipeline(
    (processor),
    (StandardScaler()), 
    (LinearRegression())
)

lr_pipe.fit(X_train, y_train.price)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-91-1c31eaf7c59a> in <module>
      8 )
      9 
---> 10 lr_pipe.fit(X_train, y_train.price)

~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    700         else:
    701             # fit method of arity 2 (supervised transformation)
--> 702             return self.fit(X, y, **fit_params).transform(X)
    703 
    704 

~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
    728         # Reset internal state before fitting
    729         self._reset()
--> 730         return self.partial_fit(X, y, sample_weight)
    731 
    732     def partial_fit(self, X, y=None, sample_weight=None):

~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
    764         """
    765         first_call = not hasattr(self, "n_samples_seen_")
--> 766         X = self._validate_data(X, accept_sparse=('csr', 'csc'),
    767                                 estimator=self, dtype=FLOAT_DTYPES,
    768                                 force_all_finite='allow-nan', reset=first_call)

~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    419             out = X
    420         elif isinstance(y, str) and y == 'no_validation':
--> 421             X = check_array(X, **check_params)
    422             out = X
    423         else:

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    671                     array = array.astype(dtype, casting="unsafe", copy=False)
    672                 else:
--> 673                     array = np.asarray(array, order=order, dtype=dtype)
    674             except ComplexWarning as complex_warning:
    675                 raise ValueError("Complex data not supported\n"

~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

ValueError: could not convert string to float: 'Rare'

How to overcome this issue?



Solution 1:[1]

The beauty of Feature-engine transformers is that you can select the variables directly at the transformer, so there is no need to use sklearn's column transformer at all. You can place all Feature-engine transformers directly within a Pipeline.

lr_pipe = make_pipeline(
    (RareLabelEncoder(n_categories = 9,variables= ['brand', 'model']),
    (MeanEncoder(variables=high_card_cols), 
    (OrdinalEncoder(variables = cat_cols), 
     etc...
    (StandardScaler()), 
    (LinearRegression())
)

lr_pipe.fit(X_train, y_train.price)

Solution 2:[2]

Update:

I managed to solve the problem the following way:

I added rarelabelencoder to the pipeline rather than column-transformer. This solved the issue for me.

lr_pipe = make_pipeline(
    (RareLabelEncoder(0.002, variables = ['brand', 'model'])),
    (nontree_processor),
    (StandardScaler()), 
    (LinearRegression())
)

Solution 3:[3]

ColumnTransformer applies its transformers in parallel, so the brand column actually shows up twice coming out of the processor: once with rare labels grouped, but not otherwise encoded (throwing the error), and then again mean-encoded (but with rare groups getting different values). You can use pipelines to get around that:

cat_cols = ['fuel_type', 'transmission', 'is_first_owner']

brandmodel_pipe = make_pipeline(
    RareLabelEncoder(n_categories=9),
    MeanEncoder(),
)

processor = make_column_transformer(
    (brandmodel_pipe, ['brand', 'model']),
    (MeanEncoder(), ['location']),
    (OrdinalEncoder(), cat_cols),
    (PowerTransformer(), ['milage_kmpl']),
    (BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),
    remainder = "passthrough"
)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Sole Galli
Solution 2 Joe
Solution 3 Ben Reiniger