'How can i impelement SMOTE inside a columnTransformer?
I'm trying to implement SMOTENC inside a column transformer. However I'm getting error. The code and the error is provided below.
#Create a mask for categorical features
categorical_feature_mask = X_train.dtypes == object
categorical_columns = X_train.columns[categorical_feature_mask].tolist()
print(categorical_columns)
from imblearn.over_sampling import SMOTENC
#Create two datasets also create a pipeline to automate the preprocessing steps
num_features= X_train.select_dtypes(include=[np.number]).columns
cat_features = X_train.select_dtypes(exclude=[np.number]).columns
cat_transformer = Pipeline(steps=[('imp_c', SimpleImputer(strategy='most_frequent')),
('label_bina', LabelBinarizer())])
scale_transformer=Pipeline(steps=[('imp_m',SimpleImputer(strategy='median')),
('std',StandardScaler())])
smote=SMOTENC(categorical_features=categorical_columns,random_state=99)
col_transform = ColumnTransformer(transformers=[
('num', scale_transformer, num_features),
('cat', cat_transformer, cat_features),
('smote', smote )],remainder='passthrough')
#We fit a DecisionTreeClassifier and evaluste the model performance
dt=DecisionTreeClassifier(random_state=99)
pl_dt=Pipeline(steps=[('transform',col_transform),('dt',dt)])
pl_dt.fit(X_train,np.ravel(y_train))
While running this I get error: not enough values to unpack (expected 3, got 2). More precisely
ValueError Traceback (most recent call last)
<ipython-input-34-a874d44f98ee> in <module>
2 dt=DecisionTreeClassifier(random_state=99)
3 pl_dt=Pipeline(steps=[('transform',col_transform),('dt',dt)])
----> 4 pl_dt.fit(X_train,np.ravel(y_train))
5
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~/anaconda3/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
525 # set n_features_in_ attribute
526 self._check_n_features(X, reset=True)
--> 527 self._validate_transformers()
528 self._validate_column_callables(X)
529 self._validate_remainder(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_transformers(self)
274 return
275
--> 276 names, transformers, _ = zip(*self.transformers)
277
278 # validate names
ValueError: not enough values to unpack (expected 3, got 2)
How can i solve the above error?
Solution 1:[1]
ColumnTransformer
is used to apply transformations to a subset a columns of the dataset. Since you want to apply SMOTENC
to the full dataset, just put it outside the ColumnTransformer
. Also, since SMOTENC
does not have a fit_transform
method, we cannot use it with a scikit-learn pipeline. We need to use a imblearn pipeline:
from imblearn.pipeline import Pipeline
...
smote = SMOTENC(categorical_features=categorical_columns, random_state=99)
col_transform = ColumnTransformer(transformers=[
('num', scale_transformer, num_features),
('cat', cat_transformer, cat_features)],
remainder='passthrough')
dt = DecisionTreeClassifier(random_state=99)
pl_dt = Pipeline(steps=[('transform',col_transform), ('smote',smote), ('dt',dt)])
pl_dt.fit(X_train,np.ravel(y_train))
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 |