目标: 使用sklearn基于整数和对象类型特征预测结果。
我使用了来自Kaggle的以下数据集: 足球数据集
这是我的笔记本: Kaggle笔记本
库
- scikit-learn == 0.22.1
我创建了一个几乎可以工作的管道:
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipelinefrom sklearn.impute import SimpleImputerfrom sklearn.preprocessing import OneHotEncoder, StandardScalerfrom sklearn.ensemble import RandomForestClassifier# 读取数据df = total_df.copy()# 删除缺少目标的行df.dropna(axis=0, subset=['result'], inplace=True)# 从预测变量中分离目标y = df.result X = df.drop(['result'], axis=1)# 从训练数据中分离验证集X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)integer_features = list(X.columns[X.dtypes == 'int64'])#continuous_features = list(X.columns[X.dtypes == 'float64'])categorical_features = list(X.columns[X.dtypes == 'object'])# 只保留选定的列my_cols = categorical_features + integer_featuresX_train = X_train_full[my_cols].copy()X_test = X_test_full[my_cols].copy()integer_transformer = Pipeline(steps = [ ('imputer', SimpleImputer(strategy = 'most_frequent')), ('scaler', StandardScaler())])categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])preprocessor = ColumnTransformer( transformers=[ ('ints', integer_transformer, integer_features), ('cat', categorical_transformer, categorical_features)])base = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])# 对训练数据进行预处理,拟合模型 base.fit(X_train, y_train)
我收到了一个错误:
ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed
这是完整的回溯信息:
---------------------------------------------------------------------------KeyError Traceback (most recent call last)/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice) 255 try:--> 256 return dtype_to_str[type(key)] 257 except KeyError:KeyError: <class 'sqlalchemy.sql.elements.quoted_name'>During handling of the above exception, another exception occurred:ValueError Traceback (most recent call last)<ipython-input-13-702987dff390> in <module> 47 48 # Preprocessing of training data, fit model---> 49 base.fit(X_train, y_train) 50 51 base.predict(X_test)/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 348 This estimator 349 """--> 350 Xt, fit_params = self._fit(X, y, **fit_params) 351 with _print_elapsed_time('Pipeline', 352 self._log_message(len(self.steps) - 1)):/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 313 message_clsname='Pipeline', 314 message=self._log_message(step_idx),--> 315 **fit_params_steps[name]) 316 # Replace the transformer of the step with the fitted 317 # transformer. This is necessary when loading the transformer/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs) 353 354 def __call__(self, *args, **kwargs):--> 355 return self.func(*args, **kwargs) 356 357 def call_and_shelve(self, *args, **kwargs):/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 726 with _print_elapsed_time(message_clsname, message): 727 if hasattr(transformer, 'fit_transform'):--> 728 res = transformer.fit_transform(X, y, **fit_params) 729 else: 730 res = transformer.fit(X, y, **fit_params).transform(X)/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y) 514 self._validate_transformers() 515 self._validate_column_callables(X)--> 516 self._validate_remainder(X) 517 518 result = self._fit_transform(X, y, _fit_transform_one)/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_remainder(self, X) 316 if (hasattr(X, 'columns') and 317 any(_determine_key_type(cols) == 'str'--> 318 for cols in self._columns)): 319 self._df_columns = X.columns 320 /opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in <genexpr>(.0) 316 if (hasattr(X, 'columns') and 317 any(_determine_key_type(cols) == 'str'--> 318 for cols in self._columns)): 319 self._df_columns = X.columns 320 /opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice) 275 if isinstance(key, (list, tuple)): 276 unique_key = set(key)--> 277 key_type = {_determine_key_type(elt) for elt in unique_key} 278 if not key_type: 279 return None/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in <setcomp>(.0) 275 if isinstance(key, (list, tuple)): 276 unique_key = set(key)--> 277 key_type = {_determine_key_type(elt) for elt in unique_key} 278 if not key_type: 279 return None/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice) 256 return dtype_to_str[type(key)] 257 except KeyError:--> 258 raise ValueError(err_msg) 259 if isinstance(key, slice): 260 if not accept_slice:ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed
任何帮助将不胜感激!
编辑: 错误说明“只允许标量、列表或全部整数或全部字符串的切片,或布尔掩码”。 integer_features
和 categorical_features
是包含仅列名称字符串的列表。
回答:
您对integer_features和categorical_features使用了列表,而转换器需要索引类型。
categorical_features = X.select_dtypes(include="object").columnsinteger_features = X.select_dtypes(exclude="object").columns
更改这些,将解决您的错误。 🙂