我刚开始学习机器学习,正在尝试理解 sklearn 中的顺序特征选择器概念。我使用 Anaconda 和 Jupyter 笔记本进行概念验证。我已经导入了
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
包。默认情况下,mlxtend 包不是 Anaconda 的一部分,然后我通过 pip install mlxtend 命令安装了它。
我使用 sklearn 的波士顿房价数据集进行了这个概念验证,并执行了下面的代码。在拟合 sfs 时,我遇到了错误。
如何修复这个错误?
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom mlxtend.feature_selection import SequentialFeatureSelector as sfsfrom sklearn.metrics import roc_curve, roc_auc_score%matplotlib inlinedata = load_boston()print(data.keys())X = pd.DataFrame(data.data)X.columns = data.feature_namesy = data.targetX_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)sfs1=sfs(RandomForestRegressor(n_jobs=1), k_features=7, forward=True, floating=False, verbose=3, scoring='roc_auc', cv=3 )sfs1=sfs1.fit(X_train,y_train)
错误
ValueError Traceback (most recent call last)<ipython-input-77-96b29660189d> in <module> 1 #sfs1.fit(X_train,y_train) 2 X_train.shape----> 3 sfs2=sfs1.fit(X_train,y_train)C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in fit(self, X, y, custom_feature_names, **fit_params) 371 X=X_, 372 y=y,--> 373 **fit_params 374 ) 375 else:C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _inclusion(self, orig_set, subset, X, y, ignore_feature, **fit_params) 528 tuple(subset | {feature}), 529 **fit_params)--> 530 for feature in remaining 531 if feature != ignore_feature) 532 C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable) 915 # remaining jobs. 916 self._iterating = False--> 917 if self.dispatch_one_batch(iterator): 918 self._iterating = self._original_iterator is not None 919 C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator) 757 return False 758 else:--> 759 self._dispatch(tasks) 760 return True 761 C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch) 714 with self._lock: 715 job_idx = len(self._jobs)--> 716 job = self._backend.apply_async(batch, callback=cb) 717 # A job can complete so quickly than its callback is 718 # called before we get here, causing self._jobs toC:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback) 180 def apply_async(self, func, callback=None): 181 """Schedule a func to be run"""--> 182 result = ImmediateResult(func) 183 if callback: 184 callback(result)C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch) 547 # Don't delay the application, to avoid keeping the input 548 # arguments in memory--> 549 self.results = batch() 550 551 def get(self):C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs)--> 225 for func, args, kwargs in self.items] 226 227 def __len__(self):C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs)--> 225 for func, args, kwargs in self.items] 226 227 def __len__(self):C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _calc_score(selector, X, y, indices, **fit_params) 32 n_jobs=1, 33 pre_dispatch=selector.pre_dispatch,---> 34 fit_params=fit_params) 35 else: 36 selector.est_.fit(X[:, indices], y, **fit_params)C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score) 400 fit_params=fit_params, 401 pre_dispatch=pre_dispatch,--> 402 error_score=error_score) 403 return cv_results['test_score'] 404 C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score) 238 return_times=True, return_estimator=return_estimator, 239 error_score=error_score)--> 240 for train, test in cv.split(X, y, groups)) 241 242 zipped_scores = list(zip(*scores))C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable) 915 # remaining jobs. 916 self._iterating = False--> 917 if self.dispatch_one_batch(iterator): 918 self._iterating = self._original_iterator is not None 919 C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator) 757 return False 758 else:--> 759 self._dispatch(tasks) 760 return True 761 C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch) 714 with self._lock: 715 job_idx = len(self._jobs)--> 716 job = self._backend.apply_async(batch, callback=cb) 717 # A job can complete so quickly than its callback is 718 # called before we get here, causing self._jobs toC:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback) 180 def apply_async(self, func, callback=None): 181 """Schedule a func to be run"""--> 182 result = ImmediateResult(func) 183 if callback: 184 callback(result)C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch) 547 # Don't delay the application, to avoid keeping the input 548 # arguments in memory--> 549 self.results = batch() 550 551 def get(self):C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs)--> 225 for func, args, kwargs in self.items] 226 227 def __len__(self):C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs)--> 225 for func, args, kwargs in self.items] 226 227 def __len__(self):C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score) 566 fit_time = time.time() - start_time 567 # _score will return dict if is_multimetric is True--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) 569 score_time = time.time() - start_time - fit_time 570 if return_train_score:C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric) 603 """ 604 if is_multimetric:--> 605 return _multimetric_score(estimator, X_test, y_test, scorer) 606 else: 607 if y_test is None:C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers) 633 score = scorer(estimator, X_test) 634 else:--> 635 score = scorer(estimator, X_test, y_test) 636 637 if hasattr(score, 'item'):C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight) 174 y_type = type_of_target(y) 175 if y_type not in ("binary", "multilabel-indicator"):--> 176 raise ValueError("{0} format is not supported".format(y_type)) 177 178 if is_regressor(clf):ValueError: continuous format is not supported
回答: