在对逻辑回归、线性SVM和K最近邻分类器的正则化强度倒数参数和最近邻数量参数进行网格搜索时,通过网格搜索获得的最佳参数在使用相同训练数据集手动验证时并非真正最佳。代码如下
# Convert to a DataFrame.import pandas as pdfrom sklearn.datasets import fetch_openmldf = fetch_openml('credit-g', as_frame=True).framedf.head(5)df.dtypesimport matplotlib.pyplot as pltfig = plt.figure(figsize=(12, 12))st = fig.suptitle("univariate distributions and target distribution", fontsize=20)# Using columns that we need for this plotnfeatures = df[['duration', 'credit_amount' , 'age']]target = df['class']# creating 4x4 gridgrid = plt.GridSpec(4, 4, hspace=0.4, wspace=0.4)# creating the normal plots in grid 1 , 2 ,3 and 4p1 = fig.add_subplot(grid[:2,:2])p2 = fig.add_subplot(grid[:2,2:])p3 = fig.add_subplot(grid[2:,:2])p4 = fig.add_subplot(grid[2:,2:])p1.hist(nfeatures['duration'])p2.hist(nfeatures['credit_amount'])p3.hist(nfeatures['age'])p4.hist(target)p1.set_xlabel('duration')p2.set_xlabel('credit_amount')p3.set_xlabel('age')p4.set_xlabel('class')# customizing to look neatst.set_y(0.95)fig.subplots_adjust(top=0.92)from sklearn.model_selection import train_test_splitcolumns = [column for column in df.columns if column != 'class']X = df[columns]y = df['class']X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=11)#X_train , y_train , X_valid , y_valid = train_test_split(X,) # basic preprocessing on train sets# numeric_columns = ['duration','credit_amount' , 'installment_commitment' , 'residence_since' , 'age' ,'existing_credits' , 'num_dependents' ]numeric_columns = df.select_dtypes(include=['float64']).columnscategorical_columns = [column for column in columns if column not in numeric_columns]temp = X_train[categorical_columns]X_train_ohe = pd.concat([pd.get_dummies(temp),X_train[numeric_columns]],axis=1)from sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import cross_val_scorelr = LogisticRegression(max_iter=1000)cr = cross_val_score(lr,X_train_ohe,y_train)print(cr)from sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipelinefrom sklearn.preprocessing import OneHotEncoderfrom sklearn.preprocessing import StandardScalerfrom sklearn.svm import LinearSVCfrom sklearn.neighbors import KNeighborsClassifier# define the data preparation for the categorical columnst1 = [('cat', OneHotEncoder(), categorical_columns)]col_transform = ColumnTransformer(transformers=t1)# define the modelsmodels = {'lr_model':LogisticRegression(max_iter=1000), 'lsvm_model':LinearSVC(max_iter=2500) , 'knn_model':KNeighborsClassifier()}for name,model in models.items(): # define the data preparation and modeling pipeline pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)]) # define the model cross-validation configuration #cv = KFold(n_splits=10, shuffle=True, random_state=1) # evaluate the pipeline using cross validation and calculate MAE score = cross_val_score(pipeline, X_train, y_train) print(name ,score.mean())# define the data preparation for the categorical columns and numeric columnst2 = [('cat', OneHotEncoder(), categorical_columns), ('num', StandardScaler(), numeric_columns)]col_transform = ColumnTransformer(transformers=t2)# try with new column transformerfor name,model in models.items(): # define the data preparation and modeling pipeline pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)]) # define the model cross-validation configuration #cv = KFold(n_splits=10, shuffle=True, random_state=1) # evaluate the pipeline using cross validation and calculate MAE score = cross_val_score(pipeline, X_train, y_train) print(name ,score.mean())from sklearn.model_selection import GridSearchCVfrom sklearn.metrics import f1_scorefrom sklearn.metrics import make_scorerf1_scorer = make_scorer(f1_score, pos_label="bad")# 'prep__num__with_mean': [True, False],# 'prep__num__with_std': [True, False],param_grid = { 'm__C': [0.1, 1.0 , 0.01], }param_grid_knn = { 'm__n_neighbors': [5, 10 , 15], }for name,model in models.items(): # define the data preparation and modeling pipeline pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)]) # define the model cross-validation configuration #cv = KFold(n_splits=10, shuffle=True, random_state=1) # evaluate the pipeline using cross validation and calculate MAE if name == 'knn_model': grid_clf = GridSearchCV(pipeline, param_grid_knn, cv=5, scoring=f1_scorer ) else: grid_clf = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer) grid_clf.fit(X_train, y_train) print(name,grid_clf.best_params_) print(name, grid_clf.best_estimator_.score(X_test, y_test))lr_array = []lr_c = [0.01,0.1,1]for c in lr_c: pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))]) pipeline.fit(X_train,y_train) y_hat = pipeline.predict(X_train) lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))lsvm_array = []lsvm_c = [0.01,0.1,1]for c in lsvm_c: pipeline = Pipeline(steps=[('prep',col_transform), ('m', LinearSVC(dual=True,max_iter=2500,C=c))]) pipeline.fit(X_train,y_train) y_hat = pipeline.predict(X_train) lsvm_array.append(f1_score(y_train,y_hat,pos_label="bad"))knn_array = []knn_n = [5,10,15]for n in knn_n: pipeline = Pipeline(steps=[('prep',col_transform), ('m', KNeighborsClassifier(n_neighbors=n))]) pipeline.fit(X_train,y_train) y_hat = pipeline.predict(X_train) knn_array.append(f1_score(y_train,y_hat,pos_label="bad"))fig = plt.figure(figsize=(12, 12))# creating 3x1 gridgrid = plt.GridSpec(3, 1, hspace=0.4, wspace=0.4)# creating the normal plots in grid 1 , 2 ,3p1 = fig.add_subplot(grid[0,:])p2 = fig.add_subplot(grid[1,:])p3 = fig.add_subplot(grid[2,:])p1.scatter(lr_c,lr_array)p2.scatter(lsvm_c,lsvm_array)p3.scatter(knn_n,knn_array)
当使用不同的评分指标并在测试集而不是训练集上进行评估时,趋势会发生变化,但网格搜索和手动验证的最佳参数似乎从未相同。导致这种情况的原因是什么?例如,如果你运行上面的代码,网格搜索会告诉你n_neighbors的最佳值是10,但最后的图表显示5的效果更好。是否比较没有正确实现?你可以在以下链接查看带有输出的运行情况 https://github.com/binodmathews93/AppliedMachineLearningCourse/blob/master/Applied_Machine_Learning_Homework_2.ipynb
回答:
超参数调优是在验证(开发)集上进行的,而不是在训练集上进行的。
网格搜索交叉验证使用K折策略来构建仅用于验证而非训练的验证集。
你手动在同一数据集上进行训练和验证,这是一种错误的方法。
pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])pipeline.fit(X_train,y_train) # <- 这里是问题所在y_hat = pipeline.predict(X_train)lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))
这只会导致超参数选择提高训练集上的表现,这不是你想要的(你想要的是一组在测试集上有良好表现的超参数 – 即具有良好泛化能力的超参数)。
这就是为什么你在手动测试时K(在KNN中)值较低的原因 – 较低的K导致较少的“正则化”,因此从训练集的角度来看是最佳的,尽管是不正确的选择。
如果你想手动验证结果,你需要自己构建验证集(并且在训练过程中不使用它),或者你需要手动调用K折交叉验证程序。