我正在尝试创建一个用于校准分类器的类。我阅读了一些关于概率校准的资源,对应该使用哪个数据集来校准分类器感到有些困惑。我创建了一个类,该类将训练集进一步分割为训练和验证集。然后,分类器首先在训练集上进行拟合,并在验证集上预测未校准的概率。
然后,我创建了一个CalibrationCV类的cal_model实例,并将其拟合到验证集上,再次预测验证集的校准概率。
有人能帮我看看下面的代码并帮我修正吗?
class calibrate_model: """ 一个类,用于将训练数据集分割成训练集和验证集,然后进行概率校准。 model = 分类模型 Xtrain = 独立特征集 ytrain = 目标变量集 cv = 交叉验证方法 cal_method = 'sigmoid' 或 'isotonic'。 """ def __init__(self, model, Xtrain, ytrain, cv, cal_method): self.model = model self.Xtrain = Xtrain self.ytrain = ytrain self.cv = cv self.cal_method = cal_method def calibrate_probability(self): from sklearn.model_selection import train_test_split from sklearn.calibration import CalibratedClassifierCV from sklearn.calibration import calibration_curve train_X, val_X, train_y, val_y = train_test_split(self.Xtrain, self.ytrain, test_size = 0.2, random_state = seed) #未校准模型 for train_index, test_index in self.cv.split(train_X, train_y): X_train_kfold, X_val_kfold = train_X[train_index], train_X[test_index] y_train_kfold, y_val_kfold = train_y[train_index], train_y[test_index] self.model.fit(X_train_kfold, y_train_kfold) uc_probs = self.model.predict_proba(val_X)[:, 1] uc_fop, uc_mpv = calibration_curve(val_y, uc_probs, n_bins=10, normalize=True, strategy = 'quantile') #校准模型 self.cal_model = CalibratedClassifierCV(self.model, method=self.cal_method, cv=self.cv) self.cal_model.fit(val_X, val_y) # 预测概率 c_probs = self.cal_model.predict_proba(val_X)[:, 1] # 可靠性图 c_fop, c_mpv = calibration_curve(val_y, c_probs, n_bins=10, normalize=True, strategy = 'quantile') # 绘制CATBOOST校准图 plt.plot([0, 1], [0, 1], linestyle='--'); # 绘制未校准模型的可靠性图 plt.plot(uc_mpv, uc_fop, marker='.', label = '未校准'); # 绘制校准后的可靠性图 plt.plot(c_mpv, c_fop, marker='.', label = '已校准'); plt.title(type(self.model).__name__ + ' ' + self.cal_method) plt.ylabel('正例比例 (fop)') plt.xlabel('平均预测值 (mpv)') plt.legend(); plt.tight_layout()
回答:
calibration_curve代码是正确的。我正在比较逻辑回归校准与xgboost校准。数据框包含predict_proba[:,1]的值或发生的概率。请查看(https://github.com/dnishimoto/python-deep-learning/blob/master/Credit%20Loan%20Risk%20.ipynb)
y_pred_prob_lr=pipeline['lr'].predict_proba(X_test) y_preds_proba_lr_df=pd.DataFrame(y_pred_prob_lr[:,1],columns= ["pred_default_proba"]) xg_cl= xgb.XGBClassifier(objective='binary:logistic',n_estimators=10,seed=123) xg_cl.fit(X_train,y_train) y_pred_xg=xg_cl.predict(X_test) y_pred_proba_xg=xg_cl.predict_proba(X_test) y_preds_proba_xg_df = pd.DataFrame(y_pred_proba_xg[:,1], columns = ['prob_default']) frac_of_pos, mean_pred_val = calibration_curve(y_test,preds_proba_df , n_bins=10, normalize=True, strategy = 'quantile') frac_of_pos_lr, mean_pred_val_lr = calibration_curve(y_test,y_pred_prob_lr_df , n_bins=10, normalize=True, strategy = 'quantile') plt.plot([0, 1], [0, 1], 'k:', label="完美校准") plt.plot(mean_pred_val, frac_of_pos, 's-', label='%s' % 'XGBoost回归') plt.plot(mean_pred_val_lr, frac_of_pos_lr, 's-', label='%s' % '逻辑回归') plt.xlabel('正例比例') plt.ylabel('平均预测概率') plt.legend() plt.title('校准曲线') plt.show()