我有一组数据,每个样本都有不同的权重。在我的应用中,这些权重在模型估计和比较不同模型时非常重要。
我使用 sklearn
来估计模型并比较不同的超参数选择。但这个单元测试显示 GridSearchCV
在估计分数时并未应用 sample_weights
。
有没有办法让 sklearn
使用 sample_weight
来评分模型?
单元测试:
from __future__ import divisionimport numpy as npfrom sklearn.datasets import load_irisfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import log_lossfrom sklearn.model_selection import GridSearchCV, RepeatedKFolddef grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting): out_results = dict() for k in max_features_grid: clf = RandomForestClassifier(n_estimators=256, criterion="entropy", warm_start=False, n_jobs=-1, random_state=RANDOM_STATE, max_features=k) for train_ndx, test_ndx in cv.split(X=X_in, y=y_in): X_train = X_in[train_ndx, :] y_train = y_in[train_ndx] w_train = w_in[train_ndx] y_test = y[test_ndx] clf.fit(X=X_train, y=y_train, sample_weight=w_train) y_hat = clf.predict_proba(X=X_in[test_ndx, :]) if use_weighting: w_test = w_in[test_ndx] w_i_sum = w_test.sum() score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test) else: score = log_loss(y_true=y_test, y_pred=y_hat) results = out_results.get(k, []) results.append(score) out_results.update({k: results}) for k, v in out_results.items(): if use_weighting: mean_score = sum(v) else: mean_score = np.mean(v) out_results.update({k: mean_score}) best_score = min(out_results.values()) best_param = min(out_results, key=out_results.get) return best_score, best_paramif __name__ == "__main__": RANDOM_STATE = 1337 X, y = load_iris(return_X_y=True) sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))]) # sample_weight = np.array([1 for _ in range(len(X))]) inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE) outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE) rfc = RandomForestClassifier(n_estimators=256, criterion="entropy", warm_start=False, n_jobs=-1, random_state=RANDOM_STATE) search_params = {"max_features": [1, 2, 3, 4]} fit_params = {"sample_weight": sample_weight} my_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True, needs_threshold=False) grid_clf = GridSearchCV(estimator=rfc, scoring=my_scorer, cv=inner_cv, param_grid=search_params, refit=True, return_train_score=False, iid=False) # in this usage, the results are the same for `iid=True` and `iid=False` grid_clf.fit(X, y, **fit_params) print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_) msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f.""" score_with_weights, param_with_weights = grid_cv(X_in=X, y_in=y, w_in=sample_weight, cv=inner_cv, max_features_grid=search_params.get( "max_features"), use_weighting=True) print(msg % ("WITH", score_with_weights)) score_without_weights, param_without_weights = grid_cv(X_in=X, y_in=y, w_in=sample_weight, cv=inner_cv, max_features_grid=search_params.get( "max_features"), use_weighting=False) print(msg % ("WITHOUT", score_without_weights))
这会产生以下输出:
This is the best out-of-sample score using GridSearchCV: 0.135692.This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
解释:由于手动计算未加权的损失与 GridSearchCV
的评分结果相同,我们可以确定样本权重并未被使用。
回答:
sklearn 版本 >= 1.4 和 1.4 夜间发布版
从 sklearn 1.4 版本开始(预计发布日期在 2023 年 10 月左右),以及从 2023 年 9 月起可用的夜间发布版(您可以按照此处的指南进行安装),您可以使用新的元数据路由机制。
您可以简单地决定哪些对象应该或不应该接收元数据,例如 sample_weight
,如下面的脚本所示:
...(代码保持不变)