我正在尝试基于从多个网站收集的产品评论进行情感分析。我已经能够按照下面的文章进行操作,直到到达模型系数可视化步骤。
当我运行我的程序时,我得到了以下错误:
ValueError: 系数数量6021与特征名称数量6290不匹配。
关于如何确保我的数据集中系数数量与特征数量匹配,有什么建议吗?
以下是我的代码:
y = reviews['Review Type']X = reviews['Review Comment']#将数据分割成训练和测试集from sklearn.model_selection import train_test_splittext_train, text_test, y_train, y_test = train_test_split(X, y, random_state=0)#使用词袋对训练和测试的独立变量进行特征提取#在转换后将变量改回X_train。from sklearn.feature_extraction.text import CountVectorizerfrom sklearn.feature_extraction.text import ENGLISH_STOP_WORDSvect = CountVectorizer().fit(text_train)X_train = vect.transform(text_train)print(repr(X_train))X_test = vect.transform(text_test)print(repr(X_test))feature_names = vect.get_feature_names()print(len(feature_names))#运行逻辑回归模型以预测评论是正面还是负面from sklearn.pipeline import make_pipelinefrom sklearn.model_selection import GridSearchCVfrom sklearn.metrics import confusion_matrixfrom sklearn.linear_model import LogisticRegressionlogreg = LogisticRegression(max_iter=10000, class_weight='balanced', random_state=0)param_grid = {'C': [0.01, 0.1, 1, 10, 100]}grid = GridSearchCV(logreg, param_grid, scoring= 'roc_auc', cv=5)logreg_train = grid.fit(X_train, y_train)pred_logreg = logreg_train.predict(X_test)confusion = confusion_matrix(y_test, pred_logreg)print(confusion)print("分类准确率为:", (confusion[0][0] + confusion[1][1]) / np.sum(confusion))from sklearn.metrics import roc_curveimport matplotlib.pyplot as pltimport seaborn as sns; sns.set();fpr, tpr, thresholds = roc_curve(y_test, grid.decision_function(X_test))#找到最接近零的阈值:close_zero = np.argmin(np.abs(thresholds))plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label= '阈值零(默认)', fillstyle= 'none', c='k', mew=2)plt.plot([0,1], linestyle='-', lw=2, color='r', label='随机', alpha=0.8)plt.legend(loc=4)plt.plot(fpr, tpr, label='ROC曲线')plt.xlabel('假阳性率')plt.ylabel('真阳性率(召回率)')plt.title('roc_curve');from sklearn.metrics import aucprint('AUC得分为:', auc(fpr, tpr));from sklearn.metrics import precision_recall_curveprecision, recall, thresholds = precision_recall_curve(\ y_test, logreg_train.decision_function(X_test))close_zero = np.argmin(np.abs(thresholds))plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10, label="阈值零", fillstyle="none", c="k", mew=2)plt.plot(precision, recall, label="精确度召回曲线")plt.xlabel("精确度")plt.ylabel("召回率")plt.title("精确度召回曲线")plt.legend(loc="best");from sklearn.feature_extraction.text import TfidfVectorizerlogreg = LogisticRegression(max_iter=10000, class_weight="balanced", random_state=0)pipe = make_pipeline(TfidfVectorizer(norm=None, stop_words='english'), logreg)param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}grid = GridSearchCV(pipe, param_grid, scoring="roc_auc", cv=5)logreg_train = grid.fit(text_train, y_train)fpr, tpr, thresholds = roc_curve(y_test, grid.decision_function(text_test))pred_logreg = logreg_train.predict(text_test)confusion = confusion_matrix(y_test, pred_logreg)print(confusion)print("分类准确率为:", (confusion[0][0] + confusion[1][1]) / np.sum(confusion)) print("测试AUC得分为:", auc(fpr, tpr));mglearn.tools.visualize_coefficients(grid.best_estimator_.named_steps["logisticregression"].coef_,feature_names, n_top_features=25)
回答:
您使用默认的stop_words=None
定义了feature_names
,它来自CountVectorizer
,但您在最后一段代码中使用的模型是使用stop_words='english'
的TfidfVectorizer
。请改用
feature_names = grid.best_estimator_.named_steps["tfidfvectorizer"].get_feature_names()