我有以下代码,它对神经网络运行得很好。我知道我需要使用混淆矩阵库来查找假阳性和假阴性率,但我不是编程专家,不确定如何操作。有人能帮我吗?
import pandas as pdfrom sklearn import preprocessingfrom sklearn.metrics import confusion_matrixfrom sklearn.model_selection import train_test_splitfrom keras.models import Sequentialfrom keras.layers import Dense# 读取csv文件并转换为数组供机器处理df = pd.read_csv('dataset_ori.csv')dataset = df.values# 将数据集分割为输入特征和要预测的特征X = dataset[:,0:7]Y = dataset[:,7]# 使用sigmoid函数的min_max_scaler缩放数据集,使所有输入特征介于0和1之间min_max_scaler = preprocessing.MinMaxScaler()# 将数据集存储到数组中X_scale = min_max_scaler.fit_transform(X)# 将数据集分割为30%的测试集和剩余的训练集X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)# 将val_and_test大小平均分割为验证集和测试集X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)# 指定顺序模型并描述将形成神经网络架构的层model = Sequential([Dense(7, activation='relu', input_shape=(7,)), Dense(32, activation='relu'), Dense(5, activation='relu'), Dense(1, activation='sigmoid'),])model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])# 训练数据hist = model.fit(X_train, Y_train, batch_size=32, epochs=100, validation_data=(X_val, Y_val))# 查找分类器的准确率scores = model.evaluate(X_test, Y_test)print("Accuracy: %.2f%%" % (scores[1]*100))
这是下面答案中提供的代码。response和model都被标红,表示未解决的引用问题
from keras import modelsfrom keras.layers import Dense, Dropoutfrom keras.utils import to_categoricalimport numpy as np # 线性代数import pandas as pd # 数据处理,CSV文件输入输出(例如pd.read_csv)from keras.models import Sequentialfrom keras.layers import Dense, Activationfrom sklearn import metricsfrom sklearn.preprocessing import StandardScaler# 读取csv文件并转换为数组供机器处理df = pd.read_csv('dataset_ori.csv')dataset = df.values# 将数据集分割为输入特征和要预测的特征X = dataset[:,0:7]Y = dataset[:,7]# 拆分训练集和测试集from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(dataset, response, test_size = 0.2, random_state = 0)# 初始化ANNclassifier = Sequential()# 添加输入层和第一个隐藏层classifier.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu', input_dim =7 ))model.add(Dropout(0.5))# 添加第二个隐藏层classifier.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu'))model.add(Dropout(0.5))# 添加输出层classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))# 编译ANNclassifier.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])# 将ANN拟合到训练集classifier.fit(X_train, y_train, batch_size = 10, epochs = 20)# 训练模型scaler = StandardScaler()classifier.fit(scaler.fit_transform(X_train.values), y_train)# 神经网络摘要classifier.summary()# 预测测试集结果并给出阈值概率y_prediction = classifier.predict_classes(scaler.transform(X_test.values))print ("\n\naccuracy" , np.sum(y_prediction == y_test) / float(len(y_test)))y_prediction = (y_prediction > 0.5)# 让我们看看我们的模型表现如何from sklearn.metrics import classification_reportprint(classification_report(y_test, y_prediction))
回答:
您输入到混淆矩阵的必须是整数数组,而不是独热编码。
# 预测测试集结果y_pred = model.predict(X_test)y_pred = (y_pred > 0.5)matrix = metrics.confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
通过设置0.5的概率阈值,下面输出将以这种方式呈现,从而将其转换为二进制。
输出(y_pred):
[0.87812372 0.77490434 0.30319547 0.84999743]
sklearn.metrics.accuracy_score(y_true, y_pred)方法定义y_pred为:
y_pred:一维数组类型,或标签指示数组/稀疏矩阵。分类器返回的预测标签。
这意味着y_pred必须是1或0的数组(预测标签)。它们不应该是概率。
您的错误的根本原因是理论问题而不是计算问题:您试图在回归(即数值预测)模型(神经逻辑模型)中使用分类度量(准确率),这是没有意义的。
就像大多数性能指标一样,准确率是比较苹果和苹果(即0/1的真实标签与0/1的预测);因此,当您要求函数比较二进制真实标签(苹果)与连续预测(橙子)时,您会得到预期的错误,其中消息明确告诉您从计算角度看问题是什么:
Classification metrics can't handle a mix of binary and continuous target
尽管该消息没有直接告诉您试图计算的度量对于您的问题是无效的(实际上我们不应该期望它走那么远),但scikit-learn至少直接明确警告您正在尝试错误的事情,这是件好事;这在其他框架中不一定是这种情况 – 例如,看看Keras在非常相似的情况下没有给出任何警告,您只会抱怨回归设置中低“准确率”…
from keras import modelsfrom keras.layers import Dense, Dropoutfrom keras.utils import to_categoricalimport numpy as np # 线性代数import pandas as pd # 数据处理,CSV文件输入输出(例如pd.read_csv)from keras.models import Sequentialfrom keras.layers import Dense, Activationfrom sklearn.cross_validation import train_test_splitfrom sklearn import metricsfrom sklearn.cross_validation import KFold, cross_val_scorefrom sklearn.preprocessing import StandardScaler# 读取csv文件并转换为数组供机器处理df = pd.read_csv('dataset_ori.csv')dataset = df.values# 将数据集分割为输入特征和要预测的特征X = dataset[:,0:7]Y = dataset[:,7]# 拆分训练集和测试集from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(dataset, response, test_size = 0.2, random_state = 0)# 初始化ANNclassifier = Sequential()# 添加输入层和第一个隐藏层classifier.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu', input_dim =7 ))model.add(Dropout(0.5))# 添加第二个隐藏层classifier.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu'))model.add(Dropout(0.5))# 添加输出层classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))# 编译ANNclassifier.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])# 将ANN拟合到训练集classifier.fit(X_train, y_train, batch_size = 10, epochs = 20)# 训练模型scaler = StandardScaler()classifier.fit(scaler.fit_transform(X_train.values), y_train)# 神经网络摘要classifier.summary()# 预测测试集结果并给出阈值概率y_prediction = classifier.predict_classes(scaler.transform(X_test.values))print ("\n\naccuracy" , np.sum(y_prediction == y_test) / float(len(y_test)))y_prediction = (y_prediction > 0.5)## 额外:混淆矩阵可视化from sklearn.metrics import confusion_matrix,accuracy_scorecm = confusion_matrix(y_test, y_pred) # 行=真实值,列=预测值df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))plt.figure(figsize = (10,7))sn.set(font_scale=1.4)sn.heatmap(df_cm, annot=True, fmt='g')print("测试数据准确率: %0.4f" % accuracy_score(y_test, y_pred))# 让我们看看我们的模型表现如何from sklearn.metrics import classification_reportprint(classification_report(y_test, y_pred))