import keras import keras.backend as K from keras.optimizers import Adam from keras.models import Sequential from keras.layers import Dense from keras.layers.core import Activation from keras.preprocessing.text import Tokenizer # for tokenizing text from keras.preprocessing.sequence import pad_sequences # for padding sentences with zeros. To make the sentence length same from keras.utils import to_categorical # for one- hot encoding of the labels from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization from keras.layers import Conv1D, MaxPooling1D, Embedding from keras.models import Sequential from sklearn.model_selection import train_test_split MAX_SEQUENCE_LENGTH = 300 MAX_NB_WORDS = 20000 #Reading the data raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv") raw_data.head() # create training and testing vars train, test = train_test_split(raw_data, test_size=0.3) train.head() test.head() tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(train.Procedure) train_sequences = tokenizer.texts_to_sequences(train.Procedure) test_sequences = tokenizer.texts_to_sequences(test.Procedure) word_index = tokenizer.word_index containing words and their index # print(tokenizer.word_index) print('Found %s unique tokens.' % len(word_index)) train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH) train test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH) test print(train_data.shape) print(test_data.shape) print (word_index) train_labels = train['dxcode'] test_labels = test['dxcode'] from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder le = LabelEncoder() # converts the character array to numeric array. Assigns levels to unique labels. le.fit(train_labels) le.fit(test_labels) train_labels = le.transform(train_labels) test_labels = le.transform(test_labels) print(le.classes_) print(np.unique(train_labels, return_counts=True)) print(np.unique(test_labels, return_counts=True)) le.inverse_transform(1) labels_train = to_categorical(np.asanyarray(train_labels)) labels_test = to_categorical(np.asarray(test_labels)) print('Shape of data tensor:', train_data.shape) print('Shape of label tensor:', labels_train.shape) print('Shape of label tensor:', labels_test.shape) EMBEDDING_DIM = 100 print(MAX_SEQUENCE_LENGTH) print('Training model.') model = Sequential() model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH )) model.add(Dropout(0.2)) model.add(Conv1D(128, 5, activation='relu')) model.add(MaxPooling1D(5)) model.add(Dropout(0.5)) model.add(BatchNormalization()) model.add(Conv1D(128, 5, activation='relu')) model.add(MaxPooling1D(5)) model.add(Dropout(0.5)) model.add(BatchNormalization()) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dense(23, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'],) model.fit(train_data, labels_train, batch_size=32, epochs=10, validation_data=(test_data, labels_test)) model.evaluate(test_data, labels_test) pred = model.predict(test_data) pred # print(model.layers) for layer in model.layers: print(layer) import keras.backend as K emd = K.function(inputs=[model.layers[0].input], outputs=[model.layers[0].output]) rbind = np.concatenate((train_data, test_data), axis=0) print(rbind.shape) ### Submissions file test_results = model.predict_classes(rbind) #print(test_results) test_labels = le.inverse_transform(test_results) #test_labels = [le.inverse_transform(i) for i in test_results] submissions_CNN = pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels}) submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)
文本文档可以被标记为多个标签,那么我如何在这个数据集上进行多标签分类?我阅读了很多sklearn的文档,但似乎找不到进行多标签分类的正确方法。提前感谢任何帮助。
回答:
您是否在这行代码上遇到错误:
train_labels = le.transform(train_labels)
如果是的,那么这是因为在它上面的那行代码中,您做了以下操作:
le.fit(test_labels)
这样做的结果是它会忘记之前的数据(之前对fit()
的调用,位于它上面的那行),只记住test_labels
中的数据。因此,当出现一个新标签(在训练集中存在但在测试集中不存在)时,它会抛出这个错误。
您需要替换以下几行代码:
le.fit(train_labels)le.fit(test_labels)
为以下代码:
# 我使用.tolist()是因为我观察到您的 # train_labels, test_labels 是 pandas Series 对象le.fit(train_labels.tolist() + test_labels.tolist())