多标签文本分类。我有一个文本/标签的csv文件。文本是纯文本,标签是字母数字

    import keras    import keras.backend as K    from keras.optimizers import Adam    from keras.models import Sequential    from keras.layers import Dense    from keras.layers.core import Activation    from keras.preprocessing.text import Tokenizer          # for     tokenizing text    from keras.preprocessing.sequence import pad_sequences  # for     padding sentences with zeros. To make the sentence length same    from keras.utils import to_categorical                  # for one-     hot encoding of the labels    from keras.layers import Dense, Input, Flatten, Dropout,     BatchNormalization    from keras.layers import Conv1D, MaxPooling1D, Embedding    from keras.models import Sequential    from sklearn.model_selection import train_test_split    MAX_SEQUENCE_LENGTH = 300       MAX_NB_WORDS = 20000            #Reading the data    raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv")    raw_data.head()    # create training and testing vars    train, test = train_test_split(raw_data, test_size=0.3)    train.head()    test.head()    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)       tokenizer.fit_on_texts(train.Procedure)               train_sequences = tokenizer.texts_to_sequences(train.Procedure)    test_sequences = tokenizer.texts_to_sequences(test.Procedure)    word_index = tokenizer.word_index                    containing words and their index    # print(tokenizer.word_index)                      print('Found %s unique tokens.' % len(word_index))     train_data = pad_sequences(train_sequences,     maxlen=MAX_SEQUENCE_LENGTH)      train    test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH)     test    print(train_data.shape)    print(test_data.shape)    print (word_index)    train_labels = train['dxcode']    test_labels = test['dxcode']    from sklearn import preprocessing    from sklearn.preprocessing import LabelEncoder    le = LabelEncoder()                  # converts the character                                         array to numeric array.                                 Assigns levels to unique labels.    le.fit(train_labels)    le.fit(test_labels)    train_labels = le.transform(train_labels)    test_labels = le.transform(test_labels)    print(le.classes_)    print(np.unique(train_labels, return_counts=True))    print(np.unique(test_labels, return_counts=True))    le.inverse_transform(1)    labels_train = to_categorical(np.asanyarray(train_labels))    labels_test  = to_categorical(np.asarray(test_labels))    print('Shape of data tensor:', train_data.shape)    print('Shape of label tensor:', labels_train.shape)    print('Shape of label tensor:', labels_test.shape)    EMBEDDING_DIM = 100    print(MAX_SEQUENCE_LENGTH)    print('Training model.')    model = Sequential()    model.add(Embedding(MAX_NB_WORDS,                        EMBEDDING_DIM,                        input_length=MAX_SEQUENCE_LENGTH                        ))    model.add(Dropout(0.2))    model.add(Conv1D(128, 5, activation='relu'))    model.add(MaxPooling1D(5))    model.add(Dropout(0.5))    model.add(BatchNormalization())    model.add(Conv1D(128, 5, activation='relu'))    model.add(MaxPooling1D(5))    model.add(Dropout(0.5))    model.add(BatchNormalization())    model.add(Flatten())    model.add(Dense(128, activation='relu'))    model.add(Dense(23, activation='softmax'))    model.compile(loss='categorical_crossentropy',                  optimizer='adam',                  metrics=['acc'],)    model.fit(train_data, labels_train,              batch_size=32,              epochs=10,              validation_data=(test_data, labels_test))    model.evaluate(test_data, labels_test)    pred = model.predict(test_data)    pred    # print(model.layers)    for layer in model.layers:        print(layer)    import keras.backend as K    emd = K.function(inputs=[model.layers[0].input],                      outputs=[model.layers[0].output])    rbind = np.concatenate((train_data, test_data), axis=0)    print(rbind.shape)    ### Submissions file     test_results = model.predict_classes(rbind)    #print(test_results)    test_labels = le.inverse_transform(test_results)    #test_labels = [le.inverse_transform(i) for i in test_results]     submissions_CNN =     pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels})    submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)

文本文档可以被标记为多个标签,那么我如何在这个数据集上进行多标签分类?我阅读了很多sklearn的文档,但似乎找不到进行多标签分类的正确方法。提前感谢任何帮助。


回答:

您是否在这行代码上遇到错误:

train_labels = le.transform(train_labels)

如果是的,那么这是因为在它上面的那行代码中,您做了以下操作:

le.fit(test_labels)

这样做的结果是它会忘记之前的数据(之前对fit()的调用,位于它上面的那行),只记住test_labels中的数据。因此,当出现一个新标签(在训练集中存在但在测试集中不存在)时,它会抛出这个错误。

您需要替换以下几行代码:

le.fit(train_labels)le.fit(test_labels)

为以下代码:

# 我使用.tolist()是因为我观察到您的 # train_labels, test_labels 是 pandas Series 对象le.fit(train_labels.tolist() + test_labels.tolist())

Related Posts

L1-L2正则化的不同系数

我想对网络的权重同时应用L1和L2正则化。然而,我找不…

使用scikit-learn的无监督方法将列表分类成不同组别,有没有办法?

我有一系列实例,每个实例都有一份列表,代表它所遵循的不…

f1_score metric in lightgbm

我想使用自定义指标f1_score来训练一个lgb模型…

通过相关系数矩阵进行特征选择

我在测试不同的算法时,如逻辑回归、高斯朴素贝叶斯、随机…

可以将机器学习库用于流式输入和输出吗?

已关闭。此问题需要更加聚焦。目前不接受回答。 想要改进…

在TensorFlow中,queue.dequeue_up_to()方法的用途是什么?

我对这个方法感到非常困惑,特别是当我发现这个令人费解的…

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注