我在尝试训练一个用于文本分类的模型,该模型接受从文章中嵌入的最大300个整数列表。模型训练没有问题,但准确率一直无法提高。
目标由41个类别组成,这些类别被编码为从0到41的整数,然后进行了归一化处理。
表格看起来像这样
另外,我不知道我的模型应该是什么样子,因为我参考了两个不同的例子,如下所示
我尝试根据这两个模型修改我的模型,但模型的准确率没有变化,甚至每轮训练都在下降
我应该给我的模型增加更多的层吗,还是我做了什么愚蠢的事情而没有意识到?
注意:如果’df.pickle’下载链接损坏,请使用 这个链接
from sklearn.model_selection import train_test_splitfrom urllib.request import urlopenfrom os.path import existsfrom os import mkdirimport tensorflow as tfimport pandas as pdimport pickle# Define dataframe pathdf_path = 'df.pickle'# Check if local dataframe existsif not exists(df_path): # Download binary from dropbox content = urlopen('https://ucd92a22d5e0d4d29b8edb608305.dl.dropboxusercontent.com/cd/0/get/Askx_25n3JI-jmnZsWXmMmRgd4O2EH1w9l0U6zCMq7xdSXs_IN_i2zuUviseqa9N7-WrReFbGhQi8CeseV5cNsFTO8dzRmSdxjr-MWEDQNpPaZ8Ik29E_58YAjY57qTc4CA/file#').read() # Write to file with open(df_path, 'wb') as file: file.write(content) # Load the dataframe from bytes df = pickle.loads(content)# If the file exists (aka. downloaded)else: # Load the dataframe from file df = pickle.load(open(df_path, 'rb'))# Normalize the categorydf['Category_Code'] = df['Category_Code'].apply(lambda x: x / 41)train_df, test_df = [pd.DataFrame() for _ in range(2)]x_train, x_test, y_train, y_test = train_test_split(df['Content_Parsed'], df['Category_Code'], test_size=0.15, random_state=8)train_df['Content_Parsed'], train_df['Category_Code'] = x_train, y_traintest_df['Content_Parsed'], test_df['Category_Code'] = x_test, y_test# Variable containing the number of words we want to keep in our vocabularyNUM_WORDS = 10000# Input/Token lengthSEQ_LEN = 300# Create tokenizer for our datatokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')tokenizer.fit_on_texts(train_df['Content_Parsed'])# Convert text data to numerical indexestrain_seqs=tokenizer.texts_to_sequences(train_df['Content_Parsed'])test_seqs=tokenizer.texts_to_sequences(test_df['Content_Parsed'])# Pad data up to SEQ_LEN (note that we truncate if there are more than SEQ_LEN tokens)train_seqs=tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=SEQ_LEN, padding="post")test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=SEQ_LEN, padding="post")# Create Models folder if not existsif not exists('Models'): mkdir('Models')# Define local model pathmodel_path = 'Models/model.pickle'# Check if model exists/pre-trainedif not exists(model_path): # Define word embedding size EMBEDDING_SIZE = 16 # Create new model ''' model = tf.keras.Sequential([ tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)), # tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) ''' model = tf.keras.Sequential([ tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE), # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) # Compile the model model.compile( optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'] ) # Stop training when a monitored quantity has stopped improving. es = tf.keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', patience=1) # Define batch size (Can be tuned to improve model accuracy) BATCH_SIZE = 16 # Define number or cycle to train EPOCHS = 20 # Using GPU (If error means you don't have GPU. Use CPU instead) with tf.device('/GPU:0'): # Train/Fit the model history = model.fit( train_seqs, train_df['Category_Code'].values, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2, validation_steps=30, callbacks=[es] ) # Evaluate the model model.evaluate(test_seqs, test_df['Category_Code'].values) # Save the model into a file with open(model_path, 'wb') as file: file.write(pickle.dumps(model))else: # Load the model model = pickle.load(open(model_path, 'rb'))# Check the modelmodel.summary()
回答:
经过两天的调整和了解更多示例后,我发现了这个网站,它很好地解释了多类分类的内容。
我所做的更改细节如下:
-
由于我要构建一个用于多类别的模型,在模型编译过程中,模型应该使用
categorical_crossentropy
作为其损失函数,而不是binary_crossentropy
。 -
模型应该生成与你要分类的总类别数相同长度的输出,在我的情况下是41。(独热编码)
-
最后一层的激活函数应该是
"softmax"
,因为我们选择的是置信度最高的标签(最接近1.0
)。 -
你需要根据你要分类的类别数量相应地调整层。参见这里,了解如何改进你的模型。
我的最终代码看起来就像这样
from sklearn.model_selection import train_test_splitfrom urllib.request import urlopenfrom functools import reducefrom os.path import existsfrom os import listdirfrom sys import exitimport tensorflow as tfimport pandas as pdimport pickleimport re# Specify dataframe pathdf_path = 'df.pickle'# Check if the file existsif not exists(df_path): # Specify url of the dataframe binary url = 'https://www.dropbox.com/s/76hibe24hmpz3bk/df.pickle?dl=1' # Read the byte content from url content = urlopen(url).read() # Write to a file to save up time with open(df_path, 'wb') as file: file.write(pickle.dumps(content)) # Unpickle the dataframe df = pickle.loads(content)else: # Load the pickle dataframe df = pickle.load(open(df_path, 'rb'))# Useful variablesMAX_NUM_WORDS = 50000 # Vocabulary size for our tokenizerMAX_SEQ_LENGTH = 600 # Maximum length of tokens (for padding later)EMBEDDING_SIZE = 256 # Embedding size (Tweak to improve accuracy)OUTPUT_LENGTH = len(df['Category'].unique()) # Number of class to be classified# Create our tokenizertokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, lower=True)# Fit our tokenizer with words/tokenstokenizer.fit_on_texts(df['Content_Parsed'].values)# Get our token vocabularyword_index = tokenizer.word_indexprint('Found {} unique tokens'.format(len(word_index)))# Parse our text into sequence of numbers using our tokenizerX = tokenizer.texts_to_sequences(df['Content_Parsed'].values)# Pad the sequence up to the MAX_SEQ_LENGTHX = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQ_LENGTH)print('Shape of feature tensor: {}'.format(X.shape))# Convert our labels into dummy variable (More info on the link provided above)Y = pd.get_dummies(df['Category']).valuesprint('Shape of label tensor: {}'.format(Y.shape))# Split our features and labels into test and train datasetx_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)print(x_train.shape, y_train.shape)print(x_test.shape, y_test.shape)# Creating our modelmodel = tf.keras.Sequential()model.add(tf.keras.layers.Embedding(MAX_NUM_WORDS, EMBEDDING_SIZE, input_length=MAX_SEQ_LENGTH))model.add(tf.keras.layers.SpatialDropout1D(0.2))# The number 64 could be changed based on your model performancemodel.add(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2))# Our output layer with length similar to the OUTPUT_LENGTHmodel.add(tf.keras.layers.Dense(OUTPUT_LENGTH, activation='softmax'))# Compile our model with "categorical_crossentropy" loss functionmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])# Model variablesEPOCHS = 100 # Number of cycle to run (The early stopping may stop the training process accordingly)BATCH_SIZE = 64 # Batch size (Tweaking this may improve model performance a bit)checkpoint_path = 'model_checkpoints' # Checkpoint path of our model# Use GPU if availablewith tf.device('/GPU:0'): # Fit/Train our model history = model.fit( x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1, callbacks=[ tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001), tf.keras.callbacks.ModelCheckpoint( checkpoint_path, monitor='val_acc', save_best_only=True, save_weights_only=False ) ], verbose=1 )