LSTM实现/过拟合

我在实现LSTM时遇到了一些问题。我不确定我的实现是否正确,或者这只是一个过拟合的问题。我使用LSTM进行作文评分,对文本进行0到10分(或其他范围的分数)的评分。我使用了ASAP kaggle竞赛数据作为训练数据之一。

然而,主要目标是在一个私有数据集上实现良好的性能,该数据集大约有500个样本。这500个样本包括验证集和训练集。我之前做了一些实验,成功让模型运行,但后来调整了一些东西后,模型就不再适合了。模型完全没有改进。我还以更清洁的方式重新实现了代码,使用了更多面向对象的代码,但仍然无法重现之前的结果。

然而,我让模型适应了我的数据,只是存在严重的过拟合。我不确定这是一个某种实现问题还是仅仅是过拟合,但我无法让模型正常工作。在ASAP数据的第一组作文上,我使用LSTM最多只能达到0.35的kappa值。出于某种奇怪的原因,我可以让单层全连接模型达到0.75的kappa值。我认为这是一个实现问题,但我并不确定。

这是我的旧代码:

train.py

import gensimimport numpy as npimport pandas as pdimport torchfrom sklearn.metrics import cohen_kappa_scorefrom torch import nnimport torch.utils.data as data_utilsfrom torch.optim import Adamfrom dataset import AESDatasetfrom network import Networkfrom optimizer import Rangerfrom qwk import quadratic_weighted_kappa, kappabatch_size = 32device = "cuda:0"torch.manual_seed(1000)# Load data from csvfile_name = "data/data_new.csv"data = pd.read_csv(file_name)arr = data.to_numpy()text = arr[:, :2]text = [str(line[0]) + str(line[1]) for line in text]text = [gensim.utils.simple_preprocess(line) for line in text]score = arr[:,2]score = [sco*6 for sco in score]score = np.asarray(score, dtype=int)train_dataset = AESDataset(text_arr=text[:400], scores=score[:400])test_dataset = AESDataset(text_arr=text[400:], scores=score[400:])score = torch.tensor(score).view(-1,1).long().to(device)train_loader = data_utils.DataLoader(train_dataset,shuffle=True, batch_size=batch_size, drop_last=True)test_loader = data_utils.DataLoader(test_dataset,shuffle=True,batch_size=batch_size, drop_last=True)out_class = 61epochs = 1000model = Network(out_class).to(device)model.load_state_dict(torch.load("model/best_model"))y_onehot = torch.FloatTensor(batch_size, out_class).to(device)optimizer = Adam(model.parameters())criti = torch.nn.CrossEntropyLoss()# model, optimizer = amp.initialize(model, optimizer, opt_level="O2")step = 0for i in range(epochs):    #Testing    if i % 1 == 0:        total_loss = 0        total_kappa = 0        total_batches = 0        model.eval()        for (text, score) in test_loader:            out = model(text)            out_score = torch.argmax(out, 1)            y_onehot.zero_()            y_onehot.scatter_(1, score, 1)            kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())            score = score.view(-1)            loss = criti(out, score.view(-1))            total_loss += loss            total_kappa += kappa_l            total_batches += 1        print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")        with open(f"model/epoch_{i}", "wb") as f:            torch.save(model.state_dict(),f)        model.train()    #Training    for (text, score) in train_loader:        optimizer.zero_grad()        step += 1        out = model(text)        out_score = torch.argmax(out,1)        y_onehot.zero_()        y_onehot.scatter_(1, score, 1)        kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())        loss = criti(out, score.view(-1))        print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")        loss.backward()        optimizer.step()

dataset.py

import gensimimport torchimport numpy as npclass AESDataset(torch.utils.data.Dataset):    def __init__(self, text_arr, scores):        self.data = text_arr        self.scores = scores        self.w2v_model = ("w2vec_model_all")        self.max_len = 500    def __getitem__(self, item):        vector = []        essay = self.data[item]        pad_vec = [1 for i in range(300)]        for i in range(self.max_len - len(essay)):            vector.append(pad_vec)        for word in essay:            word_vec = pad_vec            try:                word_vec = self.w2v_model[word]            except:                #print(f"Skipping word as word {word} not in dictionary")                word_vec = pad_vec            vector.append(word_vec)        #print(len(vector))        vector = np.stack(vector)        tensor = torch.tensor(vector[:self.max_len]).float().to("cuda")        score = self.scores[item]        score = torch.tensor(score).long().to("cuda").view(1)        return tensor, score    def __len__(self):        return len(self.scores)

network.py

import torch.nn as nnimport torchimport torch.nn.functional as Fclass Network(nn.Module):    def __init__(self, output_size):        super(Network, self).__init__()        self.lstm = nn.LSTM(300,500,1, batch_first=True)        self.dropout = nn.Dropout(p=0.5)        #self.l2 = nn.L2        self.linear = nn.Linear(500,output_size)    def forward(self,x):        x, _ = self.lstm(x)        x = x[:,-1,:]        x = self.dropout(x)        x = self.linear(x)        return x

我的新代码:https://github.com/Clement-Hui/EssayGrading


回答:

我认为问题出在训练代码上,因为你使用了LSTM,你应该在每个epoch后清空隐藏状态和细胞状态,并在每个批次后将其从计算图中分离出来。

network.py

import torch.nn as nnimport torchimport torch.nn.functional as Fclass Network(nn.Module):    def __init__(self, output_size):        super(Network, self).__init__()        self.lstm = nn.LSTM(300,500,1, batch_first=True)        self.dropout = nn.Dropout(p=0.5)        #self.l2 = nn.L2        self.linear = nn.Linear(500,output_size)    def forward(self,x,hidden):        x, hidden = self.lstm(x,hidden)        x = x.contiguous().view(-1, 500)        x = self.dropout(x)        x = self.linear(x)        return x , hidden    def init_hidden(self,batch_size):        weights = next(self.parameters()).data        hidden = (weights.new(1 , batch_size,500).zero_().cuda(),                  weights.new(1 , batch_size,500).zero_().cuda())        return hidden

train.py

# 你的代码用于初始化模型和数据以及所有其他内容for i in range(epochs):    #测试    if i % 1 == 0:        total_loss = 0        total_kappa = 0        total_batches = 0        model.eval()        val_h  = model.init_hidden(batch_size) # 初始化隐藏状态        for (text, score) in test_loader:           # 为隐藏状态创建新变量,否则           # 我们将通过整个训练历史进行反向传播            val_h = tuple([each.data for each in val_h])             out ,  val_h  = model(text,val_h)            out_score = torch.argmax(out, 1)            y_onehot.zero_()            y_onehot.scatter_(1, score, 1)            kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())            score = score.view(-1)            loss = criti(out, score.view(-1))            total_loss += loss            total_kappa += kappa_l            total_batches += 1        print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")        with open(f"model/epoch_{i}", "wb") as f:            torch.save(model.state_dict(),f)    model.train()    #训练    h =  model.init_hidden(batch_size) # 初始化隐藏状态    for (text, score) in train_loader:        optimizer.zero_grad()        step += 1        # 为隐藏状态创建新变量,否则        # 我们将通过整个训练历史进行反向传播        h = tuple([each.data for each in h])        out , h  = model(text,h)        out_score = torch.argmax(out,1)        y_onehot.zero_()        y_onehot.scatter_(1, score, 1)        kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())        loss = criti(out, score.view(-1))        print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")        loss.backward()        optimizer.step()

请告诉我提到的更改是否有效。

Related Posts

L1-L2正则化的不同系数

我想对网络的权重同时应用L1和L2正则化。然而,我找不…

使用scikit-learn的无监督方法将列表分类成不同组别,有没有办法?

我有一系列实例,每个实例都有一份列表,代表它所遵循的不…

f1_score metric in lightgbm

我想使用自定义指标f1_score来训练一个lgb模型…

通过相关系数矩阵进行特征选择

我在测试不同的算法时,如逻辑回归、高斯朴素贝叶斯、随机…

可以将机器学习库用于流式输入和输出吗?

已关闭。此问题需要更加聚焦。目前不接受回答。 想要改进…

在TensorFlow中,queue.dequeue_up_to()方法的用途是什么?

我对这个方法感到非常困惑,特别是当我发现这个令人费解的…

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注