使用PyTorch实现的VGG16训练损失不下降

我想在PyTorch中尝试一些简单的例子，但在训练过程中训练损失并未下降。

这里提供了一些信息：

模型是VGG16，由13个卷积层和3个全连接层组成。
数据是PyTorch中的CIFAR100数据集。
我选择了交叉熵作为损失函数。

代码如下

# encoding: utf-8import torchimport torch.optim as optimimport torch.nn as nnimport torch.nn.functional as Fimport torchvision.transforms as transformsimport torchvisionimport numpy as npclass VGG16(torch.nn.Module):    def __init__(self, n_classes):        super(VGG16, self).__init__()        # construct model        self.conv1_1 = nn.Conv2d(3, 64, 3, padding=1)        self.conv1_2 = nn.Conv2d(64, 64, 3, padding=1)        self.conv2_1 = nn.Conv2d(64, 128, 3, padding=1)        self.conv2_2 = nn.Conv2d(128, 128, 3, padding=1)        self.conv3_1 = nn.Conv2d(128, 256, 3, padding=1)        self.conv3_2 = nn.Conv2d(256, 256, 3, padding=1)        self.conv3_3 = nn.Conv2d(256, 256, 3, padding=1)        self.conv4_1 = nn.Conv2d(256, 512, 3, padding=1)        self.conv4_2 = nn.Conv2d(512, 512, 3, padding=1)        self.conv4_3 = nn.Conv2d(512, 512, 3, padding=1)        self.conv5_1 = nn.Conv2d(512, 512, 3, padding=1)        self.conv5_2 = nn.Conv2d(512, 512, 3, padding=1)        self.conv5_3 = nn.Conv2d(512, 512, 3, padding=1)        self.fc6 = nn.Linear(512, 512)        self.fc7 = nn.Linear(512, 512)        self.fc8 = nn.Linear(512, n_classes)    def forward(self, x):        x = F.relu(self.conv1_1(x))        x = F.relu(self.conv1_2(x))        x = F.max_pool2d(x, (2, 2))        x = F.relu(self.conv2_1(x))        x = F.relu(self.conv2_2(x))        x = F.max_pool2d(x, (2, 2))        x = F.relu(self.conv3_1(x))        x = F.relu(self.conv3_2(x))        x = F.relu(self.conv3_3(x))        x = F.max_pool2d(x, (2, 2))        x = F.relu(self.conv4_1(x))        x = F.relu(self.conv4_2(x))        x = F.relu(self.conv4_3(x))        x = F.max_pool2d(x, (2, 2))        x = F.relu(self.conv5_1(x))        x = F.relu(self.conv5_2(x))        x = F.relu(self.conv5_3(x))        x = F.max_pool2d(x, (2, 2))        x = x.view(-1, self.num_flat_features(x))        x = F.relu(self.fc6(x))        x = F.relu(self.fc7(x))        x = self.fc8(x)        return x    def num_flat_features(self, x):        size = x.size()[1:]        num_features = 1        for s in size:            num_features *= s        return num_featuresif __name__ == '__main__':    BATCH_SIZE = 128    LOG_INTERVAL = 5    # data    transform = transforms.Compose([        transforms.ToTensor()    ])    trainset = torchvision.datasets.CIFAR100(        root='./data',        train=True,        download=True,        transform=transform    )    testset = torchvision.datasets.CIFAR100(        root='./data',        train=False,        download=True,        transform=transform    )    trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)    testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)    # model    vgg16 = VGG16(100)    vgg16.cuda()    # optimizer    optimizer = optim.SGD(vgg16.parameters(), lr=0.01)    # loss    criterion = nn.CrossEntropyLoss()    print('———— Train Start —————')    for epoch in range(20):        running_loss = 0.        for step, (batch_x, batch_y) in enumerate(trainloader):            batch_x, batch_y = batch_x.cuda(), batch_y.cuda()            #             optimizer.zero_grad()            output = vgg16(batch_x)            loss = criterion(output, batch_y)            loss.backward()            optimizer.step()            running_loss += loss.item()            if step % LOG_INTERVAL == 0:                print('[%d, %4d] loss: %.4f' % (epoch, step, running_loss / LOG_INTERVAL))                running_loss = 0.        def test():            print('———— Test Start ————')            correct = 0            total = 0            #             with torch.no_grad():                for test_x, test_y in testloader:                    images, labels = test_x.cuda(), test_y.cuda()                    output = vgg16(images)                    _, predicted = torch.max(output.data, 1)                    total += labels.size(0)                    correct += (predicted == labels).sum().item()            accuracy = 100 * correct / total            print('Accuracy of the network is: %.4f %%' % accuracy)            print('———— Test Finish ————')        test()    print('———— Train Finish —————')

损失一直保持在4.6060左右，并且从未下降。我尝试了不同的学习率但没有效果。

回答：

我注意到你在卷积层之间没有使用批量归一化。我添加了批量归一化层，似乎有效。以下是修改后的代码：

class VGG16(torch.nn.Module):def __init__(self, n_classes):    super(VGG16, self).__init__()    # construct model    self.conv1_1 = nn.Conv2d(3, 64, 3, padding=1)    self.conv11_bn = nn.BatchNorm2d(64)    self.conv1_2 = nn.Conv2d(64, 64, 3, padding=1)    self.conv12_bn = nn.BatchNorm2d(64)    self.conv2_1 = nn.Conv2d(64, 128, 3, padding=1)    self.conv21_bn = nn.BatchNorm2d(128)    self.conv2_2 = nn.Conv2d(128, 128, 3, padding=1)    self.conv22_bn = nn.BatchNorm2d(128)    self.conv3_1 = nn.Conv2d(128, 256, 3, padding=1)    self.conv31_bn = nn.BatchNorm2d(256)    self.conv3_2 = nn.Conv2d(256, 256, 3, padding=1)    self.conv32_bn = nn.BatchNorm2d(256)    self.conv3_3 = nn.Conv2d(256, 256, 3, padding=1)    self.conv33_bn = nn.BatchNorm2d(256)    self.conv4_1 = nn.Conv2d(256, 512, 3, padding=1)    self.conv41_bn = nn.BatchNorm2d(512)    self.conv4_2 = nn.Conv2d(512, 512, 3, padding=1)    self.conv42_bn = nn.BatchNorm2d(512)    self.conv4_3 = nn.Conv2d(512, 512, 3, padding=1)    self.conv43_bn = nn.BatchNorm2d(512)    self.conv5_1 = nn.Conv2d(512, 512, 3, padding=1)    self.conv51_bn = nn.BatchNorm2d(512)            self.conv5_2 = nn.Conv2d(512, 512, 3, padding=1)    self.conv52_bn = nn.BatchNorm2d(512)    self.conv5_3 = nn.Conv2d(512, 512, 3, padding=1)    self.conv53_bn = nn.BatchNorm2d(512)    self.fc6 = nn.Linear(512, 512)    self.fc7 = nn.Linear(512, 512)    self.fc8 = nn.Linear(512, n_classes)def forward(self, x):    x = F.relu(self.conv11_bn(self.conv1_1(x)))    x = F.relu(self.conv12_bn(self.conv1_2(x)))    x = F.max_pool2d(x, (2, 2))    x = F.relu(self.conv22_bn(self.conv2_1(x)))    x = F.relu(self.conv21_bn(self.conv2_2(x)))    x = F.max_pool2d(x, (2, 2))    x = F.relu(self.conv31_bn(self.conv3_1(x)))    x = F.relu(self.conv32_bn(self.conv3_2(x)))    x = F.relu(self.conv33_bn(self.conv3_3(x)))    x = F.max_pool2d(x, (2, 2))    x = F.relu(self.conv41_bn(self.conv4_1(x)))    x = F.relu(self.conv42_bn(self.conv4_2(x)))    x = F.relu(self.conv43_bn(self.conv4_3(x)))    x = F.max_pool2d(x, (2, 2))    x = F.relu(self.conv51_bn(self.conv5_1(x)))    x = F.relu(self.conv52_bn(self.conv5_2(x)))    x = F.relu(self.conv53_bn(self.conv5_3(x)))    x = F.max_pool2d(x, (2, 2))    x = x.view(-1, self.num_flat_features(x))    x = F.relu(self.fc6(x))    x = F.relu(self.fc7(x))    x = self.fc8(x)    return x

然而，同样的更优雅的版本可以在这里找到

学技术

使用PyTorch实现的VGG16训练损失不下降

发表回复取消回复

相关文章：

Related Posts

使用LSTM在Python中预测未来值

如何在gensim的word2vec模型中查找双词组的相似性

dask_xgboost.predict 可以工作但无法显示 – 数据必须是一维的

ML Tuning – Cross Validation in Spark

如何在React JS中使用fetch从REST API获取预测

如何分析ML.NET中多类分类预测得分数组？

发表回复 取消回复

发表回复取消回复