我在尝试追踪整个数据集的误差,并在PyTorch中计算整个数据集的误差。我在cifar10 PyTorch 0.3.1中编写了以下(可复现的示例和完全包含的代码):
import torchfrom torch.autograd import Variableimport torch.optim as optimimport torchvisionimport torchvision.transforms as transformsfrom math import inffrom pdb import set_trace as stdef error_criterion(outputs,labels): max_vals, max_indices = torch.max(outputs,1) train_error = (max_indices != labels).sum().data[0]/max_indices.size()[0] return train_errordef evalaute_mdl_data_set(loss,error,net,dataloader,enable_cuda,iterations=inf): ''' 评估模型在特定数据集下的误差,使用特定的损失和误差标准。 ''' running_loss,running_error = 0,0 for i,data in enumerate(dataloader): if i >= iterations: break inputs, labels = extract_data(enable_cuda,data,wrap_in_variable=True) outputs = net(inputs) running_loss += loss(outputs,labels).data[0] running_error += error(outputs,labels) return running_loss/(i+1),running_error/(i+1)def extract_data(enable_cuda,data,wrap_in_variable=False): inputs, labels = data if enable_cuda: inputs, labels = inputs.cuda(), labels.cuda() #TODO 可能的加速方法? if wrap_in_variable: inputs, labels = Variable(inputs), Variable(labels) return inputs, labelsdef train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,error_criterion, iterations=inf): ''' 在训练前添加统计信息 ''' train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, trainloader, enable_cuda, iterations) test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion, error_criterion, net, testloader, enable_cuda, iterations) print(f'[-1, -1], (训练损失: {train_loss_epoch}, 训练误差: {train_error_epoch}) , (测试损失: {test_loss_epoch}, 测试误差: {test_error_epoch})') ## ''' 开始训练 ''' print('即将开始训练') for epoch in range(nb_epochs): # 多次遍历数据集 running_train_loss,running_train_error = 0.0,0.0 for i,data_train in enumerate(trainloader): ''' 清零参数梯度 ''' optimizer.zero_grad() ''' 训练步骤 = 前向传播 + 反向传播 + 优化 ''' inputs, labels = extract_data(enable_cuda,data_train,wrap_in_variable=True) outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_train_loss += loss.data[0] running_train_error += error_criterion(outputs,labels) ''' 每轮结束时:收集统计信息''' train_loss_epoch, train_error_epoch = running_train_loss/(i+1), running_train_error/(i+1) #train_loss_epoch, train_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,trainloader,enable_cuda,iterations) test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,testloader,enable_cuda,iterations) print(f'[{epoch}, {i+1}], (训练损失: {train_loss_epoch}, 训练误差: {train_error_epoch}) , (测试损失: {test_loss_epoch}, 测试误差: {test_error_epoch})') return train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epochclass Flatten(torch.nn.Module): def forward(self, input): return input.view(input.size(0), -1)def main(): enable_cuda = True print('运行主函数') num_workers = 0 ''' 获取数据集 ''' batch_size_test = 10000 batch_size_train = 10000 data_path = './data' transform = [transforms.ToTensor(),transforms.Normalize( (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) )] transform = transforms.Compose(transform) trainset = torchvision.datasets.CIFAR10(root=data_path, train=True,download=False, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size_train,shuffle=True, num_workers=num_workers) testset = torchvision.datasets.CIFAR10(root=data_path, train=False,download=False, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size_test,shuffle=False, num_workers=num_workers) ''' 获取模型 ''' net = torch.nn.Sequential( torch.nn.Conv2d(3,13,5), #(输入通道, 输出通道, 核大小), Flatten(), torch.nn.Linear(28*28*13, 13), torch.nn.Linear(13, 10) ) net.cuda() ''' 训练 ''' nb_epochs = 10 lr = 0.1 err_criterion = error_criterion criterion = torch.nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.0) train_and_track_stats(enable_cuda, nb_epochs, trainloader,testloader, net,optimizer,criterion,err_criterion, iterations=inf) ''' 完成 ''' print('完成')if __name__ == '__main__': main()
当我运行它时,我得到以下错误:
python my_cifar10.pyrunning main[-1, -1], (训练损失: 2.3172860145568848, 训练误差: 0.0054) , (测试损失: 2.317185878753662, 测试误差: 0.0038)即将开始训练[0, 5], (训练损失: 2.22599835395813, 训练误差: 0.015160000000000002) , (测试损失: 2.0623881816864014, 测试误差: 0.0066)[1, 5], (训练损失: 2.014406657218933, 训练误差: 0.00896) , (测试损失: 1.9619578123092651, 测试误差: 0.0195)[2, 5], (训练损失: 1.9428715705871582, 训练误差: 0.01402) , (测试损失: 1.918603539466858, 测试误差: 0.0047)[3, 5], (训练损失: 1.9434458494186402, 训练误差: 0.01192) , (测试损失: 1.9194672107696533, 测试误差: 0.0125)[4, 5], (训练损失: 1.8804980754852294, 训练误差: 0.00794) , (测试损失: 1.8549214601516724, 测试误差: 0.004)[5, 5], (训练损失: 1.8573726177215577, 训练误差: 0.010159999999999999) , (测试损失: 1.8625996112823486, 测试误差: 0.0158)[6, 5], (训练损失: 1.8454653739929199, 训练误差: 0.01524) , (测试损失: 1.8155865669250488, 测试误差: 0.0122)[7, 5], (训练损失: 1.8140610456466675, 训练误差: 0.01066) , (测试损失: 1.808283805847168, 测试误差: 0.0101)[8, 5], (训练损失: 1.8036894083023072, 训练误差: 0.00832) , (测试损失: 1.799634575843811, 测试误差: 0.007)[9, 5], (训练损失: 1.8023016452789307, 训练误差: 0.0077399999999999995) , (测试损失: 1.8030155897140503, 测试误差: 0.0114)完成
显然,这一定是错误的,因为测试误差几乎为零,而模型非常小且简单(1个卷积层,2个全连接层)。
代码看起来如此简单,我无法找出问题所在。我已经尝试了很多方法,调整了很多东西,花了几天时间。有什么新的建议可以尝试吗?
回答:
如果你的批次大小过大,使用你的代码,值
(max_indices == labels).sum()(max_indices != labels).sum()
加起来不会等于批次大小。这是因为你使用了torch.ByteTensor
,在求和时会对大于255的值进行溢出处理。
使用
(max_indices != labels).int().sum()
通过在求和前将Tensor转换为int
类型,可以解决这个问题。