我尝试训练一个两层神经网络来进行一个离散函数的简单线性插值,我尝试了很多不同的学习率和不同的激活函数,但似乎没有任何东西被学到!
我已经花了整整6个小时试图调试以下代码,但似乎没有任何错误!这是怎么回事?
from torch.utils.data import Dataset import os import torch import numpy as np import torch.nn as nn import torch.optim as optim import random LOW_X=255 MID_X=40000 HIGH_X=200000 LOW_Y=torch.Tensor([0,0,1]) MID_Y=torch.Tensor([0.2,0.5,0.3]) HIGH_Y=torch.Tensor([1,0,0]) BATCH_SIZE=4 def x_to_tensor(x): if x<=MID_X: return LOW_Y+(x-LOW_X)*(MID_Y-LOW_Y)/(MID_X-LOW_X) if x<=HIGH_X: return MID_Y+(x-MID_X)*(HIGH_Y-MID_Y)/(HIGH_X-MID_X) return HIGH_Y class XYDataset(Dataset): LENGTH=10000 def __len__(self): return self.LENGTH def __getitem__(self, idx): x=random.randint(LOW_X,HIGH_X) y=x_to_tensor(x) return x,y class Interpolate(nn.Module): def __init__(self, num_outputs,hidden_size=10): super(Interpolate, self).__init__() self.hidden_size=hidden_size self.x_to_hidden = nn.Linear(1, hidden_size) self.hidden_to_out = nn.Linear(hidden_size,num_outputs) self.activation = nn.Tanh() #我还尝试了Sigmoid和Relu激活函数 self.softmax=torch.nn.Softmax(dim=1) def forward(self, x): out = self.x_to_hidden(x) out = self.activation(out) out = self.hidden_to_out(out) out = self.softmax(out) return out dataset=XYDataset() trainloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) criterion= nn.MSELoss() def train_net(net,epochs=10,lr=5.137871216190041e-05,l2_regularization=2.181622809797563e-12): optimizer= optim.Adam(net.parameters(),lr=lr,weight_decay=l2_regularization) net.train(True) running_loss=0.0 for epoch in range(epochs): for i,data in enumerate(trainloader): inputs,targets=data inputs,targets=torch.FloatTensor(inputs.float()).view(-1,1),torch.FloatTensor(targets.float()) optimizer.zero_grad() outputs=net(inputs) loss=criterion(outputs,targets) loss.backward() optimizer.step() running_loss+=loss.item() if (len(trainloader)*epoch+i)%200==199: running_loss=running_loss/(200*BATCH_SIZE) print('[%d,%5d] loss: %.6f ' % (epoch+1,i+1,running_loss)) running_loss=0.0 for i in range(-11,3): net=Interpolate(num_outputs=3) train_net(net,lr=10**i,epochs=1) print('对于学习率 {},网络在低 x 值上的输出为 {}'.format(i,net(torch.Tensor([255]).view(-1,1))))
回答:
虽然你的问题相当简单,但其缩放性很差:x
的值从255到20万。这种缩放性差导致了数值不稳定性,并且整体上使训练过程不必要地不稳定。
要克服这个技术问题,你只需要将输入缩放到[-1, 1]
(或[0, 1]
)范围内即可。
请注意,这种缩放在深度学习中非常普遍:图像被缩放到[-1, 1]
范围(例如,参见torchvision.transforms.Normalize
)。
要更好地理解缩放响应的重要性,你可以查看这篇论文中的数学分析。