作为一个从Tensorflow转来的Pytorch新手,我不确定如何实现提前停止。我的研究发现,pytorch并没有原生方法来实现这一点。我还发现了torchsample,但由于某些原因无法在我的conda环境中安装它。没有它,有没有简单的方法来应用提前停止?以下是我的当前设置:
class RegressionDataset(Dataset): def __init__(self, X_data, y_data): self.X_data = X_data self.y_data = y_data def __getitem__(self, index): return self.X_data[index], self.y_data[index] def __len__(self): return len(self.X_data)train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())test_dataset = RegressionDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())# Model ParamsEPOCHS = 100BATCH_SIZE = 1000LEARNING_RATE = 0.001NUM_FEATURES = np.shape(X_test)[1]# Initialize Dataloadertrain_loader = DataLoader(dataset = train_dataset, batch_size=BATCH_SIZE, shuffle = True)val_loader = DataLoader(dataset = val_dataset, batch_size=BATCH_SIZE)test_loader = DataLoader(dataset = test_dataset, batch_size=BATCH_SIZE)# Define Neural Network Architectureclass MultipleRegression(nn.Module): def __init__(self, num_features): super(MultipleRegression, self).__init__() # Define architecture self.layer_1 = nn.Linear(num_features, 16) self.layer_2 = nn.Linear(16, 32) self.layer_3 = nn.Linear(32, 25) self.layer_4 = nn.Linear(25, 20) self.layer_5 = nn.Linear(20, 16) self.layer_out = nn.Linear(16, 1) self.relu = nn.ReLU() # ReLU applied to all layers # Initialize weights and biases nn.init.xavier_uniform_(self.layer_1.weight) nn.init.zeros_(self.layer_1.bias) nn.init.xavier_uniform_(self.layer_2.weight) nn.init.zeros_(self.layer_2.bias) nn.init.xavier_uniform_(self.layer_3.weight) nn.init.zeros_(self.layer_3.bias) nn.init.xavier_uniform_(self.layer_4.weight) nn.init.zeros_(self.layer_4.bias) nn.init.xavier_uniform_(self.layer_5.weight) nn.init.zeros_(self.layer_5.bias) nn.init.xavier_uniform_(self.layer_out.weight) nn.init.zeros_(self.layer_out.bias) def forward(self, inputs): x = self.relu(self.layer_1(inputs)) x = self.relu(self.layer_2(x)) x = self.relu(self.layer_3(x)) x = self.relu(self.layer_4(x)) x = self.relu(self.layer_5(x)) x = self.layer_out(x) return(x) def predict(self, test_inputs): x = self.relu(self.layer_1(test_inputs)) x = self.relu(self.layer_2(x)) x = self.relu(self.layer_3(x)) x = self.relu(self.layer_4(x)) x = self.relu(self.layer_5(x)) x = self.layer_out(x) return(x)# Check for GPUdevice = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")print(device)model = MultipleRegression(NUM_FEATURES)model.to(device)print(model)criterion = nn.MSELoss()optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)# define dictionary to store loss/epochs for training and validationloss_stats = { "train": [], "val": [] }# begin trainingprint("Begin Training")for e in tqdm(range(1, EPOCHS+1)): # Training train_epoch_loss = 0 model.train() for X_train_batch, y_train_batch in train_loader: X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device) optimizer.zero_grad() y_train_pred = model(X_train_batch) train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1)) train_loss.backward() optimizer.step() train_epoch_loss += train_loss.item() # validation with torch.no_grad(): val_epoch_loss = 0 model.eval() for X_val_batch, y_val_batch in val_loader: X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device) y_val_pred = model(X_val_batch) val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1)) val_epoch_loss += val_loss.item() loss_stats["train"].append(train_epoch_loss/len(train_loader)) loss_stats["val"].append(val_epoch_loss/len(val_loader)) print(f"Epoch {e}: \ Train loss: {train_epoch_loss/len(train_loader):.5f} \ Val loss: {val_epoch_loss/len(val_loader):.5f}")# Visualize loss and accuracytrain_val_loss_df = pd.DataFrame.from_dict(loss_stats).reset_index().melt(id_vars=["index"]).rename(columns = {"index":"epochs"})plt.figure()sns.lineplot(data = train_val_loss_df, x = "epochs", y = "value", hue = "variable").set_title("Train-Val Loss/Epoch")# Test modely_pred_list = []with torch.no_grad(): model.eval() for X_batch, _ in test_loader: X_batch = X_batch.to(device) y_test_pred = model(X_batch) y_pred_list.append(y_test_pred.cpu().numpy())y_pred_list = [a.squeeze().tolist() for a in y_pred_list]y_pred_list = [item for sublist in y_pred_list for item in sublist]y_pred_list = np.array(y_pred_list)mse = mean_squared_error(y_test, y_pred_list)r_square = r2_score(y_test, y_pred_list)print("Mean Squared Error :", mse)print("R^2 :", r_square)
回答:
实现提前停止的一种基本方法是持续跟踪迄今为止获得的最佳验证损失。
你可以在遍历epoch的循环之前初始化一个变量best_loss = 0
(或者你可以做其他事情,比如每epoch的最佳损失等)。
每次验证通过后执行以下操作:
if val_loss > best_loss: best_loss = val_loss # 在此时也保存当前模型的快照 torch.save(model, 'my_model_best_loss.pth')
然后,如果best_loss
在一定数量的训练步骤后没有显著改善,或者在epoch结束时,或者如果val_loss
变差,就跳出循环并在此处终止训练。
对于实现像提前停止这样的算法(以及你的训练循环整体),你可能会发现尝试PyTorch Lightning会更容易(没有关联,但它比自己手动实现一切要容易得多)。