Numpy Backprop Cost is Not Decreasing

我正在编写一个Python脚本，允许用户在全连接神经网络中定义隐藏层的数量及其节点数。

问题是，当我尝试使用更大的数据集时，错误显示为nan。我不确定这是为什么，但我也在Google Colab上运行时遇到了这个Python错误。

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:64: RuntimeWarning: overflow encountered in exp

以下是小数据集的输出，在这种情况下不会发生错误…

Network Architecture:----------------------------------------------------------------------------Input Layer Number of Weights: 60Hidden Layer 1 Number of Weights: 400Output Layer Number of Weights: 20----------------------------------------------------------------------------Total Number of Weights:  480Epoch: 1 ERROR: 8.148725708134741e-05Epoch: 2 ERROR: 8.148670920765655e-05Epoch: 3 ERROR: 8.14861613419593e-05Epoch: 4 ERROR: 8.14856134840336e-05Epoch: 5 ERROR: 8.148506563421254e-05Epoch: 6 ERROR: 8.148451779205201e-05Epoch: 7 ERROR: 8.148396995799612e-05Epoch: 8 ERROR: 8.148342213176729e-05Epoch: 9 ERROR: 8.148287431336554e-05Epoch: 10 ERROR: 8.14823265030129e-05Epoch: 11 ERROR: 8.148177870037632e-05Epoch: 12 ERROR: 8.148123090584436e-05Epoch: 13 ERROR: 8.148068311908396e-05Epoch: 14 ERROR: 8.148013534031717e-05Epoch: 15 ERROR: 8.147958756948848e-05Done.Final Accuracy: 99.99185204124305%Prediction:array([0.])

这是使用sklearn波士顿数据集的输出

Network Architecture:----------------------------------------------------------------------------Input Layer Number of Weights: 260Hidden Layer 1 Number of Weights: 400Output Layer Number of Weights: 20----------------------------------------------------------------------------Total Number of Weights:  680Epoch: 1 ERROR: nanEpoch: 2 ERROR: nanEpoch: 3 ERROR: nanEpoch: 4 ERROR: nanEpoch: 5 ERROR: nanEpoch: 6 ERROR: nanEpoch: 7 ERROR: nanEpoch: 8 ERROR: nanEpoch: 9 ERROR: nanEpoch: 10 ERROR: nanEpoch: 11 ERROR: nanEpoch: 12 ERROR: nanEpoch: 13 ERROR: nanEpoch: 14 ERROR: nanEpoch: 15 ERROR: nanDone.Final Accuracy: nan%Prediction:/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:64: RuntimeWarning: overflow encountered in exparray([nan])

任何帮助都将非常受欢迎！完整的脚本如下…

# Python 3import numpy as npnp.seterr(divide='ignore', invalid='ignore')class Model:  def __init__(self, x, y, number_of_hidden_layers=2, number_of_hidden_nodes=30, quiet=False):    self.x = x    self.y = y    self.number_of_hidden_layers = number_of_hidden_layers    self.number_of_hidden_nodes = number_of_hidden_nodes    self.input_layer_activation_function = "tanh"    self.hidden_layer_activation_function = "tanh"    self.output_layer_activation_function = "tanh"    #making a random, reproducible seed    np.random.seed(1)    input_shape = self.x[0].shape[0]    output_shape = self.y[0].shape[0]    number_of_hidden_nodes = self.number_of_hidden_nodes    number_of_hidden_layers = self.number_of_hidden_layers    #init the full lists of hidden plus 2 for input and output    #weights    self.W = [None] * (number_of_hidden_layers + 2)    #activations    self.A = [None] * (number_of_hidden_layers + 2)    #deltas    self.D = [None] * (number_of_hidden_layers + 2)    input_layer_weights = 2 * np.random.random((input_shape,number_of_hidden_nodes)) - 1    self.W[0] = (input_layer_weights)    #middle    for i in range(number_of_hidden_layers):      i += 1      hidden_layer_weights = 2 * np.random.random((number_of_hidden_nodes,number_of_hidden_nodes)) - 1      self.W[i] = (hidden_layer_weights)    #output    output_layer_weights = 2 * np.random.random((number_of_hidden_nodes,output_shape)) - 1    self.W[len(self.W)-1] = (output_layer_weights)    if quiet == False:      #show the architecture:      print ("Network Architecture:")      print ("----------------------------------------------------------------------------")      total = 0      for count, i in enumerate(self.W):        total += (i.shape[0] * i.shape[1])        if count == 0:          print("Input Layer Number of Weights: " + str(i.shape[0] * i.shape[1]))        elif count == (len(self.W)-1):          print("Output Layer Number of Weights: " + str(i.shape[0] * i.shape[1]))        else:          print("Hidden Layer " + str(count) + " Number of Weights: " + str(i.shape[0] * i.shape[1]))      print ("----------------------------------------------------------------------------")      print("Total Number of Weights: ", total)      print()  #nonlin func  def nonlin(self, x, deriv, function):    if function == "tanh":      t=(np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))      if (deriv==True):          dt=1-t**2          return dt      return t    elif function == "sigmoid":      if (deriv==True):          return (x * (1-x))      return 1/(1 + np.exp(-x))    elif function == "leaky_relu":      if (deriv==True):          dx = np.ones_like(x)          dx[x < 0] = 0.01          return dx      return np.where(x > 0, x, x * 0.01)  def predict(self, x):    #forward pass    input_layer_activation = self.nonlin(np.dot(x, self.W[0]), False, self.input_layer_activation_function)    self.A[0] = (input_layer_activation)    for i in range(self.number_of_hidden_layers):      i += 1      hidden_layer_activation = self.nonlin(np.dot(self.A[i-1], self.W[i]), False, self.hidden_layer_activation_function)    output_layer_activation = self.nonlin(np.dot(hidden_layer_activation, self.W[len(self.W)-1]), False,  self.output_layer_activation_function)    print()    print("Prediction:")    return output_layer_activation  #training  def train(self, loss_function, epochs, alpha=0.001):    for j in range(epochs):        #forward pass        input_layer_activation = self.nonlin(np.dot(self.x, self.W[0]), False, self.input_layer_activation_function)        self.A[0] = (input_layer_activation)        for i in range(self.number_of_hidden_layers):          i += 1          hidden_layer_activation = self.nonlin(np.dot(self.A[i-1], self.W[i]), False, self.hidden_layer_activation_function)          self.A[i] = (hidden_layer_activation)        output_layer_activation = self.nonlin(np.dot(hidden_layer_activation, self.W[len(self.W)-1]), False,  self.output_layer_activation_function)        self.A[len(self.A)-1] = (output_layer_activation)        #choose error in compile        #so output_layer_activation is the prediction!!!        if loss_function == "mse":          error = (self.y - output_layer_activation) **2        if loss_function == "mae":          error = np.abs(self.y - output_layer_activation)        if loss_function == "cce":          output_layer_activation = np.clip(output_layer_activation, 1e-12, 1. - 1e-12)          total_number = output_layer_activation.shape[0]          error = -np.sum(self.y*np.log(output_layer_activation+1e-9))/total_number        else:          error = self.y - output_layer_activation        #print every n steps        divis = epochs//10        if (j % divis) == 0:            print ('Epoch: ' + str(j+1) + ' ERROR: ' + str(np.mean(np.abs(error))))        #backwards pass        output_delta = error * self.nonlin(output_layer_activation, True, self.output_layer_activation_function)        self.D[0] = output_delta        #setting working vars        working_delta = output_delta        past_layer_weights = self.W[len(self.W)-1]        for i in range(self.number_of_hidden_layers):          working_index = i+1          hidden_layer_activation_error = working_delta.dot(past_layer_weights.T)          hidden_layer_activation_delta = hidden_layer_activation_error * self.nonlin(self.A[len(self.A)-working_index-1], True, self.hidden_layer_activation_function)          self.D[working_index] = hidden_layer_activation_delta          working_delta = hidden_layer_activation_delta          past_layer_weights = self.W[len(self.W)-(working_index+1)]        input_layer_activation_error = self.D[working_index].dot(self.W[working_index].T)        input_layer_activation_delta = input_layer_activation_error * self.nonlin(input_layer_activation, True, self.input_layer_activation_function)        self.D[working_index+1] = input_layer_activation_delta        #update weights        internal_alpha = alpha        self.W[len(W)-1] += input_layer_activation.T.dot(self.D[0]) * internal_alpha        for i,z in enumerate(range(number_of_hidden_layers,0,-1)):          i += 1          self.W[z] += self.A[i].T.dot(self.D[i]) * internal_alpha        self.W[0] += self.x.T.dot(self.D[len(self.D)-1]) * internal_alpha    #ending print out    print()    print("Done.")    print("Final Accuracy: " + str(np.abs((np.mean(np.abs(error)))-1)*100) + "%")#inputsx = np.array([[0,0,0], [1,1,1], [1,1,1], [0,0,0]])#outputy = np.array([[0],[1],[1],[0]])from sklearn.datasets import load_bostonboston = load_boston()x = boston["data"]y = boston["target"]y = y.reshape((x.shape[0], 1))model = Model(x, y, number_of_hidden_layers=1, number_of_hidden_nodes=20)model.train("mse", 15, alpha=.001)model.predict(x[0])

回答：

我认为这是一个回归模型，看起来所有层都使用了tanh激活函数。由于tanh的输出范围是[-1, +1]，你应该对最后一层使用类似relu的激活函数，因为sklearn的波士顿数据集目标值的范围是[0, 50]。

学技术

Numpy Backprop Cost is Not Decreasing

发表回复取消回复

相关文章：

Related Posts

使用LSTM在Python中预测未来值

如何在gensim的word2vec模型中查找双词组的相似性

dask_xgboost.predict 可以工作但无法显示 – 数据必须是一维的

ML Tuning – Cross Validation in Spark

如何在React JS中使用fetch从REST API获取预测

如何分析ML.NET中多类分类预测得分数组？

发表回复 取消回复

发表回复取消回复