受到Andrej Karpathy的博客启发,我想要创建一个自己的版本的循环神经网络,用于选择下一个单词而不是字符。因为文本中不同单词的数量非常多,我使用了word2vec来将单词表示为向量(其中相似的单词在向量空间中更接近)。现在神经网络应该训练学习从旧向量的模式中获得新向量。
-需要注意的一点是,Karpathy使用了分类器,而我尝试使用回归方法(平方损失成本)。
我的问题是,无论进行多少训练,我的神经网络都预测输出[0,0,0….,0]。所以我猜测我的训练或预测方法可能存在问题(在训练过程中平均误差略有下降,所以一定进行了一些训练)
如果有人想运行它,下面是我的完整代码(它使用了布朗语料库,因此需要安装nltk才能正常工作)。
这是我在Lasagne中的“初学者”项目,所以如果我做了一些愚蠢的事情,任何建议都会被感激。提前谢谢:)
from gensim.models import Word2Vecimport gensimimport sysfrom datetime import timedeltaimport matplotlib.pyplot as pltfrom nltk.corpus import brownimport theano.tensor as Timport theanoimport timeimport numpy as npfrom lasagne import layersimport lasagnefrom lasagne.updates import nesterov_momentumfrom sklearn.preprocessing import MinMaxScalerfrom sklearn.manifold import TSNEdef modelExcept(input, model, size): try: out = model[input] return out except Exception: out = np.zeros((size)) print 'exception ' + str(input) return outdef plot_TSNE(model,nr_words=None): tsne = TSNE(n_components=2) if nr_words == None: X_tsne = tsne.fit_transform(model[model.wv.vocab][:]) else: X_tsne = tsne.fit_transform(model[model.wv.vocab][0:nr_words]) X_names = [key for key in model.wv.vocab] plt.figure() ax = plt.subplot(111) for i in range(X_tsne.shape[0]): plt.text(X_tsne[i, 0], X_tsne[i, 1], str(X_names[i]), #color=plt.cm.Set1(y[i] / 10.), fontdict={'weight': 'bold', 'size': 9}) plt.xticks([]), plt.yticks([]) plt.draw() #plt.scatter(X_tsne[:, 0], X_tsne[:, 1]) #plt.show()def getBatch(words_as_vecs , wordSize,totalwords, windowSize, BATCHSIZE): BatchIndexes = np.random.randint(0,totalwords-windowSize, size=BATCHSIZE) input = np.empty((BATCHSIZE,windowSize,wordSize),dtype=np.float32) target = np.empty((BATCHSIZE,wordSize),dtype=np.float32) for i in range(BATCHSIZE): k = BatchIndexes[i] input[i,:,:] = words_as_vecs[k:k+windowSize,:] target[i,:] = words_as_vecs[k+windowSize,:] return input, targetwordSize = 30windowSize = 5BATCHSIZE = 128LEARNING_RATE = .1Nr_EPOCHS = 100NR_Predictions = 15model_raw = Word2Vec(brown.sents(),workers=4,window=10,iter=15,size=wordSize, min_count=10)#plot_TSNE(model_raw,None)model = model_raw.wv #trim model after training to save RAMdel model_rawwords_filtered = filter(lambda x: x in model.vocab, brown.words())#filter away words that are not in vocabularywords_as_vecs = np.asarray([modelExcept(word, model,wordSize) for word in words_filtered],dtype = np.float32) #create all vector representations beforehand to save time!!scaler = MinMaxScaler(feature_range=(0,1))words_as_vecs = scaler.fit_transform(words_as_vecs)print 'creating neural net...'Num_units_per_layer = 512GRAD_CLIP = 100l_in = lasagne.layers.InputLayer(shape=(None,None,wordSize))l_LSTM1 = lasagne.layers.LSTMLayer(l_in,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify)l_drop1 = lasagne.layers.DropoutLayer(l_LSTM1,p=0.5)l_LSTM2 = lasagne.layers.LSTMLayer(l_drop1,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify, only_return_final=True)l_drop2 = lasagne.layers.DropoutLayer(l_LSTM2,p=0.5)l_shp = lasagne.layers.ReshapeLayer(l_drop2,(-1,Num_units_per_layer))l_out = lasagne.layers.DenseLayer(l_shp,num_units=wordSize,W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.rectify)target_vals = T.imatrix('target values')net_out = lasagne.layers.get_output(l_out)net_out_predict = lasagne.layers.get_output(l_out,deterministic = True)#use squared error because the problem is now a regession problemcost = T.sum(lasagne.objectives.squared_error(net_out,target_vals))all_params = lasagne.layers.get_all_params(l_out, trainable = True)updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)net_train = theano.function([l_in.input_var, target_vals], cost, updates=updates, allow_input_downcast=True)compute_cost = theano.function([l_in.input_var, target_vals], cost, allow_input_downcast=True)net_predict = theano.function([l_in.input_var],net_out_predict,allow_input_downcast=True)print 'creating testphrase...'testphrase_vectors = np.empty((1,5,wordSize),dtype=np.float32)testphrase_vectors[0,:,:] = words_as_vecs[1:6,:]testphrase_words = words_filtered[0:6]#testphrase_words = brown.words()[0:6]print 'training...'avg_cost = 0totalwords = len(words_filtered)#totalwords = len(brown.words())print_freq = totalwords/BATCHSIZE #print example every epochnrItterations = Nr_EPOCHS*totalwords/BATCHSIZEfor i in range(nrItterations): inTrain, target = getBatch(words_as_vecs, wordSize, totalwords, windowSize, BATCHSIZE) avg_cost += net_train(inTrain,target) #generate text sample if (i%print_freq == 0) and (i != 0): print 'prediction of train' print 'average cost is {0}' .format(avg_cost/(BATCHSIZE*print_freq)) avg_cost = 0 generated_example = ' '.join(testphrase_words) testphrase_vectors_copy = testphrase_vectors for k in range(NR_Predictions): prediction = np.asarray(net_predict(testphrase_vectors_copy)) prediction_unscaled = scaler.inverse_transform(prediction.reshape(1,-1)).reshape(-1) current_word = model.most_similar(positive=[prediction_unscaled], topn=1) generated_example = ' '.join((generated_example, current_word[0][0])) #insert new word in testphrase (and delete first) testphrase_vectors_copy[0,0:-1,:] = testphrase_vectors_copy[0,1:,:] testphrase_vectors_copy[0,-1,:] = model[current_word[0][0]] #print testphrase_vectors_copy print 'example nr. {}' .format(i/print_freq + 1) print generated_example print '\n \n'
回答:
我终于找到了错误。
问题出在这行代码:
target_vals = T.imatrix(‘target values’)
应该改为:
target_vals = T.fmatrix(‘target values’)
因为我目标是浮点数而不是整数。