我在将数据输入到嵌入层时看到了两种填充方式。
例如:
考虑两个句子:
word1 = “我是一个爱狗的人。”
word2 = “Krishni和Pradeepa都喜欢猫。”
word1_int = [1,2,3,4,5,6]
word2_int = [7,8,9,10,11,12,13]
将两个词填充到长度 = 8
填充方法1(在开头添加0)
word1_int = [0,0,1,2,3,4,5,6]
word2_int = [0,7,8,9,10,11,12,13]
填充方法2(在结尾添加0)
word1_int = [1,2,3,4,5,6,0,0]
word2_int = [7,8,9,10,11,12,13,0]
我正在使用20个新闻组数据集进行在线分类,目前我使用的是第一种方法来填充我的文本。
问题:在我的实现中,使用第一种方法相较于第二种方法有什么优势吗?
提前感谢您!
我的代码如下所示:
from collections import Counterimport tensorflow as tffrom sklearn.datasets import fetch_20newsgroupsimport matplotlib as mpltmplt.use('agg') # Must be before importing matplotlib.pyplot or pylab!import matplotlib.pyplot as pltfrom string import punctuationfrom sklearn.preprocessing import LabelBinarizerimport numpy as npfrom nltk.corpus import stopwordsimport nltknltk.download('stopwords')def pre_process(): newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) words = [] temp_post_text = [] print(len(newsgroups_data.data)) for post in newsgroups_data.data: all_text = ''.join([text for text in post if text not in punctuation]) all_text = all_text.split('\n') all_text = ''.join(all_text) temp_text = all_text.split(" ") for word in temp_text: if word.isalpha(): temp_text[temp_text.index(word)] = word.lower() # temp_text = [word for word in temp_text if word not in stopwords.words('english')] temp_text = list(filter(None, temp_text)) temp_text = ' '.join([i for i in temp_text if not i.isdigit()]) words += temp_text.split(" ") temp_post_text.append(temp_text) # temp_post_text = list(filter(None, temp_post_text)) dictionary = Counter(words) # deleting spaces # del dictionary[""] sorted_split_words = sorted(dictionary, key=dictionary.get, reverse=True) vocab_to_int = {c: i for i, c in enumerate(sorted_split_words,1)} message_ints = [] for message in temp_post_text: temp_message = message.split(" ") message_ints.append([vocab_to_int[i] for i in temp_message]) # maximum message length = 6577 # message_lens = Counter([len(x) for x in message_ints])AAA seq_length = 6577 num_messages = len(temp_post_text) features = np.zeros([num_messages, seq_length], dtype=int) for i, row in enumerate(message_ints): print(features[i, -len(row):]) features[i, -len(row):] = np.array(row)[:seq_length] print(features[i, -len(row):]) lb = LabelBinarizer() lbl = newsgroups_data.target labels = np.reshape(lbl, [-1]) labels = lb.fit_transform(labels) return features, labels, len(sorted_split_words)+1def get_batches(x, y, batch_size=1): for ii in range(0, len(y), batch_size): yield x[ii:ii + batch_size], y[ii:ii + batch_size]def plot(noOfWrongPred, dataPoints): font_size = 14 fig = plt.figure(dpi=100,figsize=(10, 6)) mplt.rcParams.update({'font.size': font_size}) plt.title("错误预测的分布", fontsize=font_size) plt.ylabel('错误率', fontsize=font_size) plt.xlabel('数据点数量', fontsize=font_size) plt.plot(dataPoints, noOfWrongPred, label='预测', color='blue', linewidth=1.8) # plt.legend(loc='upper right', fontsize=14) plt.savefig('错误预测的分布.png') # plt.show()def train_test(): features, labels, n_words = pre_process() print(features.shape) print(labels.shape) # Defining Hyperparameters lstm_layers = 1 batch_size = 1 lstm_size = 200 learning_rate = 0.01 # --------------placeholders------------------------------------- # Create the graph object graph = tf.Graph() # Add nodes to the graph with graph.as_default(): tf.set_random_seed(1) inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs") # labels_ = tf.placeholder(dtype= tf.int32) labels_ = tf.placeholder(tf.float32, [None, None], name="labels") # output_keep_prob is the dropout added to the RNN's outputs, the dropout will have no effect on the calculation of the subsequent states. keep_prob = tf.placeholder(tf.float32, name="keep_prob") # Size of the embedding vectors (number of units in the embedding layer) embed_size = 300 # generating random values from a uniform distribution (minval included and maxval excluded) embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1),trainable=True) embed = tf.nn.embedding_lookup(embedding, inputs_) print(embedding.shape) print(embed.shape) print(embed[0]) # Your basic LSTM cell lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size) # Add dropout to the cell drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) # Stack up multiple LSTM layers, for deep learning cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers) # Getting an initial state of all zeros initial_state = cell.zero_state(batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state) # hidden layer hidden = tf.layers.dense(outputs[:, -1], units=25, activation=tf.nn.relu) print(hidden.shape) logit = tf.contrib.layers.fully_connected(hidden, num_outputs=20, activation_fn=None) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=labels_)) optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) saver = tf.train.Saver() # ----------------------------online training----------------------------------------- with tf.Session(graph=graph) as sess: tf.set_random_seed(1) sess.run(tf.global_variables_initializer()) iteration = 1 state = sess.run(initial_state) wrongPred = 0 noOfWrongPreds = [] dataPoints = [] for ii, (x, y) in enumerate(get_batches(features, labels, batch_size), 1): feed = {inputs_: x, labels_: y, keep_prob: 0.5, initial_state: state} embedzz = sess.run(embedding, feed_dict=feed) print(embedzz) predictions = tf.nn.softmax(logit).eval(feed_dict=feed) print("----------------------------------------------------------") print("迭代次数: {}".format(iteration)) isequal = np.equal(np.argmax(predictions[0], 0), np.argmax(y[0], 0)) print(np.argmax(predictions[0], 0)) print(np.argmax(y[0], 0)) if not (isequal): wrongPred += 1 print("错误预测数量: ",wrongPred) if iteration%50 == 0: noOfWrongPreds.append(wrongPred/iteration) dataPoints.append(iteration) loss, states, _ = sess.run([cost, final_state, optimizer], feed_dict=feed) print("训练损失: {:.3f}".format(loss)) iteration += 1 saver.save(sess, "checkpoints/sentiment.ckpt") errorRate = wrongPred / len(labels) print("错误总数: ", wrongPred) print("错误率: ", errorRate) plot(noOfWrongPreds, dataPoints)if __name__ == '__main__': train_test()
这是我用来填充所有句子的代码示例。
seq_length = 6577 num_messages = len(temp_post_text) features = np.zeros([num_messages, seq_length], dtype=int) for i, row in enumerate(message_ints): print(features[i, -len(row):]) features[i, -len(row):] = np.array(row)[:seq_length] print(features[i, -len(row):])
回答:
通常,当我们使用LSTM或RNN时,我们使用最终输出或隐藏状态,并将其传递以进行预测。您也在做同样的事情,如下行所示:
logit = tf.contrib.layers.fully_connected(hidden, num_outputs=20, activation_fn=None)
在这里,两种填充方法有所区别。如果您使用第二种填充方法,即后填充,那么最终的隐藏状态将会大部分被刷新为0
,而通过使用第一种方法,我们确保隐藏状态输出是正确的。