我在使用Keras实现情感分析模型。我已经创建并训练了模型,但现在我不确定如何对新数据进行预测,因为IMDB数据集已经是向量形式的(例如[22,33,4,…])。
那么,如何对新的句子如“我爱这部电影”进行预测呢?
from keras.datasets import imdbfrom keras.models import Sequentialfrom keras.layers import Densefrom keras.layers import LSTM, Convolution1D, Flatten, Dropoutfrom keras.layers.embeddings import Embeddingfrom keras.preprocessing import sequencefrom keras.callbacks import TensorBoard# 使用Keras加载数据集,设置top_words为10000top_words = 10000(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)# 将序列填充到相同长度max_review_length = 1600X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)# 使用Keras的嵌入层embedding_vecor_length = 300model = Sequential()model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))# 卷积模型(3层卷积,flatten,2层全连接)model.add(Convolution1D(64, 3, padding='same'))model.add(Convolution1D(32, 3, padding='same'))model.add(Convolution1D(16, 3, padding='same'))model.add(Flatten())model.add(Dropout(0.2))model.add(Dense(180,activation='sigmoid'))model.add(Dropout(0.2))model.add(Dense(1,activation='sigmoid'))# 日志记录到TensorBoardtensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])model.fit(X_train, y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)# 在测试集上进行评估scores = model.evaluate(X_test, y_test, verbose=0)print("准确率: %.2f%%" % (scores[1]*100))model.save("trained_demo.h5")
回答:
你需要获取单词和索引对的字典。使用这个字典,你可以将单词转换为索引,最后进行填充。
from nltk import word_tokenizefrom keras.preprocessing import sequenceword2index = imdb.get_word_index()test=[]for word in word_tokenize( "i love this movie"): test.append(word2index[word])test=sequence.pad_sequences([test],maxlen=max_review_length)model.predict(test)