这是我的代码,已加载预训练的权重和嵌入矩阵
from __future__ import print_functionimport numpy as npimport pandas as pdimport csv, datetime, time, jsonfrom zipfile import ZipFilefrom os.path import expanduser, existsfrom keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import Model from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization from keras.layers.embeddings import Embeddingfrom keras.regularizers import l2from keras.callbacks import Callback, ModelCheckpointfrom keras.utils.data_utils import get_filefrom keras import backend as Kfrom sklearn.model_selection import train_test_split**初始化全局变量** KERAS_DATASETS_DIR = expanduser('~/.keras/datasets/') QUESTION_PAIRS_FILE_URL = 'http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv' QUESTION_PAIRS_FILE = 'test.csv' GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip' GLOVE_ZIP_FILE = 'glove.840B.300d.zip' GLOVE_FILE = 'glove.840B.300d.txt' Q1_TRAINING_DATA_FILE = 'q1_train.npy' Q2_TRAINING_DATA_FILE = 'q2_train.npy' LABEL_TRAINING_DATA_FILE = 'label_train.npy' WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy' NB_WORDS_DATA_FILE = 'nb_words.json' MAX_NB_WORDS = 200000 MAX_SEQUENCE_LENGTH = 25 EMBEDDING_DIM = 300 EMBEDDING_DIM = 300 MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5' VALIDATION_SPLIT = 0.1 TEST_SPLIT = 0.1 RNG_SEED = 13371447 NB_EPOCHS = 1 DROPOUT = 0.1 BATCH_SIZE = 32 OPTIMIZER = 'adam'word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))with open(NB_WORDS_DATA_FILE, 'r') as f:nb_words = json.load(f)['nb_words']print("Processing", QUESTION_PAIRS_FILE)question1 = []question2 = []with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:reader = csv.DictReader(csvfile, delimiter=',')for row in reader: question1.append(row['question1']) question2.append(row['question2']) print('Question pairs: %d' % len(question1)) T1=len(question1) print(T1) **构建标记化词索引** questions = question1 + question2 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(questions) question1_word_sequences = tokenizer.texts_to_sequences(question1) question2_word_sequences = tokenizer.texts_to_sequences(question2) word_index = tokenizer.word_index print("Words in index: %d" % len(word_index)) 准备词嵌入矩阵 准备训练数据张量 q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH) q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of question1 data tensor:', q1_data.shape) print('Shape of question2 data tensor:', q2_data.shape) **定义模型**question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))q1 = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)(question1)q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)q2 = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)(question2)q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)merged = concatenate([q1,q2])merged = Dense(200, activation='relu')(merged)merged = Dropout(DROPOUT)(merged)merged = BatchNormalization()(merged)merged = Dense(200, activation='relu')(merged)merged = Dropout(DROPOUT)(merged)merged = BatchNormalization()(merged)merged = Dense(200, activation='relu')(merged)merged = Dropout(DROPOUT)(merged)merged = BatchNormalization()(merged)merged = Dense(200, activation='relu')(merged)merged = Dropout(DROPOUT)(merged)merged = BatchNormalization()(merged)is_duplicate = Dense(1, activation='sigmoid')(merged)model = Model(inputs=[question1,question2], outputs=is_duplicate) model.compile(loss='binary_crossentropy', optimizer=OPTIMIZER, metrics= ['accuracy']) model.load_weights(MODEL_WEIGHTS_FILE) temp= model.predict([q1_data, q2_data]) df = pd.DataFrame(temp) df.to_csv("hero.csv",header=['is_duplicate'])
它给我的输出是这样的
test id is_duplicate0 0.5859841 0.134376972 0.74581093 0.62828464 0.44692168
但我需要布尔值
test id is_duplicate 0 1 1 0 2 1 3 1
不要告诉我四舍五入值
我希望神经网络本身给我布尔值
这是可能的吗,能这样训练网络吗
如果可以,请建议我可以在代码中加入什么
提前感谢
回答:
网络不能直接输出布尔值,您只能对网络的输出进行阈值处理,因为它使用了sigmoid激活函数。假设您设置阈值为0.5,任何高于0.5的值将被分类为1,低于0.5的值将被分类为0。
此外,网络的输出值还可以为您提供网络对输出不确定性的“置信度”估计。