我目前正在根据Tensorflow的’wide’教程使用Tensorflow实现一个逻辑回归器: https://www.tensorflow.org/tutorials/wide
我的代码几乎与教程完全匹配,然而,当我在模型上运行predict()时,它每次都猜测负类,这占了数据的约77%。我如何才能让我的模型尝试一些正向的猜测?我没有进行正则化,因此方差应该最大化。文档声称准确率为84%,而我使用的是完全相同的数据集。可能出了什么问题?这是训练代码:
def train_logistic_model(training_path, response, predictors, num_labels): # Get csv df_train = pd.read_csv(training_path, header=0) # Sanitize column names unsanitized_column_names = df_train.columns.values column_names = [] for col in unsanitized_column_names: column_names.append(re.sub('[^A-Za-z0-9]+', '', col)) # Update dataframe with sanitized column names df_train = pd.read_csv(training_path, names=column_names, skiprows=1) # Slice off %10 of training data to test with df_test = df_train.loc[(len(df_train.index) * .9):] df_train = df_train.loc[:(len(df_train.index) * .9)] response_name = column_names[response] LABEL_COLUMN = "label" df_train[LABEL_COLUMN] = (df_train[response_name].apply(lambda x: ">50K" in x)).astype(int) df_test[LABEL_COLUMN] = (df_test[response_name].apply(lambda x: ">50K" in x)).astype(int) del df_train[response_name] del df_test[response_name] # remove NaN elements df_train = df_train.dropna(how='any', axis=0) df_test = df_test.dropna(how='any', axis=0) CATEGORICAL_COLUMNS = [] CONTINUOUS_COLUMNS = [] for key, value in predictors.items(): if value == 'Categorical': CATEGORICAL_COLUMNS.append(column_names[key]) elif value == 'Continuous': CONTINUOUS_COLUMNS.append(column_names[key]) # Input bulder function def input_fn(df): continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS} categorical_cols = { k: tf.SparseTensor( indices=[[i, 0] for i in range(df[k].size)], values=df[k].values, dense_shape=[df[k].size, 1]) for k in CATEGORICAL_COLUMNS } # Merges the two dictionaries into one. feature_cols = {**continuous_cols, **categorical_cols} label = tf.constant(df[LABEL_COLUMN].values) return feature_cols, label def train_input_fn(): return input_fn(df_train) def eval_input_fn_test(): return input_fn(df_test) cat_tensors = [] for col in CATEGORICAL_COLUMNS: cat_tensors.append(tf.contrib.layers.sparse_column_with_hash_bucket( column_name=col, hash_bucket_size=100)) cont_tensors = [] for cont in CONTINUOUS_COLUMNS: cont_tensors.append(tf.contrib.layers.real_valued_column(cont)) feature_columns = cat_tensors + cont_tensors model_dir = tempfile.mkdtemp() logistic_model = tf.contrib.learn.LinearClassifier(feature_columns=feature_columns, n_classes=num_labels, model_dir=model_dir) logistic_model.fit(input_fn=train_input_fn, steps=200) # Test the model on reserve data eval_result_test = logistic_model.evaluate(input_fn=eval_input_fn_test, steps=1) # Test the model on training data eval_result_train = logistic_model.evaluate(input_fn=eval_input_fn_train, steps=1) for key in sorted(eval_result_train): print("%s: %s" % (key, eval_result_train[key])) return eval_result_test, model_dir
回答:
我想,你需要添加交叉列来让我们的线性模型表现得更好。