我在尝试使用Tensorflow的tf.contrib.layers包进行分类时遇到了一个问题,实在是搞不明白。从一些例子来看(例如这个及其教程),图的处理都是由API完成的。我可以在我的环境中下载并运行同样的代码,没有任何问题。
然而,当我运行我的代码时,我得到了一个错误,提示我的全局步骤与我的损失不是来自同一个图,这看起来很奇怪:ValueError: Tensor("global_step:0", shape=(), dtype=int64_ref) must be from the same graph as Tensor("softmax_cross_entropy_loss/value:0", shape=(), dtype=float32).
这个错误是在构建train_op
时发生的。
这是我的Tensorflow代码(我确实有一些其他代码来处理数据的加载,但它们没有使用Tensorflow的任何东西)。抱歉代码现在有点乱:我一直在尝试拆解它来找出这个错误。
import numpy as np
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
import data # 我的数据加载模块
def train(training_file, vocab_path, hidden_units=[10, 20, 10], estimator=tf.contrib.learn.DNNClassifier):
"""
给定一个训练CSV文件,训练一个Tensorflow神经网络
"""
training_set = data.load(training_file)
vocab = tf.contrib.learn.preprocessing.VocabularyProcessor(data.DOC_LENGTH)
vocab = vocab.restore(vocab_path)
training_data = tf.one_hot(training_set.data, len(vocab.vocabulary_._mapping), dtype=tf.float32)
training_targets = tf.constant(np.array(training_set.targets, dtype=np.int32))
classifier = tf.contrib.learn.Estimator(model_fn=lambda features, targets, mode, params: model_fn(features, targets, mode, params, hidden_units))
classifier.fit(input_fn=lambda: (training_data, training_targets), steps=2000)
return classifier
def model_fn(features, targets, mode, params, hidden_units):
if len(hidden_units) <= 0:
raise ValueError("隐藏单元必须是一个长度大于等于1的整数可迭代对象")
# 定义网络
network = tf.contrib.layers.relu(features, hidden_units[0])
for i in range(1, len(hidden_units)):
network = tf.contrib.layers.relu(network, hidden_units[i])
# 展平网络
network = tf.reshape(network, [-1, hidden_units[-1] * data.DOC_LENGTH])
# 添加dropout以增强特征使用
network = tf.layers.dropout(inputs=network, rate=0.5, training=mode == tf.contrib.learn.ModeKeys.TRAIN)
# 计算logits
logits = tf.contrib.layers.fully_connected(network, 15)
loss = None
train_op = None
if mode != tf.contrib.learn.ModeKeys.INFER:
targets = tf.cast(tf.one_hot(targets, 15, 1, 0), dtype=tf.float32)
loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=targets)
if mode == tf.contrib.learn.ModeKeys.TRAIN:
# 这个train_op会导致错误
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.train.get_global_step(),
optimizer='Adam',
learning_rate=0.01)
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
return model_fn_lib.ModelFnOps(mode=mode, predictions=predictions, loss=loss, train_op=train_op)
def main(unusedargv):
# ... 解析参数
classifier = train(args.train_data, args.vocab)
print(evaluate(classifier, args.train_data))
print(evaluate(classifier, args.test_data))
if __name__ == "__main__":
tf.app.run()
这是完整的堆栈跟踪:
File "categorize.py", line 126, in main
classifier = train(args.train_data, args.vocab)
File "categorize.py", line 39, in train
classifier.fit(input_fn=lambda: (training_data, training_targets), steps=2000)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 280, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 426, in fit
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 934, in _train_model
model_fn_ops = self._call_legacy_get_train_ops(features, labels)
File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 1003, in _call_legacy_get_train_ops
train_ops = self._get_train_ops(features, labels)
File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 1162, in _get_train_ops
return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN)
File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 1133, in _call_model_fn
model_fn_results = self._model_fn(features, labels, **kwargs)
File "categorize.py", line 37, in <lambda>
classifier = tf.contrib.learn.Estimator(model_fn=lambda features, targets, mode, params: model_fn(features, targets, mode, params, hidden_units))
File "categorize.py", line 73, in model_fn
learning_rate=0.01)
File "/usr/local/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/optimizers.py", line 152, in optimize_loss
with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
File "/usr/local/Cellar/python3/3.6.0_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py", line 82, in __enter__
return next(self.gen)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1410, in variable_scope
g = ops._get_graph_from_inputs(values) # pylint: disable=protected-access
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3968, in _get_graph_from_inputs
_assert_same_graph(original_graph_element, graph_element)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3907, in _assert_same_graph
"%s must be from the same graph as %s." % (item, original_item))
ValueError: Tensor("global_step:0", shape=(), dtype=int64_ref) must be from the same graph as Tensor("softmax_cross_entropy_loss/value:0", shape=(), dtype=float32).
这是我的代码:
回答:
我已经找出了问题所在!这可能与Estimator
接口有关,但基本上我需要将我的Tensorflow变量定义移到Estimator中。我最终创建了一个方法来做这件事,但当我在lambda中定义变量时也同样有效:
def train(training_file, vocab_path, hidden_units=[10, 20, 10]):
"""
给定一个训练CSV文件,训练一个Tensorflow神经网络
"""
training_set = data.load(training_file)
vocab = tf.contrib.learn.preprocessing.VocabularyProcessor(data.DOC_LENGTH)
vocab = vocab.restore(vocab_path)
# 注意这里没有定义变量
training_data = training_set.data
training_targets = np.array(training_set.targets, dtype=np.int32)
classifier = tf.contrib.learn.Estimator(model_fn=lambda features, targets, mode, params: model_fn(features, targets, mode, params, hidden_units))
# 注意这里定义了变量
classifier.fit(
input_fn=lambda:
(tf.one_hot(training_data, len(vocab.vocabulary_._mapping), dtype=tf.float32),
tf.constant(training_targets)),
steps=2000)
return classifier