Skip to content

ERROR:tensorflow:Model diverged with loss = NaN. #11

@humf

Description

@humf

INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpzXWlhI
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc8d7933650>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpzXWlhI', '_train_distribute': None, '_save_summary_steps': 100}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpzXWlhI/model.ckpt.
ERROR:tensorflow:Model diverged with loss = NaN.

NanLossDuringTrainingErrorTraceback (most recent call last)
in ()
32 optimizer=tf.train.AdamOptimizer(0.001))
33
---> 34 estimator.train(input_fn=train_input_fn, steps=200)
35 evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
36 evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.pyc in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
364
365 saving_listeners = _check_listeners_type(saving_listeners)
--> 366 loss = self._train_model(input_fn, hooks, saving_listeners)
367 logging.info('Loss for final step: %s.', loss)
368 return self

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.pyc in _train_model(self, input_fn, hooks, saving_listeners)
1117 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1118 else:
-> 1119 return self._train_model_default(input_fn, hooks, saving_listeners)
1120
1121 def _train_model_default(self, input_fn, hooks, saving_listeners):

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.pyc in _train_model_default(self, input_fn, hooks, saving_listeners)
1133 return self._train_with_estimator_spec(estimator_spec, worker_hooks,
1134 hooks, global_step_tensor,
-> 1135 saving_listeners)
1136
1137 def _train_model_distributed(self, input_fn, hooks, saving_listeners):

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.pyc in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
1334 loss = None
1335 while not mon_sess.should_stop():
-> 1336 _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
1337 return loss
1338

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.pyc in run(self, fetches, feed_dict, options, run_metadata)
575 feed_dict=feed_dict,
576 options=options,
--> 577 run_metadata=run_metadata)
578
579 def run_step_fn(self, step_fn):

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.pyc in run(self, fetches, feed_dict, options, run_metadata)
1051 feed_dict=feed_dict,
1052 options=options,
-> 1053 run_metadata=run_metadata)
1054 except _PREEMPTION_ERRORS as e:
1055 logging.info('An error was raised. This may be due to a preemption in '

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.pyc in run(self, *args, **kwargs)
1142 raise six.reraise(*original_exc_info)
1143 else:
-> 1144 raise six.reraise(*original_exc_info)
1145
1146

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.pyc in run(self, *args, **kwargs)
1127 def run(self, *args, **kwargs):
1128 try:
-> 1129 return self._sess.run(*args, **kwargs)
1130 except _PREEMPTION_ERRORS:
1131 raise

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.pyc in run(self, fetches, feed_dict, options, run_metadata)
1207 results=outputs[hook] if hook in outputs else None,
1208 options=options,
-> 1209 run_metadata=run_metadata))
1210 self._should_stop = self._should_stop or run_context.stop_requested
1211

/usr/local/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.pyc in after_run(self, run_context, run_values)
633 if self._fail_on_nan_loss:
634 logging.error(failure_message)
--> 635 raise NanLossDuringTrainingError
636 else:
637 logging.warning(failure_message)

NanLossDuringTrainingError: NaN loss during training.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions