From 2adffda3a6795fc85ab65239a136a184cbae911e Mon Sep 17 00:00:00 2001 From: EHTIISHAM Date: Sun, 24 Nov 2024 20:30:57 +0500 Subject: [PATCH 1/5] some files are updated but no estimator update finding any solution for it --- discrimination/run_discrimination.py | 4 +- lm/dataloader.py | 15 +-- lm/modeling.py | 152 ++++++++++++++------------- 3 files changed, 87 insertions(+), 84 deletions(-) diff --git a/discrimination/run_discrimination.py b/discrimination/run_discrimination.py index 86b7116c..c04cee9f 100644 --- a/discrimination/run_discrimination.py +++ b/discrimination/run_discrimination.py @@ -21,14 +21,14 @@ import numpy as np import tensorflow as tf -from tensorflow.python.lib.io import file_io +#from tensorflow.python.lib.io import file_io from lm.dataloader import classification_convert_examples_to_features, classification_input_fn_builder from lm.modeling import classification_model_fn_builder, GroverConfig from lm.utils import _save_np from sample.encoder import get_encoder -flags = tf.flags +flags = tf.compat.v1.flags FLAGS = flags.FLAGS diff --git a/lm/dataloader.py b/lm/dataloader.py index 283cb85e..04741653 100644 --- a/lm/dataloader.py +++ b/lm/dataloader.py @@ -19,7 +19,7 @@ def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" - example = tf.parse_single_example(record, name_to_features) + example = tf.compat.v1.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. @@ -42,7 +42,7 @@ def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] name_to_features = { - "input_ids": tf.FixedLenFeature([seq_length + 1], tf.int64), + "input_ids": tf.io.FixedLenFeature([seq_length + 1], tf.int64), } # For training, we want a lot of parallel reading and shuffling. @@ -57,6 +57,7 @@ def input_fn(params): # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. + # tf.data.experimental.parallel_interleave will be removed in future versions need to use interleave instead d = d.apply( tf.data.experimental.parallel_interleave( tf.data.TFRecordDataset, @@ -91,13 +92,13 @@ def classification_convert_examples_to_features( chop_from_front_if_needed=True): """Convert a set of `InputExample`s to a TFRecord file.""" - writer = tf.python_io.TFRecordWriter(output_file) + writer = tf.io.TFRecordWriter(output_file) label_map = {label: i for i, label in enumerate(labels)} for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: - tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + tf.compat.v1.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # begin_summary is our [CLS] token tokens = example['ids'] + [encoder.begin_summary] @@ -134,9 +135,9 @@ def classification_input_fn_builder(input_file, seq_length, is_training, """Creates an `input_fn` closure to be passed to TPUEstimator.""" name_to_features = { - "input_ids": tf.FixedLenFeature([seq_length], tf.int64), - "label_ids": tf.FixedLenFeature([], tf.int64), - "is_real_example": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), + "is_real_example": tf.io.FixedLenFeature([], tf.int64), } def input_fn(params): diff --git a/lm/modeling.py b/lm/modeling.py index 7e9e5330..c6b61303 100644 --- a/lm/modeling.py +++ b/lm/modeling.py @@ -84,7 +84,7 @@ def from_dict(cls, json_object): @classmethod def from_json_file(cls, json_file): """Constructs a `NewsConfig` from a json file of parameters.""" - with tf.gfile.GFile(json_file, "r") as reader: + with tf.io.gfile.GFile(json_file, "r") as reader: text = reader.read() return cls.from_dict(json.loads(text)) @@ -114,7 +114,7 @@ def mask_attention_for_ltr(attention_scores, attention_mask): def create_initializer(initializer_range=0.02): """Creates a `truncated_normal_initializer` with the given range.""" - return tf.truncated_normal_initializer(stddev=initializer_range) + return tf.compat.v1.truncated_normal_initializer(stddev=initializer_range) def _attention_projection_and_transpose(x_flat, batch_size, seq_length, num_attention_heads, size_per_head, @@ -136,7 +136,7 @@ def _attention_projection_and_transpose(x_flat, batch_size, seq_length, num_atte (batch_size_seq_length, dim), size_per_head, num_attention_heads )) - projected = tf.layers.dense( + projected = tf.keras.layers.Dense( x_flat, num_attention_heads * size_per_head, name=name, @@ -212,8 +212,8 @@ def attention_layer(x_flat, attention_mask, batch_size, seq_length, size_per_hea # Multiply [batch_size, num_attention_heads, seq_length, size_per_head] with # [batch_size, num_attention_heads, size_per_head, seq_length+cached_length] -> # [batch_size, num_attention_heads, seq_length, seq_length+cached_length] - attention_scores = tf.matmul(query, key, transpose_b=True) - attention_scores = tf.multiply(attention_scores, + attention_scores = tf.linalg.matmul(query, key, transpose_b=True) + attention_scores = tf.math.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) attention_scores = mask_attention_for_ltr(attention_scores, attention_mask) attention_probs = tf.nn.softmax(attention_scores) @@ -226,13 +226,13 @@ def attention_layer(x_flat, attention_mask, batch_size, seq_length, size_per_hea # Multiply [batch_size, num_attention_heads, seq_length, seq_length+cached_length] with # [batch_size, num_attention_heads, seq_length+cached_length, size_per_head] -> # [batch_size, num_attention_heads, seq_length, size_per_head] -> - context_layer = tf.matmul(attention_probs, value) + context_layer = tf.linalg.matmul(attention_probs, value) # `context_layer` = [batch_size, seq_length, num_attention_heads, size_per_head] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) context_layer = tf.reshape(context_layer, [batch_size * seq_length, num_attention_heads * size_per_head]) - context_layer_projected = tf.layers.dense( + context_layer_projected = tf.keras.layers.Dense( context_layer, num_attention_heads * size_per_head, kernel_initializer=create_initializer(initializer_range), @@ -255,7 +255,7 @@ def residual_mlp_layer(x_flat, intermediate_size, initializer_range=0.02, hidden batch_size_seq_length, hidden_size = get_shape_list(x_flat, expected_rank=2) x_norm = layer_norm(x_flat, name='mlp_ln0') - intermediate_output = tf.layers.dense( + intermediate_output = tf.keras.layers.Dense( x_norm, intermediate_size, activation=gelu, @@ -263,7 +263,7 @@ def residual_mlp_layer(x_flat, intermediate_size, initializer_range=0.02, hidden name='intermediate', ) - output_for_residual = tf.layers.dense( + output_for_residual = tf.keras.layers.Dense( intermediate_output, hidden_size, name='output', @@ -293,27 +293,27 @@ def embed(input_ids, """ (batch_size, seq_length) = get_shape_list(input_ids, expected_rank=2) - embedding_table = tf.get_variable( + embedding_table = tf.compat.v1.get_variable( name='word_embed', shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range), ) - assert_op = tf.assert_less_equal(tf.reduce_max(input_ids), vocab_size - 1) + assert_op = tf.compat.v1.assert_less_equal(tf.compact.v1.reduce_max(input_ids), vocab_size - 1) with tf.control_dependencies([assert_op]): if use_one_hot_embeddings: flat_input_ids = tf.reshape(input_ids, [-1]) one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) - output_flat = tf.matmul(one_hot_input_ids, embedding_table) + output_flat = tf.linalg.matmul(one_hot_input_ids, embedding_table) else: output_flat = tf.nn.embedding_lookup(embedding_table, input_ids) embedded_input = tf.reshape(output_flat, [batch_size, seq_length, embedding_size]) - assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + assert_op = tf.compat.v1.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): - full_position_embeddings = tf.get_variable( + full_position_embeddings = tf.compat.v1.get_variable( name='pos_embed', shape=[max_position_embeddings, embedding_size], initializer=create_initializer(initializer_range), @@ -335,7 +335,7 @@ def embed(input_ids, one_hot_pos_ids = tf.one_hot(flat_pos_ids, depth=max_position_embeddings) # [seq_length, full_position_embeddings], [full_position_embeddings, dim] - seq_embeds = tf.matmul(one_hot_pos_ids, full_position_embeddings) + seq_embeds = tf.linalg.matmul(one_hot_pos_ids, full_position_embeddings) embedded_input += seq_embeds[None] # embedded_input += tf.slice(full_position_embeddings[position_offset:], [0, 0], [seq_length, -1])[None] @@ -354,10 +354,10 @@ def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9): # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK """ - with tf.variable_scope('top_p_sample'): + with tf.compat.v1.variable_scope('top_p_sample'): batch_size, vocab_size = get_shape_list(logits, expected_rank=2) - probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10, + probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], dtype =tf.float32) * 1e10, axis=-1) if isinstance(p, float) and p > 0.999999: @@ -366,13 +366,13 @@ def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9): return { 'probs': probs, 'sample': tf.random.categorical( - logits=logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10, + logits=logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], dtype =tf.float32) * 1e10, num_samples=num_samples, dtype=tf.int32), } # [batch_size, vocab_perm] indices = tf.argsort(probs, direction='DESCENDING') - cumulative_probabilities = tf.math.cumsum(tf.batch_gather(probs, indices), axis=-1, exclusive=False) + cumulative_probabilities = tf.math.cumsum(tf.compat.v1.batch_gather(probs, indices), axis=-1, exclusive=False) # find the top pth index to cut off. careful we don't want to cutoff everything! # result will be [batch_size, vocab_perm] @@ -381,13 +381,13 @@ def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9): tf.logical_or(cumulative_probabilities < p_expanded, tf.range(vocab_size)[None] < 1)) # OPTION A - sample in the sorted space, then unsort. - logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10 + logits_to_use = tf.compat.v1.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10 sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples) - sample = tf.batch_gather(indices, sample_perm) + sample = tf.compat.v1.batch_gather(indices, sample_perm) # OPTION B - unsort first - Indices need to go back to 0 -> N-1 -- then sample # unperm_indices = tf.argsort(indices, direction='ASCENDING') - # include_mask_unperm = tf.batch_gather(include_mask, unperm_indices) + # include_mask_unperm = tf.compat.v1.batch_gather(include_mask, unperm_indices) # logits_to_use = logits - (1 - tf.cast(include_mask_unperm, tf.float32)) * 1e10 # sample = tf.random.categorical(logits=logits_to_use, num_samples=num_samples, dtype=tf.int32) @@ -408,7 +408,7 @@ def _top_k_sample(logits, ignore_ids=None, num_samples=1, k=10): # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK """ - with tf.variable_scope('top_p_sample'): + with tf.compat.v1.variable_scope('top_p_sample'): batch_size, vocab_size = get_shape_list(logits, expected_rank=2) probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10, @@ -422,9 +422,9 @@ def _top_k_sample(logits, ignore_ids=None, num_samples=1, k=10): exclude_mask = tf.range(vocab_size)[None] >= k_expanded # OPTION A - sample in the sorted space, then unsort. - logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10 + logits_to_use = tf.compat.v1.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10 sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples) - sample = tf.batch_gather(indices, sample_perm) + sample = tf.compat.v1.batch_gather(indices, sample_perm) return { 'probs': probs, @@ -487,8 +487,8 @@ def __init__(self, assert features_ == (config.hidden_size // config.num_attention_heads) caches = tf.unstack(cache, axis=1) - with tf.variable_scope(scope, default_name='newslm', reuse=reuse): - with tf.variable_scope("embeddings"): + with tf.compat.v1.variable_scope(scope, default_name='newslm', reuse=reuse): + with tf.compat.v1.variable_scope("embeddings"): embeddings, self.embedding_table = embed(self.input_ids, config.vocab_size, config.hidden_size, position_offset=self.cache_length, @@ -505,7 +505,7 @@ def __init__(self, hidden_state = tf.reshape(embeddings, [self.batch_size * self.seq_length, self.config.hidden_size]) new_kvs = [] for layer_idx, layer_cache in enumerate(caches): - with tf.variable_scope('layer{:02d}'.format(layer_idx)): + with tf.compat.v1.variable_scope('layer{:02d}'.format(layer_idx)): # [batch_size * seq_length, hidden_size] attention_output, new_kv = attention_layer( hidden_state, @@ -531,7 +531,7 @@ def __init__(self, self.new_kvs = tf.stack(new_kvs, axis=1) if do_cache else None # Note that the hidden state is still flat (batch_size*hidden_size) - self.logits_flat = tf.matmul(self.hidden_state, self.embedding_table, transpose_b=True) + self.logits_flat = tf.linalg.matmul(self.hidden_state, self.embedding_table, transpose_b=True) # THE OUTPUT BIAS DOES NOT SPARK JOY # output_bias = tf.get_variable('output_bias', shape=[config.vocab_size], initializer=tf.zeros_initializer()) @@ -549,7 +549,7 @@ def lm_loss(self): target_ids_flat = tf.reshape(self.target_ids, [-1]) # 1 if it's valid and 0 otherwise. - label_weights = tf.cast(tf.not_equal(target_ids_flat, self.pad_token_id), dtype=self.logits_flat.dtype) + label_weights = tf.cast(tf.math.not_equal(target_ids_flat, self.pad_token_id), dtype=self.logits_flat.dtype) # [batch_size * seq_length, vocab_size] one_hot_labels = tf.one_hot(target_ids_flat, @@ -559,12 +559,12 @@ def lm_loss(self): # [batch_size * seq_length, vocab_size] logprobs_flat = tf.nn.log_softmax(self.logits_flat, axis=-1) - per_example_loss = -tf.reduce_sum(logprobs_flat * one_hot_labels, axis=[-1]) + per_example_loss = -tf.math.reduce_sum(logprobs_flat * one_hot_labels, axis=[-1]) # per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_flat, labels=target_ids_flat) - numerator = tf.reduce_sum(label_weights * per_example_loss) - denominator = tf.reduce_sum(label_weights) + 1e-5 + numerator = tf.math.reduce_sum(label_weights * per_example_loss) + denominator = tf.math.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return loss @@ -574,7 +574,7 @@ def pooled_output(self, clf_token): :param clf_token: :return: """ - pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(self.input_ids, clf_token), tf.float32), 1), tf.int32) + pool_idx = tf.cast(tf.math.argmax(tf.cast(tf.math.equal(self.input_ids, clf_token), tf.float32), 1), tf.int32) return tf.gather(self.hidden_state, tf.range(self.batch_size, dtype=tf.int32) * self.seq_length + pool_idx) @@ -585,12 +585,12 @@ def model_fn_builder(config: GroverConfig, init_checkpoint, learning_rate, def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" - tf.logging.info("*** Features ***") + tf.compat.v1.logging.info("*** Features ***") for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + tf.compat.v1.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] - + # this is not found in updates is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = GroverModel( @@ -606,11 +606,11 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument if is_training: train_op, train_metrics = optimization_adafactor.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) - tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + tvars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES) else: train_op = None train_metrics = {} - tvars = tf.trainable_variables() + tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} scaffold_fn = None @@ -619,22 +619,23 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() + tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.compat.v1.train.Scaffold() scaffold_fn = tpu_scaffold else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map) - tf.logging.info("**** Trainable Variables ****") + tf.compat.v1.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + tf.compat.v1.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None + # need to find it no updates found if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( @@ -650,12 +651,12 @@ def tpu_scaffold(): loss=total_loss, train_op=train_op, training_hooks=[ - tf.train.LoggingTensorHook({'loss': tf.metrics.mean(total_loss)[1]}, every_n_iter=100)], + tf.compat.v1.train.LoggingTensorHook({'loss': tf.compat.v1.metrics.mean(total_loss)[1]}, every_n_iter=100)], scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(total_loss): - loss = tf.metrics.mean(values=total_loss) + loss = tf.compat.v1.metrics.mean(values=total_loss) return { "eval_loss": loss, } @@ -668,11 +669,11 @@ def metric_fn(total_loss): eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: - gt_logprobs = tf.squeeze(tf.batch_gather(model.log_probs, model.target_ids[:, :, None]), axis=2) + gt_logprobs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(model.log_probs, model.target_ids[:, :, None]), axis=2) # Need top-p required under topp sampling! better_than_gt = model.log_probs > gt_logprobs[:, :, None] - top_p_required = tf.reduce_sum(tf.cast(better_than_gt, tf.float32) * tf.exp(model.log_probs), axis=2) + top_p_required = tf.math.reduce_sum(tf.cast(better_than_gt, tf.float32) * tf.exp(model.log_probs), axis=2) # No top-p sampling for now, since this seems to be too slow on TPUs if use_tpu: @@ -687,7 +688,7 @@ def metric_fn(total_loss): _top_p_sample(model.logits_flat, num_samples=1, p=0.99)['sample'], get_shape_list(model.target_ids), ) - pred_logprobs = tf.squeeze(tf.batch_gather(model.log_probs, predictions[:, :, None]), axis=2) + pred_logprobs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(model.log_probs, predictions[:, :, None]), axis=2) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, @@ -738,8 +739,8 @@ def sample_step(tokens, ignore_ids, news_config, batch_size=1, p_for_topp=0.95, else: sample_info = _top_p_sample(next_logits, ignore_ids=ignore_ids, num_samples=1, p=p_for_topp) - new_tokens = tf.squeeze(sample_info['sample'], 1) - new_probs = tf.squeeze(tf.batch_gather(sample_info['probs'], sample_info['sample']), 1) + new_tokens = tf.compat.v1.squeeze(sample_info['sample'], 1) + new_probs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(sample_info['probs'], sample_info['sample']), 1) return { 'new_tokens': new_tokens, 'new_probs': new_probs, @@ -775,7 +776,7 @@ def sample(news_config: GroverConfig, initial_context, eos_token, ignore_ids=Non if ignore_ids is None: ignore_ids = tf.constant([x == 0 for x in range(news_config.vocab_size)], dtype=tf.bool) - with tf.name_scope('sample_sequence'): + with tf.compat.v1.name_scope('sample_sequence'): # Initial call to get cache context_output = initialize_from_context(initial_context, ignore_ids=ignore_ids, news_config=news_config, p_for_topp=p_for_topp, @@ -797,10 +798,10 @@ def body(ctx, cache, probs): return [new_ids, new_cache, new_probs] def cond(ctx, cache, probs): - is_eos = tf.equal(ctx, eos_token) - return tf.math.logical_not(tf.reduce_all(tf.reduce_any(is_eos, axis=1))) + is_eos = tf.math.equal(ctx, eos_token) + return tf.math.logical_not(tf.math.reduce_all(tf.math.reduce_any(is_eos, axis=1))) - tokens, cache, probs = tf.while_loop( + tokens, cache, probs = tf.compat.v1.while_loop( cond=cond, body=body, maximum_iterations=1025 - get_shape_list(ctx)[1], loop_vars=[ctx, cache, probs], shape_invariants=[tf.TensorShape([batch_size, None]), @@ -823,9 +824,9 @@ def classification_model_fn_builder(config: GroverConfig, init_checkpoint, learn def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" - tf.logging.info("*** Features ***") + tf.compat.v1.logging.info("*** Features ***") for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + tf.compat.v1.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] label_ids = features["label_ids"] @@ -833,7 +834,7 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) - + # need to find this no update found is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Create model with aux loss @@ -845,11 +846,11 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument chop_off_last_token=False, ) - with tf.variable_scope('classification'): + with tf.compat.v1.variable_scope('classification'): hidden_state = model.pooled_output(pool_token_id) if is_training: hidden_state = dropout(hidden_state, dropout_prob=0.1) - logits = tf.layers.dense( + logits = tf.keras.layers.Dense( hidden_state, num_labels, kernel_initializer=create_initializer(config.initializer_range), @@ -857,8 +858,8 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument ) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=num_labels, dtype=tf.float32) - per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) - class_loss = tf.reduce_mean(per_example_loss) + per_example_loss = -tf.math.reduce_sum(one_hot_labels * log_probs, axis=-1) + class_loss = tf.math.reduce_mean(per_example_loss) total_loss = lm_loss_coef * model.lm_loss() + class_loss @@ -866,16 +867,16 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument train_op, train_metrics = optimization_adafactor.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) # tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) - tvars = tf.trainable_variables() + tvars = tf.compat.v1.trainable_variables() train_metrics['minibatch_cls_loss'] = class_loss - train_metrics['minibatch_acc'] = tf.reduce_mean( - tf.cast(tf.equal(tf.argmax(logits, axis=-1, output_type=tf.int32), + train_metrics['minibatch_acc'] = tf.math.reduce_mean( + tf.cast(tf.math.equal(tf.math.argmax(logits, axis=-1, output_type=tf.int32), label_ids), tf.float32)) else: train_op = None train_metrics = {} - tvars = tf.trainable_variables() + tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} scaffold_fn = None @@ -884,22 +885,23 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() + tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.compat.v1.train.Scaffold() scaffold_fn = tpu_scaffold else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map) - tf.logging.info("**** Trainable Variables ****") + tf.compat.v1.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + tf.compat.v1.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None + # need to find this no update found if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( @@ -915,15 +917,15 @@ def tpu_scaffold(): loss=total_loss, train_op=train_op, training_hooks=[ - tf.train.LoggingTensorHook({'loss': tf.metrics.mean(total_loss)[1]}, every_n_iter=100)], + tf.compat.v1.train.LoggingTensorHook({'loss': tf.compat.v1.metrics.mean(total_loss)[1]}, every_n_iter=100)], scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): - predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) - accuracy = tf.metrics.accuracy( + predictions = tf.math.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.compat.v1.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) - loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + loss = tf.compat.v1.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, From 5c4e16a43b48ea58f1e7144c89e109e9e99c2abd Mon Sep 17 00:00:00 2001 From: EHTIISHAM Date: Tue, 26 Nov 2024 10:21:56 +0500 Subject: [PATCH 2/5] except estimators all changes are done --- lm/modeling.py | 23 +++++++++------- lm/optimization_adafactor.py | 49 ++++++++++++++++++----------------- lm/train.py | 25 +++++++++--------- lm/utils.py | 35 +++++++++++++------------ lm/validate.py | 19 +++++++------- realnews/prepare_lm_data.py | 6 ++--- sample/contextual_generate.py | 15 ++++++----- 7 files changed, 90 insertions(+), 82 deletions(-) diff --git a/lm/modeling.py b/lm/modeling.py index c6b61303..9f2312d6 100644 --- a/lm/modeling.py +++ b/lm/modeling.py @@ -19,6 +19,7 @@ import six import tensorflow as tf +import tensorflow.compat.v1 as tf1 from lm import optimization_adafactor from lm.utils import get_assignment_map_from_checkpoint, get_shape_list, get_attention_mask, gelu, layer_norm, dropout, \ @@ -638,7 +639,7 @@ def tpu_scaffold(): # need to find it no updates found if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, @@ -646,7 +647,7 @@ def tpu_scaffold(): prefix='training/'), scaffold_fn=scaffold_fn) else: - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, @@ -663,7 +664,7 @@ def metric_fn(total_loss): eval_metrics = (metric_fn, [total_loss]) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, @@ -690,7 +691,7 @@ def metric_fn(total_loss): ) pred_logprobs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(model.log_probs, predictions[:, :, None]), axis=2) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions={'gt_logprobs': gt_logprobs, 'top_p_required': top_p_required, @@ -904,7 +905,8 @@ def tpu_scaffold(): # need to find this no update found if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + # from here + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, @@ -912,14 +914,14 @@ def tpu_scaffold(): prefix='training/'), scaffold_fn=scaffold_fn) else: - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=[ tf.compat.v1.train.LoggingTensorHook({'loss': tf.compat.v1.metrics.mean(total_loss)[1]}, every_n_iter=100)], scaffold_fn=scaffold_fn) - + # to here elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.math.argmax(logits, axis=-1, output_type=tf.int32) @@ -930,16 +932,16 @@ def metric_fn(per_example_loss, label_ids, logits, is_real_example): "eval_accuracy": accuracy, "eval_loss": loss, } - + # from here eval_metrics = (metric_fn, [per_example_loss, label_ids, logits, is_real_example]) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: - output_spec = tf.contrib.tpu.TPUEstimatorSpec( + output_spec = tf1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions={'logits': logits, 'probs': tf.nn.softmax(logits, axis=-1)}, @@ -947,3 +949,4 @@ def metric_fn(per_example_loss, label_ids, logits, is_real_example): return output_spec return model_fn + # to here \ No newline at end of file diff --git a/lm/optimization_adafactor.py b/lm/optimization_adafactor.py index b8d03ed1..400d56c7 100644 --- a/lm/optimization_adafactor.py +++ b/lm/optimization_adafactor.py @@ -14,17 +14,18 @@ # limitations under the License. import re import tensorflow as tf +import tensorflow.compat.v1 as tf1 from lm.utils import get_shape_list def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): """Creates an optimizer training op.""" - global_step = tf.train.get_or_create_global_step() + global_step = tf1.train.get_or_create_global_step() - learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + learning_rate = tf1.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. - learning_rate = tf.train.polynomial_decay( + learning_rate = tf1.train.polynomial_decay( learning_rate, global_step, num_train_steps, @@ -36,7 +37,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) - warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + warmup_steps_int = tf1.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) @@ -60,10 +61,10 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: - optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + optimizer = tf1.tpu.CrossShardOptimizer(optimizer) - tvars = tf.trainable_variables() - grads = tf.gradients(loss, tvars) + tvars = tf1.trainable_variables() + grads = tf1.gradients(loss, tvars) # You could do this, but instead we don't because a) it's slow and b) we already did the 'update clipping' # (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) @@ -125,7 +126,7 @@ def _parameter_scale(self, var): Returns: a Scalar """ - return tf.maximum(reduce_rms(var), self.epsilon2) + return tf.math.maximum(reduce_rms(var), self.epsilon2) def apply_gradients(self, grads_and_vars, global_step=None, name=None): """See base class.""" @@ -139,7 +140,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None): # decay_rate = 1 - tf.pow(tf.cast(tf.train.get_or_create_global_step(), tf.float32) + 1.0, -0.8) decay_rate = self.beta_2 - grad_squared = tf.square(grad) + self.epsilon1 + grad_squared = tf.math.square(grad) + self.epsilon1 update_scale = self.learning_rate # update_scale = self.learning_rate * tf.cast(self._parameter_scale(param), dtype=tf.float32) @@ -148,7 +149,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None): # This confounds the XLA rewriter and keeps it from fusing computations # across different variables. This fusion is a bad for HBM usage, since # it causes the gradients to persist in memory. - grad_squared_mean = tf.reduce_mean(grad_squared) + grad_squared_mean = tf.math.reduce_mean(grad_squared) decay_rate += grad_squared_mean * 1e-30 update_scale += grad_squared_mean * 1e-30 @@ -157,42 +158,42 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None): if self._use_factored(shape_list): num_rows, num_columns = shape_list - vr = tf.get_variable( + vr = tf1.get_variable( name=param_name + "/adafactor_vr", shape=[num_rows], dtype=tf.float32, trainable=False, - initializer=tf.zeros_initializer()) - vc = tf.get_variable( + initializer=tf1.zeros_initializer()) + vc = tf1.get_variable( name=param_name + "/adafactor_vc", shape=[num_columns], dtype=tf.float32, trainable=False, - initializer=tf.zeros_initializer()) + initializer=tf1.zeros_initializer()) - next_vr = decay_rate * vr + (1 - decay_rate) * tf.reduce_mean(grad_squared, 1) - next_vc = decay_rate * vc + (1 - decay_rate) * tf.reduce_mean(grad_squared, 0) + next_vr = decay_rate * vr + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 1) + next_vc = decay_rate * vc + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 0) - long_term_mean = tf.reduce_mean(next_vr, -1, keepdims=True) - r_factor = tf.rsqrt(next_vr / long_term_mean + self.epsilon1) - c_factor = tf.rsqrt(next_vc + self.epsilon1) + long_term_mean = tf.math.reduce_mean(next_vr, -1, keepdims=True) + r_factor = tf.math.rsqrt(next_vr / long_term_mean + self.epsilon1) + c_factor = tf.math.rsqrt(next_vc + self.epsilon1) update = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(c_factor, -2) assignments.append(vr.assign(next_vr, use_locking=self.use_locking)) assignments.append(vc.assign(next_vc, use_locking=self.use_locking)) else: - v = tf.get_variable( + v = tf1.get_variable( name=param_name + "/adafactor_v", shape=shape_list, dtype=tf.float32, trainable=False, - initializer=tf.zeros_initializer()) + initializer=tf1.zeros_initializer()) next_v = decay_rate * v + (1 - decay_rate) * grad_squared assignments.append(v.assign(next_v, use_locking=self.use_locking)) - update = grad * tf.rsqrt(next_v + self.epsilon1) + update = grad * tf.math.rsqrt(next_v + self.epsilon1) - clipping_denom = tf.maximum(1.0, reduce_rms(update) / self.clipping_rate) + clipping_denom = tf.math.maximum(1.0, reduce_rms(update) / self.clipping_rate) update /= clipping_denom # Do weight decay @@ -231,4 +232,4 @@ def _get_variable_name(self, param_name): def reduce_rms(x): - return tf.sqrt(tf.reduce_mean(tf.square(x))) + return tf.math.sqrt(tf.math.reduce_mean(tf.math.square(x))) diff --git a/lm/train.py b/lm/train.py index fbf9d3ad..77e0796a 100644 --- a/lm/train.py +++ b/lm/train.py @@ -16,11 +16,12 @@ """ Training script! """ import tensorflow as tf +import tensorflow.compat.v1 as tf1 from lm.dataloader import input_fn_builder from lm.modeling import model_fn_builder, GroverConfig -flags = tf.flags +flags = tf1.flags FLAGS = flags.FLAGS @@ -93,25 +94,25 @@ def main(_): - tf.logging.set_verbosity(tf.logging.INFO) + tf1.logging.set_verbosity(tf1.logging.INFO) news_config = GroverConfig.from_json_file(FLAGS.config_file) - tf.gfile.MakeDirs(FLAGS.output_dir) + tf1.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) + input_files.extend(tf1.gfile.Glob(input_pattern)) - tf.logging.info("*** Input Files ***") + tf1.logging.info("*** Input Files ***") for input_file in input_files: - tf.logging.info(" %s" % input_file) + tf1.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: - tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - + # from here is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, @@ -141,9 +142,9 @@ def main(_): eval_batch_size=FLAGS.train_batch_size, params={'model_dir': FLAGS.output_dir} ) - - tf.logging.info("***** Running training *****") - tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + # to here + tf1.logging.info("***** Running training *****") + tf1.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, seq_length=FLAGS.max_seq_length, @@ -154,4 +155,4 @@ def main(_): if __name__ == "__main__": flags.mark_flag_as_required("input_file") flags.mark_flag_as_required("output_dir") - tf.app.run() + tf1.app.run() diff --git a/lm/utils.py b/lm/utils.py index aa75c71b..470ee50a 100644 --- a/lm/utils.py +++ b/lm/utils.py @@ -18,6 +18,7 @@ import six import tensorflow as tf +import tensorflow.compat.v1 as tf1 import numpy as np from tensorflow.python.lib.io import file_io @@ -53,7 +54,7 @@ def assert_rank(tensor, expected_rank, name=None): actual_rank = tensor.shape.ndims if actual_rank not in expected_rank_dict: - scope_name = tf.get_variable_scope().name + scope_name = tf1.get_variable_scope().name raise ValueError( "For the tensor `%s` in scope `%s`, the actual rank " "`%d` (shape = %s) is not equal to the expected rank `%s`" % @@ -91,7 +92,7 @@ def get_shape_list(tensor, expected_rank=None, name=None): if not non_static_indexes: return shape - dyn_shape = tf.shape(tensor) + dyn_shape = tf1.shape(tensor) for index in non_static_indexes: shape[index] = dyn_shape[index] return shape @@ -109,20 +110,20 @@ def gelu(input_tensor): Returns: `input_tensor` with the GELU activation applied. """ - cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) + cdf = 0.5 * (1.0 + tf.math.erf(input_tensor / tf.math.sqrt(2.0))) return input_tensor * cdf def layer_norm(input_tensor, name=None, epsilon=1e-5): """Run layer normalization on the last dimension of the tensor.""" name2use = f'LayerNorm_{name}' if name is not None else name - with tf.variable_scope(name2use, default_name='LayerNorm'): + with tf1.variable_scope(name2use, default_name='LayerNorm'): dim = input_tensor.shape[-1].value - gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1)) - beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0)) - mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True) - std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True) - input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon) + gamma = tf1.get_variable('gamma', [dim], initializer=tf1.constant_initializer(1)) + beta = tf1.get_variable('beta', [dim], initializer=tf1.constant_initializer(0)) + mean = tf.math.reduce_mean(input_tensor, axis=-1, keepdims=True) + std = tf.math.reduce_mean(tf.math.square(input_tensor - mean), axis=-1, keepdims=True) + input_tensor = (input_tensor - mean) * tf.math.rsqrt(std + epsilon) input_tensor = input_tensor * gamma + beta return input_tensor @@ -149,8 +150,8 @@ def get_attention_mask(nd, ns, *, dtype): this is a TPU compatible version of tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd) where the lower right triangle contains 1s """ - i = tf.range(nd)[:, None] - j = tf.range(ns) + i = tf.ragged.range(nd)[:, None] + j = tf.ragged.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) @@ -214,21 +215,21 @@ def host_call_fn(global_step, *args): List of summary ops to run on the CPU host. """ step = global_step[0] - with tf.contrib.summary.create_file_writer( + with tf.summary.create_file_writer( logdir=model_dir, filename_suffix=".host_call").as_default(): - with tf.contrib.summary.always_record_summaries(): + with tf.summary.should_record_summaries(): for i, name in enumerate(metric_names): - tf.contrib.summary.scalar(prefix + name, args[i][0], step=step) + tf1.summary.scalar(prefix + name, args[i][0], step=step) - return tf.contrib.summary.all_summary_ops() + return tf1.summary.all_v2_summary_ops() # To log the current learning rate, and gradient norm for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. - global_step_tensor = tf.reshape( + global_step_tensor = tf1.manip.reshape( tf.compat.v1.train.get_or_create_global_step(), [1]) - other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names] + other_tensors = [tf.manip.reshape(metric_dict[key], [1]) for key in metric_names] return host_call_fn, [global_step_tensor] + other_tensors diff --git a/lm/validate.py b/lm/validate.py index 0ac1668c..49694768 100644 --- a/lm/validate.py +++ b/lm/validate.py @@ -16,13 +16,14 @@ import os from lm.modeling import model_fn_builder, GroverConfig import tensorflow as tf +import tensorflow.compat.v1 as tf1 from lm.dataloader import input_fn_builder import numpy as np import tempfile import h5py from google.cloud import storage -flags = tf.flags +flags = tf1.flags FLAGS = flags.FLAGS @@ -126,20 +127,20 @@ def ind_where(array: np.ndarray, target, return_first_match=True, default_value= def main(_): - tf.logging.set_verbosity(tf.logging.INFO) + tf1.logging.set_verbosity(tf1.logging.INFO) news_config = GroverConfig.from_json_file(FLAGS.config_file) - tf.gfile.MakeDirs(FLAGS.output_dir) + tf1.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) + input_files.extend(tf1.gfile.Glob(input_pattern)) - tf.logging.info("*** Input Files ***") + tf1.logging.info("*** Input Files ***") for input_file in input_files: - tf.logging.info(" %s" % input_file) - + tf1.logging.info(" %s" % input_file) + # from here tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( @@ -176,7 +177,7 @@ def main(_): predict_batch_size=FLAGS.batch_size, params={'model_dir': FLAGS.output_dir} ) - + # to here eval_input_fn = input_fn_builder( input_files=input_files, seq_length=FLAGS.max_seq_length, @@ -211,4 +212,4 @@ def main(_): if __name__ == "__main__": flags.mark_flag_as_required("input_file") flags.mark_flag_as_required("output_dir") - tf.app.run() + tf1.app.run() diff --git a/realnews/prepare_lm_data.py b/realnews/prepare_lm_data.py index 33fc6433..5e4365e4 100644 --- a/realnews/prepare_lm_data.py +++ b/realnews/prepare_lm_data.py @@ -82,14 +82,14 @@ def __init__(self, fn): self.s3client = boto3.client('s3', ) self.storage_dir = TemporaryDirectory() - self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord')) + self.writer = tf.io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord')) self.bucket_name, self.file_name = self.fn.split('s3://', 1)[1].split('/', 1) elif fn.startswith('gs://'): from google.cloud import storage self.s3client = None self.gclient = storage.Client() self.storage_dir = TemporaryDirectory() - self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord')) + self.writer = tf.io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord')) self.bucket_name, self.file_name = self.fn.split('gs://', 1)[1].split('/', 1) else: @@ -98,7 +98,7 @@ def __init__(self, fn): self.bucket_name = None self.file_name = None self.storage_dir = None - self.writer = tf.python_io.TFRecordWriter(fn) + self.writer = tf.io.TFRecordWriter(fn) def write(self, x): self.writer.write(x) diff --git a/sample/contextual_generate.py b/sample/contextual_generate.py index 85d55235..873f4b44 100644 --- a/sample/contextual_generate.py +++ b/sample/contextual_generate.py @@ -1,4 +1,5 @@ import tensorflow as tf +import tensorflow.compat.v1 as tf1 import numpy as np import sys import json @@ -101,19 +102,19 @@ with open(args.metadata_fn, 'r') as f: articles = [json.loads(l) for i, l in enumerate(f) if i % args.num_folds == args.fold] -tf_config = tf.ConfigProto(allow_soft_placement=True) +tf_config = tf1.ConfigProto(allow_soft_placement=True) -with tf.Session(config=tf_config, graph=tf.Graph()) as sess, \ +with tf1.Session(config=tf_config, graph=tf.Graph()) as sess, \ open(args.out_fn, 'w') as f_out: - initial_context = tf.placeholder(tf.int32, [batch_size_per_chunk, None]) - p_for_topp = tf.placeholder(tf.float32, [batch_size_per_chunk]) - eos_token = tf.placeholder(tf.int32, []) - ignore_ids = tf.placeholder(tf.bool, [news_config.vocab_size]) + initial_context = tf1.placeholder(tf.int32, [batch_size_per_chunk, None]) + p_for_topp = tf1.placeholder(tf.float32, [batch_size_per_chunk]) + eos_token = tf1.placeholder(tf.int32, []) + ignore_ids = tf1.placeholder(tf.bool, [news_config.vocab_size]) tokens, probs = sample(news_config=news_config, initial_context=initial_context, eos_token=eos_token, ignore_ids=ignore_ids, p_for_topp=p_for_topp, do_topk=False) - saver = tf.train.Saver() + saver = tf1.train.Saver() saver.restore(sess, args.model_ckpt) # Let's go! From 9422065258184a88dd6be3360489efdf8f561132 Mon Sep 17 00:00:00 2001 From: EHTIISHAM <100058262+EHTIISHAM@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:13:08 +0500 Subject: [PATCH 3/5] Update requirements-gpu.txt --- requirements-gpu.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements-gpu.txt b/requirements-gpu.txt index 216df38b..a69664c0 100644 --- a/requirements-gpu.txt +++ b/requirements-gpu.txt @@ -1,8 +1,8 @@ -pandas==0.24.2 -regex==2019.4.14 -h5py==2.9.0 -numpy==1.16.2 -tensorboard==1.13.1 -tensorflow-gpu==1.13.1 -tqdm==4.31.1 -requests==2.22.0 \ No newline at end of file +pandas +regex +h5py +numpy==1.26.3 +tensorboard +tensorflow-gpu==2.16.1 +tqdm +requests From 8e086a88e634b28118c751f647b1f13a25dd910f Mon Sep 17 00:00:00 2001 From: EHTIISHAM <100058262+EHTIISHAM@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:20:42 +0500 Subject: [PATCH 4/5] Update requirements-gpu.txt --- requirements-gpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-gpu.txt b/requirements-gpu.txt index a69664c0..4a10d921 100644 --- a/requirements-gpu.txt +++ b/requirements-gpu.txt @@ -3,6 +3,6 @@ regex h5py numpy==1.26.3 tensorboard -tensorflow-gpu==2.16.1 +tensorflow==2.16.1 tqdm requests From 39eca7440c0c085ffc8a03101e6aea0d0141f3e7 Mon Sep 17 00:00:00 2001 From: EHTIISHAM <100058262+EHTIISHAM@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:33:46 +0500 Subject: [PATCH 5/5] Update optimization_adafactor.py --- lm/optimization_adafactor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm/optimization_adafactor.py b/lm/optimization_adafactor.py index 400d56c7..ff1c0ed9 100644 --- a/lm/optimization_adafactor.py +++ b/lm/optimization_adafactor.py @@ -86,7 +86,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): return train_op, train_metrics -class AdaFactorOptimizer(tf.train.Optimizer): +class AdaFactorOptimizer(tf1.train.Optimizer): """here's the optimizer we'll use""" def __init__(self,