From 2adffda3a6795fc85ab65239a136a184cbae911e Mon Sep 17 00:00:00 2001
From: EHTIISHAM <ehtasham7899@gmail.com>
Date: Sun, 24 Nov 2024 20:30:57 +0500
Subject: [PATCH 1/5] some files are updated but no estimator update finding
 any solution for it

---
 discrimination/run_discrimination.py |   4 +-
 lm/dataloader.py                     |  15 +--
 lm/modeling.py                       | 152 ++++++++++++++-------------
 3 files changed, 87 insertions(+), 84 deletions(-)

diff --git a/discrimination/run_discrimination.py b/discrimination/run_discrimination.py
index 86b7116c..c04cee9f 100644
--- a/discrimination/run_discrimination.py
+++ b/discrimination/run_discrimination.py
@@ -21,14 +21,14 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.python.lib.io import file_io
+#from tensorflow.python.lib.io import file_io
 
 from lm.dataloader import classification_convert_examples_to_features, classification_input_fn_builder
 from lm.modeling import classification_model_fn_builder, GroverConfig
 from lm.utils import _save_np
 from sample.encoder import get_encoder
 
-flags = tf.flags
+flags = tf.compat.v1.flags
 
 FLAGS = flags.FLAGS
 
diff --git a/lm/dataloader.py b/lm/dataloader.py
index 283cb85e..04741653 100644
--- a/lm/dataloader.py
+++ b/lm/dataloader.py
@@ -19,7 +19,7 @@
 
 def _decode_record(record, name_to_features):
     """Decodes a record to a TensorFlow example."""
-    example = tf.parse_single_example(record, name_to_features)
+    example = tf.compat.v1.parse_single_example(record, name_to_features)
 
     # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
     # So cast all int64 to int32.
@@ -42,7 +42,7 @@ def input_fn(params):
         """The actual input function."""
         batch_size = params["batch_size"]
         name_to_features = {
-            "input_ids": tf.FixedLenFeature([seq_length + 1], tf.int64),
+            "input_ids": tf.io.FixedLenFeature([seq_length + 1], tf.int64),
         }
 
         # For training, we want a lot of parallel reading and shuffling.
@@ -57,6 +57,7 @@ def input_fn(params):
 
             # `sloppy` mode means that the interleaving is not exact. This adds
             # even more randomness to the training pipeline.
+            # tf.data.experimental.parallel_interleave will be removed in future versions need to use interleave instead
             d = d.apply(
                 tf.data.experimental.parallel_interleave(
                     tf.data.TFRecordDataset,
@@ -91,13 +92,13 @@ def classification_convert_examples_to_features(
         chop_from_front_if_needed=True):
     """Convert a set of `InputExample`s to a TFRecord file."""
 
-    writer = tf.python_io.TFRecordWriter(output_file)
+    writer = tf.io.TFRecordWriter(output_file)
 
     label_map = {label: i for i, label in enumerate(labels)}
 
     for (ex_index, example) in enumerate(examples):
         if ex_index % 10000 == 0:
-            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+            tf.compat.v1.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
 
         # begin_summary is our [CLS] token
         tokens = example['ids'] + [encoder.begin_summary]
@@ -134,9 +135,9 @@ def classification_input_fn_builder(input_file, seq_length, is_training,
     """Creates an `input_fn` closure to be passed to TPUEstimator."""
 
     name_to_features = {
-        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
-        "label_ids": tf.FixedLenFeature([], tf.int64),
-        "is_real_example": tf.FixedLenFeature([], tf.int64),
+        "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+        "label_ids": tf.io.FixedLenFeature([], tf.int64),
+        "is_real_example": tf.io.FixedLenFeature([], tf.int64),
     }
 
     def input_fn(params):
diff --git a/lm/modeling.py b/lm/modeling.py
index 7e9e5330..c6b61303 100644
--- a/lm/modeling.py
+++ b/lm/modeling.py
@@ -84,7 +84,7 @@ def from_dict(cls, json_object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `NewsConfig` from a json file of parameters."""
-        with tf.gfile.GFile(json_file, "r") as reader:
+        with tf.io.gfile.GFile(json_file, "r") as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
@@ -114,7 +114,7 @@ def mask_attention_for_ltr(attention_scores, attention_mask):
 
 def create_initializer(initializer_range=0.02):
     """Creates a `truncated_normal_initializer` with the given range."""
-    return tf.truncated_normal_initializer(stddev=initializer_range)
+    return tf.compat.v1.truncated_normal_initializer(stddev=initializer_range)
 
 
 def _attention_projection_and_transpose(x_flat, batch_size, seq_length, num_attention_heads, size_per_head,
@@ -136,7 +136,7 @@ def _attention_projection_and_transpose(x_flat, batch_size, seq_length, num_atte
             (batch_size_seq_length, dim), size_per_head, num_attention_heads
         ))
 
-    projected = tf.layers.dense(
+    projected = tf.keras.layers.Dense(
         x_flat,
         num_attention_heads * size_per_head,
         name=name,
@@ -212,8 +212,8 @@ def attention_layer(x_flat, attention_mask, batch_size, seq_length, size_per_hea
     # Multiply [batch_size, num_attention_heads, seq_length, size_per_head] with
     #          [batch_size, num_attention_heads, size_per_head, seq_length+cached_length] ->
     #          [batch_size, num_attention_heads, seq_length, seq_length+cached_length]
-    attention_scores = tf.matmul(query, key, transpose_b=True)
-    attention_scores = tf.multiply(attention_scores,
+    attention_scores = tf.linalg.matmul(query, key, transpose_b=True)
+    attention_scores = tf.math.multiply(attention_scores,
                                    1.0 / math.sqrt(float(size_per_head)))
     attention_scores = mask_attention_for_ltr(attention_scores, attention_mask)
     attention_probs = tf.nn.softmax(attention_scores)
@@ -226,13 +226,13 @@ def attention_layer(x_flat, attention_mask, batch_size, seq_length, size_per_hea
     # Multiply [batch_size, num_attention_heads, seq_length, seq_length+cached_length] with
     #          [batch_size, num_attention_heads, seq_length+cached_length, size_per_head] ->
     #          [batch_size, num_attention_heads, seq_length, size_per_head] ->
-    context_layer = tf.matmul(attention_probs, value)
+    context_layer = tf.linalg.matmul(attention_probs, value)
 
     # `context_layer` = [batch_size, seq_length, num_attention_heads, size_per_head]
     context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
     context_layer = tf.reshape(context_layer, [batch_size * seq_length, num_attention_heads * size_per_head])
 
-    context_layer_projected = tf.layers.dense(
+    context_layer_projected = tf.keras.layers.Dense(
         context_layer,
         num_attention_heads * size_per_head,
         kernel_initializer=create_initializer(initializer_range),
@@ -255,7 +255,7 @@ def residual_mlp_layer(x_flat, intermediate_size, initializer_range=0.02, hidden
     batch_size_seq_length, hidden_size = get_shape_list(x_flat, expected_rank=2)
     x_norm = layer_norm(x_flat, name='mlp_ln0')
 
-    intermediate_output = tf.layers.dense(
+    intermediate_output = tf.keras.layers.Dense(
         x_norm,
         intermediate_size,
         activation=gelu,
@@ -263,7 +263,7 @@ def residual_mlp_layer(x_flat, intermediate_size, initializer_range=0.02, hidden
         name='intermediate',
     )
 
-    output_for_residual = tf.layers.dense(
+    output_for_residual = tf.keras.layers.Dense(
         intermediate_output,
         hidden_size,
         name='output',
@@ -293,27 +293,27 @@ def embed(input_ids,
     """
     (batch_size, seq_length) = get_shape_list(input_ids, expected_rank=2)
 
-    embedding_table = tf.get_variable(
+    embedding_table = tf.compat.v1.get_variable(
         name='word_embed',
         shape=[vocab_size, embedding_size],
         initializer=create_initializer(initializer_range),
     )
 
-    assert_op = tf.assert_less_equal(tf.reduce_max(input_ids), vocab_size - 1)
+    assert_op = tf.compat.v1.assert_less_equal(tf.compact.v1.reduce_max(input_ids), vocab_size - 1)
     with tf.control_dependencies([assert_op]):
         if use_one_hot_embeddings:
             flat_input_ids = tf.reshape(input_ids, [-1])
             one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
-            output_flat = tf.matmul(one_hot_input_ids, embedding_table)
+            output_flat = tf.linalg.matmul(one_hot_input_ids, embedding_table)
         else:
             output_flat = tf.nn.embedding_lookup(embedding_table, input_ids)
 
         embedded_input = tf.reshape(output_flat, [batch_size, seq_length, embedding_size])
 
-    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
+    assert_op = tf.compat.v1.assert_less_equal(seq_length, max_position_embeddings)
 
     with tf.control_dependencies([assert_op]):
-        full_position_embeddings = tf.get_variable(
+        full_position_embeddings = tf.compat.v1.get_variable(
             name='pos_embed',
             shape=[max_position_embeddings, embedding_size],
             initializer=create_initializer(initializer_range),
@@ -335,7 +335,7 @@ def embed(input_ids,
             one_hot_pos_ids = tf.one_hot(flat_pos_ids, depth=max_position_embeddings)
 
             # [seq_length, full_position_embeddings], [full_position_embeddings, dim]
-            seq_embeds = tf.matmul(one_hot_pos_ids, full_position_embeddings)
+            seq_embeds = tf.linalg.matmul(one_hot_pos_ids, full_position_embeddings)
             embedded_input += seq_embeds[None]
 
             # embedded_input += tf.slice(full_position_embeddings[position_offset:], [0, 0], [seq_length, -1])[None]
@@ -354,10 +354,10 @@ def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9):
 
     # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK
     """
-    with tf.variable_scope('top_p_sample'):
+    with tf.compat.v1.variable_scope('top_p_sample'):
         batch_size, vocab_size = get_shape_list(logits, expected_rank=2)
 
-        probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10,
+        probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], dtype =tf.float32) * 1e10,
                               axis=-1)
 
         if isinstance(p, float) and p > 0.999999:
@@ -366,13 +366,13 @@ def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9):
             return {
                 'probs': probs,
                 'sample': tf.random.categorical(
-                    logits=logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10,
+                    logits=logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], dtype =tf.float32) * 1e10,
                     num_samples=num_samples, dtype=tf.int32),
             }
 
         # [batch_size, vocab_perm]
         indices = tf.argsort(probs, direction='DESCENDING')
-        cumulative_probabilities = tf.math.cumsum(tf.batch_gather(probs, indices), axis=-1, exclusive=False)
+        cumulative_probabilities = tf.math.cumsum(tf.compat.v1.batch_gather(probs, indices), axis=-1, exclusive=False)
 
         # find the top pth index to cut off. careful we don't want to cutoff everything!
         # result will be [batch_size, vocab_perm]
@@ -381,13 +381,13 @@ def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9):
             tf.logical_or(cumulative_probabilities < p_expanded, tf.range(vocab_size)[None] < 1))
 
         # OPTION A - sample in the sorted space, then unsort.
-        logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10
+        logits_to_use = tf.compat.v1.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10
         sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples)
-        sample = tf.batch_gather(indices, sample_perm)
+        sample = tf.compat.v1.batch_gather(indices, sample_perm)
 
         # OPTION B - unsort first - Indices need to go back to 0 -> N-1 -- then sample
         # unperm_indices = tf.argsort(indices, direction='ASCENDING')
-        # include_mask_unperm = tf.batch_gather(include_mask, unperm_indices)
+        # include_mask_unperm = tf.compat.v1.batch_gather(include_mask, unperm_indices)
         # logits_to_use = logits - (1 - tf.cast(include_mask_unperm, tf.float32)) * 1e10
         # sample = tf.random.categorical(logits=logits_to_use, num_samples=num_samples, dtype=tf.int32)
 
@@ -408,7 +408,7 @@ def _top_k_sample(logits, ignore_ids=None, num_samples=1, k=10):
 
     # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK
     """
-    with tf.variable_scope('top_p_sample'):
+    with tf.compat.v1.variable_scope('top_p_sample'):
         batch_size, vocab_size = get_shape_list(logits, expected_rank=2)
 
         probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10,
@@ -422,9 +422,9 @@ def _top_k_sample(logits, ignore_ids=None, num_samples=1, k=10):
         exclude_mask = tf.range(vocab_size)[None] >= k_expanded
 
         # OPTION A - sample in the sorted space, then unsort.
-        logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10
+        logits_to_use = tf.compat.v1.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10
         sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples)
-        sample = tf.batch_gather(indices, sample_perm)
+        sample = tf.compat.v1.batch_gather(indices, sample_perm)
 
     return {
         'probs': probs,
@@ -487,8 +487,8 @@ def __init__(self,
             assert features_ == (config.hidden_size // config.num_attention_heads)
             caches = tf.unstack(cache, axis=1)
 
-        with tf.variable_scope(scope, default_name='newslm', reuse=reuse):
-            with tf.variable_scope("embeddings"):
+        with tf.compat.v1.variable_scope(scope, default_name='newslm', reuse=reuse):
+            with tf.compat.v1.variable_scope("embeddings"):
                 embeddings, self.embedding_table = embed(self.input_ids, config.vocab_size,
                                                          config.hidden_size,
                                                          position_offset=self.cache_length,
@@ -505,7 +505,7 @@ def __init__(self,
             hidden_state = tf.reshape(embeddings, [self.batch_size * self.seq_length, self.config.hidden_size])
             new_kvs = []
             for layer_idx, layer_cache in enumerate(caches):
-                with tf.variable_scope('layer{:02d}'.format(layer_idx)):
+                with tf.compat.v1.variable_scope('layer{:02d}'.format(layer_idx)):
                     # [batch_size * seq_length, hidden_size]
                     attention_output, new_kv = attention_layer(
                         hidden_state,
@@ -531,7 +531,7 @@ def __init__(self,
         self.new_kvs = tf.stack(new_kvs, axis=1) if do_cache else None
 
         # Note that the hidden state is still flat (batch_size*hidden_size)
-        self.logits_flat = tf.matmul(self.hidden_state, self.embedding_table, transpose_b=True)
+        self.logits_flat = tf.linalg.matmul(self.hidden_state, self.embedding_table, transpose_b=True)
 
         # THE OUTPUT BIAS DOES NOT SPARK JOY
         # output_bias = tf.get_variable('output_bias', shape=[config.vocab_size], initializer=tf.zeros_initializer())
@@ -549,7 +549,7 @@ def lm_loss(self):
         target_ids_flat = tf.reshape(self.target_ids, [-1])
 
         # 1 if it's valid and 0 otherwise.
-        label_weights = tf.cast(tf.not_equal(target_ids_flat, self.pad_token_id), dtype=self.logits_flat.dtype)
+        label_weights = tf.cast(tf.math.not_equal(target_ids_flat, self.pad_token_id), dtype=self.logits_flat.dtype)
 
         # [batch_size * seq_length, vocab_size]
         one_hot_labels = tf.one_hot(target_ids_flat,
@@ -559,12 +559,12 @@ def lm_loss(self):
         # [batch_size * seq_length, vocab_size]
         logprobs_flat = tf.nn.log_softmax(self.logits_flat, axis=-1)
 
-        per_example_loss = -tf.reduce_sum(logprobs_flat * one_hot_labels, axis=[-1])
+        per_example_loss = -tf.math.reduce_sum(logprobs_flat * one_hot_labels, axis=[-1])
 
         # per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_flat, labels=target_ids_flat)
 
-        numerator = tf.reduce_sum(label_weights * per_example_loss)
-        denominator = tf.reduce_sum(label_weights) + 1e-5
+        numerator = tf.math.reduce_sum(label_weights * per_example_loss)
+        denominator = tf.math.reduce_sum(label_weights) + 1e-5
         loss = numerator / denominator
         return loss
 
@@ -574,7 +574,7 @@ def pooled_output(self, clf_token):
         :param clf_token:
         :return:
         """
-        pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(self.input_ids, clf_token), tf.float32), 1), tf.int32)
+        pool_idx = tf.cast(tf.math.argmax(tf.cast(tf.math.equal(self.input_ids, clf_token), tf.float32), 1), tf.int32)
         return tf.gather(self.hidden_state, tf.range(self.batch_size, dtype=tf.int32) * self.seq_length + pool_idx)
 
 
@@ -585,12 +585,12 @@ def model_fn_builder(config: GroverConfig, init_checkpoint, learning_rate,
     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
         """The `model_fn` for TPUEstimator."""
 
-        tf.logging.info("*** Features ***")
+        tf.compat.v1.logging.info("*** Features ***")
         for name in sorted(features.keys()):
-            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+            tf.compat.v1.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
 
         input_ids = features["input_ids"]
-
+        # this is not found in updates 
         is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 
         model = GroverModel(
@@ -606,11 +606,11 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
         if is_training:
             train_op, train_metrics = optimization_adafactor.create_optimizer(
                 total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
-            tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+            tvars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
         else:
             train_op = None
             train_metrics = {}
-            tvars = tf.trainable_variables()
+            tvars = tf.compat.v1.trainable_variables()
 
         initialized_variable_names = {}
         scaffold_fn = None
@@ -619,22 +619,23 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
              ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
             if use_tpu:
                 def tpu_scaffold():
-                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-                    return tf.train.Scaffold()
+                    tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                    return tf.compat.v1.train.Scaffold()
 
                 scaffold_fn = tpu_scaffold
             else:
-                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
 
-        tf.logging.info("**** Trainable Variables ****")
+        tf.compat.v1.logging.info("**** Trainable Variables ****")
         for var in tvars:
             init_string = ""
             if var.name in initialized_variable_names:
                 init_string = ", *INIT_FROM_CKPT*"
-            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+            tf.compat.v1.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                             init_string)
 
         output_spec = None
+        # need to find it no updates found
         if mode == tf.estimator.ModeKeys.TRAIN:
             if use_tpu:
                 output_spec = tf.contrib.tpu.TPUEstimatorSpec(
@@ -650,12 +651,12 @@ def tpu_scaffold():
                     loss=total_loss,
                     train_op=train_op,
                     training_hooks=[
-                        tf.train.LoggingTensorHook({'loss': tf.metrics.mean(total_loss)[1]}, every_n_iter=100)],
+                        tf.compat.v1.train.LoggingTensorHook({'loss': tf.compat.v1.metrics.mean(total_loss)[1]}, every_n_iter=100)],
                     scaffold_fn=scaffold_fn)
 
         elif mode == tf.estimator.ModeKeys.EVAL:
             def metric_fn(total_loss):
-                loss = tf.metrics.mean(values=total_loss)
+                loss = tf.compat.v1.metrics.mean(values=total_loss)
                 return {
                     "eval_loss": loss,
                 }
@@ -668,11 +669,11 @@ def metric_fn(total_loss):
                 eval_metrics=eval_metrics,
                 scaffold_fn=scaffold_fn)
         else:
-            gt_logprobs = tf.squeeze(tf.batch_gather(model.log_probs, model.target_ids[:, :, None]), axis=2)
+            gt_logprobs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(model.log_probs, model.target_ids[:, :, None]), axis=2)
 
             # Need top-p required under topp sampling!
             better_than_gt = model.log_probs > gt_logprobs[:, :, None]
-            top_p_required = tf.reduce_sum(tf.cast(better_than_gt, tf.float32) * tf.exp(model.log_probs), axis=2)
+            top_p_required = tf.math.reduce_sum(tf.cast(better_than_gt, tf.float32) * tf.exp(model.log_probs), axis=2)
 
             # No top-p sampling for now, since this seems to be too slow on TPUs
             if use_tpu:
@@ -687,7 +688,7 @@ def metric_fn(total_loss):
                     _top_p_sample(model.logits_flat, num_samples=1, p=0.99)['sample'],
                     get_shape_list(model.target_ids),
                 )
-            pred_logprobs = tf.squeeze(tf.batch_gather(model.log_probs, predictions[:, :, None]), axis=2)
+            pred_logprobs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(model.log_probs, predictions[:, :, None]), axis=2)
 
             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                 mode=mode,
@@ -738,8 +739,8 @@ def sample_step(tokens, ignore_ids, news_config, batch_size=1, p_for_topp=0.95,
     else:
         sample_info = _top_p_sample(next_logits, ignore_ids=ignore_ids, num_samples=1, p=p_for_topp)
 
-    new_tokens = tf.squeeze(sample_info['sample'], 1)
-    new_probs = tf.squeeze(tf.batch_gather(sample_info['probs'], sample_info['sample']), 1)
+    new_tokens = tf.compat.v1.squeeze(sample_info['sample'], 1)
+    new_probs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(sample_info['probs'], sample_info['sample']), 1)
     return {
         'new_tokens': new_tokens,
         'new_probs': new_probs,
@@ -775,7 +776,7 @@ def sample(news_config: GroverConfig, initial_context, eos_token, ignore_ids=Non
     if ignore_ids is None:
         ignore_ids = tf.constant([x == 0 for x in range(news_config.vocab_size)], dtype=tf.bool)
 
-    with tf.name_scope('sample_sequence'):
+    with tf.compat.v1.name_scope('sample_sequence'):
         # Initial call to get cache
         context_output = initialize_from_context(initial_context, ignore_ids=ignore_ids, news_config=news_config,
                                                  p_for_topp=p_for_topp,
@@ -797,10 +798,10 @@ def body(ctx, cache, probs):
             return [new_ids, new_cache, new_probs]
 
         def cond(ctx, cache, probs):
-            is_eos = tf.equal(ctx, eos_token)
-            return tf.math.logical_not(tf.reduce_all(tf.reduce_any(is_eos, axis=1)))
+            is_eos = tf.math.equal(ctx, eos_token)
+            return tf.math.logical_not(tf.math.reduce_all(tf.math.reduce_any(is_eos, axis=1)))
 
-        tokens, cache, probs = tf.while_loop(
+        tokens, cache, probs = tf.compat.v1.while_loop(
             cond=cond, body=body, maximum_iterations=1025 - get_shape_list(ctx)[1],
             loop_vars=[ctx, cache, probs],
             shape_invariants=[tf.TensorShape([batch_size, None]),
@@ -823,9 +824,9 @@ def classification_model_fn_builder(config: GroverConfig, init_checkpoint, learn
     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
         """The `model_fn` for TPUEstimator."""
 
-        tf.logging.info("*** Features ***")
+        tf.compat.v1.logging.info("*** Features ***")
         for name in sorted(features.keys()):
-            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+            tf.compat.v1.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
 
         input_ids = features["input_ids"]
         label_ids = features["label_ids"]
@@ -833,7 +834,7 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
             is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
         else:
             is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
-
+        # need to find this no update found
         is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 
         # Create model with aux loss
@@ -845,11 +846,11 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
             chop_off_last_token=False,
         )
 
-        with tf.variable_scope('classification'):
+        with tf.compat.v1.variable_scope('classification'):
             hidden_state = model.pooled_output(pool_token_id)
             if is_training:
                 hidden_state = dropout(hidden_state, dropout_prob=0.1)
-            logits = tf.layers.dense(
+            logits = tf.keras.layers.Dense(
                 hidden_state,
                 num_labels,
                 kernel_initializer=create_initializer(config.initializer_range),
@@ -857,8 +858,8 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
             )
             log_probs = tf.nn.log_softmax(logits, axis=-1)
             one_hot_labels = tf.one_hot(label_ids, depth=num_labels, dtype=tf.float32)
-            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
-            class_loss = tf.reduce_mean(per_example_loss)
+            per_example_loss = -tf.math.reduce_sum(one_hot_labels * log_probs, axis=-1)
+            class_loss = tf.math.reduce_mean(per_example_loss)
 
         total_loss = lm_loss_coef * model.lm_loss() + class_loss
 
@@ -866,16 +867,16 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
             train_op, train_metrics = optimization_adafactor.create_optimizer(
                 total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
             # tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-            tvars = tf.trainable_variables()
+            tvars = tf.compat.v1.trainable_variables()
 
             train_metrics['minibatch_cls_loss'] = class_loss
-            train_metrics['minibatch_acc'] = tf.reduce_mean(
-                tf.cast(tf.equal(tf.argmax(logits, axis=-1, output_type=tf.int32),
+            train_metrics['minibatch_acc'] = tf.math.reduce_mean(
+                tf.cast(tf.math.equal(tf.math.argmax(logits, axis=-1, output_type=tf.int32),
                                  label_ids), tf.float32))
         else:
             train_op = None
             train_metrics = {}
-            tvars = tf.trainable_variables()
+            tvars = tf.compat.v1.trainable_variables()
 
         initialized_variable_names = {}
         scaffold_fn = None
@@ -884,22 +885,23 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
              ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
             if use_tpu:
                 def tpu_scaffold():
-                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-                    return tf.train.Scaffold()
+                    tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                    return tf.compat.v1.train.Scaffold()
 
                 scaffold_fn = tpu_scaffold
             else:
-                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
 
-        tf.logging.info("**** Trainable Variables ****")
+        tf.compat.v1.logging.info("**** Trainable Variables ****")
         for var in tvars:
             init_string = ""
             if var.name in initialized_variable_names:
                 init_string = ", *INIT_FROM_CKPT*"
-            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+            tf.compat.v1.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                             init_string)
 
         output_spec = None
+        # need to find this no update found
         if mode == tf.estimator.ModeKeys.TRAIN:
             if use_tpu:
                 output_spec = tf.contrib.tpu.TPUEstimatorSpec(
@@ -915,15 +917,15 @@ def tpu_scaffold():
                     loss=total_loss,
                     train_op=train_op,
                     training_hooks=[
-                        tf.train.LoggingTensorHook({'loss': tf.metrics.mean(total_loss)[1]}, every_n_iter=100)],
+                        tf.compat.v1.train.LoggingTensorHook({'loss': tf.compat.v1.metrics.mean(total_loss)[1]}, every_n_iter=100)],
                     scaffold_fn=scaffold_fn)
 
         elif mode == tf.estimator.ModeKeys.EVAL:
             def metric_fn(per_example_loss, label_ids, logits, is_real_example):
-                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
-                accuracy = tf.metrics.accuracy(
+                predictions = tf.math.argmax(logits, axis=-1, output_type=tf.int32)
+                accuracy = tf.compat.v1.metrics.accuracy(
                     labels=label_ids, predictions=predictions, weights=is_real_example)
-                loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
+                loss = tf.compat.v1.metrics.mean(values=per_example_loss, weights=is_real_example)
                 return {
                     "eval_accuracy": accuracy,
                     "eval_loss": loss,

From 5c4e16a43b48ea58f1e7144c89e109e9e99c2abd Mon Sep 17 00:00:00 2001
From: EHTIISHAM <ehtasham7899@gmail.com>
Date: Tue, 26 Nov 2024 10:21:56 +0500
Subject: [PATCH 2/5] except estimators all changes are done

---
 lm/modeling.py                | 23 +++++++++-------
 lm/optimization_adafactor.py  | 49 ++++++++++++++++++-----------------
 lm/train.py                   | 25 +++++++++---------
 lm/utils.py                   | 35 +++++++++++++------------
 lm/validate.py                | 19 +++++++-------
 realnews/prepare_lm_data.py   |  6 ++---
 sample/contextual_generate.py | 15 ++++++-----
 7 files changed, 90 insertions(+), 82 deletions(-)

diff --git a/lm/modeling.py b/lm/modeling.py
index c6b61303..9f2312d6 100644
--- a/lm/modeling.py
+++ b/lm/modeling.py
@@ -19,6 +19,7 @@
 
 import six
 import tensorflow as tf
+import tensorflow.compat.v1 as tf1
 
 from lm import optimization_adafactor
 from lm.utils import get_assignment_map_from_checkpoint, get_shape_list, get_attention_mask, gelu, layer_norm, dropout, \
@@ -638,7 +639,7 @@ def tpu_scaffold():
         # need to find it no updates found
         if mode == tf.estimator.ModeKeys.TRAIN:
             if use_tpu:
-                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                     mode=mode,
                     loss=total_loss,
                     train_op=train_op,
@@ -646,7 +647,7 @@ def tpu_scaffold():
                                                          prefix='training/'),
                     scaffold_fn=scaffold_fn)
             else:
-                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                     mode=mode,
                     loss=total_loss,
                     train_op=train_op,
@@ -663,7 +664,7 @@ def metric_fn(total_loss):
 
             eval_metrics = (metric_fn,
                             [total_loss])
-            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+            output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                 mode=mode,
                 loss=total_loss,
                 eval_metrics=eval_metrics,
@@ -690,7 +691,7 @@ def metric_fn(total_loss):
                 )
             pred_logprobs = tf.compat.v1.squeeze(tf.compat.v1.batch_gather(model.log_probs, predictions[:, :, None]), axis=2)
 
-            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+            output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                 mode=mode,
                 predictions={'gt_logprobs': gt_logprobs,
                              'top_p_required': top_p_required,
@@ -904,7 +905,8 @@ def tpu_scaffold():
         # need to find this no update found
         if mode == tf.estimator.ModeKeys.TRAIN:
             if use_tpu:
-                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                # from here 
+                output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                     mode=mode,
                     loss=total_loss,
                     train_op=train_op,
@@ -912,14 +914,14 @@ def tpu_scaffold():
                                                          prefix='training/'),
                     scaffold_fn=scaffold_fn)
             else:
-                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                     mode=mode,
                     loss=total_loss,
                     train_op=train_op,
                     training_hooks=[
                         tf.compat.v1.train.LoggingTensorHook({'loss': tf.compat.v1.metrics.mean(total_loss)[1]}, every_n_iter=100)],
                     scaffold_fn=scaffold_fn)
-
+                    # to here
         elif mode == tf.estimator.ModeKeys.EVAL:
             def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                 predictions = tf.math.argmax(logits, axis=-1, output_type=tf.int32)
@@ -930,16 +932,16 @@ def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                     "eval_accuracy": accuracy,
                     "eval_loss": loss,
                 }
-
+            # from here 
             eval_metrics = (metric_fn,
                             [per_example_loss, label_ids, logits, is_real_example])
-            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+            output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                 mode=mode,
                 loss=total_loss,
                 eval_metrics=eval_metrics,
                 scaffold_fn=scaffold_fn)
         else:
-            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+            output_spec = tf1.estimator.tpu.TPUEstimatorSpec(
                 mode=mode,
                 predictions={'logits': logits,
                              'probs': tf.nn.softmax(logits, axis=-1)},
@@ -947,3 +949,4 @@ def metric_fn(per_example_loss, label_ids, logits, is_real_example):
         return output_spec
 
     return model_fn
+     # to here
\ No newline at end of file
diff --git a/lm/optimization_adafactor.py b/lm/optimization_adafactor.py
index b8d03ed1..400d56c7 100644
--- a/lm/optimization_adafactor.py
+++ b/lm/optimization_adafactor.py
@@ -14,17 +14,18 @@
 # limitations under the License.
 import re
 import tensorflow as tf
+import tensorflow.compat.v1 as tf1
 from lm.utils import get_shape_list
 
 
 def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
     """Creates an optimizer training op."""
-    global_step = tf.train.get_or_create_global_step()
+    global_step = tf1.train.get_or_create_global_step()
 
-    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
+    learning_rate = tf1.constant(value=init_lr, shape=[], dtype=tf.float32)
 
     # Implements linear decay of the learning rate.
-    learning_rate = tf.train.polynomial_decay(
+    learning_rate = tf1.train.polynomial_decay(
         learning_rate,
         global_step,
         num_train_steps,
@@ -36,7 +37,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
     # learning rate will be `global_step/num_warmup_steps * init_lr`.
     if num_warmup_steps:
         global_steps_int = tf.cast(global_step, tf.int32)
-        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+        warmup_steps_int = tf1.constant(num_warmup_steps, dtype=tf.int32)
 
         global_steps_float = tf.cast(global_steps_int, tf.float32)
         warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
@@ -60,10 +61,10 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
         exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 
     if use_tpu:
-        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+        optimizer = tf1.tpu.CrossShardOptimizer(optimizer)
 
-    tvars = tf.trainable_variables()
-    grads = tf.gradients(loss, tvars)
+    tvars = tf1.trainable_variables()
+    grads = tf1.gradients(loss, tvars)
 
     # You could do this, but instead we don't because a) it's slow and b) we already did the 'update clipping'
     # (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
@@ -125,7 +126,7 @@ def _parameter_scale(self, var):
         Returns:
           a Scalar
         """
-        return tf.maximum(reduce_rms(var), self.epsilon2)
+        return tf.math.maximum(reduce_rms(var), self.epsilon2)
 
     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
         """See base class."""
@@ -139,7 +140,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
 
             # decay_rate = 1 - tf.pow(tf.cast(tf.train.get_or_create_global_step(), tf.float32) + 1.0, -0.8)
             decay_rate = self.beta_2
-            grad_squared = tf.square(grad) + self.epsilon1
+            grad_squared = tf.math.square(grad) + self.epsilon1
 
             update_scale = self.learning_rate
             # update_scale = self.learning_rate * tf.cast(self._parameter_scale(param), dtype=tf.float32)
@@ -148,7 +149,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
             # This confounds the XLA rewriter and keeps it from fusing computations
             # across different variables.  This fusion is a bad for HBM usage, since
             # it causes the gradients to persist in memory.
-            grad_squared_mean = tf.reduce_mean(grad_squared)
+            grad_squared_mean = tf.math.reduce_mean(grad_squared)
             decay_rate += grad_squared_mean * 1e-30
             update_scale += grad_squared_mean * 1e-30
 
@@ -157,42 +158,42 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
             if self._use_factored(shape_list):
                 num_rows, num_columns = shape_list
 
-                vr = tf.get_variable(
+                vr = tf1.get_variable(
                     name=param_name + "/adafactor_vr",
                     shape=[num_rows],
                     dtype=tf.float32,
                     trainable=False,
-                    initializer=tf.zeros_initializer())
-                vc = tf.get_variable(
+                    initializer=tf1.zeros_initializer())
+                vc = tf1.get_variable(
                     name=param_name + "/adafactor_vc",
                     shape=[num_columns],
                     dtype=tf.float32,
                     trainable=False,
-                    initializer=tf.zeros_initializer())
+                    initializer=tf1.zeros_initializer())
 
-                next_vr = decay_rate * vr + (1 - decay_rate) * tf.reduce_mean(grad_squared, 1)
-                next_vc = decay_rate * vc + (1 - decay_rate) * tf.reduce_mean(grad_squared, 0)
+                next_vr = decay_rate * vr + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 1)
+                next_vc = decay_rate * vc + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 0)
 
-                long_term_mean = tf.reduce_mean(next_vr, -1, keepdims=True)
-                r_factor = tf.rsqrt(next_vr / long_term_mean + self.epsilon1)
-                c_factor = tf.rsqrt(next_vc + self.epsilon1)
+                long_term_mean = tf.math.reduce_mean(next_vr, -1, keepdims=True)
+                r_factor = tf.math.rsqrt(next_vr / long_term_mean + self.epsilon1)
+                c_factor = tf.math.rsqrt(next_vc + self.epsilon1)
                 update = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(c_factor, -2)
 
                 assignments.append(vr.assign(next_vr, use_locking=self.use_locking))
                 assignments.append(vc.assign(next_vc, use_locking=self.use_locking))
             else:
-                v = tf.get_variable(
+                v = tf1.get_variable(
                     name=param_name + "/adafactor_v",
                     shape=shape_list,
                     dtype=tf.float32,
                     trainable=False,
-                    initializer=tf.zeros_initializer())
+                    initializer=tf1.zeros_initializer())
                 next_v = decay_rate * v + (1 - decay_rate) * grad_squared
 
                 assignments.append(v.assign(next_v, use_locking=self.use_locking))
-                update = grad * tf.rsqrt(next_v + self.epsilon1)
+                update = grad * tf.math.rsqrt(next_v + self.epsilon1)
 
-            clipping_denom = tf.maximum(1.0, reduce_rms(update) / self.clipping_rate)
+            clipping_denom = tf.math.maximum(1.0, reduce_rms(update) / self.clipping_rate)
             update /= clipping_denom
 
             # Do weight decay
@@ -231,4 +232,4 @@ def _get_variable_name(self, param_name):
 
 
 def reduce_rms(x):
-    return tf.sqrt(tf.reduce_mean(tf.square(x)))
+    return tf.math.sqrt(tf.math.reduce_mean(tf.math.square(x)))
diff --git a/lm/train.py b/lm/train.py
index fbf9d3ad..77e0796a 100644
--- a/lm/train.py
+++ b/lm/train.py
@@ -16,11 +16,12 @@
 """ Training script! """
 
 import tensorflow as tf
+import tensorflow.compat.v1 as tf1
 
 from lm.dataloader import input_fn_builder
 from lm.modeling import model_fn_builder, GroverConfig
 
-flags = tf.flags
+flags = tf1.flags
 
 FLAGS = flags.FLAGS
 
@@ -93,25 +94,25 @@
 
 
 def main(_):
-    tf.logging.set_verbosity(tf.logging.INFO)
+    tf1.logging.set_verbosity(tf1.logging.INFO)
 
     news_config = GroverConfig.from_json_file(FLAGS.config_file)
 
-    tf.gfile.MakeDirs(FLAGS.output_dir)
+    tf1.gfile.MakeDirs(FLAGS.output_dir)
 
     input_files = []
     for input_pattern in FLAGS.input_file.split(","):
-        input_files.extend(tf.gfile.Glob(input_pattern))
+        input_files.extend(tf1.gfile.Glob(input_pattern))
 
-    tf.logging.info("*** Input Files ***")
+    tf1.logging.info("*** Input Files ***")
     for input_file in input_files:
-        tf.logging.info("  %s" % input_file)
+        tf1.logging.info("  %s" % input_file)
 
     tpu_cluster_resolver = None
     if FLAGS.use_tpu and FLAGS.tpu_name:
-        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
             FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
-
+    # from here 
     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
     run_config = tf.contrib.tpu.RunConfig(
         cluster=tpu_cluster_resolver,
@@ -141,9 +142,9 @@ def main(_):
         eval_batch_size=FLAGS.train_batch_size,
         params={'model_dir': FLAGS.output_dir}
     )
-
-    tf.logging.info("***** Running training *****")
-    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+    # to here 
+    tf1.logging.info("***** Running training *****")
+    tf1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
     train_input_fn = input_fn_builder(
         input_files=input_files,
         seq_length=FLAGS.max_seq_length,
@@ -154,4 +155,4 @@ def main(_):
 if __name__ == "__main__":
     flags.mark_flag_as_required("input_file")
     flags.mark_flag_as_required("output_dir")
-    tf.app.run()
+    tf1.app.run()
diff --git a/lm/utils.py b/lm/utils.py
index aa75c71b..470ee50a 100644
--- a/lm/utils.py
+++ b/lm/utils.py
@@ -18,6 +18,7 @@
 
 import six
 import tensorflow as tf
+import tensorflow.compat.v1 as tf1
 import numpy as np
 from tensorflow.python.lib.io import file_io
 
@@ -53,7 +54,7 @@ def assert_rank(tensor, expected_rank, name=None):
 
     actual_rank = tensor.shape.ndims
     if actual_rank not in expected_rank_dict:
-        scope_name = tf.get_variable_scope().name
+        scope_name = tf1.get_variable_scope().name
         raise ValueError(
             "For the tensor `%s` in scope `%s`, the actual rank "
             "`%d` (shape = %s) is not equal to the expected rank `%s`" %
@@ -91,7 +92,7 @@ def get_shape_list(tensor, expected_rank=None, name=None):
     if not non_static_indexes:
         return shape
 
-    dyn_shape = tf.shape(tensor)
+    dyn_shape = tf1.shape(tensor)
     for index in non_static_indexes:
         shape[index] = dyn_shape[index]
     return shape
@@ -109,20 +110,20 @@ def gelu(input_tensor):
     Returns:
       `input_tensor` with the GELU activation applied.
     """
-    cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
+    cdf = 0.5 * (1.0 + tf.math.erf(input_tensor / tf.math.sqrt(2.0)))
     return input_tensor * cdf
 
 
 def layer_norm(input_tensor, name=None, epsilon=1e-5):
     """Run layer normalization on the last dimension of the tensor."""
     name2use = f'LayerNorm_{name}' if name is not None else name
-    with tf.variable_scope(name2use, default_name='LayerNorm'):
+    with tf1.variable_scope(name2use, default_name='LayerNorm'):
         dim = input_tensor.shape[-1].value
-        gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1))
-        beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0))
-        mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True)
-        std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True)
-        input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon)
+        gamma = tf1.get_variable('gamma', [dim], initializer=tf1.constant_initializer(1))
+        beta = tf1.get_variable('beta', [dim], initializer=tf1.constant_initializer(0))
+        mean = tf.math.reduce_mean(input_tensor, axis=-1, keepdims=True)
+        std = tf.math.reduce_mean(tf.math.square(input_tensor - mean), axis=-1, keepdims=True)
+        input_tensor = (input_tensor - mean) * tf.math.rsqrt(std + epsilon)
         input_tensor = input_tensor * gamma + beta
     return input_tensor
 
@@ -149,8 +150,8 @@ def get_attention_mask(nd, ns, *, dtype):
     this is a TPU compatible version of tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd)
     where the lower right triangle contains 1s
     """
-    i = tf.range(nd)[:, None]
-    j = tf.range(ns)
+    i = tf.ragged.range(nd)[:, None]
+    j = tf.ragged.range(ns)
     m = i >= j - ns + nd
     return tf.cast(m, dtype)
 
@@ -214,21 +215,21 @@ def host_call_fn(global_step, *args):
           List of summary ops to run on the CPU host.
         """
         step = global_step[0]
-        with tf.contrib.summary.create_file_writer(
+        with tf.summary.create_file_writer(
                 logdir=model_dir, filename_suffix=".host_call").as_default():
-            with tf.contrib.summary.always_record_summaries():
+            with tf.summary.should_record_summaries():
                 for i, name in enumerate(metric_names):
-                    tf.contrib.summary.scalar(prefix + name, args[i][0], step=step)
+                    tf1.summary.scalar(prefix + name, args[i][0], step=step)
 
-                return tf.contrib.summary.all_summary_ops()
+                return tf1.summary.all_v2_summary_ops()
 
     # To log the current learning rate, and gradient norm for Tensorboard, the
     # summary op needs to be run on the host CPU via host_call. host_call
     # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
     # dimension. These Tensors are implicitly concatenated to
     # [params['batch_size']].
-    global_step_tensor = tf.reshape(
+    global_step_tensor = tf1.manip.reshape(
         tf.compat.v1.train.get_or_create_global_step(), [1])
-    other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names]
+    other_tensors = [tf.manip.reshape(metric_dict[key], [1]) for key in metric_names]
 
     return host_call_fn, [global_step_tensor] + other_tensors
diff --git a/lm/validate.py b/lm/validate.py
index 0ac1668c..49694768 100644
--- a/lm/validate.py
+++ b/lm/validate.py
@@ -16,13 +16,14 @@
 import os
 from lm.modeling import model_fn_builder, GroverConfig
 import tensorflow as tf
+import tensorflow.compat.v1 as tf1
 from lm.dataloader import input_fn_builder
 import numpy as np
 import tempfile
 import h5py
 from google.cloud import storage
 
-flags = tf.flags
+flags = tf1.flags
 
 FLAGS = flags.FLAGS
 
@@ -126,20 +127,20 @@ def ind_where(array: np.ndarray, target, return_first_match=True, default_value=
 
 
 def main(_):
-    tf.logging.set_verbosity(tf.logging.INFO)
+    tf1.logging.set_verbosity(tf1.logging.INFO)
 
     news_config = GroverConfig.from_json_file(FLAGS.config_file)
 
-    tf.gfile.MakeDirs(FLAGS.output_dir)
+    tf1.gfile.MakeDirs(FLAGS.output_dir)
 
     input_files = []
     for input_pattern in FLAGS.input_file.split(","):
-        input_files.extend(tf.gfile.Glob(input_pattern))
+        input_files.extend(tf1.gfile.Glob(input_pattern))
 
-    tf.logging.info("*** Input Files ***")
+    tf1.logging.info("*** Input Files ***")
     for input_file in input_files:
-        tf.logging.info("  %s" % input_file)
-
+        tf1.logging.info("  %s" % input_file)
+    # from here
     tpu_cluster_resolver = None
     if FLAGS.use_tpu and FLAGS.tpu_name:
         tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
@@ -176,7 +177,7 @@ def main(_):
         predict_batch_size=FLAGS.batch_size,
         params={'model_dir': FLAGS.output_dir}
     )
-
+    # to here 
     eval_input_fn = input_fn_builder(
         input_files=input_files,
         seq_length=FLAGS.max_seq_length,
@@ -211,4 +212,4 @@ def main(_):
 if __name__ == "__main__":
     flags.mark_flag_as_required("input_file")
     flags.mark_flag_as_required("output_dir")
-    tf.app.run()
+    tf1.app.run()
diff --git a/realnews/prepare_lm_data.py b/realnews/prepare_lm_data.py
index 33fc6433..5e4365e4 100644
--- a/realnews/prepare_lm_data.py
+++ b/realnews/prepare_lm_data.py
@@ -82,14 +82,14 @@ def __init__(self, fn):
             self.s3client = boto3.client('s3',
                                          )
             self.storage_dir = TemporaryDirectory()
-            self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
+            self.writer = tf.io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
             self.bucket_name, self.file_name = self.fn.split('s3://', 1)[1].split('/', 1)
         elif fn.startswith('gs://'):
             from google.cloud import storage
             self.s3client = None
             self.gclient = storage.Client()
             self.storage_dir = TemporaryDirectory()
-            self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
+            self.writer = tf.io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
             self.bucket_name, self.file_name = self.fn.split('gs://', 1)[1].split('/', 1)
 
         else:
@@ -98,7 +98,7 @@ def __init__(self, fn):
             self.bucket_name = None
             self.file_name = None
             self.storage_dir = None
-            self.writer = tf.python_io.TFRecordWriter(fn)
+            self.writer = tf.io.TFRecordWriter(fn)
 
     def write(self, x):
         self.writer.write(x)
diff --git a/sample/contextual_generate.py b/sample/contextual_generate.py
index 85d55235..873f4b44 100644
--- a/sample/contextual_generate.py
+++ b/sample/contextual_generate.py
@@ -1,4 +1,5 @@
 import tensorflow as tf
+import tensorflow.compat.v1 as tf1
 import numpy as np
 import sys
 import json
@@ -101,19 +102,19 @@
 with open(args.metadata_fn, 'r') as f:
     articles = [json.loads(l) for i, l in enumerate(f) if i % args.num_folds == args.fold]
 
-tf_config = tf.ConfigProto(allow_soft_placement=True)
+tf_config = tf1.ConfigProto(allow_soft_placement=True)
 
-with tf.Session(config=tf_config, graph=tf.Graph()) as sess, \
+with tf1.Session(config=tf_config, graph=tf.Graph()) as sess, \
         open(args.out_fn, 'w') as f_out:
-    initial_context = tf.placeholder(tf.int32, [batch_size_per_chunk, None])
-    p_for_topp = tf.placeholder(tf.float32, [batch_size_per_chunk])
-    eos_token = tf.placeholder(tf.int32, [])
-    ignore_ids = tf.placeholder(tf.bool, [news_config.vocab_size])
+    initial_context = tf1.placeholder(tf.int32, [batch_size_per_chunk, None])
+    p_for_topp = tf1.placeholder(tf.float32, [batch_size_per_chunk])
+    eos_token = tf1.placeholder(tf.int32, [])
+    ignore_ids = tf1.placeholder(tf.bool, [news_config.vocab_size])
     tokens, probs = sample(news_config=news_config, initial_context=initial_context,
                            eos_token=eos_token, ignore_ids=ignore_ids, p_for_topp=p_for_topp,
                            do_topk=False)
 
-    saver = tf.train.Saver()
+    saver = tf1.train.Saver()
     saver.restore(sess, args.model_ckpt)
 
     # Let's go!

From 9422065258184a88dd6be3360489efdf8f561132 Mon Sep 17 00:00:00 2001
From: EHTIISHAM <100058262+EHTIISHAM@users.noreply.github.com>
Date: Wed, 27 Nov 2024 10:13:08 +0500
Subject: [PATCH 3/5] Update requirements-gpu.txt

---
 requirements-gpu.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements-gpu.txt b/requirements-gpu.txt
index 216df38b..a69664c0 100644
--- a/requirements-gpu.txt
+++ b/requirements-gpu.txt
@@ -1,8 +1,8 @@
-pandas==0.24.2
-regex==2019.4.14
-h5py==2.9.0
-numpy==1.16.2
-tensorboard==1.13.1
-tensorflow-gpu==1.13.1
-tqdm==4.31.1
-requests==2.22.0
\ No newline at end of file
+pandas
+regex
+h5py
+numpy==1.26.3
+tensorboard
+tensorflow-gpu==2.16.1
+tqdm
+requests

From 8e086a88e634b28118c751f647b1f13a25dd910f Mon Sep 17 00:00:00 2001
From: EHTIISHAM <100058262+EHTIISHAM@users.noreply.github.com>
Date: Wed, 27 Nov 2024 10:20:42 +0500
Subject: [PATCH 4/5] Update requirements-gpu.txt

---
 requirements-gpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-gpu.txt b/requirements-gpu.txt
index a69664c0..4a10d921 100644
--- a/requirements-gpu.txt
+++ b/requirements-gpu.txt
@@ -3,6 +3,6 @@ regex
 h5py
 numpy==1.26.3
 tensorboard
-tensorflow-gpu==2.16.1
+tensorflow==2.16.1
 tqdm
 requests

From 39eca7440c0c085ffc8a03101e6aea0d0141f3e7 Mon Sep 17 00:00:00 2001
From: EHTIISHAM <100058262+EHTIISHAM@users.noreply.github.com>
Date: Wed, 27 Nov 2024 10:33:46 +0500
Subject: [PATCH 5/5] Update optimization_adafactor.py

---
 lm/optimization_adafactor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm/optimization_adafactor.py b/lm/optimization_adafactor.py
index 400d56c7..ff1c0ed9 100644
--- a/lm/optimization_adafactor.py
+++ b/lm/optimization_adafactor.py
@@ -86,7 +86,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
     return train_op, train_metrics
 
 
-class AdaFactorOptimizer(tf.train.Optimizer):
+class AdaFactorOptimizer(tf1.train.Optimizer):
     """here's the optimizer we'll use"""
 
     def __init__(self,