Current updated code till 2.16 without complex changes #86

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

EHTIISHAM wants to merge 5 commits into rowanz:master from EHTIISHAM:master

discrimination/run_discrimination.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -21,14 +21,14 @@ @@
     import numpy as np
     import tensorflow as tf
-    from tensorflow.python.lib.io import file_io
+    #from tensorflow.python.lib.io import file_io
     from lm.dataloader import classification_convert_examples_to_features, classification_input_fn_builder
     from lm.modeling import classification_model_fn_builder, GroverConfig
     from lm.utils import _save_np
     from sample.encoder import get_encoder
-    flags = tf.flags
+    flags = tf.compat.v1.flags
     FLAGS = flags.FLAGS
@@ Expand Down @@

lm/dataloader.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -19,7 +19,7 @@
  
    def _decode_record(record, name_to_features):

        """Decodes a record to a TensorFlow example."""

        example = tf.parse_single_example(record, name_to_features)

        example = tf.compat.v1.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.

        # So cast all int64 to int32.

    @@ -42,7 +42,7 @@ def input_fn(params):
  
            """The actual input function."""

            batch_size = params["batch_size"]

            name_to_features = {

                "input_ids": tf.FixedLenFeature([seq_length + 1], tf.int64),

                "input_ids": tf.io.FixedLenFeature([seq_length + 1], tf.int64),

            }

            # For training, we want a lot of parallel reading and shuffling.

    @@ -57,6 +57,7 @@ def input_fn(params):
  
                # `sloppy` mode means that the interleaving is not exact. This adds

                # even more randomness to the training pipeline.

                # tf.data.experimental.parallel_interleave will be removed in future versions need to use interleave instead

                d = d.apply(

                    tf.data.experimental.parallel_interleave(

                        tf.data.TFRecordDataset,

    @@ -91,13 +92,13 @@ def classification_convert_examples_to_features(
  
            chop_from_front_if_needed=True):

        """Convert a set of `InputExample`s to a TFRecord file."""

        writer = tf.python_io.TFRecordWriter(output_file)

        writer = tf.io.TFRecordWriter(output_file)

        label_map = {label: i for i, label in enumerate(labels)}

        for (ex_index, example) in enumerate(examples):

            if ex_index % 10000 == 0:

                tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

                tf.compat.v1.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

            # begin_summary is our [CLS] token

            tokens = example['ids'] + [encoder.begin_summary]

    @@ -134,9 +135,9 @@ def classification_input_fn_builder(input_file, seq_length, is_training,
  
        """Creates an `input_fn` closure to be passed to TPUEstimator."""

        name_to_features = {

            "input_ids": tf.FixedLenFeature([seq_length], tf.int64),

            "label_ids": tf.FixedLenFeature([], tf.int64),

            "is_real_example": tf.FixedLenFeature([], tf.int64),

            "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),

            "label_ids": tf.io.FixedLenFeature([], tf.int64),

            "is_real_example": tf.io.FixedLenFeature([], tf.int64),

        }

        def input_fn(params):

lm/modeling.py

Large diffs are not rendered by default.

lm/optimization_adafactor.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -14,17 +14,18 @@
  
    # limitations under the License.

    import re

    import tensorflow as tf

    import tensorflow.compat.v1 as tf1

    from lm.utils import get_shape_list

    def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):

        """Creates an optimizer training op."""

        global_step = tf.train.get_or_create_global_step()

        global_step = tf1.train.get_or_create_global_step()

        learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

        learning_rate = tf1.constant(value=init_lr, shape=[], dtype=tf.float32)

        # Implements linear decay of the learning rate.

        learning_rate = tf.train.polynomial_decay(

        learning_rate = tf1.train.polynomial_decay(

            learning_rate,

            global_step,

            num_train_steps,

    @@ -36,7 +37,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  
        # learning rate will be `global_step/num_warmup_steps * init_lr`.

        if num_warmup_steps:

            global_steps_int = tf.cast(global_step, tf.int32)

            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

            warmup_steps_int = tf1.constant(num_warmup_steps, dtype=tf.int32)

            global_steps_float = tf.cast(global_steps_int, tf.float32)

            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    @@ -60,10 +61,10 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        if use_tpu:

            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

            optimizer = tf1.tpu.CrossShardOptimizer(optimizer)

        tvars = tf.trainable_variables()

        grads = tf.gradients(loss, tvars)

        tvars = tf1.trainable_variables()

        grads = tf1.gradients(loss, tvars)

        # You could do this, but instead we don't because a) it's slow and b) we already did the 'update clipping'

        # (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

    @@ -85,7 +86,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  
        return train_op, train_metrics

    class AdaFactorOptimizer(tf.train.Optimizer):

    class AdaFactorOptimizer(tf1.train.Optimizer):

        """here's the optimizer we'll use"""

        def __init__(self,

    @@ -125,7 +126,7 @@ def _parameter_scale(self, var):
  
            Returns:

              a Scalar

            """

            return tf.maximum(reduce_rms(var), self.epsilon2)

            return tf.math.maximum(reduce_rms(var), self.epsilon2)

        def apply_gradients(self, grads_and_vars, global_step=None, name=None):

            """See base class."""

    @@ -139,7 +140,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
  
                # decay_rate = 1 - tf.pow(tf.cast(tf.train.get_or_create_global_step(), tf.float32) + 1.0, -0.8)

                decay_rate = self.beta_2

                grad_squared = tf.square(grad) + self.epsilon1

                grad_squared = tf.math.square(grad) + self.epsilon1

                update_scale = self.learning_rate

                # update_scale = self.learning_rate * tf.cast(self._parameter_scale(param), dtype=tf.float32)

    @@ -148,7 +149,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
  
                # This confounds the XLA rewriter and keeps it from fusing computations

                # across different variables.  This fusion is a bad for HBM usage, since

                # it causes the gradients to persist in memory.

                grad_squared_mean = tf.reduce_mean(grad_squared)

                grad_squared_mean = tf.math.reduce_mean(grad_squared)

                decay_rate += grad_squared_mean * 1e-30

                update_scale += grad_squared_mean * 1e-30

    @@ -157,42 +158,42 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
  
                if self._use_factored(shape_list):

                    num_rows, num_columns = shape_list

                    vr = tf.get_variable(

                    vr = tf1.get_variable(

                        name=param_name + "/adafactor_vr",

                        shape=[num_rows],

                        dtype=tf.float32,

                        trainable=False,

                        initializer=tf.zeros_initializer())

                    vc = tf.get_variable(

                        initializer=tf1.zeros_initializer())

                    vc = tf1.get_variable(

                        name=param_name + "/adafactor_vc",

                        shape=[num_columns],

                        dtype=tf.float32,

                        trainable=False,

                        initializer=tf.zeros_initializer())

                        initializer=tf1.zeros_initializer())

                    next_vr = decay_rate * vr + (1 - decay_rate) * tf.reduce_mean(grad_squared, 1)

                    next_vc = decay_rate * vc + (1 - decay_rate) * tf.reduce_mean(grad_squared, 0)

                    next_vr = decay_rate * vr + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 1)

                    next_vc = decay_rate * vc + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 0)

                    long_term_mean = tf.reduce_mean(next_vr, -1, keepdims=True)

                    r_factor = tf.rsqrt(next_vr / long_term_mean + self.epsilon1)

                    c_factor = tf.rsqrt(next_vc + self.epsilon1)

                    long_term_mean = tf.math.reduce_mean(next_vr, -1, keepdims=True)

                    r_factor = tf.math.rsqrt(next_vr / long_term_mean + self.epsilon1)

                    c_factor = tf.math.rsqrt(next_vc + self.epsilon1)

                    update = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(c_factor, -2)

                    assignments.append(vr.assign(next_vr, use_locking=self.use_locking))

                    assignments.append(vc.assign(next_vc, use_locking=self.use_locking))

                else:

                    v = tf.get_variable(

                    v = tf1.get_variable(

                        name=param_name + "/adafactor_v",

                        shape=shape_list,

                        dtype=tf.float32,

                        trainable=False,

                        initializer=tf.zeros_initializer())

                        initializer=tf1.zeros_initializer())

                    next_v = decay_rate * v + (1 - decay_rate) * grad_squared

                    assignments.append(v.assign(next_v, use_locking=self.use_locking))

                    update = grad * tf.rsqrt(next_v + self.epsilon1)

                    update = grad * tf.math.rsqrt(next_v + self.epsilon1)

                clipping_denom = tf.maximum(1.0, reduce_rms(update) / self.clipping_rate)

                clipping_denom = tf.math.maximum(1.0, reduce_rms(update) / self.clipping_rate)

                update /= clipping_denom

                # Do weight decay

    @@ -231,4 +232,4 @@ def _get_variable_name(self, param_name):
  
    def reduce_rms(x):

        return tf.sqrt(tf.reduce_mean(tf.square(x)))

        return tf.math.sqrt(tf.math.reduce_mean(tf.math.square(x)))

lm/train.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -16,11 +16,12 @@
  
    """ Training script! """

    import tensorflow as tf

    import tensorflow.compat.v1 as tf1

    from lm.dataloader import input_fn_builder

    from lm.modeling import model_fn_builder, GroverConfig

    flags = tf.flags

    flags = tf1.flags

    FLAGS = flags.FLAGS

    @@ -93,25 +94,25 @@
  
    def main(_):

        tf.logging.set_verbosity(tf.logging.INFO)

        tf1.logging.set_verbosity(tf1.logging.INFO)

        news_config = GroverConfig.from_json_file(FLAGS.config_file)

        tf.gfile.MakeDirs(FLAGS.output_dir)

        tf1.gfile.MakeDirs(FLAGS.output_dir)

        input_files = []

        for input_pattern in FLAGS.input_file.split(","):

            input_files.extend(tf.gfile.Glob(input_pattern))

            input_files.extend(tf1.gfile.Glob(input_pattern))

        tf.logging.info("*** Input Files ***")

        tf1.logging.info("*** Input Files ***")

        for input_file in input_files:

            tf.logging.info("  %s" % input_file)

            tf1.logging.info("  %s" % input_file)

        tpu_cluster_resolver = None

        if FLAGS.use_tpu and FLAGS.tpu_name:

            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(

            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(

                FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

        # from here 

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

        run_config = tf.contrib.tpu.RunConfig(

            cluster=tpu_cluster_resolver,

    @@ -141,9 +142,9 @@ def main(_):
  
            eval_batch_size=FLAGS.train_batch_size,

            params={'model_dir': FLAGS.output_dir}

        )

        tf.logging.info("***** Running training *****")

        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)

        # to here 

        tf1.logging.info("***** Running training *****")

        tf1.logging.info("  Batch size = %d", FLAGS.train_batch_size)

        train_input_fn = input_fn_builder(

            input_files=input_files,

            seq_length=FLAGS.max_seq_length,

    @@ -154,4 +155,4 @@ def main(_):
  
    if __name__ == "__main__":

        flags.mark_flag_as_required("input_file")

        flags.mark_flag_as_required("output_dir")

        tf.app.run()

        tf1.app.run()

lm/utils.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -18,6 +18,7 @@
  
    import six

    import tensorflow as tf

    import tensorflow.compat.v1 as tf1

    import numpy as np

    from tensorflow.python.lib.io import file_io

    @@ -53,7 +54,7 @@ def assert_rank(tensor, expected_rank, name=None):
  
        actual_rank = tensor.shape.ndims

        if actual_rank not in expected_rank_dict:

            scope_name = tf.get_variable_scope().name

            scope_name = tf1.get_variable_scope().name

            raise ValueError(

                "For the tensor `%s` in scope `%s`, the actual rank "

                "`%d` (shape = %s) is not equal to the expected rank `%s`" %

    @@ -91,7 +92,7 @@ def get_shape_list(tensor, expected_rank=None, name=None):
  
        if not non_static_indexes:

            return shape

        dyn_shape = tf.shape(tensor)

        dyn_shape = tf1.shape(tensor)

        for index in non_static_indexes:

            shape[index] = dyn_shape[index]

        return shape

    @@ -109,20 +110,20 @@ def gelu(input_tensor):
  
        Returns:

          `input_tensor` with the GELU activation applied.

        """

        cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))

        cdf = 0.5 * (1.0 + tf.math.erf(input_tensor / tf.math.sqrt(2.0)))

        return input_tensor * cdf

    def layer_norm(input_tensor, name=None, epsilon=1e-5):

        """Run layer normalization on the last dimension of the tensor."""

        name2use = f'LayerNorm_{name}' if name is not None else name

        with tf.variable_scope(name2use, default_name='LayerNorm'):

        with tf1.variable_scope(name2use, default_name='LayerNorm'):

            dim = input_tensor.shape[-1].value

            gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1))

            beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0))

            mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True)

            std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True)

            input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon)

            gamma = tf1.get_variable('gamma', [dim], initializer=tf1.constant_initializer(1))

            beta = tf1.get_variable('beta', [dim], initializer=tf1.constant_initializer(0))

            mean = tf.math.reduce_mean(input_tensor, axis=-1, keepdims=True)

            std = tf.math.reduce_mean(tf.math.square(input_tensor - mean), axis=-1, keepdims=True)

            input_tensor = (input_tensor - mean) * tf.math.rsqrt(std + epsilon)

            input_tensor = input_tensor * gamma + beta

        return input_tensor

    @@ -149,8 +150,8 @@ def get_attention_mask(nd, ns, *, dtype):
  
        this is a TPU compatible version of tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd)

        where the lower right triangle contains 1s

        """

        i = tf.range(nd)[:, None]

        j = tf.range(ns)

        i = tf.ragged.range(nd)[:, None]

        j = tf.ragged.range(ns)

        m = i >= j - ns + nd

        return tf.cast(m, dtype)

    @@ -214,21 +215,21 @@ def host_call_fn(global_step, *args):
  
              List of summary ops to run on the CPU host.

            """

            step = global_step[0]

            with tf.contrib.summary.create_file_writer(

            with tf.summary.create_file_writer(

                    logdir=model_dir, filename_suffix=".host_call").as_default():

                with tf.contrib.summary.always_record_summaries():

                with tf.summary.should_record_summaries():

                    for i, name in enumerate(metric_names):

                        tf.contrib.summary.scalar(prefix + name, args[i][0], step=step)

                        tf1.summary.scalar(prefix + name, args[i][0], step=step)

                    return tf.contrib.summary.all_summary_ops()

                    return tf1.summary.all_v2_summary_ops()

        # To log the current learning rate, and gradient norm for Tensorboard, the

        # summary op needs to be run on the host CPU via host_call. host_call

        # expects [batch_size, ...] Tensors, thus reshape to introduce a batch

        # dimension. These Tensors are implicitly concatenated to

        # [params['batch_size']].

        global_step_tensor = tf.reshape(

        global_step_tensor = tf1.manip.reshape(

            tf.compat.v1.train.get_or_create_global_step(), [1])

        other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names]

        other_tensors = [tf.manip.reshape(metric_dict[key], [1]) for key in metric_names]

        return host_call_fn, [global_step_tensor] + other_tensors

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Current updated code till 2.16 without complex changes #86

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Current updated code till 2.16 without complex changes #86

Are you sure you want to change the base?

Uh oh!

Current updated code till 2.16 without complex changes #86

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!