Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions discrimination/run_discrimination.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@

import numpy as np
import tensorflow as tf
from tensorflow.python.lib.io import file_io
#from tensorflow.python.lib.io import file_io

from lm.dataloader import classification_convert_examples_to_features, classification_input_fn_builder
from lm.modeling import classification_model_fn_builder, GroverConfig
from lm.utils import _save_np
from sample.encoder import get_encoder

flags = tf.flags
flags = tf.compat.v1.flags

FLAGS = flags.FLAGS

Expand Down
15 changes: 8 additions & 7 deletions lm/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
example = tf.compat.v1.parse_single_example(record, name_to_features)

# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
Expand All @@ -42,7 +42,7 @@ def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
name_to_features = {
"input_ids": tf.FixedLenFeature([seq_length + 1], tf.int64),
"input_ids": tf.io.FixedLenFeature([seq_length + 1], tf.int64),
}

# For training, we want a lot of parallel reading and shuffling.
Expand All @@ -57,6 +57,7 @@ def input_fn(params):

# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
# tf.data.experimental.parallel_interleave will be removed in future versions need to use interleave instead
d = d.apply(
tf.data.experimental.parallel_interleave(
tf.data.TFRecordDataset,
Expand Down Expand Up @@ -91,13 +92,13 @@ def classification_convert_examples_to_features(
chop_from_front_if_needed=True):
"""Convert a set of `InputExample`s to a TFRecord file."""

writer = tf.python_io.TFRecordWriter(output_file)
writer = tf.io.TFRecordWriter(output_file)

label_map = {label: i for i, label in enumerate(labels)}

for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
tf.compat.v1.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

# begin_summary is our [CLS] token
tokens = example['ids'] + [encoder.begin_summary]
Expand Down Expand Up @@ -134,9 +135,9 @@ def classification_input_fn_builder(input_file, seq_length, is_training,
"""Creates an `input_fn` closure to be passed to TPUEstimator."""

name_to_features = {
"input_ids": tf.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.FixedLenFeature([], tf.int64),
"is_real_example": tf.FixedLenFeature([], tf.int64),
"input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.io.FixedLenFeature([], tf.int64),
"is_real_example": tf.io.FixedLenFeature([], tf.int64),
}

def input_fn(params):
Expand Down
175 changes: 90 additions & 85 deletions lm/modeling.py

Large diffs are not rendered by default.

51 changes: 26 additions & 25 deletions lm/optimization_adafactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,18 @@
# limitations under the License.
import re
import tensorflow as tf
import tensorflow.compat.v1 as tf1
from lm.utils import get_shape_list


def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
global_step = tf1.train.get_or_create_global_step()

learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
learning_rate = tf1.constant(value=init_lr, shape=[], dtype=tf.float32)

# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate = tf1.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
Expand All @@ -36,7 +37,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
warmup_steps_int = tf1.constant(num_warmup_steps, dtype=tf.int32)

global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
Expand All @@ -60,10 +61,10 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

if use_tpu:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
optimizer = tf1.tpu.CrossShardOptimizer(optimizer)

tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
tvars = tf1.trainable_variables()
grads = tf1.gradients(loss, tvars)

# You could do this, but instead we don't because a) it's slow and b) we already did the 'update clipping'
# (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
Expand All @@ -85,7 +86,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
return train_op, train_metrics


class AdaFactorOptimizer(tf.train.Optimizer):
class AdaFactorOptimizer(tf1.train.Optimizer):
"""here's the optimizer we'll use"""

def __init__(self,
Expand Down Expand Up @@ -125,7 +126,7 @@ def _parameter_scale(self, var):
Returns:
a Scalar
"""
return tf.maximum(reduce_rms(var), self.epsilon2)
return tf.math.maximum(reduce_rms(var), self.epsilon2)

def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""See base class."""
Expand All @@ -139,7 +140,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):

# decay_rate = 1 - tf.pow(tf.cast(tf.train.get_or_create_global_step(), tf.float32) + 1.0, -0.8)
decay_rate = self.beta_2
grad_squared = tf.square(grad) + self.epsilon1
grad_squared = tf.math.square(grad) + self.epsilon1

update_scale = self.learning_rate
# update_scale = self.learning_rate * tf.cast(self._parameter_scale(param), dtype=tf.float32)
Expand All @@ -148,7 +149,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
# This confounds the XLA rewriter and keeps it from fusing computations
# across different variables. This fusion is a bad for HBM usage, since
# it causes the gradients to persist in memory.
grad_squared_mean = tf.reduce_mean(grad_squared)
grad_squared_mean = tf.math.reduce_mean(grad_squared)
decay_rate += grad_squared_mean * 1e-30
update_scale += grad_squared_mean * 1e-30

Expand All @@ -157,42 +158,42 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
if self._use_factored(shape_list):
num_rows, num_columns = shape_list

vr = tf.get_variable(
vr = tf1.get_variable(
name=param_name + "/adafactor_vr",
shape=[num_rows],
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
vc = tf.get_variable(
initializer=tf1.zeros_initializer())
vc = tf1.get_variable(
name=param_name + "/adafactor_vc",
shape=[num_columns],
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
initializer=tf1.zeros_initializer())

next_vr = decay_rate * vr + (1 - decay_rate) * tf.reduce_mean(grad_squared, 1)
next_vc = decay_rate * vc + (1 - decay_rate) * tf.reduce_mean(grad_squared, 0)
next_vr = decay_rate * vr + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 1)
next_vc = decay_rate * vc + (1 - decay_rate) * tf.math.reduce_mean(grad_squared, 0)

long_term_mean = tf.reduce_mean(next_vr, -1, keepdims=True)
r_factor = tf.rsqrt(next_vr / long_term_mean + self.epsilon1)
c_factor = tf.rsqrt(next_vc + self.epsilon1)
long_term_mean = tf.math.reduce_mean(next_vr, -1, keepdims=True)
r_factor = tf.math.rsqrt(next_vr / long_term_mean + self.epsilon1)
c_factor = tf.math.rsqrt(next_vc + self.epsilon1)
update = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(c_factor, -2)

assignments.append(vr.assign(next_vr, use_locking=self.use_locking))
assignments.append(vc.assign(next_vc, use_locking=self.use_locking))
else:
v = tf.get_variable(
v = tf1.get_variable(
name=param_name + "/adafactor_v",
shape=shape_list,
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
initializer=tf1.zeros_initializer())
next_v = decay_rate * v + (1 - decay_rate) * grad_squared

assignments.append(v.assign(next_v, use_locking=self.use_locking))
update = grad * tf.rsqrt(next_v + self.epsilon1)
update = grad * tf.math.rsqrt(next_v + self.epsilon1)

clipping_denom = tf.maximum(1.0, reduce_rms(update) / self.clipping_rate)
clipping_denom = tf.math.maximum(1.0, reduce_rms(update) / self.clipping_rate)
update /= clipping_denom

# Do weight decay
Expand Down Expand Up @@ -231,4 +232,4 @@ def _get_variable_name(self, param_name):


def reduce_rms(x):
return tf.sqrt(tf.reduce_mean(tf.square(x)))
return tf.math.sqrt(tf.math.reduce_mean(tf.math.square(x)))
25 changes: 13 additions & 12 deletions lm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
""" Training script! """

import tensorflow as tf
import tensorflow.compat.v1 as tf1

from lm.dataloader import input_fn_builder
from lm.modeling import model_fn_builder, GroverConfig

flags = tf.flags
flags = tf1.flags

FLAGS = flags.FLAGS

Expand Down Expand Up @@ -93,25 +94,25 @@


def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tf1.logging.set_verbosity(tf1.logging.INFO)

news_config = GroverConfig.from_json_file(FLAGS.config_file)

tf.gfile.MakeDirs(FLAGS.output_dir)
tf1.gfile.MakeDirs(FLAGS.output_dir)

input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
input_files.extend(tf1.gfile.Glob(input_pattern))

tf.logging.info("*** Input Files ***")
tf1.logging.info("*** Input Files ***")
for input_file in input_files:
tf.logging.info(" %s" % input_file)
tf1.logging.info(" %s" % input_file)

tpu_cluster_resolver = None
if FLAGS.use_tpu and FLAGS.tpu_name:
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

# from here
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
cluster=tpu_cluster_resolver,
Expand Down Expand Up @@ -141,9 +142,9 @@ def main(_):
eval_batch_size=FLAGS.train_batch_size,
params={'model_dir': FLAGS.output_dir}
)

tf.logging.info("***** Running training *****")
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
# to here
tf1.logging.info("***** Running training *****")
tf1.logging.info(" Batch size = %d", FLAGS.train_batch_size)
train_input_fn = input_fn_builder(
input_files=input_files,
seq_length=FLAGS.max_seq_length,
Expand All @@ -154,4 +155,4 @@ def main(_):
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("output_dir")
tf.app.run()
tf1.app.run()
35 changes: 18 additions & 17 deletions lm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import six
import tensorflow as tf
import tensorflow.compat.v1 as tf1
import numpy as np
from tensorflow.python.lib.io import file_io

Expand Down Expand Up @@ -53,7 +54,7 @@ def assert_rank(tensor, expected_rank, name=None):

actual_rank = tensor.shape.ndims
if actual_rank not in expected_rank_dict:
scope_name = tf.get_variable_scope().name
scope_name = tf1.get_variable_scope().name
raise ValueError(
"For the tensor `%s` in scope `%s`, the actual rank "
"`%d` (shape = %s) is not equal to the expected rank `%s`" %
Expand Down Expand Up @@ -91,7 +92,7 @@ def get_shape_list(tensor, expected_rank=None, name=None):
if not non_static_indexes:
return shape

dyn_shape = tf.shape(tensor)
dyn_shape = tf1.shape(tensor)
for index in non_static_indexes:
shape[index] = dyn_shape[index]
return shape
Expand All @@ -109,20 +110,20 @@ def gelu(input_tensor):
Returns:
`input_tensor` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
cdf = 0.5 * (1.0 + tf.math.erf(input_tensor / tf.math.sqrt(2.0)))
return input_tensor * cdf


def layer_norm(input_tensor, name=None, epsilon=1e-5):
"""Run layer normalization on the last dimension of the tensor."""
name2use = f'LayerNorm_{name}' if name is not None else name
with tf.variable_scope(name2use, default_name='LayerNorm'):
with tf1.variable_scope(name2use, default_name='LayerNorm'):
dim = input_tensor.shape[-1].value
gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1))
beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0))
mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True)
std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True)
input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon)
gamma = tf1.get_variable('gamma', [dim], initializer=tf1.constant_initializer(1))
beta = tf1.get_variable('beta', [dim], initializer=tf1.constant_initializer(0))
mean = tf.math.reduce_mean(input_tensor, axis=-1, keepdims=True)
std = tf.math.reduce_mean(tf.math.square(input_tensor - mean), axis=-1, keepdims=True)
input_tensor = (input_tensor - mean) * tf.math.rsqrt(std + epsilon)
input_tensor = input_tensor * gamma + beta
return input_tensor

Expand All @@ -149,8 +150,8 @@ def get_attention_mask(nd, ns, *, dtype):
this is a TPU compatible version of tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd)
where the lower right triangle contains 1s
"""
i = tf.range(nd)[:, None]
j = tf.range(ns)
i = tf.ragged.range(nd)[:, None]
j = tf.ragged.range(ns)
m = i >= j - ns + nd
return tf.cast(m, dtype)

Expand Down Expand Up @@ -214,21 +215,21 @@ def host_call_fn(global_step, *args):
List of summary ops to run on the CPU host.
"""
step = global_step[0]
with tf.contrib.summary.create_file_writer(
with tf.summary.create_file_writer(
logdir=model_dir, filename_suffix=".host_call").as_default():
with tf.contrib.summary.always_record_summaries():
with tf.summary.should_record_summaries():
for i, name in enumerate(metric_names):
tf.contrib.summary.scalar(prefix + name, args[i][0], step=step)
tf1.summary.scalar(prefix + name, args[i][0], step=step)

return tf.contrib.summary.all_summary_ops()
return tf1.summary.all_v2_summary_ops()

# To log the current learning rate, and gradient norm for Tensorboard, the
# summary op needs to be run on the host CPU via host_call. host_call
# expects [batch_size, ...] Tensors, thus reshape to introduce a batch
# dimension. These Tensors are implicitly concatenated to
# [params['batch_size']].
global_step_tensor = tf.reshape(
global_step_tensor = tf1.manip.reshape(
tf.compat.v1.train.get_or_create_global_step(), [1])
other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names]
other_tensors = [tf.manip.reshape(metric_dict[key], [1]) for key in metric_names]

return host_call_fn, [global_step_tensor] + other_tensors
Loading