Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
functions/*/node_modules
**/.DS_STORE
manifest.json
**/*_sa.json
learning/tfx/metadata/*
learning/tfx/pipelines/*
6 changes: 3 additions & 3 deletions functions/recordPlaybackRate/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

128 changes: 128 additions & 0 deletions learning/bert_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Lint as: python2, python3
# Copyright 2020 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Configurable fine-tuning BERT models for various tasks."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from typing import Text, Optional, List, Union

import tensorflow as tf
import tensorflow.keras as keras


def build_bert_classifier(bert_layer: tf.keras.layers.Layer,
max_len: int,
num_classes: int,
dropout: float = 0.1,
activation: Optional[Text] = None):
"""BERT Keras model for classification.

Connect configurable fully connected layers on top of the BERT
pooled_output.

Args:
bert_layer: A tensorflow_hub.KerasLayer intence of BERT layer.
max_len: The maximum length of preprocessed tokens.
num_classes: Number of unique classes in the labels. Determines the output
shape of the classification layer.
dropout: Dropout rate to be used for the classification layer.
activation: Activation function to use. If you don't specify anything, no
activation is applied (ie. "linear" activation: a(x) = x).

Returns:
A Keras model.
"""
input_layer_names = ["input_word_ids", "input_mask", "segment_ids"]

input_layers = [
keras.layers.Input(shape=(max_len,), dtype=tf.int64, name=name)
for name in input_layer_names
]

converted_layers = [tf.cast(k, tf.int32) for k in input_layers]

pooled_output, _ = bert_layer(converted_layers)
output = keras.layers.Dropout(dropout)(pooled_output)
output = keras.layers.Dense(num_classes, activation=activation)(output)
model = keras.Model(input_layers, output)
return model


def compile_bert_classifier(
model: tf.keras.Model,
loss: tf.keras.losses = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True),
learning_rate: float = 2e-5,
metrics: List[Union[Text, tf.keras.metrics.Metric]] = None):
"""Compile the BERT classifier using suggested parameters.

Args:
model: A keras model. Most likely the output of build_bert_classifier.
loss: tf.keras.losses. The suggested loss function expects integer labels
(e.g. 0, 1, 2). If the labels are one-hot encoded, consider using
tf.keras.lossesCategoricalCrossEntropy with from_logits set to true.
learning_rate: Suggested learning rate to be used in
tf.keras.optimizer.Adam. The three suggested learning_rates for
fine-tuning are [2e-5, 3e-5, 5e-5].
metrics: Default None will use ['sparse_categorical_accuracy']. An array of
strings or tf.keras.metrics.

Returns:
None.
"""
if metrics is None:
metrics = ["sparse_categorical_accuracy"]

model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate),
loss=loss,
metrics=metrics)


def build_and_compile_bert_classifier(
bert_layer: tf.keras.layers.Layer,
max_len: int,
num_classes: int,
learning_rate: float = 5e-5,
metrics: List[Union[Text, tf.keras.metrics.Metric]] = None):
"""Build and compile keras BERT classification model.

Apart from the necessary inputs, use default/suggested parameters in build
and compile BERT classifier functions.

Args:
bert_layer: A tensorflow_hub.KerasLayer intence of BERT layer.
max_len: The maximum length of preprocessed tokens.
num_classes: Number of unique classes in the labels. Determines the output
shape of the classification layer.
learning_rate: Suggested learning rate to be used in
tf.keras.optimizer.Adam. The three suggested learning_rates for
fine-tuning are [2e-5, 3e-5,5e-5]
metrics: Default None will use ['sparse_categorical_accuracy']. An array of
strings or tf.keras.metrics.

Returns:
A compiled keras BERT Classification model.
"""
if metrics is None:
metrics = ["sparse_categorical_accuracy"]

model = build_bert_classifier(bert_layer, max_len, num_classes)

compile_bert_classifier(model, learning_rate=learning_rate, metrics=metrics)
return model

182 changes: 182 additions & 0 deletions learning/bert_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# Lint as: python2, python3
# Copyright 2020 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepressing using tensorflow_text BertTokenizer."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from typing import Text

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

from tensorflow.python.eager.context import eager_mode # pylint: disable=g-direct-tensorflow-import


_CLS = '[CLS]'
_PAD = '[PAD]'
_SEP = '[SEP]'


class BertPreprocessor(object):
"""Bert Tokenizer built ontop of tensorflow_text.BertTokenizer."""

def __init__(self, model_link: Text):
self._model_link = model_link
self._model = hub.KerasLayer(model_link)
self._find_special_tokens()

def _find_special_tokens(self):
"""Find the special token ID's for [CLS] [PAD] [SEP].

Since each Bert model is trained on different vocabulary, it's important
to find the special token indices pertaining to that model.
Since in Transform, tensorflow_hub.KerasLayer loads a symbolic tensor, turn
on eager mode to get the actual vocab_file location.
"""

with eager_mode():
model = hub.KerasLayer(self._model_link)
vocab = model.resolved_object.vocab_file.asset_path.numpy()
self._do_lower_case = model.resolved_object.do_lower_case.numpy()
with tf.io.gfile.GFile(vocab, 'r') as f:
lines = f.read().split('\n')
self._sep_id = lines.index(_SEP)
self._cls_id = lines.index(_CLS)
self._pad_id = lines.index(_PAD)

def tokenize_single_sentence_unpad(self,
sequence: tf.Tensor,
max_len: int = 128,
add_cls: bool = True,
add_sep: bool = True):
"""Tokenize a sentence with the BERT model vocab file and without padding.

Add special tokens according to config.

Args:
sequence: Tensor of shape [batch_size, 1].
max_len: The number of tokens after padding and truncating.
add_cls: Whether to add CLS token at the front of each sequence.
add_sep: Whether to add SEP token at the end of each sequence.

Returns:
word_ids: Ragged tokenized sequences [batch_size, None].
"""
vocab_file_path = self._model.resolved_object.vocab_file.asset_path
tokenizer = text.BertTokenizer(
vocab_file_path,
lower_case=self._do_lower_case,
token_out_type=tf.int64)
word_ids = tokenizer.tokenize(sequence)
# Tokenizer default puts tokens into array of size 1. merge_dims flattens it
word_ids = word_ids.merge_dims(-2, -1)
if add_cls:
cls_token = tf.fill([tf.shape(sequence)[0], 1],
tf.constant(self._cls_id, dtype=tf.int64))

word_ids = tf.concat([cls_token, word_ids], 1)

if add_sep:
sep_token = tf.fill([tf.shape(sequence)[0], 1],
tf.constant(self._sep_id, dtype=tf.int64))

word_ids = word_ids[:, :max_len - 1]
word_ids = tf.concat([word_ids, sep_token], 1)

return word_ids

def tokenize_single_sentence_pad(self,
sequence: tf.Tensor,
max_len: int = 128,
add_cls: bool = True,
add_sep: bool = True):
"""Tokenize a single sentence according to the vocab used by the Bert model.

Add special tokens according to config.

Args:
sequence: Tensor of shape [batch_size, 1].
max_len: The number of tokens after padding and truncating.
add_cls: Whether to add CLS token at the front of each sequence.
add_sep: Whether to add SEP token at the end of each sequence.

Returns:
word_ids: Tokenized sequences [batch_size, max_len].
input_mask: Mask padded tokens [batch_size, max_len].
segment_ids: Distinguish multiple sequences [batch_size, max_len].
"""
word_ids = self.tokenize_single_sentence_unpad(sequence, max_len, add_cls,
add_sep)

word_ids = word_ids.to_tensor(
shape=[None, max_len],
default_value=tf.constant(self._pad_id, dtype=tf.int64))

input_mask = tf.cast(tf.not_equal(word_ids, self._pad_id), tf.int64)
segment_ids = tf.fill(tf.shape(input_mask), tf.constant(0, dtype=tf.int64))

return word_ids, input_mask, segment_ids

def tokenize_sentence_pair(self, sequence_a: tf.Tensor, sequence_b: tf.Tensor,
max_len: int):
"""Tokenize a sequence pair.

Tokenize each sequence with self.tokenize_single_sentence. Then add CLS
token in front of the first sequence, add SEP tokens between the two
sequences and at the end of the second sequence.

Args:
sequence_a: [batch_size, 1]
sequence_b: [batch_size, 1]
max_len: The length of the concatenated tokenized sentences.

Returns:
word_ids: Tokenized sequences [batch_size, max_len].
input_mask: Mask padded tokens [batch_size, max_len].
segment_ids: Distinguish multiple sequences [batch_size, max_len].
"""
# TODO(dzats): the issue here is nuanced. Depending on the dataset, one
# might want to keep the entire first sentence, or the second. Consider
# alternate truncate stratagies.
sentence_len = max_len // 2
word_id_a = self.tokenize_single_sentence_unpad(
sequence_a,
sentence_len,
True,
True,
)

word_id_b = self.tokenize_single_sentence_unpad(
sequence_b,
sentence_len,
False,
True,
)

word_ids = tf.concat([word_id_a, word_id_b], 1)
word_ids = word_ids.to_tensor(
shape=[None, max_len],
default_value=tf.constant(self._pad_id, dtype=tf.int64))

input_mask = tf.cast(tf.not_equal(word_ids, self._pad_id), tf.int64)
# Fill a ragged tensor of zero with word_id_a's shape
segment_ids = tf.cast(word_id_a < 0, tf.int64)
segment_ids = segment_ids.to_tensor(
shape=[None, max_len], default_value=tf.constant(1, dtype=tf.int64))
return word_ids, input_mask, segment_ids

Loading