Skip to content

Python Code #1

@prudhviraju535

Description

@prudhviraju535

Requirements

## tensorflow==1.15.2
## tensor2tensor==1.14

Import Packages

import re
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry
from tensor2tensor import problems

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import shutil

Enable TF Eager execution

tfe = tf.contrib.eager
tfe.enable_eager_execution()

Other setup

Modes = tf.estimator.ModeKeys

#%%

Required Folder Creation

HOME_PATH = "C:/Raju/Translation_model/Translation_t2t/t2t/"
data_dir = os.path.expanduser(HOME_PATH + "data") # This folder contain the data
tmp_dir = os.path.expanduser(HOME_PATH + "tmp") # Ths folder contains temp data if any
train_dir = os.path.expanduser(HOME_PATH + "train") # This folder contain the model
export_dir = os.path.expanduser(HOME_PATH + "export") # This folder contain the exported model for production
translations_dir = os.path.expanduser(HOME_PATH + "translation") # This folder contain all translated sequence
event_dir = os.path.expanduser(HOME_PATH + "event") # Test the BLEU score
usr_dir = os.path.expanduser(HOME_PATH + "user") # This folder contains our data that we want to add
checkpoint_dir = os.path.expanduser(HOME_PATH + "checkpoints")

#%%

shutil.rmtree(data_dir)
shutil.rmtree(train_dir)

#%%

Creating folders

tf.io.gfile.makedirs(data_dir)
tf.io.gfile.makedirs(tmp_dir)
tf.io.gfile.makedirs(export_dir)
tf.io.gfile.makedirs(translations_dir)
tf.io.gfile.makedirs(train_dir)
tf.io.gfile.makedirs(event_dir)
tf.io.gfile.makedirs(usr_dir)
tf.io.gfile.makedirs(checkpoint_dir)

#%%

@registry.register_problem
class translationsig(text_problems.Text2TextProblem):
"""Predict RX SIG using Standardized SIG"""

@Property
def approx_vocab_size(self):
return 2**13*2 # ~16k

@Property
def is_generate_per_split(self):
# generate_data will shard the data into TRAIN and EVAL for us.
return True

@Property
def dataset_splits(self):
"""Splits of data to produce and number of output shards for each."""
# 10% evaluation data
return [{
"split": problem.DatasetSplit.TRAIN,
"shards": 8,
}, {
"split": problem.DatasetSplit.EVAL,
"shards": 2,
}]

def generate_samples(self, data_dir, tmp_dir, dataset_split):
del data_dir
del tmp_dir
del dataset_split

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='Sample_DL')
sig_data = dataset.to_pandas_dataframe()

print(sig_data.shape)


## Data Pre Processing

## Selecting Columns 
sig_data = sig_data[['Standardized_SIG', 'IC+_Pharmacist SIG']]
sig_data = sig_data.drop_duplicates()

#print(sig_data.shape)

## Cleaning column names
sig_data.columns = sig_data.columns.str.replace(' ','_')
sig_data.columns = sig_data.columns.str.replace('+','')

#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.lower()
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.lower()


## Data Cleaning
sig_data.Standardized_SIG = sig_data.Standardized_SIG.map(lambda x: re.sub(r'.$', "",x))
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.map(lambda x: re.sub(r'"$', "",x))
sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.strip()
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.strip()

# sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.strip()
# sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.strip()

# sig_data.Standardized_SIG = sig_data.Standardized_SIG.apply(lambda x: re.sub(r'.$', "",x))
# sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.apply(lambda x: re.sub(r'"$', "",x))

#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.rstrip('[",]')
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.rstrip('[",]')
#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.rstrip('[",]')
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.rstrip('[",]')

sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.upper()
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.upper()

#sig_data.Standardized_SIG = sig_data.Standardized_SIG.str.strip()
#sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.str.strip()

sig_data = sig_data[sig_data.IC_Pharmacist_SIG != '.']
sig_data = sig_data[sig_data.IC_Pharmacist_SIG != '']
sig_data = sig_data[sig_data.IC_Pharmacist_SIG != '...']
sig_data = sig_data.dropna()

sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.apply(lambda x:re.sub(r'\bTAKE\b',"Tk",x))
sig_data.IC_Pharmacist_SIG = sig_data.IC_Pharmacist_SIG.apply(lambda x:re.sub(r'\bONE\b',"1",x))

print("Final Shape : {}".format(sig_data.shape))

for sig in range(len(sig_data)):
    yield {
      "inputs": sig_data.Standardized_SIG.iloc[sig],
      "targets":sig_data.IC_Pharmacist_SIG.iloc[sig],
    }

#%%

Model name and Parameters selection

PROBLEM = "translationsig" # Custom ESIG Translation Problem
MODEL = "transformer" # Our model
HPARAMS = "transformer_base" # Hyperparameters for the model by default
# If you have a one gpu, use transformer_big_single_gpu
#%%

Setup helper functions for encoding and decoding

def encode(input_str, output_str=None):
"""Input str to features dict, ready for inference"""
inputs = encoders["inputs"].encode(input_str) + [1] # add EOS id
batch_inputs = tf.reshape(inputs, [1, -1, 1]) # Make it 3D.
return {"inputs": batch_inputs}

def decode(integers):
"""List of ints to str"""
integers = list(np.squeeze(integers))
if 1 in integers:
integers = integers[:integers.index(1)]
return encoders["inputs"].decode(np.squeeze(integers))

#%%

#Data generation

print('Generating data')
problem_definition = translationsig()
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(data_dir, tmp_dir)

print("Data Generated.")

#%%

Get the encoders from the problem

encoders = t2t_problem.feature_encoders(data_dir)

example = tfe.Iterator(t2t_problem.dataset(Modes.TRAIN, data_dir)).next()
inputs = [int(x) for x in example["inputs"].numpy()] # Cast to ints.
targets = [int(x) for x in example["targets"].numpy()] # Cast to ints.

Example inputs as int-tensor.

print("Inputs, encoded:")
print(inputs)
print("Inputs, decoded:")

Example inputs as a sentence.

print(decode(inputs))

Example targets as int-tensor.

print("Targets, encoded:")
print(targets)

Example targets as a sentence.

print("Targets, decoded:")
print(decode(targets))

#%%
from tensor2tensor import models

#print(problems.available()) #Show all problems
print(registry.list_models()) #Show all registered models

#%%
from tensor2tensor.utils.trainer_lib import create_hparams

Init Hparams object from T2T Problem

hparams = create_hparams(HPARAMS)

                     #data_dir=data_dir, problem_name="translation_esig")

print(hparams.to_json())
#%%

Make Chngaes to Hparams

hparams.batch_size = 500
hparams.learning_rate_warmup_steps = 4500
hparams.learning_rate = .4
save_checkpoints_steps = 1000

print(hparams.to_json())
#%%
from tensor2tensor.utils.trainer_lib import create_run_config, create_experiment

Initi Run COnfig for Model Training

RUN_CONFIG = create_run_config(
model_dir=train_dir,
model_name=MODEL,
save_checkpoints_steps= save_checkpoints_steps# Location of where model file is store
# More Params here in this fucntion for controling how noften to tave checkpoints and more.
)

# Create Tensorflow Experiment Object

tensorflow_exp_fn = create_experiment(
run_config=RUN_CONFIG,
hparams=hparams,
model_name=MODEL,
problem_name=PROBLEM,
data_dir=data_dir,
train_steps=400, # Total number of train steps for all Epochs
eval_steps=100 # Number of steps to perform for each evaluation
)

tensorflow_exp_fn.train_and_evaluate()

#%%

translate_model = registry.model(MODEL)(hparams, Modes.PREDICT)
#%%
#enfr_problem = problems.problem(PROBLEM)

Copy the vocab file locally so we can encode inputs and decode model outputs

vocab_name = "vocab.sig_translator.16384.subwords"
vocab_file = os.path.join(data_dir, vocab_name)

Get the encoders from the problem

#encoders = enfr_problem.feature_encoders(DATA_DIR)

Copy the pretrained checkpoint locally

ckpt_name = "transformer_esig"

gs_ckpt = os.path.join(gs_ckpt_dir, ckpt_name)

ckpt_path = tf.train.latest_checkpoint(os.path.join(checkpoint_dir, ckpt_name))

print(ckpt_path)

ckpt_path = tf.train.latest_checkpoint(os.path.join(train_dir))
print(ckpt_path)

#%%
def translate(inputs):
encoded_inputs = encode(inputs)
with tfe.restore_variables_on_create(ckpt_path):
model_output = translate_model.infer(encoded_inputs)["outputs"]
return decode(model_output)
#%%
inputs = "TAKE 1 TABLET 2 TIMES DAILY"

outputs = translate(inputs)

print("Inputs: %s" % inputs)
print("Outputs: %s" % outputs)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions