Replace tf.constant for TF #19

jplu · 2020-04-24T15:32:06Z

Replace simple tf.constant type of Tensor to tf.ragged.constant which allows to have examples of different size in a tf.data.Dataset.

Now the training works with TF. Here the same example than for the PT in collab:

import tensorflow as tf
import nlp
from transformers import BertTokenizerFast, TFBertForQuestionAnswering

# Load our training dataset and tokenizer
train_dataset = nlp.load('squad', split="train[:1%]")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def get_correct_alignement(context, answer):
    start_idx = answer['answer_start'][0]
    text = answer['text'][0]
    end_idx = start_idx + len(text)
    if context[start_idx:end_idx] == text:
        return start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == text:
        return start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == text:
        return start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

# Tokenize our training dataset
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = list(zip(example_batch['context'], example_batch['question']))
    encodings = tokenizer.batch_encode_plus(input_pairs, pad_to_max_length=True)

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methods.
    start_positions, end_positions = [], []
    for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):
        start_idx, end_idx = get_correct_alignement(context, answer)
        start_positions.append([encodings.char_to_token(i, start_idx)])
        end_positions.append([encodings.char_to_token(i, end_idx-1)])
    
    if start_positions and end_positions:
      encodings.update({'start_positions': start_positions,
                        'end_positions': end_positions})
    return encodings

train_dataset = train_dataset.map(convert_to_features, batched=True)

columns = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
train_dataset.set_format(type='tensorflow', columns=columns)
features = {x: train_dataset[x] for x in columns[:3]} 
labels = {"output_1": train_dataset["start_positions"]}
labels["output_2"] = train_dataset["end_positions"]
tfdataset = tf.data.Dataset.from_tensor_slices((features, labels)).batch(8)
model = TFBertForQuestionAnswering.from_pretrained("bert-base-cased")
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE, from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=opt,
              loss={'output_1': loss_fn, 'output_2': loss_fn},
              loss_weights={'output_1': 1., 'output_2': 1.},
              metrics=['accuracy'])
model.fit(tfdataset, epochs=1, steps_per_epoch=3)

… allows to have examples of different size in a tf.data.Dataset

thomwolf · 2020-04-25T21:18:40Z

Awesome!

Replace simple tf.constant type of Tensor to tf.ragged.constant which…

1558038

… allows to have examples of different size in a tf.data.Dataset

thomwolf merged commit 4b95da7 into master Apr 25, 2020

jplu deleted the new-tf-type branch April 29, 2020 09:27

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Replace tf.constant for TF #19

Replace tf.constant for TF #19

jplu commented Apr 24, 2020

thomwolf commented Apr 25, 2020

Replace tf.constant for TF #19

Replace tf.constant for TF #19

Conversation

jplu commented Apr 24, 2020

thomwolf commented Apr 25, 2020