[TensorFlow Cert] Q4-Text Data (Tokenizer, LSTM)

[TensorFlow Cert] Q4-Text Data (Tokenizer, LSTM)

2021, Oct 27    


Sarcasm Text Data


  • Layers : Embedding, LSTM, Bidirectional, Dense
import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

def solution_model():
    url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
    urllib.request.urlretrieve(url, 'sarcasm.json')

    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    training_size = 20000

    with open('sarcasm.json') as f:
        datas = json.load(f)

    sentences = []
    labels = []

    for data in datas:
        sentences.append(data['headline'])
        labels.append(data['is_sarcastic'])
    
    # sentence, label -> train, validation split

    train_sentences = sentences[:training_size]
    train_labels = labels[:training_size]

    validation_sentences = sentences[training_size:]
    validation_labels = labels[training_size:]

    # sentence -> tokenizer
    tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(train_sentences)

    # tokenizer -> sequence
    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    validation_sequences = tokenizer.texts_to_sequences(validation_sentences)

    # sequence -> padded
    train_padded = pad_sequences(train_sequences, maxlen = max_length, truncating = trunc_type, padding = padding_type)
    validation_padded = pad_sequences(validation_sequences, maxlen = max_length, truncating = trunc_type, padding = padding_type)

    # label -> numpy array
    train_labels = np.array(train_labels)
    validation_labels = np.array(validation_labels)

    # Model
    model = Sequential([
                        Embedding(vocab_size, embedding_dim, input_length=max_length),
                        Bidirectional(LSTM(64, return_sequences = True)),
                        Bidirectional(LSTM(64, return_sequences = True)),
                        Bidirectional(LSTM(64)),
                        Dense(32, activation='relu'),
                        # Dense(32, activation='relu'),
                        Dense(16, activation='relu'),
                        # Dense(4, activation='relu'),
                        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

    checkpoint_path = 'my_checkpoint_ckpt'
    checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                                 save_weights_only = True,
                                 save_best_only = True,
                                 monitor = 'val_loss',
                                 verbose = 1)
    
    model.fit(train_padded, train_labels,
              validation_data = (validation_padded, validation_labels),
              callbacks = [checkpoint],
              epochs = 10)
    
    model.load_weights(checkpoint_path)

    return model
    

if __name__ == '__main__':
    model = solution_model()
    model.save("TF4-sarcasm.h5")