Licensed
#@title Licensed under the Apache License, Version       under
                                                   2.0 (the     the Apache
                                                            "License
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at     License, Version 2.0 (the
#
# https://www.apache.org/licenses/LICENSE-2.0
                                              "License");
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imp
# See the License for the specific language governing permissions and
# limitations under the License.
    Open in Colab
Copyright 2019 The TensorFlow Authors.
Licensed under the Apache License, Version 2.0 (the "License");
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
    -O /tmp/bbc-text.csv
     --2020-07-12 13:59:26-- https://storage.googleapis.com/laurencemoroney-blog.appspot.
     Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.76.128, 64.233.1
     Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.76.128|:443...
     HTTP request sent, awaiting response... 200 OK
     Length: 5057493 (4.8M) [application/octet-stream]
     Saving to: ‘/tmp/bbc-text.csv’
     /tmp/bbc-text.csv   100%[===================>]   4.82M   --.-KB/s   in 0.03s
     2020-07-12 13:59:27 (179 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_portion = .8
sentences = []
labels = []
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
print(len(stopwords))
# Expected Output
# 153
       153
with open("/tmp/bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
        sentences.append(sentence)
print(len(labels))
print(len(sentences))
print(sentences[0])
# Expected Output
# 2225
# 2225
# tv future hands viewers home theatre systems   plasma high-definition tvs   digital video r
       2225
       2225
       tv future hands viewers home theatre systems   plasma high-definition tvs   digital vid
train_size = int(len(sentences) * training_portion)
train_sentences = sentences[:train_size]
train_labels = labels[:train_size]
validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]
print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))
#   Expected output (if training_portion=.8)
#   1780
#   1780
#   1780
#   445
#   445
     1780
     1780
     1780
     445
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
     445
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
print(len(train_sequences[0]))
print(len(train_padded[0]))
print(len(train_sequences[1]))
print(len(train_padded[1]))
print(len(train_sequences[10]))
print(len(train_padded[10]))
#   Expected Ouput
#   449
#   120
#   200
#   120
#   192
#   120
       449
       120
       200
       120
       192
       120
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_le
print(len(validation_sequences))
print(validation_padded.shape)
# Expected output
# 445
# (445, 120)
       445
       (445, 120)
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))
print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)
print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)
#   Expected output
#   [4]
#   [2]
#   [1]
#   (1780, 1)
#   [5]
#   [4]
#   [3]
#   (445, 1)
       [4]
       [2]
       [1]
       (1780, 1)
       [5]
       [4]
       [3]
       (445, 1)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
#   Expected Output
#   Layer (type)                 Output Shape              Param #
#   =================================================================
#   embedding (Embedding)        (None, 120, 16)           16000
#   _________________________________________________________________
#   global_average_pooling1d (Gl (None, 16)                0
#   _________________________________________________________________
#   dense (Dense)                (None, 24)                408
#   _________________________________________________________________
#   dense_1 (Dense)              (None, 6)                 150
#   =================================================================
#   Total params: 16,558
#   Trainable params: 16,558
#   Non-trainable params: 0
     Model: "sequential"
     _________________________________________________________________
     Layer (type)                 Output Shape              Param #
     =================================================================
     embedding (Embedding)        (None, 120, 16)           16000
     _________________________________________________________________
     global_average_pooling1d (Gl (None, 16)                0
     _________________________________________________________________
     dense (Dense)                (None, 24)                408
     _________________________________________________________________
     dense_1 (Dense)              (None, 6)                 150
     =================================================================
num_epochs
     Total = 30
           params:  16,558
     Trainable
history         params: 16,558
        = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(v
     Non-trainable params: 0
     _________________________________________________________________
     Epoch 1/30
     56/56 - 0s - loss: 1.7632 - accuracy:    0.2315 - val_loss: 1.7279 - val_accuracy: 0.229
     Epoch 2/30
     56/56 - 0s - loss: 1.6797 - accuracy:    0.2326 - val_loss: 1.6301 - val_accuracy: 0.269
     Epoch 3/30
     56/56 - 0s - loss: 1.5684 - accuracy:    0.4124 - val_loss: 1.5137 - val_accuracy: 0.429
     Epoch 4/30
     56/56 - 0s - loss: 1.4253 - accuracy:    0.4809 - val_loss: 1.3512 - val_accuracy: 0.525
     Epoch 5/30
     56/56 - 0s - loss: 1.2274 - accuracy:    0.5961 - val_loss: 1.1528 - val_accuracy: 0.671
     Epoch 6/30
     56/56 - 0s - loss: 1.0137 - accuracy:    0.7511 - val_loss: 0.9600 - val_accuracy: 0.813
     Epoch 7/30
     56/56 - 0s - loss: 0.8240 - accuracy:    0.8579 - val_loss: 0.8014 - val_accuracy: 0.860
     Epoch
import     8/30
       matplotlib.pyplot  as plt
     56/56 - 0s - loss: 0.6696 - accuracy:    0.9107 - val_loss: 0.6733 - val_accuracy: 0.885
     Epoch 9/30
     56/56 - 0s - loss: 0.5459 - accuracy:    0.9281 - val_loss: 0.5711 - val_accuracy: 0.901
def plot_graphs(history,
     Epoch 10/30          string):
  plt.plot(history.history[string])
     56/56 - 0s - loss: 0.4440 - accuracy:    0.9438 - val_loss: 0.4865 - val_accuracy: 0.921
  plt.plot(history.history['val_'+string])
     Epoch 11/30
     56/56 - 0s - loss: 0.3640 - accuracy:
  plt.xlabel("Epochs")                        0.9567 - val_loss: 0.4199 - val_accuracy: 0.921
     Epoch 12/30
  plt.ylabel(string)
     56/56 - 0s - loss:
  plt.legend([string,    0.3000 - accuracy:
                      'val_'+string])         0.9596 - val_loss: 0.3700 - val_accuracy: 0.921
     Epoch 13/30
  plt.show()
     56/56 - 0s - loss: 0.2512 - accuracy:    0.9691 - val_loss: 0.3320 - val_accuracy: 0.923
     Epoch 14/30
plot_graphs(history, "accuracy")
     56/56 - 0s - loss:  0.2149 - accuracy:   0.9725 - val_loss: 0.3016 - val_accuracy: 0.928
plot_graphs(history,
     Epoch 15/30     "loss")
     56/56 - 0s - loss: 0.1848 - accuracy:    0.9747 - val_loss: 0.2825 - val_accuracy: 0.928
     Epoch 16/30
     56/56 - 0s - loss: 0.1620 - accuracy:    0.9781 - val_loss: 0.2639 - val_accuracy: 0.932
     Epoch 17/30
     56/56 - 0s - loss: 0.1425 - accuracy:    0.9815 - val_loss: 0.2504 - val_accuracy: 0.934
     Epoch 18/30
     56/56 - 0s - loss: 0.1274 - accuracy:    0.9815 - val_loss: 0.2400 - val_accuracy: 0.932
     Epoch 19/30
     56/56 - 0s - loss: 0.1130 - accuracy:    0.9848 - val_loss: 0.2305 - val_accuracy: 0.932
     Epoch 20/30
     56/56 - 0s - loss: 0.1015 - accuracy:    0.9871 - val_loss: 0.2225 - val_accuracy: 0.932
     Epoch 21/30
     56/56 - 0s - loss: 0.0916 - accuracy:    0.9865 - val_loss: 0.2196 - val_accuracy: 0.934
     Epoch 22/30
     56/56 - 0s - loss: 0.0827 - accuracy:    0.9899 - val_loss: 0.2122 - val_accuracy: 0.934
     Epoch 23/30
     56/56 - 0s - loss: 0.0741 - accuracy:    0.9916 - val_loss: 0.2092 - val_accuracy: 0.932
     Epoch 24/30
     56/56 - 0s - loss: 0.0678 - accuracy:    0.9921 - val_loss: 0.2040 - val_accuracy: 0.934
     Epoch 25/30
     56/56 - 0s - loss: 0.0610 - accuracy:    0.9955 - val_loss: 0.2015 - val_accuracy: 0.937
     Epoch 26/30
     56/56 - 0s - loss: 0.0555 - accuracy:    0.9961 - val_loss: 0.1992 - val_accuracy: 0.939
     Epoch 27/30
     56/56 - 0s - loss: 0.0504 - accuracy:    0.9966 - val_loss: 0.1952 - val_accuracy: 0.941
     Epoch 28/30
     56/56 - 0s - loss: 0.0460 - accuracy:    0.9983 - val_loss: 0.1953 - val_accuracy: 0.939
     Epoch 29/30
     56/56 - 0s - loss: 0.0419 - accuracy:    0.9994 - val_loss: 0.1911 - val_accuracy: 0.948
     Epoch 30/30
     56/56 - 0s - loss: 0.0385 - accuracy:    0.9994 - val_loss: 0.1905 - val_accuracy: 0.939
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)
# Expected output
# (1000, 16)
     (1000, 16)
import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')