#!/usr/bin/env python
# coding: utf-8

# In[1]:


# Chapter 4 Exercises


# In[2]:


import numpy as np


# In[3]:


import pandas as pd


# In[4]:


pd.set_option(&#39;display.html.use_mathjax&#39;, False)


# In[5]:


# Prep for Exercise 1


# In[6]:


corpus_df = pd.read_csv(&#39;DataSets/Ask0729-fixed.txt&#39;, sep=&#39;\t&#39;, engine=&#39;python&#39;, header=None)#, lineterminator=&#39;\n&#39;)


# In[7]:


corpus_df.head(25)


# In[8]:


corpus_df.drop([0], axis=1)


# In[9]:


corpus_df = corpus_df[1].str.lower()


# In[10]:


corpus_df.head()


# In[11]:


from io import StringIO


# In[12]:


thing = corpus_df.to_frame().reset_index()


# In[13]:


thing.head()


# In[15]:


corpus = thing[1].tolist()


# In[16]:


# Exercise 1: Divide the sentences into prefixes and suffixes.


# In[17]:


# Empty lists to store the prefixes and the suffixes
prefix_sentences = []
suffix_sentences = []


# In[18]:


# Create one prefix and one suffix at each character of each email


# In[19]:


for email in corpus:
    for index in range(len(email)):
        # Find the prefix and suffix
        prefix = email[: index+1]
        suffix = &#39;\t&#39; + email[index+1 :] + &#39;\n&#39;
        
        # Add the prefix and suffix to the list of prefix and suffix sentences
        prefix_sentences.append(prefix)
        suffix_sentences.append(suffix)


# In[20]:


# Exercise 2: Create the vocabulary and the mappings


# In[21]:


# Initialize vocabulary with the start and end tokens
vocabulary = set([&#39;\t&#39;, &#39;\n&#39;])


# In[22]:


# Iterate for each char in each email
for email in corpus:
    for char in email:
        # Add the char to vocabulary if not already there
        if (char not in vocabulary):
            vocabulary.add(char)


# In[23]:


# Sort the vocabulary
vocabulary = sorted(vocabulary)


# In[24]:


# Create char to int and int to char mappings
char_to_idx = dict((char, idx) for idx, char in enumerate(vocabulary))


# In[25]:


idx_to_char = dict((idx, char) for idx, char in enumerate(vocabulary))


# In[26]:


# Find the length of the longest prefix
max_len_prefix_sent = max([len(prefix) for prefix in prefix_sentences])


# In[27]:


# Find the length of the longest suffix
max_len_suffix_sent = max([len(suffix) for suffix in suffix_sentences])


# In[28]:


# Define a 3D zero vector for the prefix sentences
input_data_prefix = np.zeros((len(prefix_sentences), max_len_prefix_sent, len(vocabulary)), dtype = &#39;float32&#39;)


# In[29]:


# Define a 3D zero vector for the suffix sentences
input_data_suffix = np.zeros((len(suffix_sentences), max_len_suffix_sent, len(vocabulary)), dtype = &#39;float32&#39;)


# In[30]:


# Define a 3D zero vector for the target data
target_data = np.zeros((len(suffix_sentences), max_len_suffix_sent, len(vocabulary)), dtype = &#39;float32&#39;)


# In[31]:


# Exercise 3: Initialize the input and target vectors


# In[32]:


for i in range(len(prefix_sentences)):
    # Iterate over each character in each prefix
    for k, ch in enumerate(prefix_sentences[i]):
        # Convert the character to a one-hot encoded vector
        input_data_prefix[i, k, char_to_idx[ch]] = 1
        
    # Iterate over each character in each suffix
    for k, ch in enumerate(suffix_sentences[i]):
        # Convert the character to a one-hot encoded vector
        input_data_suffix[i, k, char_to_idx[ch]] = 1
    
    # Target data is one timestep ahead and excludes start character
    if k &gt; 0:
        target_data[i, k-1, char_to_idx[ch]] = 1


# In[33]:


# Prep for Exercise 4


# In[34]:


import tensorflow as tf
from tensorflow import keras


# In[35]:


from keras import layers


# In[36]:


from keras.layers import Input
from keras.layers import LSTM
from keras.layers import Dense


# In[37]:


# Exercise 4: Building the encoder


# In[38]:


# Create the input layer of the encoded
encoder_input = Input(shape=(None, len(vocabulary)))


# In[39]:


# Create LSTM layer of size 256
encoder_LSTM = LSTM(256, return_state = True)


# In[40]:


print(tf.__version__)


# In[41]:


# Save encoder output, hidden and cell state
encoder_outputs, encoder_h, encoder_c = encoder_LSTM(encoder_input)


# In[42]:


# Drop encoder_outputs and save encoder states
encoder_states = [ encoder_h, encoder_c ]


# In[43]:


# Exercise 5: Build the decoder


# In[44]:


# Create the decoder input layer
decoder_input = Input(shape=(None, len(vocabulary)))


# In[45]:


# Create LSTM layer of size 256
decoder_LSTM = LSTM(256, return_sequences = True, return_state = True)


# In[46]:


# Save decoder output
decoder_out, _, _ = decoder_LSTM(decoder_input, initial_state = encoder_states)


# In[47]:


# Create a dense layer with softmax activation
decoder_dense = Dense(len(vocabulary), activation = &#39;softmax&#39;)


# In[48]:


# Save the decoder output
decoder_out = decoder_dense(decoder_out)


# In[49]:


# Prep for Exercise 6
from keras import Model


# In[50]:


# Exercise 6: Train the encoder and decoder


# In[51]:


# Build model
model = Model(inputs = [encoder_input, decoder_input], outputs = [decoder_out])


# In[52]:


# Compile the model
model.compile(optimizer = &#39;adam&#39;, loss = &#39;categorical_crossentropy&#39;)


# In[53]:


model.summary()


# In[54]:


# Fit the model
model.fit(x = [input_data_prefix, input_data_suffix], y = target_data, batch_size = 1, epochs = 1, validation_split = 0.2)


# In[55]:


import platform


# In[56]:


print(platform.python_version())


# In[57]:


# TensorFlow was installed following these instructions: https://developer.apple.com/metal/tensorflow-plugin/


# In[ ]: