#!/usr/bin/env python
# coding: utf-8

import zipfile

zf = zipfile.ZipFile(&#39;./tabular-playground-series-may-2022.zip&#39;)

import pandas as pd

with zf.open(&#39;train.csv&#39;) as f:
    train_df = pd.read_csv(f, index_col=&#39;id&#39;)

with zf.open(&#39;test.csv&#39;) as f:
    test_df = pd.read_csv(f, index_col=&#39;id&#39;)

zf.close()

import numpy as np

y_train_full = train_df.pop(&#39;target&#39;)

normalized_columns = [f&#39;f_0{i}&#39; for i in range(7)]
normalized_columns

discrete_columns = [f&#39;f_{i:02}&#39; for i in range(7, 19)] + [&#39;f_29&#39;, &#39;f_30&#39;]
discrete_columns

unnormalized_columns = [f&#39;f_{i}&#39; for i in range(19, 27)] + [&#39;f_28&#39;]
unnormalized_columns


def transform_f27(df):
    f27 = np.apply_along_axis(lambda s: np.array(list(map(lambda c: ord(c) - 65, s[0]))), 1, df[[&#39;f_27&#39;]].values)
    df[list(f&#39;f_27_{i}&#39; for i in range(10))] = f27
    # return df


transform_f27(train_df)


train_df[list(f&#39;f_27_{i}&#39; for i in range(10))].describe()


transform_f27(test_df)


test_df[list(f&#39;f_27_{i}&#39; for i in range(10))].describe()


from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split


std = StandardScaler().fit(train_df[unnormalized_columns])


train_df[unnormalized_columns] = std.transform(train_df[unnormalized_columns])


test_df[unnormalized_columns] = std.transform(test_df[unnormalized_columns])


X_train, X_valid, y_train, y_valid = train_test_split(train_df, y_train_full, random_state=314, stratify=y_train_full)


numeric_columns = normalized_columns + unnormalized_columns


extra_columns = list(f&#39;f_27_{i}&#39; for i in range(10))


import tensorflow as tf
from tensorflow import keras as tfk

# tf.config.set_visible_devices([], &#39;GPU&#39;)

def make_model():
    tf.random.set_seed(314)
    numeric_input = tfk.layers.Input(shape=(len(numeric_columns),))
    
    LEN_DIS = len(discrete_columns)
    discrete_input = tfk.layers.Input(shape=(LEN_DIS,))
    
    extra_input = tfk.layers.Input(shape=(10,))
    
    x1 = tfk.Sequential([
        tfk.layers.Dense(128, activation=&#39;relu&#39;),
        tfk.layers.BatchNormalization(),
        tfk.layers.Dropout(0.2),
        tfk.layers.Dense(32, activation=&#39;relu&#39;)
    ])(numeric_input)
    
    EMBEDDING_SIZE = 64
    
    d = tfk.layers.Embedding(17, EMBEDDING_SIZE, embeddings_initializer=tfk.initializers.GlorotNormal())(discrete_input)
    e = tfk.layers.Embedding(20, EMBEDDING_SIZE, embeddings_initializer=tfk.initializers.GlorotNormal())(extra_input)
    
    LEN_EX = 10
    
    x2 = tfk.Sequential([
        tfk.layers.Reshape((LEN_DIS + LEN_EX, EMBEDDING_SIZE, 1)),
        tfk.layers.Conv2D(filters=128, kernel_size=(LEN_DIS + LEN_EX, 1), activation=&#39;relu&#39;),
        tfk.layers.BatchNormalization(),
        tfk.layers.Dropout(0.2),
        tfk.layers.Conv2D(filters=32, kernel_size=(1, EMBEDDING_SIZE), activation=&#39;relu&#39;),
        tfk.layers.Flatten(),
    ])(tf.concat([d, e], axis=-2))
    
    classifier = tfk.Sequential([
        tfk.layers.BatchNormalization(),
        tfk.layers.Dropout(0.2),
        tfk.layers.Dense(64, activation=&#39;relu&#39;),
        tfk.layers.BatchNormalization(),
        tfk.layers.Dropout(0.2),
        tfk.layers.Dense(32, activation=&#39;relu&#39;),
        tfk.layers.Dense(1, activation=&#39;sigmoid&#39;),
    ])
    
    output = classifier(tfk.layers.concatenate([x1, x2]))
    
    model = tfk.Model(inputs=[numeric_input, discrete_input, extra_input], outputs=output)
    
    return model


model = make_model()


model.summary()


model.compile(
    optimizer=tfk.optimizers.Adam(learning_rate=1e-2),
    loss=tfk.losses.BinaryCrossentropy(label_smoothing=1e-3),
    metrics=[
        tfk.metrics.BinaryAccuracy(),
        tfk.metrics.AUC(name=&#39;auc&#39;),
    ],
)


BATCH_SIZE=512

history = model.fit(
    x=[X_train[numeric_columns].values, X_train[discrete_columns].values, X_train[extra_columns].values],
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=40,
    validation_data=([X_valid[numeric_columns].values, X_valid[discrete_columns].values, X_valid[extra_columns].values], y_valid),
    callbacks=[
        tfk.callbacks.ReduceLROnPlateau(patience=2, verbose=1),
        tfk.callbacks.EarlyStopping(patience=4, restore_best_weights=True, verbose=1),
    ],
    verbose=1,
)
