!wget -c "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" -O 5.tar.gz !mkdir language_model !tar -xzf 5.tar.gz -C language_model/ import tensorflow as tf from tensorflow import keras import hyperopt import hyperopt.pyll from hyperopt.pyll import scope import numpy as np from pathlib import Path import joblib import json import pandas as pd import functools def build_encoder( input_dim: int, output_dim: int, layer_multiplier: int, # FIXME: add parameter docs num_layers: int, activation: str, activity_l1: float, ) -> keras.Model: if not output_dim < input_dim: raise ValueError("output_dim must be less than input_dim") input_layer = keras.layers.Input(shape=(input_dim,)) hidden = input_layer layer_dim = input_dim // layer_multiplier for _ in range(num_layers): hidden = keras.layers.Dense( layer_dim, activation=activation, activity_regularizer=keras.regularizers.l1(activity_l1) if activity_l1 > 1e-7 else None, )(hidden) layer_dim //= layer_multiplier layer_dim = max(layer_dim, output_dim) last_layer = keras.layers.Dense(output_dim, activation=activation)(hidden) encoder = tf.keras.Model(inputs=input_layer, outputs=last_layer, name="encoder") return encoder def build_decoder( input_dim: int, output_dim: int, layer_multiplier: int, # FIXME: add parameter docs num_layers: int, activation: str, ) -> tf.keras.Model: """Build a decoder model. The returned model has not been compiled""" # FIXME: add parameter docs if not input_dim < output_dim: raise ValueError("input_dim must be less than output_dim") input_layer = keras.layers.Input(shape=(input_dim,)) hidden = input_layer layer_dim = input_dim * layer_multiplier for _ in range(num_layers): hidden = keras.layers.Dense(layer_dim, activation=activation)(hidden) layer_dim *= layer_multiplier layer_dim = min(output_dim, layer_dim) last_layer = keras.layers.Dense(output_dim, activation="tanh")(hidden) decoder = tf.keras.Model(inputs=input_layer, outputs=last_layer, name="decoder") return decoder def build_stacked_autoencoder( # pylint: disable=too-many-arguments, too-many-locals input_dim: int, encoded_dim: int, layer_multiplier: int, num_layers: int, activation: str, encoder_activity_l1: float, learning_rate: float, log2_batch_size: int, early_stopping_patience: int, lr_reduce_factor: float, lr_reduce_factor_patience: int, epochs: int=2000 ) -> keras.models.Sequential: """Returns a stacked autoencoder that can be used to encode `input_dim` dimensional data into `encoded_dim`-dimensional data. If compiled == True, the returned model is compiled with mse loss, mae metric and Nadam optimizer (lr=1e-3)""" encoder = build_encoder( input_dim, encoded_dim, layer_multiplier, num_layers, activation, encoder_activity_l1 ) decoder = build_decoder(encoded_dim, input_dim, layer_multiplier, num_layers, activation) stacked_autoencoder = keras.models.Sequential([encoder, decoder]) keras_callback = [ keras.callbacks.EarlyStopping( patience=early_stopping_patience, monitor="loss", mode="min", min_delta=1e-5, restore_best_weights=True, verbose=1, ) ] if lr_reduce_factor > 0: keras_callback.append( keras.callbacks.ReduceLROnPlateau( monitor="loss", factor=lr_reduce_factor, patience=lr_reduce_factor_patience, min_lr=1e-8, verbose=0, ) ) keras_params = dict( # FIXME: these should be deserialized from a configuration file epochs=epochs, callbacks=keras_callback, batch_size=int(2**log2_batch_size), verbose=0, ) stacked_autoencoder.compile( loss="mae", optimizer=keras.optimizers.Nadam(learning_rate=learning_rate), metrics=["mae"], ) return stacked_autoencoder, keras_params BATCH_TRIALS = 100 AUTOENCODER_HYPER_SPACE = { "layer_multiplier": hyperopt.pyll.scope.int( hyperopt.hp.quniform("layer_multiplier", 2, 10, 1) ), "num_layers": hyperopt.pyll.scope.int(hyperopt.hp.quniform("num_layers", 1, 5, 1)), "activation": hyperopt.hp.choice("activation", ["relu", "selu", "elu", "gelu"]), "encoder_activity_l1": hyperopt.hp.loguniform( "encoder_activity_l1", np.log(1.0e-8), np.log(1.0e-1) ), "learning_rate": hyperopt.hp.loguniform("learning_rate", np.log(1.0e-4), np.log(1.0e-1)), "log2_batch_size": hyperopt.hp.quniform("log2_batch_size", 3, 9, 1), "early_stopping_patience": hyperopt.pyll.scope.int( hyperopt.hp.quniform("early_stopping_patience", 5, 50, 5) ), "lr_reduce_factor_patience": hyperopt.pyll.scope.int( hyperopt.hp.quniform("lr_reduce_factor_patience", 5, 50, 5) ), "lr_reduce_factor": hyperopt.hp.uniform("lr_reduce_factor", -0.1, 0.9), } ALGO=[ (0.15, hyperopt.rand.suggest), (0.7, hyperopt.tpe.suggest), (0.15, hyperopt.anneal.suggest), ] embed = tf.saved_model.load("language_model/") def fit(X, y=None, **kwargs): # pylint: disable=invalid-name,unused-argument autoencoder, keras_params = build_stacked_autoencoder( input_dim=512, encoded_dim=5, **kwargs, ) embedding_vectors = embed(X).numpy() _history = autoencoder.fit( embedding_vectors, embedding_vectors, **keras_params, ) return _history def hyperparams_optimizer( # pylint: disable=too-many-locals data, num_rounds, ): """Find best autoencoder hyperparameters""" def hyperopt_objective(params): status = hyperopt.STATUS_FAIL mae = np.min(fit(data,data,**params).history["mae"]) # pylint: disable=protected-access if np.isfinite(mae): status = hyperopt.STATUS_OK return {"loss": mae, "status": status} trials_filename = "trials.pkl" try: trials = joblib.load(trials_filename) evals_loaded_trials = len(trials.statuses()) rstate = np.random.default_rng(evals_loaded_trials) except FileNotFoundError: trials = hyperopt.Trials() rstate = np.random.default_rng(0) runs_number = num_rounds // BATCH_TRIALS for i in range(runs_number): print(f"Batch n {(i+1):d}") best_hp = hyperopt.fmin( fn=hyperopt_objective, space=AUTOENCODER_HYPER_SPACE, max_evals=BATCH_TRIALS, algo=functools.partial(hyperopt.mix.suggest,p_suggest=ALGO), trials = trials, rstate = rstate ) best_params = hyperopt.space_eval(AUTOENCODER_HYPER_SPACE, best_hp) best_params["mae"] = trials.best_trial["result"]["loss"] print(best_params) best_params_filename = "best_params.json" with open(best_params_filename, "w", encoding="utf-8") as fh: json.dump(best_params, fh, indent=4) joblib.dump(trials, trials_filename, compress=("gzip", 3)) print("Optimization results.") print(best_params) def load_descriptions(): """Load text from CSV files""" t1 = "1.csv.gz" t2 = "2.csv.gz" df = pd.concat( [ pd.read_csv(t1), # pd.read_csv(t2), ], ) return df.Description.drop_duplicates().tolist() descriptions = load_descriptions() hyperparams_optimizer(descriptions, num_rounds=1000)