Tensorflow LSTM gives different when changing batch size running on M1 Max Mac

When running the same code on my m1 Mac with tensorflow-metal vs in a google collab I see a problem with results.

The code: https://colab.research.google.com/drive/13GzSfToUvmmGHaROS-sGCu9mY1n_2FYf?usp=sharing

import tensorflow as tf
import numpy as np
import pandas as pd

# Setup model
input_shape = (10, 5)

model_tst = tf.keras.Sequential()

model_tst.add(tf.keras.Input(shape=input_shape))
model_tst.add(tf.keras.layers.LSTM(100,  return_sequences=True))
model_tst.add(tf.keras.layers.Dense(2, activation="sigmoid"))

model_tst.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model_tst.compile(
        loss=loss,
        optimizer=optimizer,
        # metrics=[tf.keras.metrics.BinaryCrossentropy()
        metrics=["mse"
        ]
)

# Generate step data
random_input = np.ones((11, 10, 5))
random_input[:, 8:, :] = 99

# Predictions
random_output2 = model_tst.predict(random_input, batch_size=1)[0, :, :].reshape(10, 2)
random_output3 = model_tst.predict(random_input, batch_size=10)[0, :, :].reshape(10, 2)

# Compare results
diff2 = random_output3 - random_output2
pd.DataFrame(diff2).T

Output on Mac:

Output on google collab:

If I reduce the number of nodes in the LSTM I can get the problem to disappear:

import tensorflow as tf
import numpy as np
import pandas as pd

# Setup model
input_shape = (10, 5)

model_tst = tf.keras.Sequential()

model_tst.add(tf.keras.Input(shape=input_shape))
model_tst.add(tf.keras.layers.LSTM(2,  return_sequences=True))
model_tst.add(tf.keras.layers.Dense(2, activation="sigmoid"))

model_tst.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model_tst.compile(
        loss=loss,
        optimizer=optimizer,
        # metrics=[tf.keras.metrics.BinaryCrossentropy()
        metrics=["mse"
        ]
)

# Generate step data
random_input = np.ones((11, 10, 5))
random_input[:, 8:, :] = 99

# Predictions
random_output2 = model_tst.predict(random_input, batch_size=1)[0, :, :].reshape(10, 2)
random_output3 = model_tst.predict(random_input, batch_size=10)[0, :, :].reshape(10, 2)

# Compare results
diff2 = random_output3 - random_output2
pd.DataFrame(diff2).T

-> outputs are the same in this case.

I guess this has to do with how calculations are getting passed to Apple silicon.

Any debugging steps I should try to result this problem?

Info:

deleting

Update: If I disable the GPU using the following code the differences become very very small on the order of 1e-8 instead of 8e-2. Seems like a GPU driver problem

import tensorflow as tf

# Disable all GPUS
tf.config.set_visible_devices([], 'GPU')
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
    assert device.device_type != 'GPU'


import tensorflow as tf
import numpy as np
import pandas as pd

# Setup model
input_shape = (10, 5)

model_tst = tf.keras.Sequential()

model_tst.add(tf.keras.Input(shape=input_shape))
model_tst.add(tf.keras.layers.LSTM(100,  return_sequences=True))
model_tst.add(tf.keras.layers.Dense(2, activation="sigmoid"))

model_tst.summary()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model_tst.compile(
        loss=loss,
        optimizer=optimizer,
        # metrics=[tf.keras.metrics.BinaryCrossentropy()
        metrics=["mse"
        ]
)

# Generate step data
random_input = np.ones((11, 10, 5))
random_input[:, 8:, :] = 99

# Predictions
random_output2 = model_tst.predict(random_input, batch_size=1)[0, :, :].reshape(10, 2)
random_output3 = model_tst.predict(random_input, batch_size=10)[0, :, :].reshape(10, 2)

# Compare results
diff2 = random_output3 - random_output2
pd.DataFrame(diff2).T

Hi @Trevor16gordon,

Thanks for reporting this issue. I will look into this on our side to see if we can discover the cause. I'll update here once I have news.

Tensorflow LSTM gives different when changing batch size running on M1 Max Mac
 
 
Q