InvalidArgumentError: Cannot assign a device for operation

Question

Vishal89 OP

Created May ’22

Replies 1

Boosts 0

Views 1.4k

Participants 2

class RankingModel(tf.keras.Model):

def init(self): super().init() embedding_dimension = 32

# Compute embeddings for users.
self.user_embeddings = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
    vocabulary=unique_user_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

# Compute embeddings for movies.
self.movie_embeddings = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
    vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

# Compute predictions.
self.ratings = tf.keras.Sequential([
  # Learn multiple dense layers.
  tf.keras.layers.Dense(256, activation="relu"),
  tf.keras.layers.Dense(64, activation="relu"),
  # Make rating predictions in the final layer.
  tf.keras.layers.Dense(1)

])

def call(self, inputs):

user_id, movie_title = inputs

user_embedding = self.user_embeddings(user_id)
movie_embedding = self.movie_embeddings(movie_title)

return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))

task = tfrs.tasks.Ranking( loss = tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.RootMeanSquaredError()] )

class MovielensModel(tfrs.models.Model):

def init(self): super().init() self.ranking_model: tf.keras.Model = RankingModel() self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking( loss = tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.RootMeanSquaredError()] )

def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor: return self.ranking_model( (features["user_id"], features["movie_title"]))

def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor: labels = features.pop("user_rating")

rating_predictions = self(features)

# The task computes the loss and the metrics.
return self.task(labels=labels, predictions=rating_predictions)

model = MovielensModel() model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(8192).cache() cached_test = test.batch(4096).cache()

model.fit(cached_train, epochs=3)

InvalidArgumentError Traceback (most recent call last) Input In [40], in <cell line: 5>() 1 #physical_devices = tf.config.list_physical_devices('GPU') 2 #tf.config.set_visible_devices(physical_devices[0], 'GPU') 3 #with tf.device("GPU"): 4 #model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1)) ----> 5 model.fit(cached_train, epochs=3)

File /opt/homebrew/Caskroom/miniforge/base/envs/mlp/lib/python3.8/site-packages/keras/utils/traceback_utils.py:67, in filter_traceback..error_handler(*args, **kwargs) 65 except Exception as e: # pylint: disable=broad-except 66 filtered_tb = _process_traceback_frames(e.traceback) ---> 67 raise e.with_traceback(filtered_tb) from None 68 finally: 69 del filtered_tb

File /opt/homebrew/Caskroom/miniforge/base/envs/mlp/lib/python3.8/site-packages/tensorflow/python/eager/execute.py:54, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 52 try: 53 ctx.ensure_initialized() ---> 54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, 55 inputs, attrs, num_outputs) 56 except core._NotOkStatusException as e: 57 if name is not None:

InvalidArgumentError: Cannot assign a device for operation movielens_model_1/ranking_model_3/sequential_9/embedding_6/embedding_lookup: Could not satisfy explicit device specification '' because the node {{colocation_node movielens_model_1/ranking_model_3/sequential_9/embedding_6/embedding_lookup}} was colocated with a group of nodes that required incompatible device '/job:localhost/replica:0/task:0/device:GPU:0'. All available devices [/job:localhost/replica:0/task:0/device:CPU:0, /job:localhost/replica:0/task:0/device:GPU:0]. Colocation Debug Info: Colocation group had the following types and supported devices: Root Member(assigned_device_name_index_=2 requested_device_name_='/job:localhost/replica:0/task:0/device:GPU:0' assigned_device_name_='/job:localhost/replica:0/task:0/device:GPU:0' resource_device_name_='/job:localhost/replica:0/task:0/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[] ResourceSparseApplyAdagradV2: CPU UnsortedSegmentSum: GPU CPU StridedSlice: GPU CPU Const: GPU CPU Shape: GPU CPU _Arg: GPU CPU Unique: GPU CPU Identity: GPU CPU ResourceGather: GPU CPU

Colocation members, user-requested devices, and framework assigned devices, if any: movielens_model_1_ranking_model_3_sequential_9_embedding_6_embedding_lookup_4370 (_Arg) framework assigned device=/job:localhost/replica:0/task:0/device:GPU:0 adagrad_adagrad_update_resourcesparseapplyadagradv2_accum (_Arg) framework assigned device=/job:localhost/replica:0/task:0/device:GPU:0 movielens_model_1/ranking_model_3/sequential_9/embedding_6/embedding_lookup (ResourceGather) movielens_model_1/ranking_model_3/sequential_9/embedding_6/embedding_lookup/Identity (Identity) Adagrad/Adagrad/update/Unique (Unique) /job:localhost/replica:0/task:0/device:GPU:0 Adagrad/Adagrad/update/Shape (Shape) /job:localhost/replica:0/task:0/device:GPU:0 Adagrad/Adagrad/update/strided_slice/stack (Const) /job:localhost/replica:0/task:0/device:GPU:0 Adagrad/Adagrad/update/strided_slice/stack_1 (Const) /job:localhost/replica:0/task:0/device:GPU:0 Adagrad/Adagrad/update/strided_slice/stack_2 (Const) /job:localhost/replica:0/task:0/device:GPU:0 Adagrad/Adagrad/update/strided_slice (StridedSlice) /job:localhost/replica:0/task:0/device:GPU:0 Adagrad/Adagrad/update/UnsortedSegmentSum (UnsortedSegmentSum) /job:localhost/replica:0/task:0/device:GPU:0 Adagrad/Adagrad/update/ResourceSparseApplyAdagradV2 (ResourceSparseApplyAdagradV2) /job:localhost/replica:0/task:0/device:GPU:0

 [[{{node movielens_model_1/ranking_model_3/sequential_9/embedding_6/embedding_lookup}}]] [Op:__inference_train_function_4593]

Boost

Answer 1

Frameworks Engineer OP

Apple

May ’22

Hi @Vishal89,

The error you are seeing is caused by the ResourceSparseApplyAdagradV2 missing its GPU implementation. We are working on adding this to the roster of available operations. But in the meanwhile I recommend either trying a different optimizer like Adam or resorting to running on CPU instead to avoid the colocation error.

0