我正在使用 TensorFlow 和 TensorFlow Recommenders (TFRS) 开发推荐系统,在我的RecommendationModel 中初始化 FactorizedTopK 指标期间遇到了一个令人困惑的问题。具体来说,当模型尝试在 tfrs.metrics.FactorizedTopK 的 Streaming 层中添加名为“counter”的权重时,就会出现错误。我按照以下文档来制作我的推荐模型:https://www.tensorflow.org/recommenders/examples/deep_recommenders
我的开发环境是 AWS SageMaker,并且
这是我的模型代码的相关部分:
programs = tf_dataset.map(lambda x: {
"program_id": x["program_id"],
"name": x["name"],
"Country": x["Country"],
"Studylvl": x["Studylvl"],
"majors": x["majors"],
})
desired_index = 20
desired_data = next(iter(programs.skip(desired_index).take(1)))
print("Program ID:", desired_data["program_id"].numpy().decode())
print("Name:", desired_data["name"].numpy().decode())
print("Country:", desired_data["Country"].numpy().decode())
print("Study Level:", desired_data["Studylvl"].numpy().decode())
print("Majors:", desired_data["majors"].numpy().decode())
Program ID: 157027
Name: m.s.e in robotics
Country: united states of america
Study Level: postgraduate
Majors: automation science and engineering, biorobotics, control and dynamical systems, medical robotics and computer integrated surgical , perception and cognitive systems, general robotics
class ProgramModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000
embedding_dimension = 32
self.program_id_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_program_id, mask_token=None),
tf.keras.layers.Embedding(len(unique_program_id) + 1, embedding_dimension),
])
self.name_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_program_name, mask_token=None),
tf.keras.layers.Embedding(len(unique_program_name) + 1, embedding_dimension),
])
self.name_text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_mode='int', output_sequence_length=32)
self.name_text_embedding = tf.keras.Sequential([
self.name_text_vectorizer,
tf.keras.layers.Embedding(max_tokens, embedding_dimension, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.name_text_vectorizer.adapt(unique_program_name)
self.country_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_countries, mask_token=None),
tf.keras.layers.Embedding(len(unique_countries) + 1, embedding_dimension),
])
self.study_lvl_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_study_lvl, mask_token=None),
tf.keras.layers.Embedding(len(unique_study_lvl) + 1, embedding_dimension),
])
self.major_text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_mode='int', output_sequence_length=32)
self.major_text_embedding = tf.keras.Sequential([
self.major_text_vectorizer,
tf.keras.layers.Embedding(max_tokens, embedding_dimension, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D()
])
self.major_text_vectorizer.adapt(majors)
def call(self, inputs):
return tf.concat([
self.country_embedding(inputs["Country"]),
self.study_lvl_embedding(inputs["Studylvl"]),
self.name_embedding(inputs["name"]),
self.name_text_embedding(inputs["name"]),
self.major_text_embedding(inputs["majors"]),
self.program_id_embedding(inputs["program_id"]),
], axis=1)
class CandidateModel(tf.keras.Model):
def __init__(self, layer_sizes):
super().__init__()
self.embedding_model = ProgramModel()
self.dense_layers = tf.keras.Sequential()
for layer_size in layer_sizes[:-1]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
self.dense_layers.add(tf.keras.layers.BatchNormalization())
for layer_size in layer_sizes[-1:]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size))
def call(self, inputs):
feature_embedding = self.embedding_model(inputs)
return self.dense_layers(feature_embedding)
class RecommendationModel(tfrs.models.Model):
def __init__(self, layer_sizes):
super().__init__()
self.query_model = QueryModel(layer_sizes)
self.candidate_model = CandidateModel(layer_sizes)
self.task = tfrs.tasks.Retrieval(
metrics= tfrs.metrics.FactorizedTopK(
candidates=programs.batch(128).map(self.candidate_model)
)
)
def compute_loss(self, features, training=False):
query_embeddings = self.query_model({
"Country": features["Country"],
"Studylvl": features["Studylvl"],
"name": features["name"],
"majors": features["majors"],
})
candidate_embeddings = self.candidate_model({
"Country": features["Country"],
"Studylvl": features["Studylvl"],
"name": features["name"],
"majors": features["majors"],
"program_id": features["program_id"],
})
return self.task(query_embeddings, candidate_embeddings)
model = RecommendationModel([128, 64, 32])
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
)
model.fit(
x=train.batch(2000),
epochs=20,
verbose=True,
validation_data=test.batch(500)
)
在尝试初始化推荐模型时,我遇到以下 ValueError:
ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'.
这是完整的错误日志:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[64], line 1
----> 1 model = RecommendationModel([128, 64, 32])
2 model.compile(
3 optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
4 )
6 # Train the model
Cell In[63], line 7, in RecommendationModel.__init__(self, layer_sizes)
4 self.query_model = QueryModel(layer_sizes)
5 self.candidate_model = CandidateModel(layer_sizes)
6 self.task = tfrs.tasks.Retrieval(
----> 7 metrics= tfrs.metrics.FactorizedTopK(
8 candidates=programs.batch(128).map(self.candidate_model)
9 )
10 )
File /usr/local/lib/python3.9/site-packages/tensorflow_recommenders/metrics/factorized_top_k.py:79, in FactorizedTopK.__init__(self, candidates, ks, name)
75 super().__init__(name=name)
77 if isinstance(candidates, tf.data.Dataset):
78 candidates = (
---> 79 layers.factorized_top_k.Streaming(k=max(ks))
80 .index_from_dataset(candidates)
81 )
83 self._ks = ks
84 self._candidates = candidates
File /usr/local/lib/python3.9/site-packages/tensorflow_recommenders/layers/factorized_top_k.py:376, in Streaming.__init__(self, query_model, k, handle_incomplete_batches, num_parallel_calls, sorted_order)
373 self._num_parallel_calls = num_parallel_calls
374 self._sorted = sorted_order
--> 376 self._counter = self.add_weight("counter", dtype=tf.int32, trainable=False)
File /usr/local/lib/python3.9/site-packages/keras/src/layers/layer.py:499, in Layer.add_weight(self, shape, initializer, dtype, trainable, regularizer, constraint, name)
497 initializer = initializers.get(initializer)
498 with backend.name_scope(self.name, caller=self):
--> 499 variable = backend.Variable(
500 initializer=initializer,
501 shape=shape,
502 dtype=dtype,
503 trainable=trainable,
504 name=name,
505 )
506 # Will be added to layer.losses
507 variable.regularizer = regularizers.get(regularizer)
File /usr/local/lib/python3.9/site-packages/keras/src/backend/common/variables.py:74, in KerasVariable.__init__(self, initializer, shape, dtype, trainable, name)
72 else:
73 if callable(initializer):
---> 74 shape = self._validate_shape(shape)
75 value = initializer(shape, dtype=dtype)
76 else:
File /usr/local/lib/python3.9/site-packages/keras/src/backend/common/variables.py:97, in KerasVariable._validate_shape(self, shape)
96 def _validate_shape(self, shape):
---> 97 shape = standardize_shape(shape)
98 if None in shape:
99 raise ValueError(
100 "Shapes used to initialize variables must be "
101 "fully-defined (no `None` dimensions). Received: "
102 f"shape={shape} for variable path='{self.path}'"
103 )
File /usr/local/lib/python3.9/site-packages/keras/src/backend/common/variables.py:426, in standardize_shape(shape)
424 continue
425 if not is_int_dtype(type(e)):
--> 426 raise ValueError(
427 f"Cannot convert '{shape}' to a shape. "
428 f"Found invalid entry '{e}' of type '{type(e)}'. "
429 )
430 if e < 0:
431 raise ValueError(
432 f"Cannot convert '{shape}' to a shape. "
433 "Negative dimensions are not allowed."
434 )
ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'.
此错误表明在 TensorFlow 或 TFRS 内部代码中的权重初始化期间解释参数存在问题,但我不知道如何解决它。我已经确认我的输入不包含任何 NaN 值或其他明显问题,并且我的学习率似乎是合理的。
调试一段时间后,我意识到我只在 AWS SageMaker 上遇到此问题,无论我使用仅 CPU 实例 (ml.g4dn.xlarge) 还是启用了 GPU 支持的实例。这个问题似乎是 SageMaker 环境特有的,因为我在 Google Collab 或本地等其他环境中没有遇到过这个问题。
有人遇到过类似的问题或对可能出现的问题有建议吗?我使用的是 TensorFlow 2.13.0。任何见解或指导将不胜感激!
我有完全相同的问题(以及由此产生的错误消息),尝试运行在 https://www.tensorflow.org/recommenders/examples/basic_retrieval 上发布的推荐代码(与您的发布商来自同一发布商)。
我的Python 3.11.8,我的TensorFlow版本是2.16.1。
每当我尝试通过本地 Windows 10 Pro 命令行运行上述代码,但尚未尝试在 Colab 或其他环境中运行它时,我都会收到此错误。不知道为什么会发生这种情况,也不知道如何避免这种情况。