无论我使用哪种模型,我都会不断收到此错误,所以我想知道是否有人可以告诉我发生了什么事?我该如何解决这个问题?
这个模型的输入数据是:http://vision.stanford.edu/aditya86/ImageNetDogs/
很可能问题一定源于这部分,但想知道我的代码的哪一部分可以解决这个问题?:
(0) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
[[{{node ArgMax}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_1321]]
(1) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
[[{{node ArgMax}}]]
[[IteratorGetNext]]
我在带 GPU 的 HPC 系统上运行这段代码。 考虑到我已经对它进行了 QAd,我认为我的数据预处理应该没问题。
下面的代码片段 这是代码生成的错误。
Traceback (most recent call last):
File "/mnt/lustre/indy2lfs/work/mdisspt/mdisspt/y2136744/modelzoo/fc_dog_model/tf/run.py", line 292, in <module>
main()
File "/mnt/lustre/indy2lfs/work/mdisspt/mdisspt/y2136744/modelzoo/fc_dog_model/tf/run.py", line 281, in main
run(
File "/mnt/lustre/indy2lfs/work/mdisspt/mdisspt/y2136744/modelzoo/fc_dog_model/tf/run.py", line 226, in run
est.train(
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 360, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1186, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1217, in _train_model_default
return self._train_with_estimator_spec(estimator_spec, worker_hooks,
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1533, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 782, in run
return self._sess.run(
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1311, in run
return self._sess.run(
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1416, in run
raise six.reraise(*original_exc_info)
File "/mnt/lustre/indy2lfs/sw/miniconda3/4.12.0-py39-gpu/lib/python3.9/site-packages/six.py", line 719, in reraise
raise value
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1401, in run
return self._sess.run(*args, **kwargs)
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1469, in run
outputs = _WrappedSession.run(
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1232, in run
return self._sess.run(*args, **kwargs)
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 967, in run
result = self._run(None, fetches, feed_dict, options_ptr,
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1190, in _run
results = self._do_run(handle, final_targets, final_fetches,
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1370, in _do_run
return self._do_call(_run_fn, feeds, fetches, targets, options,
File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1396, in _do_call
raise type(e)(node_def, op, message) # pylint: disable=no-value-for-parameter
tensorflow.python.framework.errors_impl.InvalidArgumentError: Graph execution error:
2 root error(s) found.
(0) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
[[{{node ArgMax}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_1321]]
(1) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
[[{{node ArgMax}}]]
[[IteratorGetNext]]
0 successful operations.
0 derived errors ignored.
srun: error: r2i4n1: task 0: Exited with exit code 1
srun: launch/slurm: _step_signal: Terminating StepId=4084847.0
运行.py
def run(
args, params, model_fn, train_input_fn=None, eval_input_fn=None,
):
dtype = tf.keras.mixed_precision.Policy(
'mixed_float16', # Important: This is required.
)
tf.keras.mixed_precision.set_global_policy(dtype)
# update and validate runtime params
runconfig_params = params["runconfig"]
update_params_from_args(args, runconfig_params)
validate_params(params)
# save params for reproducibility
save_params(params, model_dir=runconfig_params["model_dir"])
# get runtime configurations
use_cs = is_cs(runconfig_params)
csrunconfig_dict = get_csrunconfig_dict(runconfig_params)
stack_params = get_custom_stack_params(params)
# prep cs1 run environment, run config and estimator
check_env(runconfig_params)
est_config = CSRunConfig(
cs_ip=runconfig_params["cs_ip"],
stack_params=stack_params,
**csrunconfig_dict,
)
model= model_fn()
est = tf.keras.estimator.model_to_estimator(
keras_model=model,
model_dir=runconfig_params["model_dir"],
# config=est_config,
# params=params,
)
# execute based on mode
elif runconfig_params["mode"] == "train":
# est.compile(input_fn=train_input_fn)
est.train(
input_fn=train_input_fn,
steps=runconfig_params["steps"],
max_steps=runconfig_params["max_steps"],
# use_cs=use_cs,
)
def main():
"""
Main function
"""
dtype = Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(dtype)
tf.keras.backend.set_floatx('float16')
default_model_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "model_dir"
)
parser = create_arg_parser(default_model_dir)
args = parser.parse_args(sys.argv[1:])
params = get_params(args.params)
print(params)
summary_context = (
cs_disable_summaries if args.multireplica else cs_enable_summaries
)
with summary_context():
run(
args=args,
params=params,
model_fn=model_fn,
train_input_fn=train_input_fn,
# eval_input_fn=eval_input_fn,
)
if __name__ == "__main__":
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
main()
模型.py
def model_fn():
dtype = Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(dtype)
# tf.keras.backend.set_floatx('float16')
inputs = tf.keras.Input(shape=(331,331,3))
# Entry block
x = layers.Conv2D(128, 3, strides=2, padding="same")(inputs)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
previous_block_activation = x # Set aside residual
for size in [256, 512, 728]:
x = layers.Activation("relu")(x)
x = layers.SeparableConv2D(size, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.SeparableConv2D(size, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D(3, strides=2, padding="same")(x)
# Project residual
residual = layers.Conv2D(size, 1, strides=2, padding="same")(
previous_block_activation
)
x = layers.add([x, residual]) # Add back residual
previous_block_activation = x # Set aside next residual
x = layers.SeparableConv2D(1024, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.GlobalAveragePooling2D()(x)
activation = "softmax"
units = 1
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation=activation)(x)
estimator_model = tf.keras.Model(inputs, outputs)
estimator_model.compile(
optimizer=tf.keras.optimizers.Adam(),
loss="categorical_crossentropy",
# metrics=['accuracy']
)
estimator_model.summary()
return estimator_model
数据.py
def input_fn(params, mode=tf.estimator.ModeKeys.TRAIN):
"""
:param <dict> params: dict containing input parameters for creating dataset.
Expects the following fields:
- "data_dir" (string): path to the data files to use.
- "batch_size" (int): batch size
- "to_float16" (bool): whether to convert to float16 or not
- "drop_last_batch" (bool): whether to drop the last batch or not
"""
params = {
'train_input': {
'shuffle': True,
'data_dir': 'dog_breed_dataset', # Place to store data
'batch_size': 32,
'num_parallel_calls': 0 # 0 means AUTOTUNE
}
}
training = mode == tf.estimator.ModeKeys.TRAIN
evaluating = mode == tf.estimator.ModeKeys.EVAL
ds = None
input_params = params["train_input"]
data_dir = input_params["data_dir"]
# setting num_parallel_calls to 0 implies AUTOTUNE
num_parallel_calls = input_params.get("num_parallel_calls", 0)
batch_size = (
input_params.get("train_batch_size")
if training
else input_params.get("eval_batch_size")
)
if batch_size is None:
batch_size = input_params["batch_size"]
list_ds = tf.data.Dataset.list_files(str(data_dir+'/*/*'), shuffle=False)
class_names = np.array(sorted([item.split('/')[-1] for item in glob.glob(data_dir + '/*')]))
val_size = int(image_count * 0.2)
def get_label(file_path):
# Convert the path to a list of path components
parts = tf.strings.split(file_path, os.path.sep)
one_hot = parts[-2] == class_names
one_hot=tf.cast(one_hot, tf.int32)
return tf.argmax(one_hot)
# return one_hot
def decode_img(img):
# Convert the compressed string to a 3D uint8 tensor
img = tf.io.decode_jpeg(img, channels=3)
img = tf.cast(img, tf.float16)
img = (img/225)
# img = tf.keras.applications.mobilenet.preprocess_input(img)
# Resize the image to the desired size
return tf.image.resize(img, [image_param['img_height'], image_param["img_width"]])
def process_path(file_path):
label = get_label(file_path)
# Load the raw data from the file as a string
img = tf.io.read_file(file_path)
img = decode_img(img)
return img, label
if training and input_params["shuffle"]:
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)
if training:
ds = list_ds.skip(val_size)
ds = ds.repeat()
else:
ds = list_ds.take(val_size)
ds = ds.map(
process_path,
num_parallel_calls=num_parallel_calls
if num_parallel_calls > 0
else tf.data.experimental.AUTOTUNE,
)
return ds
def train_input_fn(params=None):
return input_fn(params, mode=tf.estimator.ModeKeys.TRAIN)