INVALID_ARGUMENT：预期维度在 [0, 0) 范围内，但得到 0

Question

无论我使用哪种模型，我都会不断收到此错误，所以我想知道是否有人可以告诉我发生了什么事？我该如何解决这个问题？

这个模型的输入数据是：http://vision.stanford.edu/aditya86/ImageNetDogs/

很可能问题一定源于这部分，但想知道我的代码的哪一部分可以解决这个问题？：

  (0) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
     [[{{node ArgMax}}]]
     [[IteratorGetNext]]
     [[IteratorGetNext/_1321]]
  (1) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
     [[{{node ArgMax}}]]
     [[IteratorGetNext]]

我在带 GPU 的 HPC 系统上运行这段代码。考虑到我已经对它进行了 QAd，我认为我的数据预处理应该没问题。

下面的代码片段这是代码生成的错误。

Traceback (most recent call last):
  File "/mnt/lustre/indy2lfs/work/mdisspt/mdisspt/y2136744/modelzoo/fc_dog_model/tf/run.py", line 292, in <module>
    main()
  File "/mnt/lustre/indy2lfs/work/mdisspt/mdisspt/y2136744/modelzoo/fc_dog_model/tf/run.py", line 281, in main
    run(
  File "/mnt/lustre/indy2lfs/work/mdisspt/mdisspt/y2136744/modelzoo/fc_dog_model/tf/run.py", line 226, in run
    est.train(
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 360, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1186, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1217, in _train_model_default
    return self._train_with_estimator_spec(estimator_spec, worker_hooks,
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1533, in _train_with_estimator_spec
    _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 782, in run
    return self._sess.run(
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1311, in run
    return self._sess.run(
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1416, in run
    raise six.reraise(*original_exc_info)
  File "/mnt/lustre/indy2lfs/sw/miniconda3/4.12.0-py39-gpu/lib/python3.9/site-packages/six.py", line 719, in reraise
    raise value
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1401, in run
    return self._sess.run(*args, **kwargs)
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1469, in run
    outputs = _WrappedSession.run(
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/training/monitored_session.py", line 1232, in run
    return self._sess.run(*args, **kwargs)
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 967, in run
    result = self._run(None, fetches, feed_dict, options_ptr,
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1190, in _run
    results = self._do_run(handle, final_targets, final_fetches,
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1370, in _do_run
    return self._do_call(_run_fn, feeds, fetches, targets, options,
  File "/mnt/lustre/indy2lfs/sw/horovod/0.25.0-gpu/python/3.9.13/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1396, in _do_call
    raise type(e)(node_def, op, message)  # pylint: disable=no-value-for-parameter
tensorflow.python.framework.errors_impl.InvalidArgumentError: Graph execution error:

2 root error(s) found.
  (0) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
     [[{{node ArgMax}}]]
     [[IteratorGetNext]]
     [[IteratorGetNext/_1321]]
  (1) INVALID_ARGUMENT: Expected dimension in the range [0, 0), but got 0
     [[{{node ArgMax}}]]
     [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored.
srun: error: r2i4n1: task 0: Exited with exit code 1
srun: launch/slurm: _step_signal: Terminating StepId=4084847.0

运行.py

def run(
    args, params, model_fn, train_input_fn=None, eval_input_fn=None,
):
    dtype = tf.keras.mixed_precision.Policy(
      'mixed_float16', # Important: This is required.
    )

    tf.keras.mixed_precision.set_global_policy(dtype)

    # update and validate runtime params
    runconfig_params = params["runconfig"]
    update_params_from_args(args, runconfig_params)
    validate_params(params)
    # save params for reproducibility
    save_params(params, model_dir=runconfig_params["model_dir"])

    # get runtime configurations
    use_cs = is_cs(runconfig_params)
    csrunconfig_dict = get_csrunconfig_dict(runconfig_params)
    stack_params = get_custom_stack_params(params)

    # prep cs1 run environment, run config and estimator
    check_env(runconfig_params)
    est_config = CSRunConfig(
        cs_ip=runconfig_params["cs_ip"],
        stack_params=stack_params,
        **csrunconfig_dict,
    )
    model= model_fn()
    est = tf.keras.estimator.model_to_estimator(
        keras_model=model,
        model_dir=runconfig_params["model_dir"],
        # config=est_config,
        # params=params,
    )

    # execute based on mode
    elif runconfig_params["mode"] == "train":
        # est.compile(input_fn=train_input_fn)
        est.train(
            input_fn=train_input_fn,
            steps=runconfig_params["steps"],
            max_steps=runconfig_params["max_steps"],
            # use_cs=use_cs,
        )
def main():
    """
    Main function
    """
    dtype = Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(dtype)
    tf.keras.backend.set_floatx('float16')

    default_model_dir = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "model_dir"
    )
    parser = create_arg_parser(default_model_dir)
    args = parser.parse_args(sys.argv[1:])
    params = get_params(args.params)
    print(params)
    summary_context = (
        cs_disable_summaries if args.multireplica else cs_enable_summaries
    )
    with summary_context():
        run(
            args=args,
            params=params,
            model_fn=model_fn,
            train_input_fn=train_input_fn,
            # eval_input_fn=eval_input_fn,
        )


if __name__ == "__main__":
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    main()

模型.py

def model_fn():
    dtype = Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(dtype)
    # tf.keras.backend.set_floatx('float16')
    
    inputs = tf.keras.Input(shape=(331,331,3))
    # Entry block
    x = layers.Conv2D(128, 3, strides=2, padding="same")(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    previous_block_activation = x  # Set aside residual

    for size in [256, 512, 728]:
        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(size, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(size, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.MaxPooling2D(3, strides=2, padding="same")(x)

        # Project residual
        residual = layers.Conv2D(size, 1, strides=2, padding="same")(
            previous_block_activation
        )
        x = layers.add([x, residual])  # Add back residual
        previous_block_activation = x  # Set aside next residual

    x = layers.SeparableConv2D(1024, 3, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.GlobalAveragePooling2D()(x)
    activation = "softmax"
    units = 1

    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation=activation)(x)

    estimator_model = tf.keras.Model(inputs, outputs)

    estimator_model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        #   metrics=['accuracy']
        )
    estimator_model.summary()

    return estimator_model

数据.py

def input_fn(params, mode=tf.estimator.ModeKeys.TRAIN):
    """
    :param <dict> params: dict containing input parameters for creating dataset.
    Expects the following fields:
    
    - "data_dir" (string): path to the data files to use.
    - "batch_size" (int): batch size
    - "to_float16" (bool): whether to convert to float16 or not
    - "drop_last_batch" (bool): whether to drop the last batch or not
    """
    params = {
    'train_input': {
        'shuffle': True,
        'data_dir': 'dog_breed_dataset', # Place to store data
        'batch_size': 32,
        'num_parallel_calls': 0   # 0 means AUTOTUNE
        }
    }   

    training = mode == tf.estimator.ModeKeys.TRAIN
    evaluating = mode == tf.estimator.ModeKeys.EVAL
    ds = None
    input_params = params["train_input"]
    data_dir = input_params["data_dir"]

    # setting num_parallel_calls to 0 implies AUTOTUNE
    num_parallel_calls = input_params.get("num_parallel_calls", 0)

    batch_size = (
        input_params.get("train_batch_size")
        if training
        else input_params.get("eval_batch_size")
    )
    if batch_size is None:
        batch_size = input_params["batch_size"]


    list_ds = tf.data.Dataset.list_files(str(data_dir+'/*/*'), shuffle=False)
    class_names = np.array(sorted([item.split('/')[-1] for item in glob.glob(data_dir + '/*')]))

    val_size = int(image_count * 0.2)

    def get_label(file_path):
        # Convert the path to a list of path components
        parts = tf.strings.split(file_path, os.path.sep)
        one_hot = parts[-2] == class_names
        one_hot=tf.cast(one_hot, tf.int32)
        return tf.argmax(one_hot)
        # return one_hot
         
    def decode_img(img):
        # Convert the compressed string to a 3D uint8 tensor
        img = tf.io.decode_jpeg(img, channels=3)
        img = tf.cast(img, tf.float16)
        img = (img/225)
        # img = tf.keras.applications.mobilenet.preprocess_input(img)
        # Resize the image to the desired size
        return tf.image.resize(img, [image_param['img_height'], image_param["img_width"]])

    def process_path(file_path):
        label = get_label(file_path)
        # Load the raw data from the file as a string
        img = tf.io.read_file(file_path)
        img = decode_img(img)
        return img, label

    if training and input_params["shuffle"]:
        list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)
    if training:
        ds = list_ds.skip(val_size)
        ds = ds.repeat()
    else:
        ds = list_ds.take(val_size)

    ds = ds.map(
        process_path, 
        num_parallel_calls=num_parallel_calls
        if num_parallel_calls > 0
        else tf.data.experimental.AUTOTUNE,
        )

    return ds


def train_input_fn(params=None):
    return input_fn(params, mode=tf.estimator.ModeKeys.TRAIN)

INVALID_ARGUMENT：预期维度在 [0, 0) 范围内，但得到 0

问题描述投票：0回答：0

最新问题

INVALID_ARGUMENT：预期维度在 [0, 0) 范围内，但得到 0

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0