Google Vertex AI 超参数调整:遇到 500 内部错误

问题描述 投票:0回答:3

我尝试使用此处描述的 Python SDK 在 Vertex AI 上运行超参数调整作业。大约 2 小时前,成功发送作业运行。我注意到我的代码中有一些错误,因此运行失败,我返回并修复它,然后重新运行代码,得到的结果如下。

  Traceback (most recent call last):
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/api_core/grpc_helpers.py", line 67, in error_remapped_callable
    return callable_(*args, **kwargs)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
        status = StatusCode.INTERNAL
        details = "Internal error encountered."
        debug_error_string = "{"created":"@1623393121.374988331","description":"Error received from peer ipv4:142.251.33.74:443","file":"src/core/lib/surface/call.cc","file_line":1066,"grpc_message":"Internal error encountered.","grpc_status":13}"
>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/workspace/ariel_ml_2021/hparam_tuning.py", line 140, in <module>
    create_hyperparameter_tuning_job_python_package()
  File "/workspace/ariel_ml_2021/hparam_tuning.py", line 133, in create_hyperparameter_tuning_job_python_package
    response = client.create_hyperparameter_tuning_job(
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/cloud/aiplatform_v1/services/job_service/client.py", line 1363, in create_hyperparameter_tuning_job
    response = rpc(request, retry=retry, timeout=timeout, metadata=metadata,)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/api_core/gapic_v1/method.py", line 145, in __call__
    return wrapped_func(*args, **kwargs)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/api_core/grpc_helpers.py", line 69, in error_remapped_callable
    six.raise_from(exceptions.from_grpc_error(exc), exc)
  File "<string>", line 3, in raise_from
google.api_core.exceptions.InternalServerError: 500 Internal error encountered.

我认为这可能是由于我对 python 代码的更改导致了错误,所以我尝试使用原始副本(不进行任何更改),并且错误仍然存在。如果需要的话,超参数调整的代码如下。

from google.cloud import aiplatform


def create_hyperparameter_tuning_job_python_package(
    project: str = "<my_project_id>",
    display_name: str = "<some_description>",
    executor_image_uri: str = "us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-4:latest",
    package_uri: str = "gs://<bucket_name>/",
    python_module: str = "train_second",  # located at gs://<bucket_name>/train_second.py
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    client_options = {"api_endpoint": api_endpoint}
    client = aiplatform.gapic.JobServiceClient(client_options=client_options)

    metric = {
        "metric_id": "ariel_score",
        "goal": aiplatform.gapic.StudySpec.MetricSpec.GoalType.MAXIMIZE,
    }

    conditional_param_H1 = {
        "parameter_spec": {
            "parameter_id": "H1",
            "discrete_value_spec": {"values": [4, 8, 16, 32, 64, 128, 256, 512, 1024]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_H2 = {
        "parameter_spec": {
            "parameter_id": "H2",
            "discrete_value_spec": {"values": [64, 128, 256, 512, 1024]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }


    conditional_param_H3 = {
        "parameter_spec": {
            "parameter_id": "H3",
            "discrete_value_spec": {"values": [4, 8, 16, 32, 64, 128, 256, 512, 1024]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_D1 = {
        "parameter_spec": {
            "parameter_id": "D1",
            "double_value_spec": {"min_value": 0.01, "max_value": 0.5},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_mean = {
        "parameter_spec": {
            "parameter_id": "mean",
            "discrete_value_spec": {"values": [0., 1.]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_std = {
        "parameter_spec": {
            "parameter_id": "std",
            "double_value_spec": {"min_value": 0.005, "max_value": 0.5},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_lr = {
        "parameter_spec": {
            "parameter_id": "lr",
            "discrete_value_spec": {"values": [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    parameter = {
        "parameter_id": "batch_size",
        "discrete_value_spec": {"values": [10, 25, 50, 100]},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        "conditional_parameter_specs": [
            conditional_param_H1, 
            conditional_param_H2,
            conditional_param_H3,
            conditional_param_D1,
            conditional_param_mean,
            conditional_param_std,
            conditional_param_lr,
        ],
    }

    # Trial job spec
    machine_spec = {
        "machine_type": "e2-standard-4",
    }
    worker_pool_spec = {
        "machine_spec": machine_spec,
        "replica_count": 1, 
        "python_package_spec": {
            "executor_image_uri": executor_image_uri,
            "package_uris": [package_uri],
            "python_module": python_module,
            "args": [],
        }
    }

    # hparam tuning job
    hyperparameter_tuning_job = {
        "display_name": display_name,
        "max_trial_count": 2, 
        "parallel_trial_count": 2,
        "study_spec": {
            "metrics": [metric],
            "parameters": [parameter],
        },
        "trial_job_spec": {"worker_pool_specs": [worker_pool_spec]},
    }

    parent = f"projects/{project}/locations/{location}"
    response = client.create_hyperparameter_tuning_job(
        parent=parent, hyperparameter_tuning_job=hyperparameter_tuning_job
    )
    print(f"response:", response)


if __name__ == "__main__":
    create_hyperparameter_tuning_job_python_package()

提前致谢。

google-cloud-ml
3个回答
1
投票

us-central1 的端点似乎遇到了问题。解决方法是使用另一个端点,例如 us-east1,问题就解决了。


0
投票

您的

package_uri
不正确。它应该指向包含 Python 包的文件(即包含所有代码的 tar.bz 文件),而不是目录或存储桶。


0
投票

遇到同样的错误,由于错误不是很清楚,花了很多时间调试。事实证明,服务帐户应该有足够的权限来读取 Artifact Registry 标签,即

roles/artifactregistry.reader

这个 github 问题有帮助:https://github.com/googleapis/python-aiplatform/issues/2181#issuecomment-1621594750

© www.soinside.com 2019 - 2024. All rights reserved.