将应用部署到 GKE 时出现 ImagePullBackOff 错误

问题描述 投票:0回答:1

我正在尝试使用 helm 将 dockerized python 应用程序部署到 GKE,并将图像托管在 GCP Artifact 注册表上。我遇到了这个错误:

rpc error: code = Unknown desc = failed to pull and unpack image "us-central1-docker.pkg.dev/myapp-dev-411610/myapp-repo/myapp-app:latest": failed to resolve reference "us-central1-docker.pkg.dev/myapp-dev-xxxxxx/myapp-repo/myapp-app:latest": failed to authorize: failed to fetch oauth token: unexpected status from GET request to https://us-central1-docker.pkg.dev/v2/token?scope=repository%3Amyapp-dev-xxxxxx%2Fmyapp-repo%2Fmyapp-app%3Apull&service=us-central1-docker.pkg.dev: 403 Forbidden

我的 Terraform 配置基于 https://antonputra.com/google/create-gke-cluster-using-terraform/ 如下:

resource "google_container_cluster" "primary" {
    name = "primary"
    location = "us-central1-b" # region or availablility zone. Should be region for prod
    remove_default_node_pool = true
    initial_node_count = 1
    network = google_compute_network.main.self_link
    subnetwork = google_compute_subnetwork.private.self_link
    logging_service = "logging.googleapis.com/kubernetes"
    monitoring_service = "monitoring.googleapis.com/kubernetes"
    networking_mode = "VPC_NATIVE"

    addons_config {
        http_load_balancing {
            disabled = true
        }
        horizontal_pod_autoscaling {
            disabled = false
        }
    }

    release_channel {
        channel = "REGULAR"
    }

    workload_identity_config {
        workload_pool = "myapp-dev-xxxxxx.svc.id.goog"
    }

    ip_allocation_policy {
        cluster_secondary_range_name = "k8s-pod-range"
        services_secondary_range_name = "k8s-service-range"
    }

    private_cluster_config {
        enable_private_nodes = true
        enable_private_endpoint = false
        master_ipv4_cidr_block = "172.16.0.0/28"
    }
}

resource "google_service_account" "kubernetes" {
    account_id = "kubernetes"
}

resource "google_container_node_pool" "general" {
    name = "general"
    cluster = google_container_cluster.primary.id
    node_count = 1

    management {
        auto_repair = true
        auto_upgrade = true

    }

    node_config {
        preemptible = false
        machine_type = "e2-small"

        labels = {
            role = "general"
        }

        service_account = google_service_account.kubernetes.email
        oauth_scopes = [
            "https://www.googleapis.com/auth/cloud-platform"
        ]
    }
}

resource "google_container_node_pool" "spot" {
    name = "spot"
    cluster = google_container_cluster.primary.id
    node_count = 1

    management {
        auto_repair = true
        auto_upgrade = true

    }

    autoscaling {
        min_node_count = 0
        max_node_count = 10
    }

    node_config {
        preemptible = true # Do not use for prod pool - can be taken offline by GCP at any time
        machine_type = "e2-small"
        
        labels = {
            team = "devops"
        }

        taint {
            key = "instance_type"
            value = "spot"
            effect = "NO_SCHEDULE"
        }

        service_account = google_service_account.kubernetes.email
        oauth_scopes = [
            "https://www.googleapis.com/auth/cloud-platform"
        ]
    }
}

resource "google_service_account" "artifact_registry_service_account" {
  account_id   ="artifact-registry-svc-acct"
  display_name = "Artifact Registry Service Account"
}

resource "google_project_iam_member" "artifact_registry_service_account_reader" {
  project = "myapp-dev-xxxxxx"
  role    = "roles/artifactregistry.reader"
  member  = "serviceAccount:${google_service_account.artifact_registry_service_account.email}"
}

resource "google_service_account_key" "artifact_registry_service_account_key" {
  service_account_id = google_service_account.artifact_registry_service_account.name
  public_key_type    = "TYPE_X509_PEM_FILE"
}

resource "local_file" "artifact_registry_service_account_key_file" {
  content  = base64decode(google_service_account_key.artifact_registry_service_account_key.private_key)
  filename = "${path.module}/artifact-registry-service-account-key.json"
}

resource "kubernetes_secret" "artifact_registry_json_key" {
  metadata {
    name = "artifact-registry-json-key"
  }

  data = {
    ".dockerconfigjson" = jsonencode({
      "auths" = {
        "us-central1-docker.pkg.dev" = {
          "username" = "_json_key"
          "password" = google_service_account_key.artifact_registry_service_account_key.private_key
          "email" = google_service_account.artifact_registry_service_account.email
          "auth" = base64encode("_json_key:${google_service_account_key.artifact_registry_service_account_key.private_key}")
        }
      }
    })
  }
}

resource "google_artifact_registry_repository" "docker_repository" {
  provider = google
  location = "us-central1"
  repository_id = "myapp-repo"
  description = "Docker Repository"
  format = "DOCKER"

  labels = {
    env = "dev"
  }
}

在我的舵values.yaml中我放入:

image:
  repository: us-central1-docker.pkg.dev/myapp-dev-xxxxxx/myapp-repo/myapp-app
  pullPolicy: IfNotPresent
  # Overrides the image tag whose default is the chart appVersion.
  tag: "latest"

imagePullSecrets:
  - name: artifact-registry-json-key

我可以使用 docker 成功构建并推送到 ArtifactRegistry。我的图像位于我新创建的存储库中。我可以使用我的主帐户和 terraform 文件中声明的服务帐户通过 cli 提取图像。如果我尝试通过 UI 部署到 GKE,我会遇到同样的错误,所以我不确定为什么它甚至无法使用具有默认所有者和组织管理员角色的凭据来工作。

我的预感是这个秘密没有被以某种方式使用(我尝试使用 kubectl 而不是 Helm 进行部署并得到相同的错误),或者使用 docker cli 和 kubernetes 拉取图像之间存在差异,因此一个可以工作,一个没有。

难道服务帐户需要额外的权限?但如果是这种情况,为什么我的所有者凭据在通过 UI 部署时不起作用?我正在努力思考可能导致此问题的其他问题。

值得注意的是,我在使用 GCR 时也遇到了同样的问题,但由于弃用而转而使用 GAR。尝试在云运行上部署,它能够拉取映像,因此似乎与 GKE 有关。

docker kubernetes google-cloud-platform google-kubernetes-engine kubernetes-helm
1个回答
0
投票

使用上面 DazWilkin 的评论解决了这个问题。我将以下内容添加到我的 terraform 配置中:

resource "google_project_iam_member" "k8s_account_artifact_registry_reader" {
  project = "myapp-dev-xxxxxx"
  role    = "roles/artifactregistry.reader"
  member  = "serviceAccount:[email protected]"
}
© www.soinside.com 2019 - 2024. All rights reserved.