为什么我的 ecs 任务无法从 docker hub 拉取容器?

问题描述 投票:0回答:1

我正在尝试使用 terraform 在 ecs fargate 上部署 mlflow。我正在尝试设置一个带有 nat 网关的私有子网,以便 ecs 任务能够从 docker hub 中提取 mlflow 映像。这是我的地形:

    data "aws_region" "current" {}

resource "aws_iam_role" "ecs_task" {
  name = "mlflow-dev-ecs-task"
  tags = local.tags

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Principal = {
          Service = "ecs-tasks.amazonaws.com"
        }
        Effect = "Allow"
      },
    ]
  })
}

resource "aws_iam_role" "ecs_execution" {
  name = "mlflow-dev-ecs-execution"
  tags = local.tags

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Principal = {
          Service = "ecs-tasks.amazonaws.com"
        }
        Effect = "Allow"
      },
    ]
  })
}

resource "aws_iam_role_policy_attachment" "ecs_execution" {
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
  role       = aws_iam_role.ecs_execution.name
}

resource "aws_security_group" "ecs_service" {
  name = "mlflow-dev-ecs-service"
  tags = local.tags

  vpc_id = "vpc-XXXXXXXX"

  ingress {
    from_port       = local.service_port
    to_port         = local.service_port
    protocol        = "tcp"
    security_groups = [aws_security_group.lb.id]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

resource "aws_cloudwatch_log_group" "mlflow" {
  name              = "/aws/ecs/mlflow-dev"
  retention_in_days = 90
  tags              = local.tags
}

resource "aws_ecs_cluster" "mlflow" {
  name = "mlflow-dev"
  tags = local.tags
}

resource "aws_ecs_task_definition" "mlflow" {
  family = "mlflow-dev"
  tags   = local.tags
  container_definitions = jsonencode(concat([
    {
      name      = "mlflow"
      image     = "ghcr.io/mlflow/mlflow"
      essential = true

      # As of version 1.9.1, MLflow doesn't support specifying the backend store uri as an environment variable. ECS doesn't allow evaluating secret environment variables from within the command. Therefore, we are forced to override the entrypoint and assume the docker image has a shell we can use to interpolate the secret at runtime.
      entryPoint = ["sh", "-c"]
      command = [
        "/bin/sh -c \"mlflow server --host=0.0.0.0 --port=${local.service_port} --default-artifact-root=s3://${local.artifact_bucket_id}/ --backend-store-uri=mysql+pymysql://${aws_rds_cluster.backend_store.master_username}:`echo -n $DB_PASSWORD`@${aws_rds_cluster.backend_store.endpoint}:${aws_rds_cluster.backend_store.port}/${aws_rds_cluster.backend_store.database_name} --gunicorn-opts '' \""
      ]
      portMappings = [{ containerPort = local.service_port }]
      secrets = [
        {
          name      = "DB_PASSWORD"
          valueFrom = aws_secretsmanager_secret.db_password.arn
        },
      ]
      logConfiguration = {
        logDriver     = "awslogs"
        secretOptions = null
        options = {
          "awslogs-group"         = aws_cloudwatch_log_group.mlflow.name
          "awslogs-region"        = data.aws_region.current.name
          "awslogs-stream-prefix" = "cis"
        }
      }
    },
  ], []))

  network_mode             = "awsvpc"
  task_role_arn            = aws_iam_role.ecs_task.arn
  execution_role_arn       = aws_iam_role.ecs_execution.arn
  requires_compatibilities = ["FARGATE"]
  # is this overkill?
  cpu                      = 2048
  memory                   = 4096
}

resource "aws_subnet" "mlflow-dev-service-subnet" {
  vpc_id                  = "vpc-XXXXXXXX"
  cidr_block              = "XXXXXXXX"  
  map_public_ip_on_launch = "false" //it makes this a public subnet
  availability_zone       = "eu-west-1a"
  tags = {
    Name = "mlflow-dev-service-subnet"
  }
}

resource "aws_ecs_service" "mlflow" {
  name             = "mlflow-dev"
  cluster          = aws_ecs_cluster.mlflow.id
  task_definition  = aws_ecs_task_definition.mlflow.arn
  desired_count    = 2
  launch_type      = "FARGATE"
  platform_version = "1.4.0"


  network_configuration {
    subnets         = [aws_subnet.mlflow-dev-service-subnet.id]
    security_groups = [aws_security_group.ecs_service.id]
    assign_public_ip = false
  }

  load_balancer {
    target_group_arn = aws_lb_target_group.mlflow.arn
    container_name   = "mlflow"
    container_port   = local.service_port
  }

  lifecycle {
    ignore_changes = [desired_count]
  }

  depends_on = [
    aws_lb.mlflow,
  ]
}

resource "aws_appautoscaling_target" "mlflow" {
  service_namespace  = "ecs"
  resource_id        = "service/${aws_ecs_cluster.mlflow.name}/${aws_ecs_service.mlflow.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  max_capacity       = 2
  min_capacity       = 2
}

resource "aws_security_group" "lb" {
  name   = "mlflow-dev-lb"
  tags   = local.tags
  vpc_id = "vpc-XXXXXXXXXX"
}

resource "aws_security_group_rule" "lb_ingress_http" {
  description       = "Only allow load balancer to reach the ECS service on the right port"
  type              = "ingress"
  from_port         = 80
  to_port           = 80
  protocol          = "tcp"
  # should be cidr range of the vpc
  # vpc.vpc_cidr_block
  cidr_blocks       = ["XXX.XX.0.0/16"] # cidr block of mlflow-dev-service-subnet
  security_group_id = aws_security_group.lb.id
}

resource "aws_security_group_rule" "lb_ingress_https" {
  description       = "Only allow load balancer to reach the ECS service on the right port"
  type              = "ingress"
  from_port         = 443
  to_port           = 443
  protocol          = "tcp"
  cidr_blocks       = ["XXX.XX.0.0/16"] # cidr block of mlflow-dev-service-subnet
  security_group_id = aws_security_group.lb.id
}

resource "aws_security_group_rule" "lb_egress" {
  description              = "Only allow load balancer to reach the ECS service on the right port"
  type                     = "egress"
  from_port                = local.service_port
  to_port                  = local.service_port
  protocol                 = "tcp"
  source_security_group_id = aws_security_group.ecs_service.id
  security_group_id        = aws_security_group.lb.id
}

resource "aws_lb" "mlflow" {
  name               = "mlflow-dev"
  tags               = local.tags
  internal           = true
  load_balancer_type = "application"
  security_groups    = [aws_security_group.lb.id]
  subnets            = [aws_subnet.mlflow-dev-service-subnet.id, "subnet-0ae9eae7be10c1603"]
}

resource "aws_lb_target_group" "mlflow" {
  name        = "mlflow-dev"
  port        = local.service_port
  protocol    = "HTTP"
  vpc_id      = "vpc-XXXXXXXXX"
  target_type = "ip"

  health_check {
    protocol = "HTTP"
    matcher  = "200-202"
    path     = "/health"
  }
}

resource "aws_lb_listener" "http" {
  load_balancer_arn = aws_lb.mlflow.arn
  port              = "80"
  protocol          = "HTTP"

  default_action {
    target_group_arn = aws_lb_target_group.mlflow.id
    type             = "forward"
  }
}

resource "aws_eip" "nat_gateway" {
  vpc = true
}

resource "aws_nat_gateway" "nat_gateway" {
  allocation_id = aws_eip.nat_gateway.id
  subnet_id = aws_subnet.mlflow-dev-service-subnet.id
  tags = {
    "Name" = "DevNatGateway"
  }
}

output "nat_gateway_ip" {
  value = aws_eip.nat_gateway.public_ip
}

resource "aws_route_table" "instance" {
  vpc_id = "vpc-XXXXXXXXX"
  route {
    cidr_block = "0.0.0.0/0"
    nat_gateway_id = aws_nat_gateway.nat_gateway.id
  }
}

resource "aws_route_table_association" "instance" {
  subnet_id = aws_subnet.mlflow-dev-service-subnet.id
  route_table_id = aws_route_table.instance.id
}

每次部署上述内容时,ecs 任务都会抛出以下错误:

CannotPullContainerError: pull image manifest has been retried 5 time(s): failed to resolve ref ghcr.io/mlflow/mlflow:latest: failed to do request: Head "https://ghcr.io/v2/mlflow/mlflow/manifests/latest": dial tcp XXX.XX.XXX.34:443: i/o timeout

我尝试按照此示例设置 NAT 网关(https://dev.betterdoc.org/infrastruct/2020/02/04/setting-up-a-nat-gateway-on-aws-using -terraform.html),我不知道哪里出了问题。任何帮助将不胜感激。

terraform amazon-ecs amazon-vpc aws-nat-gateway
1个回答
0
投票

您正在创建一个私有子网来部署 ECS 服务,然后在同一子网内创建 NAT 网关。 NAT 网关不能在私有子网中工作。 NAT 网关本身必须位于公共子网(具有到 Internet 网关的路由的子网)中。您的 NAT 网关当前没有通往 Internet 的路由,因此无法将任何传出流量转发到 Internet。

您的配置需要是:

  • 具有到 Internet 网关的路由的公共子网
  • 公有子网内的 NAT 网关
  • 具有到 NAT 网关的路由的私有子网
  • 部署在私有子网中的ECS服务
© www.soinside.com 2019 - 2024. All rights reserved.