sagemaker 批量变压器与我自己的预训练模型

问题描述 投票:0回答:2

我尝试使用 Sagemaker 批处理变压器对 yolo-nas 的需求进行推理。 使用带有预训练权重的预训练模型。

但我收到错误

python3: can't open file '//serve': \[Errno 2\] No such file or directory

我不知道什么是

'//serve'
。 我根本没有参考或使用它,也找不到任何有关它的文档

有关我的案例的更多数据

  1. 数据只是其中包含单个对象的图像(称为“裁剪”)

  2. seq-gil-transformer.py
    坐在
    s3://benny-test/indexer/yolo-nas/1/code

  3. /opt/ml/processing/input
    不是正确的道路。我知道这一点,我正在了解我应该使用哪条路径,但在它之前代码粉碎了......

我有自己的服装形象如下:

FROM python:3.10-bullseye

RUN apt update && \
    apt install ffmpeg libsm6 libxext6 libgl1 -y && \
    rm -rf /var/lib/apt/lists/*

ADD req.txt req.txt

RUN pip3 install -r req.txt

ENTRYPOINT ["python3"]

我有自己的推理代码,称为“seq-gil-transformer.py”:

(我知道我可以改进它,现在尝试使逻辑工作并理解 sagemaker,然后我将改进代码)

import super_gradients
import torch
from torch import nn
import json
from collections import OrderedDict
from typing import List
from pathlib import Path
import PIL 
import numpy as np
import time
import cProfile
import trace
import io
import psutil
import numpy as np
import matplotlib.pyplot as plt
import sys
import pandas as pd
import argparse


# getting images from local path
def get_images(path, device):
    crops_path_list = Path(path).glob("*.png")
    images = []
    for crop in crops_path_list:
        image = PIL.Image.open(str(crop)).convert("RGB")
        image = np.asarray(image)
        image = np.ascontiguousarray(image.transpose(2, 0, 1))
        image  = torch.from_numpy(np.array(image)).unsqueeze(0).to(dtype=torch.float32, device=device)
        images.append(image)
    return images

# saving model
def create_yolo_nas_indexer(device="cpu"):
    yolo_nas = super_gradients.training.models.get("yolo_nas_l", pretrained_weights="coco").to(device)
    torch.save(yolo_nas.backbone, f"yolo_nas_l_{device}.tar.gz")

# loading model. 
# using our own module named 'modelwrapper', 
# you can see it in the end of this post
def load_yolo_nas(device="cpu",output_layers=["stage2","stage3","context_module"]):
    yolo_nas = torch.load(f"yolo_nas_l_{device}.pth.tar").to(device)
    wraped_yolo_nas = ModelWrapper(yolo_nas,device=device).to(device)
    wraped_yolo_nas.add_output_layers(output_layers)
    return wraped_yolo_nas

# making the inference using 'modelWrapper'
# and the loaded yolo-nas super-gredients of
# deci-ai model per image(crop)
def primitive_c2v(crop:torch.Tensor, model)->list:
    output = model(crop)[0]
    return output

if __name__ == "__main__":
    engine = "cpu"
    crops = get_images("/opt/ml/processing/input", engine)
    
    H = W = 64
    C = 3
    
    outputs = []
    create_yolo_nas_indexer(engine)
    model = load_yolo_nas(engine)
    model.eval()
    list_o = []
    for crop in crops:
        start_time = time.time()
        output = primitive_c2v(crop, model)
        output = [l.detach().numpy() for l in output]
        list_o.append(output)

# printing a bit of the output just to be sure
# everything went well.
# in my data i got list of lists of numpy tensors.
# each list in the grand list is of size of 4
    for l in list_o:
        print(len(l))
    

sagemaker 代码本身是:

import boto3
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session() 
role = get_execution_role()
bucket = 'benny-test'

model = sagemaker.model.Model(
    source_dir="s3://benny-test/indexer/yolo-nas/1/code",
    entry_point="seq-gil-transformer.py",
    image_uri='*.dkr.ecr.eu-west-1.amazonaws.com/sagemaker:super-gredients-0.1',
    role=role,
    name="yolo-nas-cpu")

transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.4xlarge"
)
transformer.transform(
    data="s3://benny-test/indexer/Raw/dummy_set/images"
)
transformer.wait()

模型包装参考

此代码不是必需的,但我将其放在这里以供参考和重现性:)

def remove_all_hooks_recursive(model: nn.Module) -> None:
    for name, child in model.named_children():
        if child is not None:
            if hasattr(child, "_forward_hooks"):
                child._forward_hooks = OrderedDict()
            elif hasattr(child, "_forward_pre_hooks"):
                child._forward_pre_hooks = OrderedDict()
            elif hasattr(child, "_backward_hooks"):
                child._backward_hooks = OrderedDict()
            remove_all_hooks_recursive(child)
def add_all_modules_to_model_dict_recursive(model, module_dict, prefix=''):
    """Recursively adds all modules in a PyTorch model to a hierarchical dictionary."""
    for name, module in model.named_children():
        full_name = prefix + '.' + name if prefix else name
        full_name = full_name if full_name != "_model" else ""
        module_dict[full_name] = module
        if isinstance(module, nn.Module):
            add_all_modules_to_model_dict_recursive(module, module_dict, full_name)
class StopModel(Exception):
    def __init__(self):
        super().__init__()
def forward_hook(model_wrapper, layer_name, model=None):
    def hook(module, input, output):
        model_wrapper.selected_out[layer_name] = output
        if model is not None:
            _, code = model(output)
            model_wrapper.selected_out[f"code_{layer_name}"] = code
        if model_wrapper.stop_at_last_hook and layer_name == model_wrapper.last_layer:
            raise StopModel()
    return hook
class ModelWrapper(nn.Module):
    def __init__(self, model, stop_at_last_hook=False, device=None):
        super().__init__()
        self.stop_at_last_hook = stop_at_last_hook
        self.model = model
        self.model.eval()
        self.output_layers = []
        self.selected_out = OrderedDict()
        self.fhooks = []
        self.layer_size_dict = {}
        self.layer_stride_dict = {}
        self.model_dict = self.add_all_modules_to_model_dict()
        self.device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.last_layer = None
    @classmethod
    def from_cfg(cls, cfg_path: str):
        with open(cfg_path, "r") as f:
            cfg_dict = json.load(f)
        cls(**cfg_dict)
    def add_output_layers(self, output_layers: List[str]):
        self.last_layer = output_layers[-1]
        for output_layer in output_layers:
            if output_layer not in self.model_dict:
                raise ValueError(f"Model does not have layer: {output_layer}")
        self.output_layers = output_layers
        for layer_name, module in self.model_dict.items():
            if layer_name in self.output_layers:
                self.fhooks.append(module.register_forward_hook(forward_hook(self, layer_name)))
        self.compute_output_layer_parameters()
    def compute_output_layer_parameters(self):
        random_input = torch.rand(1, 3, 64, 64) #TODO: CHANGE TO ZEROS and avoid seed usage
        random_input = random_input.to(self.device)
        self.forward(random_input)
        for layer_name, output_value in self.selected_out.items():
            if isinstance(output_value, (list, tuple)):
                self.layer_size_dict[layer_name] = None
                self.layer_stride_dict[layer_name] = None
            else:
                self.layer_size_dict[layer_name] = output_value.shape[1]
                self.layer_stride_dict[layer_name] = int(64 / output_value.shape[2])
    def print_all_modules(self, print_module_str=False):
        for layer_name, module in self.model_dict.items():
            layer_txt = layer_name
            if print_module_str:
                layer_txt += f": {str(module)}"
            if layer_name in self.output_layers:
                layer_txt += " (SET AS AN OUTPUT LAYER)"
            print(layer_txt)
    def forward(self, x):
        # TODO: find a way to run the model only for the selected out
        if self.stop_at_last_hook:
            try:
                self.model(x)
            except Exception as e:
                if not isinstance(e, StopModel):
                    raise e
            out = None
        else:
            out = self.model(x)
        return out, self.selected_out
    def inference(self, image, name):
        bbox_list = self.model.inference(image, name)
        return bbox_list, self.selected_out
    def add_all_modules_to_model_dict(self):
        model_dict = {}
        add_all_modules_to_model_dict_recursive(self.model, model_dict)
        return model_dict
    def remove_all_hooks(self):
        remove_all_hooks_recursive(self.model)
        self.selected_out = OrderedDict()
python machine-learning deep-learning amazon-sagemaker mlops
2个回答
0
投票

此错误消息表明 python3 命令是使用错误的文件路径而不是 python 脚本文件路径调用的,而不是 Python 脚本内的错误。

例如,如果您在本地环境中点击以下命令,您将收到完全相同的错误消息。

$ python3 //serve
python3: can't open file '//serve': [Errno 2] No such file or directory

请调查 python3 命令的执行位置,并检查您是否输入了正确的 python 脚本路径。


0
投票

Sagemaker 批量推理和端点以相同的方式工作。 他们期望带有 get[ping] 和 post[incalls] 的 Web 服务器开始工作。 问题是,当 sagemaker 运行时,他正在运行一个名为“serve”的文件,在我的情况下该文件丢失了。 说清楚一点,据我所知。 “我的情况”是指我们在推理之前不使用估计器。你可以看到我像这样定义了我自己的预训练模型:

model = sagemaker.model.Model(
    image_uri='*.dkr.ecr.eu-west-1.amazonaws.com/sagemaker:yolo-nas-cpu-infra-0.1.28',
    role=role,
    name="yolo-nas-cpu-infra-v0-1-28-dt"+str(date_time_in_numbers),
)

transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.4xlarge",
)

transformer.transform(
    data="s3://benny-test/indexer/yolo-nas/sagemaker-bs/",
    content_type="application/json"
)

TL;DR使用了flask、ngnix、gunicorn。配置和运行 ngnix 和 Gunicorn 的服务文件

© www.soinside.com 2019 - 2024. All rights reserved.