我正在 aws-sagemaker 容器中部署我的 pytorch 模型并使用 Gunicorn 服务器进行推理
我有进口
依赖版本
matplotlib = 3.0;
pandas = 2.0;
PyTorch = 2.0.1;
tqdm = 4.0;
cuda = 11.7
import logging
import os
import shutil
import pandas as pd
import torch
from time import perf_counter
import json
shutil._USE_CP_SENDFILE = False
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch import cuda
from awsme import create_cloud_watch
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
device = 'cuda' if cuda.is_available() else 'cpu'
def invoke(request):
logger.info('Rawinput-_requ: {}'.format(request))
start_secs = perf_counter()
request = request["title"]
batch_size = 10
num_workers = 2
# num_workers = 0 # This gives result without any error
logger.info('Request: {}'.format(request))
df_test = pd.read_json(json.dumps(request))
df_test['standardRelevancy'] = 1
test_dataset = RelevanceDataset(df_test,
tokenizer=tokenizer,
label_column="standardRelevancy",
max_length=100,
do_lower_case=True)
test_dataloader = DataLoader(test_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
pin_memory=True)
emb_scores_list, labels_list = run_eval_epoch(test_dataloader)
df_test["score"] = emb_scores_list
Invoke 正在调用下面的方法
def run_eval_epoch(dataloader):
emb_scores_list = []
labels_list = []
model.eval()
with torch.set_grad_enabled(False):
for i, batch_data in enumerate(tqdm(dataloader)):
batch_data = {x: y.to(device) for x, y in batch_data.items()}
outputs = model(**batch_data, return_dict=True)
batch_softmax_logits = torch.nn.Sigmoid()(outputs.logits)
emb_scores_list.extend(batch_softmax_logits[:, -1].data.cpu().numpy().tolist())
labels_list.extend(batch_data["labels"].cpu().numpy().tolist())
del outputs
return emb_scores_list, labels_list
当我尝试致电
def invoke(request):
时,我看到以下异常
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/resource_sharer.py", line 142, in _serve
with self._listener.accept() as conn:
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 465, in accept
deliver_challenge(c, self._authkey)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 740, in deliver_challenge
response = connection.recv_bytes(256) # reject large message
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
BlockingIOError: [Errno 11] Resource temporarily unavailable
0%| | 0/1 [00:00<?, ?it/s]
ERROR 2023-10-18 08:00:01,626: invoke:152: Error occurred
Traceback (most recent call last):
File "/opt/jk/lib/python3.8/site-packages/jk_test_jk_dummy_model_deploy1/service.py", line 142, in invoke
emb_scores_list, labels_list = run_eval_epoch(test_dataloader)
File "/opt/jk/lib/python3.8/site-packages/jk_test_jk_dummy_model_deploy1/service.py", line 93, in run_eval_epoch
for i, batch_data in enumerate(tqdm(dataloader)):
File "/opt/jk/lib/python3.8/site-packages/tqdm/std.py", line 1195, in __iter__
for obj in iterable:
File "/opt/jk/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 633, in __next__
data = self._next_data()
File "/opt/jk/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in _next_data
idx, data = self._get_data()
File "/opt/jk/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1294, in _get_data
success, data = self._try_get_data()
File "/opt/jk/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1132, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/opt/jk/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 307, in rebuild_storage_fd
fd = df.detach()
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 752, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/opt/jk/python3.8/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
BlockingIOError: [Errno 11] Resource temporarily unavailable
240.10.1.5 - - [18/Oct/2023:08:00:01 +0000] "POST /invocations HTTP/1.1" 200 4 "-" "python-requests/2.31.0"
当我设置
num_workers=0
时,我得到的结果没有任何问题。如果。我设置了num_workers>0
然后我看到了上面的问题
此外,当我。在 jupyterNoteBook 中运行上面的代码我没有看到这个问题。
当我部署在gunicorn 内部时,就会发生这种情况。不幸的是我无法控制gunicorn设置。我只能控制 pytorch 相关的东西
我尝试了大多数互联网解决方案,例如
torch.set_num_threads(1)
set_start_method('spawn')
等等
有关此问题的任何指示都会有帮助
谢谢 Jk
Google bard 帮助解决了这个问题:)
根据吟游诗人的说法,这是因为
是的,PyTorch 数据加载器之间存在已知的兼容性问题 和古尼角兽。这个问题是由Gunicorn处理方式引起的 多重处理。
Gunicorn 使用工作进程模型来处理请求。每个工人 进程有自己的 Python 解释器和 PyTorch 副本 图书馆。当在工作进程中创建数据加载器时,它会尝试 与其他工作进程共享其数据。这可能会导致 BlockingIOError:[Errno 11]资源暂时不可用错误。
我们可以尝试吟游诗人解决方案中的不同解决方案。对我有用的最简单的解决方案之一是设置此参数
torch.multiprocessing.set_sharing_strategy('file_system')