集群信息:
Kubernetes version:
root@k8s-eu-1-master:~# kubectl version
Client Version: v1.28.2
Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3
Server Version: v1.28.2
使用的云:Contabo Cloud(裸机) 安装方法: 请按照以下步骤操作:https://www.linuxtechi.com/install-kubernetes-on-ubuntu-22-04/?utm_content=cmp-true 主机操作系统:Ubuntu 22.04 CNI 和版本:
root@k8s-eu-1-master:~# ls /etc/cni/net.d/
10-flannel.conflist
root@k8s-eu-1-master:~# cat /etc/cni/net.d/10-flannel.conflist
{
"name": "cbr0",
"cniVersion": "0.3.1",
"plugins": [
{
"type": "flannel",
"delegate": {
"hairpinMode": true,
"isDefaultGateway": true
}
},
{
"type": "portmap",
"capabilities": {
"portMappings": true
}
}
]
}
CRI 和版本:
Container Runtime : containerd
root@k8s-eu-1-master:~# cat /etc/containerd/config.toml | grep version
version = 2
Pod 在状态
Running
和状态 CrashLoopBackOff
之间来回切换
root@k8s-eu-1-master:~# kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
coredns-5dd5756b68-g2bkc 1/1 Running 0 2d4h
coredns-5dd5756b68-gt7xt 1/1 Running 0 2d4h
etcd-k8s-eu-1-master 1/1 Running 1 (2d2h ago) 2d4h
kube-apiserver-k8s-eu-1-master 1/1 Running 1 (2d2h ago) 2d4h
kube-controller-manager-k8s-eu-1-master 1/1 Running 1 (2d2h ago) 2d4h
kube-proxy-7mj86 1/1 Running 1 (2d2h ago) 2d4h
kube-proxy-7nvv5 1/1 Running 1 (2d2h ago) 2d3h
kube-proxy-fq6vz 1/1 Running 1 (2d2h ago) 2d4h
kube-proxy-n2nm5 1/1 Running 1 (2d2h ago) 2d3h
kube-proxy-qhvrn 1/1 Running 1 (2d2h ago) 2d4h
kube-proxy-tbrn4 1/1 Running 1 (2d2h ago) 2d3h
kube-scheduler-k8s-eu-1-master 1/1 Running 1 (2d2h ago) 2d4h
root@k8s-eu-1-master:~# kubectl get pods
NAME READY STATUS RESTARTS AGE
arango-deployment-operator-7f59876f78-7djdr 0/1 CrashLoopBackOff 87 (11s ago) 4h58m
arango-storage-operator-6c7fdf5586-gjcrp 0/1 CrashLoopBackOff 83 (98s ago) 4h44m
root@k8s-eu-1-master:~# kubectl describe pod arango-deployment-operator-7f59876f78-7djdr
Name: arango-deployment-operator-7f59876f78-7djdr
Namespace: default
Priority: 0
Service Account: arango-deployment-operator
Node: k8s-eu-1-worker-2/xx.xxx.xxx.xxx
Start Time: Thu, 19 Oct 2023 12:56:41 +0200
Labels: app.kubernetes.io/instance=deployment
app.kubernetes.io/managed-by=Tiller
app.kubernetes.io/name=kube-arangodb
helm.sh/chart=kube-arangodb-1.2.34
pod-template-hash=7f59876f78
release=deployment
Annotations: <none>
Status: Running
IP: 10.244.0.6
IPs:
IP: 10.244.0.6
Controlled By: ReplicaSet/arango-deployment-operator-7f59876f78
Containers:
operator:
Container ID: containerd://344e2967054112557a9333332f99a8ca1dc3312285c808c727de6468f8c73381
Image: arangodb/kube-arangodb:1.2.34
Image ID: docker.io/arangodb/kube-arangodb@sha256:a25d031e87ba5b0f3038ce9f346553b69760a3a065fe608727cde188602b59e8
Port: 8528/TCP
Host Port: 0/TCP
Args:
--scope=legacy
--operator.deployment
--mode.single
--chaos.allowed=false
--log.level=debug
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Exit Code: 137
Started: Thu, 19 Oct 2023 17:39:23 +0200
Finished: Thu, 19 Oct 2023 17:40:22 +0200
Ready: False
Restart Count: 83
Liveness: http-get https://:8528/health delay=5s timeout=1s period=10s #success=1 #failure=3
Readiness: http-get https://:8528/ready delay=5s timeout=1s period=10s #success=1 #failure=3
Environment:
MY_POD_NAMESPACE: default (v1:metadata.namespace)
MY_POD_NAME: arango-deployment-operator-7f59876f78-7djdr (v1:metadata.name)
MY_POD_IP: (v1:status.podIP)
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-g4fbd (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
kube-api-access-g4fbd:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 5s
node.kubernetes.io/unreachable:NoExecute op=Exists for 5s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning Unhealthy 48m (x215 over 4h48m) kubelet Liveness probe failed: Get "https://10.244.0.6:8528/health": dial tcp 10.244.0.6:8528: connect: connection refused
Normal Pulling 28m (x77 over 4h48m) kubelet Pulling image "arangodb/kube-arangodb:1.2.34"
Warning Unhealthy 13m (x565 over 4h48m) kubelet Readiness probe failed: Get "https://10.244.0.6:8528/ready": dial tcp 10.244.0.6:8528: connect: connection refused
Warning BackOff 3m28s (x968 over 4h42m) kubelet Back-off restarting failed container operator in pod arango-deployment-operator-7f59876f78-7djdr_default(d1d6ec8e-b413-4ab8-84d7-8f6686cd3a8a)
root@k8s-eu-1-master:~# kubectl logs arango-deployment-operator-7f59876f78-7djdr
2023-10-19T15:45:24Z INF nice to meet you operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature agency-poll (deployment.feature.agency-poll) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature deployment-spec-defaults-restore (deployment.feature.deployment-spec-defaults-restore) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature encryption-rotation (deployment.feature.encryption-rotation) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature enforced-resign-leadership (deployment.feature.enforced-resign-leadership) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature ephemeral-volumes (deployment.feature.ephemeral-volumes) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature failover-leadership (deployment.feature.failover-leadership) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature force-rebuild-out-synced-shards (deployment.feature.force-rebuild-out-synced-shards) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature graceful-shutdown (deployment.feature.graceful-shutdown) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature init-containers-copy-resources (deployment.feature.init-containers-copy-resources) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature jwt-rotation (deployment.feature.jwt-rotation) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature local-storage.pass-reclaim-policy (deployment.feature.local-storage.pass-reclaim-policy) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature local-volume-replacement-check (deployment.feature.local-volume-replacement-check) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature maintenance (deployment.feature.maintenance) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature metrics-exporter (deployment.feature.metrics-exporter) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature optional-graceful-shutdown (deployment.feature.optional-graceful-shutdown) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature random-pod-names (deployment.feature.random-pod-names) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature rebalancer-v2 (deployment.feature.rebalancer-v2) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature restart-policy-always (deployment.feature.restart-policy-always) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature secured-containers (deployment.feature.secured-containers) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature sensitive-information-protection (deployment.feature.sensitive-information-protection) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature short-pod-names (deployment.feature.short-pod-names) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature timezone-management (deployment.feature.timezone-management) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature tls-rotation (deployment.feature.tls-rotation) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature tls-sni (deployment.feature.tls-sni) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature upgrade-version-check (deployment.feature.upgrade-version-check) is enabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature upgrade-version-check-v2 (deployment.feature.upgrade-version-check-v2) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Operator Feature version.3-10 (deployment.feature.version.3-10) is disabled. operator-id=7djdr
2023-10-19T15:45:24Z INF Starting arangodb-operator (Community), version 1.2.34 build 05e58812 operator-id=7djdr pod-name=arango-deployment-operator-7f59876f78-7djdr pod-namespace=default
2023-10-19T15:45:54Z INF Get Operations is not allowed. Continue crd=arangojobs.apps.arangodb.com operator-id=7djdr
为什么两个 Arango Pod 的状态突然从
running
变为 CrashLoopBackOff
?
root@k8s-eu-1-master:~# kubectl get pods
NAME READY STATUS RESTARTS AGE
arango-deployment-operator-7f59876f78-7djdr 0/1 CrashLoopBackOff 87 (100s ago) 4h59m
arango-storage-operator-6c7fdf5586-gjcrp 0/1 CrashLoopBackOff 83 (3m7s ago) 4h45m
root@k8s-eu-1-master:~#
root@k8s-eu-1-master:~# kubectl get pods
NAME READY STATUS RESTARTS AGE
arango-deployment-operator-7f59876f78-7djdr 0/1 CrashLoopBackOff 89 (4m47s ago) 5h9m
arango-storage-operator-6c7fdf5586-gjcrp 0/1 Running 86 (6m4s ago) 4h55m
root@k8s-eu-1-master:~#
如何让它们再次工作? 如何防止这种“突然”发生?
从 Pod 描述中,我可以看到 Pod 已终止,状态代码为
137
,这意味着您尚未配置容器启动和运行所需的内存。
A 137 code is issued when a process is terminated externally because of its memory consumption. The operating system's out of memory manager (OOM) intervenes to stop the program before it destabilizes the host. Pods running in Kubernetes will show a status of OOMKilled when they encounter a 137 exit code.
解决这个问题。我建议您为容器配置
resource Request and Limit
。