-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
What happened?
When some abnormal situation occured, the pod stuck in restarting the container, and the container stuck in stopping status, such as
[root@node2 ~]# kubectl get po -n monitoring -owide | grep prometheus-k8s
prometheus-k8s-0 5/6 Running 11 (42h ago) 2d17h 21.100.82.133 node4 <none> <none>
prometheus-k8s-1 5/6 Running 13 (2d2h ago) 2d17h 21.100.89.244 node2 <none> <none>
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning Unhealthy 39s (x16114 over 22h) kubelet Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of e8ba4071dd21768653cd400d7921998ed203cf536bc9a8702191fa98f01fa081 is running failed: container process not found
root@node2 userdata]# crictl ps -a | grep 141d581
141d581cecdfe c7f2a0fdd328b8359731c24c43b7002ed80b517a02f1a99b5eba2145ddb60561 2 days ago Running prometheus
As i checked the conmon-pid and container pid, they are all gone
[root@node2 ~]# cd /var/run/containers/storage/overlay-containers/141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af/
[root@node2 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af]# ls
userdata
[root@node2 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af]# cd userdata/
[root@node2 userdata]# ls
config.json conmon-pidfile ctl pidfile run winsz
[root@node2 userdata]# cat conmon-pidfile
524557[root@node2 userdata]#
[root@node2 userdata]# cat pidfile
524599[root@node2 userdata]# ps -ef | grep 524599
root 2385378 2144759 0 14:41 pts/0 00:00:00 grep --color=auto 524599
[root@node2 userdata]# ps -ef | grep 524557
root 2391999 2144759 0 14:41 pts/0 00:00:00 grep --color=auto 524557
But the exit status is already set succeded
[root@node2 userdata]# pwd
/var/lib/containers/storage/overlay-containers/141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af/userdata
[root@node2 userdata]#
[root@node2 userdata]# ls
config.json exit state.json
[root@node2 userdata]# cat exit
137[root@node2 userdata]#
Check the cri-o logs, it shows as
Aug 14 13:44:14 node2 crio[2507]: time="2025-08-14 13:44:14.245436869+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=32cc7a3b-3139-4279-b5b1-945c6da7820b name=/runtime.v1.RuntimeService/StopContainer
Aug 14 13:56:14 node2 crio[2507]: time="2025-08-14 13:56:14.912372883+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=9bd0c820-21f4-4fbb-bd28-1e4a393babbf name=/runtime.v1.RuntimeService/StopContainer
Aug 14 14:08:15 node2 crio[2507]: time="2025-08-14 14:08:15.594914573+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=c3b771c4-7d33-42fe-a4be-743bf3d4c089 name=/runtime.v1.RuntimeService/StopContainer
Aug 14 14:20:16 node2 crio[2507]: time="2025-08-14 14:20:16.140494686+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=3066933c-4378-4341-a1f6-328d4a9675c6 name=/runtime.v1.RuntimeService/StopContainer
Aug 14 14:32:16 node2 crio[2507]: time="2025-08-14 14:32:16.871985579+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=803a7c30-1920-4615-a856-fd3d80526d5a name=/runtime.v1.RuntimeService/StopContainer
Aug 14 14:44:17 node2 crio[2507]: time="2025-08-14 14:44:17.459770555+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=06e53805-b139-417f-bf1f-b4acbed4a9dc name=/runtime.v1.RuntimeService/StopContainer
Aug 14 14:56:18 node2 crio[2507]: time="2025-08-14 14:56:18.118468835+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=59110570-082c-4838-9e16-294d37564c87 name=/runtime.v1.RuntimeService/StopContainer
Aug 14 15:08:18 node2 crio[2507]: time="2025-08-14 15:08:18.911555124+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=57ef935f-431c-49ba-a90a-f29cb80e779d name=/runtime.v1.RuntimeService/StopContainer
Aug 14 15:20:19 node2 crio[2507]: time="2025-08-14 15:20:19.508352107+08:00" level=info msg="Stopping container: 141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af (timeout: 600s)" id=d5c68afa-86c5-4246-a622-64c8a7e768f1 name=/runtime.v1.RuntimeService/StopContainer
the container stat show as
{
"status": {
"id": "141d581cecdfef42f66adb6deb51e411572e5a8495a6bebf79e8afe61d81d3af",
"metadata": {
"attempt": 3,
"name": "prometheus"
},
"state": "CONTAINER_RUNNING",
"createdAt": "2025-08-12T11:42:09.707991891+08:00",
"startedAt": "2025-08-12T11:42:09.723073428+08:00",
"finishedAt": "0001-01-01T00:00:00Z",
"exitCode": 0,
"image": {
"annotations": {},
"image": "image.xxx.io/xxx@sha256:16812e80c5a71a141817cde5e8527af55752d4bbbf657694d72711bf44c76caf",
"runtimeHandler": "",
"userSpecifiedImage": ""
},
"imageRef": "image.xxx.io/xxx@sha256:16812e80c5a71a141817cde5e8527af55752d4bbbf657694d72711bf44c76caf",
"reason": "",
"message": "",
"labels": {
"io.kubernetes.container.name": "prometheus",
"io.kubernetes.pod.name": "prometheus-k8s-1",
"io.kubernetes.pod.namespace": "monitoring",
"io.kubernetes.pod.uid": "a50460a0-1b6e-4550-ade1-b88ff7a85b16"
},
"annotations": {
"io.kubernetes.container.hash": "eb454ac7",
"io.kubernetes.container.restartCount": "3",
"io.kubernetes.container.terminationMessagePath": "/dev/termination-log",
"io.kubernetes.container.terminationMessagePolicy": "FallbackToLogsOnError",
"io.kubernetes.pod.terminationGracePeriod": "600"
},
is seems like the cri-o is stuck in some goroutine about stopping the container,such as
cri-o/internal/oci/runtime_oci.go
Line 932 in 3d8e440
func (r *runtimeOCI) StopLoopForContainer(ctx context.Context, c *Container, bm kwait.BackoffManager) { |
crictl rm
to remove the running prometheus container, then it can recover automatically
I forgot to get the cri-o goroutine info, will post more information when i get
What did you expect to happen?
cri-o should stop the container successfully when the pid is gone and exit status is set ok
How can we reproduce it (as minimally and precisely as possible)?
rarely, only encountered into this situation twice when upgrade to 1.29.13
Anything else we need to know?
No response
CRI-O and Kubernetes version
$ crio --version
# paste output here
1.29.13
$ kubectl version --output=json
# paste output here
1.29.6
OS version
# On Linux:
$ cat /etc/os-release
# paste output here
$ uname -a
# paste output here
5.15.67-2