Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 24 additions & 20 deletions pkg/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ func (kl *Kubelet) SyncPod(_ context.Context, updateType kubetypes.SyncPodType,
// TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan
// See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060
if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) {
kl.handlePodResourcesResize(pod)
pod = kl.handlePodResourcesResize(pod)
}
}

Expand Down Expand Up @@ -2629,13 +2629,14 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus)
klog.ErrorS(err, "getNodeAnyway function failed")
return false, nil, ""
}
podCopy := pod.DeepCopy()
cpuAvailable := node.Status.Allocatable.Cpu().MilliValue()
memAvailable := node.Status.Allocatable.Memory().Value()
cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU)
memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory)
cpuRequests := resource.GetResourceRequest(podCopy, v1.ResourceCPU)
memRequests := resource.GetResourceRequest(podCopy, v1.ResourceMemory)
if cpuRequests > cpuAvailable || memRequests > memAvailable {
klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", pod.Name)
return false, nil, v1.PodResizeStatusInfeasible
klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", podCopy.Name)
return false, podCopy, v1.PodResizeStatusInfeasible
}

// Treat the existing pod needing resize as a new pod with desired resources seeking admit.
Expand All @@ -2647,13 +2648,12 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus)
}
}

if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, pod); !ok {
if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, podCopy); !ok {
// Log reason and return. Let the next sync iteration retry the resize
klog.V(3).InfoS("Resize cannot be accommodated", "pod", pod.Name, "reason", failReason, "message", failMessage)
return false, nil, v1.PodResizeStatusDeferred
klog.V(3).InfoS("Resize cannot be accommodated", "pod", podCopy.Name, "reason", failReason, "message", failMessage)
return false, podCopy, v1.PodResizeStatusDeferred
}

podCopy := pod.DeepCopy()
for _, container := range podCopy.Spec.Containers {
idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name)
if found {
Expand All @@ -2665,9 +2665,9 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus)
return true, podCopy, v1.PodResizeStatusInProgress
}

func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) {
func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) *v1.Pod {
if pod.Status.Phase != v1.PodRunning {
return
return pod
}
podResized := false
for _, container := range pod.Spec.Containers {
Expand All @@ -2689,31 +2689,35 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) {
}
}
if !podResized {
return
return pod
}

kl.podResizeMutex.Lock()
defer kl.podResizeMutex.Unlock()
fit, updatedPod, resizeStatus := kl.canResizePod(pod)
if updatedPod == nil {
return pod
}
if fit {
// Update pod resource allocation checkpoint
if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil {
//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod))
klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(updatedPod))
return pod
}
*pod = *updatedPod
}
if resizeStatus != "" {
// Save resize decision to checkpoint
if err := kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus); err != nil {
if err := kl.statusManager.SetPodResizeStatus(updatedPod.UID, resizeStatus); err != nil {
//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(pod))
klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(updatedPod))
return pod
}
pod.Status.Resize = resizeStatus
updatedPod.Status.Resize = resizeStatus
}
kl.podManager.UpdatePod(pod)
kl.statusManager.SetPodStatus(pod, pod.Status)
return
kl.podManager.UpdatePod(updatedPod)
kl.statusManager.SetPodStatus(updatedPod, updatedPod.Status)
Copy link
Contributor

@smarterclayton smarterclayton Mar 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you expecting the SyncPod loop to continue with the updated pod values? If so you need to return updatedPod here and ensure this is called before other pod actions.

Also note - calling UpdatePod like this isn't safe - if another actor updates the pod in between this call and the time you resync, the value of pod in pod manager will be overwritten by that value (may be older) - or worse, you might overwrite a newer value of pod spec that was delivered. That might cause (for instance) a pod that was previously completed to be seen as newly created, and the kubelet to run it twice.

That is the root of my concern here is that you can't use podManager as a store of temporary state (the kubelet uses it as a cache of upstream sources of truth). If you need to create a store of truth for "changes that have been requested AND accepted by the kubelet" we need to create a new map to hold them, which probably involves designing this into the state model of the kubelet as I mentioned.

The approach here will be enough for alpha but is probably relying on the update from the API actually propagating quickly to be correct.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have modified the code to return updated pod, it looks like a fairly safe change for now. PTAL.
The implementation does not rely on quick propagation, but it is desired.

My initial approach in the KEP was to not do this in SyncPod at all and do this in HandlePodUpdates coupled with event-based retries for Deferred resizes (e.g. revisit Deferred resizes upon deletion. IMHO it makes sense here because there is fair chance a resize try will be successful upon after other resource-using pods departed the node vs periodic retries). IIRC there were concerns about the reliability and it was conventional/accepted for K8s to retry even when it may not change the result. We also discussed this point during early reviews but opted to have a beta TODO to focus on this change, tracking issue #109547

What are your thoughts on my initial suggested approach (HandlePodUpdates + event based retries)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #116826 for addressing in the next version

Copy link
Member Author

@vinaykul vinaykul Mar 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is the root of my concern here is that you can't use podManager as a store of temporary state (the kubelet uses it as a cache of upstream sources of truth). If you need to create a store of truth for "changes that have been requested AND accepted by the kubelet" we need to create a new map to hold them, which probably involves designing this into the state model of the kubelet as I mentioned.

Yes, I plan to revisit computePodResizeAction for this purpose. Another possibility is to borrow from the original implementation of the initially approved KEP and patch the PodStatus and proceed only upon successful patch but that has the downside of an additional call to the API server, which could get rate-limited. This approach intends to leverage pod status as persistent, reliable source of truth ( xref: KEP 2527 )

return updatedPod
}

// LatestLoopEntryTime returns the last time in the sync loop monitor.
Expand Down
6 changes: 4 additions & 2 deletions pkg/kubelet/kubelet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2585,8 +2585,10 @@ func TestHandlePodResourcesResize(t *testing.T) {
tt.pod.Spec.Containers[0].Resources.Requests = tt.newRequests
tt.pod.Status.ContainerStatuses[0].AllocatedResources = v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}
kubelet.handlePodResourcesResize(tt.pod)
assert.Equal(t, tt.expectedAllocations, tt.pod.Status.ContainerStatuses[0].AllocatedResources, tt.name)
assert.Equal(t, tt.expectedResize, tt.pod.Status.Resize, tt.name)
updatedPod, found := kubelet.podManager.GetPodByName(tt.pod.Namespace, tt.pod.Name)
assert.True(t, found, "expected to find pod %s", tt.pod.Name)
assert.Equal(t, tt.expectedAllocations, updatedPod.Status.ContainerStatuses[0].AllocatedResources, tt.name)
assert.Equal(t, tt.expectedResize, updatedPod.Status.Resize, tt.name)
testKubelet.fakeKubeClient.ClearActions()
}
}
Expand Down