From 79c384654f39cf6e87ea3b31a90dcca0bf6b9fe3 Mon Sep 17 00:00:00 2001 From: Roshani Narasimhan Date: Tue, 13 May 2025 11:11:38 -0700 Subject: [PATCH 1/6] Update README.md (#21) --- README.md | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 53cabf15..a1dd467c 100644 --- a/README.md +++ b/README.md @@ -29,10 +29,16 @@ The user workload is typically on a Vertex AI notebook, so users can connect to ### Install a released version To install the latest released version of PathwaysJob version on your cluster, run the following command: ```sh -VERSION=v0.1.0 +VERSION=v0.1.1 kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/$VERSION/install.yaml ``` +To uninstall the latest released version of PathwaysJob version on your cluster, run the following command: +```sh +VERSION=v0.1.1 +kubectl delete -f https://github.com/google/pathways-job/releases/download/$VERSION/install.yaml +``` + ### Build and install from source To build PathwaysJob from source and install it on your cluster, run the following commands: **Build and push your image to the location specified by `IMAGE`:** @@ -71,37 +77,16 @@ You can apply the examples from the config/samples: kubectl apply -k config/samples/.yaml ``` ->**NOTE**: Ensure that the examples has default values to test it out. - -### To Uninstall -**Delete the instances (CRs) from the cluster:** - -```sh -kubectl delete -k config/samples/.yaml -``` - -**Delete the APIs(CRDs) from the cluster:** - -```sh -make deploy IMG=$IMAGE -``` - -### Create instances of your solution -You can apply the examples from the config/samples: - -```sh -kubectl apply -k config/samples/.yaml -``` >**NOTE**: Refer to the examples showcasing PathwaysJob features. -Ensure that the examples has default values to test it out. - -### Delete the instances (CRs) from the cluster:** +>Ensure that the examples has default values to test it out. +**Delete the instances (CRs) from the cluster:** +You can delete the examples from the config/samples, applied above: ```sh kubectl delete -k config/samples/.yaml ``` -**UnDeploy the controller from the cluster:** +**Undeploy the controller from the cluster:** ```sh make undeploy From ed9781e0c578075b81cd0a352edfcd92d0ed8793 Mon Sep 17 00:00:00 2001 From: Roshani Narasimhan Date: Wed, 28 May 2025 11:59:46 -0700 Subject: [PATCH 2/6] Update README.md , fix contributing.md link (#22) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a1dd467c..89e40c82 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ make undeploy ## Contributing -We welcome contributions! Please look at [contributing.md](/usr/local/google/home/roshanin/pathways-job/docs/contributing.md). +We welcome contributions! Please look at [contributing.md](https://github.com/google/pathways-job/blob/main/docs/contributing.md). More information can be found via the [Kubebuilder Documentation](https://book.kubebuilder.io/introduction.html) ## License From 04cf9f20aa4b8b08ae3b6214fa1a4c12e8a68ad1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Jun 2025 13:21:06 -0700 Subject: [PATCH 3/6] Bump golang.org/x/net in the go_modules group across 1 directory (#19) Bumps the go_modules group with 1 update in the / directory: [golang.org/x/net](https://github.com/golang/net). Updates `golang.org/x/net` from 0.36.0 to 0.38.0 - [Commits](https://github.com/golang/net/compare/v0.36.0...v0.38.0) --- updated-dependencies: - dependency-name: golang.org/x/net dependency-version: 0.38.0 dependency-type: indirect dependency-group: go_modules ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Roshani Narasimhan --- go.mod | 10 +++++----- go.sum | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/go.mod b/go.mod index 11175b04..4669ca77 100644 --- a/go.mod +++ b/go.mod @@ -77,12 +77,12 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/net v0.36.0 // indirect + golang.org/x/net v0.38.0 // indirect golang.org/x/oauth2 v0.24.0 // indirect - golang.org/x/sync v0.11.0 // indirect - golang.org/x/sys v0.30.0 // indirect - golang.org/x/term v0.29.0 // indirect - golang.org/x/text v0.22.0 // indirect + golang.org/x/sync v0.12.0 // indirect + golang.org/x/sys v0.31.0 // indirect + golang.org/x/term v0.30.0 // indirect + golang.org/x/text v0.23.0 // indirect golang.org/x/time v0.7.0 // indirect golang.org/x/tools v0.28.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index a3ccb100..a4d96868 100644 --- a/go.sum +++ b/go.sum @@ -163,26 +163,26 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.36.0 h1:vWF2fRbw4qslQsQzgFqZff+BItCvGFQqKzKIzx1rmoA= -golang.org/x/net v0.36.0/go.mod h1:bFmbeoIPfrw4sMHNhb4J9f6+tPziuGjq7Jk/38fxi1I= +golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= +golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= -golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= +golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= -golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= +golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= +golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= +golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= -golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= +golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= +golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= From 17df5bcd7281fb8f6993643b8dcd88438a84f620 Mon Sep 17 00:00:00 2001 From: Roshani Narasimhan Date: Fri, 6 Jun 2025 14:40:40 -0700 Subject: [PATCH 4/6] Place pathways-head pod exclusively on one CPU node. (#23) --- internal/controller/pathwaysjob_controller.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/internal/controller/pathwaysjob_controller.go b/internal/controller/pathwaysjob_controller.go index e9aca8c9..5beb1b78 100644 --- a/internal/controller/pathwaysjob_controller.go +++ b/internal/controller/pathwaysjob_controller.go @@ -227,7 +227,7 @@ func (r *PathwaysJobReconciler) createJobSet(ctx context.Context, pw *pathwaysjo Spec: jobsetv1alpha2.JobSetSpec{ StartupPolicy: &jobsetv1alpha2.StartupPolicy{ StartupPolicyOrder: jobsetv1alpha2.InOrder, - }, // create jobs in the order specified in JobSet. + }, // create jobs in the order specified in JobSet - pathways-head first. FailurePolicy: &jobsetv1alpha2.FailurePolicy{ MaxRestarts: pw.Spec.MaxRestarts, }, @@ -475,6 +475,7 @@ func makeImageTagUsingPathwaysVersion(pw *pathwaysjob.PathwaysJob) string { // Construct success policy based on deployment mode and user workload spec. func MakeSuccessPolicy(pw *pathwaysjob.PathwaysJob) *jobsetv1alpha2.SuccessPolicy { + // Mark the Job successful if pathways-head pod succeeds. userJobName := PathwaysHeadJobName if isUserPodProvided(pw) { return &jobsetv1alpha2.SuccessPolicy{Operator: jobsetv1alpha2.OperatorAll, TargetReplicatedJobs: []string{userJobName}} @@ -824,7 +825,8 @@ func MakePathwaysHeadPodSpec(pw *pathwaysjob.PathwaysJob) *corev1.PodSpec { if isUserPodProvided(pw) { // Inject Pathways RM and proxy into the user provided pod spec // in the form of initContainers. The user container is the main container, - // whose success or failure will be tracked. + // whose success or failure will be tracked to determine the success of the + // pathways-head pod. // Ensure DNSPolicy and HostNetwork are set as needed. RMContainerSpec, _ := MakeResourceManagerContainer(pw, true) ProxyContainerSpec, _ := MakeProxyContainer(pw, true) @@ -875,6 +877,11 @@ func MakePathwaysHeadReplicatedJob(pathwaysHeadPodSpec corev1.PodSpec) jobsetv1a Name: PathwaysHeadJobName, Replicas: 1, Template: batchv1.JobTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "alpha.jobset.sigs.k8s.io/exclusive-topology": "kubernetes.io/hostname", + }, // needed so that head pods are placed exclusively on CPU nodes. + }, Spec: batchv1.JobSpec{ BackoffLimit: ptr.To(int32(0)), Completions: ptr.To(int32(1)), From 58adfe27fedaa53231693c8a8d4ef9c9354516c6 Mon Sep 17 00:00:00 2001 From: Roshani Narasimhan Date: Mon, 16 Jun 2025 09:52:19 -0700 Subject: [PATCH 5/6] Exclusive CPU nodes only for default mode. (#24) --- internal/controller/pathwaysjob_controller.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/internal/controller/pathwaysjob_controller.go b/internal/controller/pathwaysjob_controller.go index 5beb1b78..301564c2 100644 --- a/internal/controller/pathwaysjob_controller.go +++ b/internal/controller/pathwaysjob_controller.go @@ -872,15 +872,20 @@ func injectJAXBackendTargetIntoMainContainer(pw *pathwaysjob.PathwaysJob, pathwa } -func MakePathwaysHeadReplicatedJob(pathwaysHeadPodSpec corev1.PodSpec) jobsetv1alpha2.ReplicatedJob { +func MakePathwaysHeadReplicatedJob(pw *pathwaysjob.PathwaysJob, pathwaysHeadPodSpec corev1.PodSpec) jobsetv1alpha2.ReplicatedJob { + var annotations map[string]string + annotations = nil + if pw.Spec.Controller.DeploymentMode == pathwaysjob.Default { + annotations = map[string]string{ + "alpha.jobset.sigs.k8s.io/exclusive-topology": "kubernetes.io/hostname", + } // needed so that head pods are placed exclusively on CPU nodes. + } pathwaysHeadJob := jobsetv1alpha2.ReplicatedJob{ Name: PathwaysHeadJobName, Replicas: 1, Template: batchv1.JobTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - "alpha.jobset.sigs.k8s.io/exclusive-topology": "kubernetes.io/hostname", - }, // needed so that head pods are placed exclusively on CPU nodes. + Annotations: annotations, }, Spec: batchv1.JobSpec{ BackoffLimit: ptr.To(int32(0)), @@ -905,12 +910,12 @@ func MakePathwaysHeadJobForColocateHeadWithWorkersDeployment(ctx context.Context podSpec.Affinity = affinitySpec podSpec.Tolerations = tolerations - return MakePathwaysHeadReplicatedJob(podSpec), nil + return MakePathwaysHeadReplicatedJob(pw, podSpec), nil } // Construct pathways-head replicated job containing Pathways RM, Pathways Proxy and the user job containers for the 'default' deployment mode. // In the default mode, the Pathways head pod is placed on CPU nodes. func MakePathwaysHeadJobForDefaultDeployment(ctx context.Context, pw *pathwaysjob.PathwaysJob) (jobsetv1alpha2.ReplicatedJob, error) { podSpec := *MakePathwaysHeadPodSpec(pw) - return MakePathwaysHeadReplicatedJob(podSpec), nil + return MakePathwaysHeadReplicatedJob(pw, podSpec), nil } From adcade7d9eb5b9ba6d7da793e76b2cca2c8e23e1 Mon Sep 17 00:00:00 2001 From: Roshani Narasimhan Date: Mon, 23 Jun 2025 10:21:36 -0700 Subject: [PATCH 6/6] Releasing v0.1.2, with reliable placement of pathways-head. (#25) --- Makefile | 2 +- README.md | 4 ++-- config/manager/kustomization.yaml | 2 +- release/install.yaml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index f04416b4..6f6fb95e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Image URL to use all building/pushing image targets IMAGE_REGISTRY ?= us-docker.pkg.dev/cloud-tpu-v2-images/pathways-job IMAGE_NAME ?= pathwaysjob-controller -IMAGE_TAG ?= v0.1.1 +IMAGE_TAG ?= v0.1.2 IMG ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME):$(IMAGE_TAG) # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.30.0 diff --git a/README.md b/README.md index 89e40c82..d6bde57f 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,13 @@ The user workload is typically on a Vertex AI notebook, so users can connect to ### Install a released version To install the latest released version of PathwaysJob version on your cluster, run the following command: ```sh -VERSION=v0.1.1 +VERSION=v0.1.2 kubectl apply --server-side -f https://github.com/google/pathways-job/releases/download/$VERSION/install.yaml ``` To uninstall the latest released version of PathwaysJob version on your cluster, run the following command: ```sh -VERSION=v0.1.1 +VERSION=v0.1.2 kubectl delete -f https://github.com/google/pathways-job/releases/download/$VERSION/install.yaml ``` diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index ebcfa24b..9b986dc0 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -19,4 +19,4 @@ kind: Kustomization images: - name: controller newName: us-docker.pkg.dev/cloud-tpu-v2-images/pathways-job/pathwaysjob-controller - newTag: v0.1.1 + newTag: v0.1.2 diff --git a/release/install.yaml b/release/install.yaml index fa70f1ec..c36363ec 100644 --- a/release/install.yaml +++ b/release/install.yaml @@ -8795,7 +8795,7 @@ spec: - --health-probe-bind-address=:8081 command: - /manager - image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways-job/pathwaysjob-controller:v0.1.1 + image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways-job/pathwaysjob-controller:v0.1.2 imagePullPolicy: Always livenessProbe: httpGet: