Move ci-kubernetes-e2e-gce-scale-resource-size to experiments

serathius · serathius · commit 9027242552ba · 2025-10-16T16:32:13.000+02:00
Remembered that we have experimental periodic jobs and dashboard. This
might be a better place.
diff --git a/config/jobs/kubernetes/sig-scalability/sig-scalability-experimental-periodic-jobs.yaml b/config/jobs/kubernetes/sig-scalability/sig-scalability-experimental-periodic-jobs.yaml
@@ -184,3 +184,112 @@ periodics:
           limits:
             cpu: 2
             memory: "2Gi"
+# Experimental tests for larger resource size as proposed in https://github.com/kubernetes/kubernetes/issues/134375
+- cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC)
+  name: ci-kubernetes-e2e-gce-scale-resource-size
+  tags:
+  - "perfDashPrefix: gce-5000Nodes-ResourceSize"
+  - "perfDashBuildsCount: 270"
+  - "perfDashJobType: performance"
+  cluster: k8s-infra-prow-build
+  labels:
+    preset-service-account: "true"
+    preset-k8s-ssh: "true"
+    preset-e2e-scalability-common: "true"
+    preset-e2e-scalability-periodics: "true"
+    preset-e2e-scalability-periodics-master: "true"
+  decorate: true
+  decoration_config:
+    timeout: 450m
+  extra_refs:
+  - org: kubernetes
+    repo: kubernetes
+    base_ref: master
+    path_alias: k8s.io/kubernetes
+  - org: kubernetes
+    repo: perf-tests
+    base_ref: master
+    path_alias: k8s.io/perf-tests
+  annotations:
+    testgrid-dashboards: sig-scalability-experiments
+    testgrid-tab-name: gce-master-scale-resource-size
+    description: "Experimental tests for larger resource size as proposed in https://github.com/kubernetes/kubernetes/issues/134375"
+  spec:
+    volumes:
+    - name: cache-secret
+      secret:
+        secretName: scale-pull-cache-token
+    containers:
+    - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20251016-39cf27682d-master
+      volumeMounts:
+      - name: cache-secret
+        readOnly: true
+        mountPath: /etc/registry-auth
+      env:
+      - name: KUBERNETES_REGISTRY_PULL_THROUGH_HOST
+        value: https://us-central1-docker.pkg.dev/v2/k8s-infra-e2e-scale-5k-project/k8s-5k-scale-cache/
+      - name: KUBERNETES_REGISTRY_PULL_THROUGH_BASIC_AUTH_TOKEN_PATH
+        value: /etc/registry-auth/token
+      command:
+      - runner.sh
+      - /workspace/scenarios/kubernetes_e2e.py
+      args:
+      - --cluster=gce-scale-cluster
+      - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
+      # TODO(mborsz): Adjust or remove this change once we understand coredns
+      # memory usage regression.
+      - --env=KUBE_DNS_MEMORY_LIMIT=300Mi
+      - --extract=ci/fast/latest-fast
+      - --gcp-nodes=5000
+      - --gcp-project-type=scalability-scale-project
+      - --gcp-zone=us-east1-b
+      - --provider=gce
+      - --metadata-sources=cl2-metadata.json
+      - --env=CL2_LOAD_TEST_THROUGHPUT=50
+      - --env=CL2_DELETE_TEST_THROUGHPUT=50
+      - --env=CL2_RATE_LIMIT_POD_CREATION=false
+      - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms
+      # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics.
+      - --env=CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100
+      # Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics.
+      # TODO(#1311): Clean this up after the experiment - it should allow
+      #   to hugely decrease pod-startup-latency across the whole test.
+      #   Given that individual controllers have separate QPS limits, we allow
+      #   scheduler to keep up with the load from deployment, daemonset and job
+      #   performing pod creations at once.
+      - --env=SCHEDULER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500
+      # With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0.
+      - --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0
+      - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true
+      - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5
+      - --env=CL2_DAEMONSET_POD_PAYLOAD_SIZE=1024
+      - --env=CL2_DEPLOYMENT_POD_PAYLOAD_SIZE=1024
+      - --env=CL2_STATEFULSET_POD_PAYLOAD_SIZE=1024
+      - --env=CL2_JOB_POD_PAYLOAD_SIZE=1024
+      - --test=false
+      - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
+      - --test-cmd-args=cluster-loader2
+      - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
+      - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
+      - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
+      - --test-cmd-args=--nodes=5000
+      - --test-cmd-args=--prometheus-scrape-node-exporter
+      - --test-cmd-args=--provider=gce
+      - --test-cmd-args=--report-dir=$(ARTIFACTS)
+      - --test-cmd-args=--testconfig=testing/load/config.yaml
+      - --test-cmd-args=--testconfig=testing/huge-service/config.yaml
+      - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml
+      - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml
+      - --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml
+      - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml
+      - --test-cmd-name=ClusterLoaderV2
+      - --timeout=420m
+      - --use-logexporter
+      - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID)
+      resources:
+        requests:
+          cpu: 6
+          memory: "16Gi"
+        limits:
+          cpu: 6
+          memory: "16Gi"
diff --git a/config/jobs/kubernetes/sig-scalability/sig-scalability-periodic-jobs.yaml b/config/jobs/kubernetes/sig-scalability/sig-scalability-periodic-jobs.yaml
@@ -1166,113 +1166,3 @@ periodics:
         limits:
           cpu: 3
           memory: "8Gi"
-
-# Exploratory tests for resource size limit as proposed in https://github.com/kubernetes/kubernetes/issues/134375
-- cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC)
-  name: ci-kubernetes-e2e-gce-scale-resource-size
-  tags:
-  - "perfDashPrefix: gce-5000Nodes-ResourceSize"
-  - "perfDashBuildsCount: 270"
-  - "perfDashJobType: performance"
-  cluster: k8s-infra-prow-build
-  labels:
-    preset-service-account: "true"
-    preset-k8s-ssh: "true"
-    preset-e2e-scalability-common: "true"
-    preset-e2e-scalability-periodics: "true"
-    preset-e2e-scalability-periodics-master: "true"
-  decorate: true
-  decoration_config:
-    timeout: 450m
-  extra_refs:
-  - org: kubernetes
-    repo: kubernetes
-    base_ref: master
-    path_alias: k8s.io/kubernetes
-  - org: kubernetes
-    repo: perf-tests
-    base_ref: master
-    path_alias: k8s.io/perf-tests
-  annotations:
-    testgrid-dashboards: sig-scalability-gce, google-gce
-    testgrid-tab-name: gce-master-scale-resource-size
-    description: "Exploratory tests for resource size limit as proposed in https://github.com/kubernetes/kubernetes/issues/134375"
-  spec:
-    volumes:
-    - name: cache-secret
-      secret:
-        secretName: scale-pull-cache-token
-    containers:
-    - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20251016-39cf27682d-master
-      volumeMounts:
-      - name: cache-secret
-        readOnly: true
-        mountPath: /etc/registry-auth
-      env:
-      - name: KUBERNETES_REGISTRY_PULL_THROUGH_HOST
-        value: https://us-central1-docker.pkg.dev/v2/k8s-infra-e2e-scale-5k-project/k8s-5k-scale-cache/
-      - name: KUBERNETES_REGISTRY_PULL_THROUGH_BASIC_AUTH_TOKEN_PATH
-        value: /etc/registry-auth/token
-      command:
-      - runner.sh
-      - /workspace/scenarios/kubernetes_e2e.py
-      args:
-      - --cluster=gce-scale-cluster
-      - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
-      # TODO(mborsz): Adjust or remove this change once we understand coredns
-      # memory usage regression.
-      - --env=KUBE_DNS_MEMORY_LIMIT=300Mi
-      - --extract=ci/fast/latest-fast
-      - --gcp-nodes=5000
-      - --gcp-project-type=scalability-scale-project
-      - --gcp-zone=us-east1-b
-      - --provider=gce
-      - --metadata-sources=cl2-metadata.json
-      - --env=CL2_LOAD_TEST_THROUGHPUT=50
-      - --env=CL2_DELETE_TEST_THROUGHPUT=50
-      - --env=CL2_RATE_LIMIT_POD_CREATION=false
-      - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms
-      # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics.
-      - --env=CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100
-      # Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics.
-      # TODO(#1311): Clean this up after the experiment - it should allow
-      #   to hugely decrease pod-startup-latency across the whole test.
-      #   Given that individual controllers have separate QPS limits, we allow
-      #   scheduler to keep up with the load from deployment, daemonset and job
-      #   performing pod creations at once.
-      - --env=SCHEDULER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500
-      # With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0.
-      - --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0
-      - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true
-      - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5
-      - --env=CL2_DAEMONSET_POD_PAYLOAD_SIZE=1024
-      - --env=CL2_DEPLOYMENT_POD_PAYLOAD_SIZE=1024
-      - --env=CL2_STATEFULSET_POD_PAYLOAD_SIZE=1024
-      - --env=CL2_JOB_POD_PAYLOAD_SIZE=1024
-      - --test=false
-      - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
-      - --test-cmd-args=cluster-loader2
-      - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
-      - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
-      - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
-      - --test-cmd-args=--nodes=5000
-      - --test-cmd-args=--prometheus-scrape-node-exporter
-      - --test-cmd-args=--provider=gce
-      - --test-cmd-args=--report-dir=$(ARTIFACTS)
-      - --test-cmd-args=--testconfig=testing/load/config.yaml
-      - --test-cmd-args=--testconfig=testing/huge-service/config.yaml
-      - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml
-      - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml
-      - --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml
-      - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml
-      - --test-cmd-name=ClusterLoaderV2
-      - --timeout=420m
-      - --use-logexporter
-      - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID)
-      resources:
-        requests:
-          cpu: 6
-          memory: "16Gi"
-        limits:
-          cpu: 6
-          memory: "16Gi"