diff --git a/config/jobs/kubernetes/sig-scalability/sig-scalability-experimental-periodic-jobs.yaml b/config/jobs/kubernetes/sig-scalability/sig-scalability-experimental-periodic-jobs.yaml index 176023203ddb..6657e75df028 100644 --- a/config/jobs/kubernetes/sig-scalability/sig-scalability-experimental-periodic-jobs.yaml +++ b/config/jobs/kubernetes/sig-scalability/sig-scalability-experimental-periodic-jobs.yaml @@ -184,3 +184,112 @@ periodics: limits: cpu: 2 memory: "2Gi" +# Experimental tests for larger resource size as proposed in https://github.com/kubernetes/kubernetes/issues/134375 +- cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC) + name: ci-kubernetes-e2e-gce-scale-resource-size + tags: + - "perfDashPrefix: gce-5000Nodes-ResourceSize" + - "perfDashBuildsCount: 270" + - "perfDashJobType: performance" + cluster: k8s-infra-prow-build + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + preset-e2e-scalability-common: "true" + preset-e2e-scalability-periodics: "true" + preset-e2e-scalability-periodics-master: "true" + decorate: true + decoration_config: + timeout: 450m + extra_refs: + - org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes + - org: kubernetes + repo: perf-tests + base_ref: master + path_alias: k8s.io/perf-tests + annotations: + testgrid-dashboards: sig-scalability-experiments + testgrid-tab-name: gce-master-scale-resource-size + description: "Experimental tests for larger resource size as proposed in https://github.com/kubernetes/kubernetes/issues/134375" + spec: + volumes: + - name: cache-secret + secret: + secretName: scale-pull-cache-token + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20251016-39cf27682d-master + volumeMounts: + - name: cache-secret + readOnly: true + mountPath: /etc/registry-auth + env: + - name: KUBERNETES_REGISTRY_PULL_THROUGH_HOST + value: https://us-central1-docker.pkg.dev/v2/k8s-infra-e2e-scale-5k-project/k8s-5k-scale-cache/ + - name: KUBERNETES_REGISTRY_PULL_THROUGH_BASIC_AUTH_TOKEN_PATH + value: /etc/registry-auth/token + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --cluster=gce-scale-cluster + - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32 + # TODO(mborsz): Adjust or remove this change once we understand coredns + # memory usage regression. + - --env=KUBE_DNS_MEMORY_LIMIT=300Mi + - --extract=ci/fast/latest-fast + - --gcp-nodes=5000 + - --gcp-project-type=scalability-scale-project + - --gcp-zone=us-east1-b + - --provider=gce + - --metadata-sources=cl2-metadata.json + - --env=CL2_LOAD_TEST_THROUGHPUT=50 + - --env=CL2_DELETE_TEST_THROUGHPUT=50 + - --env=CL2_RATE_LIMIT_POD_CREATION=false + - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms + # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics. + - --env=CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100 + # Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics. + # TODO(#1311): Clean this up after the experiment - it should allow + # to hugely decrease pod-startup-latency across the whole test. + # Given that individual controllers have separate QPS limits, we allow + # scheduler to keep up with the load from deployment, daemonset and job + # performing pod creations at once. + - --env=SCHEDULER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500 + # With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0. + - --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0 + - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true + - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5 + - --env=CL2_DAEMONSET_POD_PAYLOAD_SIZE=1024 + - --env=CL2_DEPLOYMENT_POD_PAYLOAD_SIZE=1024 + - --env=CL2_STATEFULSET_POD_PAYLOAD_SIZE=1024 + - --env=CL2_JOB_POD_PAYLOAD_SIZE=1024 + - --test=false + - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh + - --test-cmd-args=cluster-loader2 + - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true + - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID) + - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true + - --test-cmd-args=--nodes=5000 + - --test-cmd-args=--prometheus-scrape-node-exporter + - --test-cmd-args=--provider=gce + - --test-cmd-args=--report-dir=$(ARTIFACTS) + - --test-cmd-args=--testconfig=testing/load/config.yaml + - --test-cmd-args=--testconfig=testing/huge-service/config.yaml + - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml + - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml + - --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml + - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml + - --test-cmd-name=ClusterLoaderV2 + - --timeout=420m + - --use-logexporter + - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID) + resources: + requests: + cpu: 6 + memory: "16Gi" + limits: + cpu: 6 + memory: "16Gi" \ No newline at end of file diff --git a/config/jobs/kubernetes/sig-scalability/sig-scalability-periodic-jobs.yaml b/config/jobs/kubernetes/sig-scalability/sig-scalability-periodic-jobs.yaml index d15a2c4828d3..7f3c1f58a30f 100644 --- a/config/jobs/kubernetes/sig-scalability/sig-scalability-periodic-jobs.yaml +++ b/config/jobs/kubernetes/sig-scalability/sig-scalability-periodic-jobs.yaml @@ -1166,113 +1166,3 @@ periodics: limits: cpu: 3 memory: "8Gi" - -# Exploratory tests for resource size limit as proposed in https://github.com/kubernetes/kubernetes/issues/134375 -- cron: '1 17 2-31/2 * *' # Run on even days at 9:01PST (17:01 UTC) - name: ci-kubernetes-e2e-gce-scale-resource-size - tags: - - "perfDashPrefix: gce-5000Nodes-ResourceSize" - - "perfDashBuildsCount: 270" - - "perfDashJobType: performance" - cluster: k8s-infra-prow-build - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - preset-e2e-scalability-common: "true" - preset-e2e-scalability-periodics: "true" - preset-e2e-scalability-periodics-master: "true" - decorate: true - decoration_config: - timeout: 450m - extra_refs: - - org: kubernetes - repo: kubernetes - base_ref: master - path_alias: k8s.io/kubernetes - - org: kubernetes - repo: perf-tests - base_ref: master - path_alias: k8s.io/perf-tests - annotations: - testgrid-dashboards: sig-scalability-gce, google-gce - testgrid-tab-name: gce-master-scale-resource-size - description: "Exploratory tests for resource size limit as proposed in https://github.com/kubernetes/kubernetes/issues/134375" - spec: - volumes: - - name: cache-secret - secret: - secretName: scale-pull-cache-token - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20251016-39cf27682d-master - volumeMounts: - - name: cache-secret - readOnly: true - mountPath: /etc/registry-auth - env: - - name: KUBERNETES_REGISTRY_PULL_THROUGH_HOST - value: https://us-central1-docker.pkg.dev/v2/k8s-infra-e2e-scale-5k-project/k8s-5k-scale-cache/ - - name: KUBERNETES_REGISTRY_PULL_THROUGH_BASIC_AUTH_TOKEN_PATH - value: /etc/registry-auth/token - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --cluster=gce-scale-cluster - - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32 - # TODO(mborsz): Adjust or remove this change once we understand coredns - # memory usage regression. - - --env=KUBE_DNS_MEMORY_LIMIT=300Mi - - --extract=ci/fast/latest-fast - - --gcp-nodes=5000 - - --gcp-project-type=scalability-scale-project - - --gcp-zone=us-east1-b - - --provider=gce - - --metadata-sources=cl2-metadata.json - - --env=CL2_LOAD_TEST_THROUGHPUT=50 - - --env=CL2_DELETE_TEST_THROUGHPUT=50 - - --env=CL2_RATE_LIMIT_POD_CREATION=false - - --env=KUBE_CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms - # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics. - - --env=CONTROLLER_MANAGER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=100 --kube-api-burst=100 - # Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics. - # TODO(#1311): Clean this up after the experiment - it should allow - # to hugely decrease pod-startup-latency across the whole test. - # Given that individual controllers have separate QPS limits, we allow - # scheduler to keep up with the load from deployment, daemonset and job - # performing pod creations at once. - - --env=SCHEDULER_TEST_ARGS=--authorization-always-allow-paths=/healthz,/readyz,/livez,/metrics --profiling --contention-profiling --kube-api-qps=500 --kube-api-burst=500 - # With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0. - - --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0 - - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true - - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5 - - --env=CL2_DAEMONSET_POD_PAYLOAD_SIZE=1024 - - --env=CL2_DEPLOYMENT_POD_PAYLOAD_SIZE=1024 - - --env=CL2_STATEFULSET_POD_PAYLOAD_SIZE=1024 - - --env=CL2_JOB_POD_PAYLOAD_SIZE=1024 - - --test=false - - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh - - --test-cmd-args=cluster-loader2 - - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true - - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID) - - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true - - --test-cmd-args=--nodes=5000 - - --test-cmd-args=--prometheus-scrape-node-exporter - - --test-cmd-args=--provider=gce - - --test-cmd-args=--report-dir=$(ARTIFACTS) - - --test-cmd-args=--testconfig=testing/load/config.yaml - - --test-cmd-args=--testconfig=testing/huge-service/config.yaml - - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml - - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml - - --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml - - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml - - --test-cmd-name=ClusterLoaderV2 - - --timeout=420m - - --use-logexporter - - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID) - resources: - requests: - cpu: 6 - memory: "16Gi" - limits: - cpu: 6 - memory: "16Gi"