Skip to content

Commit 02c6763

Browse files
Merge pull request #7208 from GeorgianaElena/eks-upgrade
[projectpythia] Eks upgrade, k8s upgrade, separate nodepools
2 parents 0ae74da + 5030443 commit 02c6763

File tree

5 files changed

+34
-183
lines changed

5 files changed

+34
-183
lines changed

config/clusters/projectpythia/prod.values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ jupyterhub:
1919
templateVars:
2020
org:
2121
name: ProjectPythia
22+
singleuser:
23+
nodeSelector:
24+
2i2c/hub-name: prod
2225
hub:
2326
config:
2427
GitHubOAuthenticator:

config/clusters/projectpythia/pythia-binder.values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ jupyterhub:
1919
# Schedule users on the smallest instance
2020
# https://github.com/2i2c-org/infrastructure/issues/4241
2121
node.kubernetes.io/instance-type: r5.xlarge
22+
2i2c/hub-name: pythia-binder
2223
storage:
2324
type: none
2425
extraVolumeMounts: []

config/clusters/projectpythia/staging.values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ jupyterhub:
1717
templateVars:
1818
org:
1919
name: projectpythia staging
20+
singleuser:
21+
nodeSelector:
22+
2i2c/hub-name: staging
2023
hub:
2124
config:
2225
GitHubOAuthenticator:

config/clusters/projectpythia/support.values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,8 @@ cluster-autoscaler:
3535

3636
calico:
3737
enabled: true
38+
39+
# FIXME: remove this once eksctl is fixed
40+
nvidiaDevicePlugin:
41+
aws:
42+
enabled: true

eksctl/projectpythia.jsonnet

Lines changed: 22 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -1,185 +1,24 @@
1-
/*
2-
This file is a jsonnet template of a eksctl's cluster configuration file,
3-
that is used with the eksctl CLI to both update and initialize an AWS EKS
4-
based cluster.
5-
6-
This file has in turn been generated from eksctl/template.jsonnet which is
7-
relevant to compare with for changes over time.
8-
9-
To use jsonnet to generate an eksctl configuration file from this, do:
10-
11-
jsonnet projectpythia.jsonnet > projectpythia.eksctl.yaml
12-
13-
References:
14-
- https://eksctl.io/usage/schema/
15-
*/
16-
local ng = import './libsonnet/nodegroup.jsonnet';
17-
18-
// place all cluster nodes here
19-
local clusterRegion = 'us-west-2';
20-
local masterAzs = ['us-west-2a', 'us-west-2b', 'us-west-2c'];
21-
local nodeAz = 'us-west-2a';
22-
23-
// Node definitions for notebook nodes. Config here is merged
24-
// with our notebook node definition.
25-
// A `node.kubernetes.io/instance-type label is added, so pods
26-
// can request a particular kind of node with a nodeSelector
27-
local notebookNodes = [
28-
{ instanceType: 'r5.xlarge' },
29-
{ instanceType: 'r5.4xlarge' },
30-
{ instanceType: 'r5.16xlarge' },
31-
{
32-
instanceType: 'g4dn.xlarge',
33-
minSize: 0,
34-
tags+: {
35-
'k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu': '1',
36-
'k8s.io/cluster-autoscaler/node-template/label/k8s.amazonaws.com/accelerator': 'nvidia-tesla-t4',
37-
},
38-
taints+: {
39-
'nvidia.com/gpu': 'present:NoSchedule',
40-
},
41-
labels+: {
42-
'2i2c/has-gpu': 'true',
43-
'k8s.amazonaws.com/accelerator': 'nvidia-tesla-t4',
44-
},
45-
// Allow provisioning GPUs across all AZs, to prevent situation where all
46-
// GPUs in a single AZ are in use and no new nodes can be spawned
47-
availabilityZones: masterAzs,
48-
},
49-
];
50-
local daskNodes = [];
51-
52-
53-
{
54-
apiVersion: 'eksctl.io/v1alpha5',
55-
kind: 'ClusterConfig',
56-
metadata+: {
57-
name: 'projectpythia',
58-
region: clusterRegion,
59-
version: '1.32',
60-
tags+: {
61-
ManagedBy: '2i2c',
62-
'2i2c.org/cluster-name': $.metadata.name,
63-
},
64-
},
65-
availabilityZones: masterAzs,
66-
iam: {
67-
withOIDC: true,
68-
},
69-
// If you add an addon to this config, run the create addon command.
70-
//
71-
// eksctl create addon --config-file=projectpythia.eksctl.yaml
72-
//
73-
addons: [
74-
{ version: 'latest', tags: $.metadata.tags } + addon
75-
for addon in
76-
[
77-
{ name: 'coredns' },
78-
{ name: 'kube-proxy' },
79-
{
80-
// vpc-cni is a Amazon maintained container networking interface
81-
// (CNI), where a CNI is required for k8s networking. The aws-node
82-
// DaemonSet in kube-system stems from installing this.
83-
//
84-
// Related docs: https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/
85-
// https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html
86-
//
87-
name: 'vpc-cni',
88-
attachPolicyARNs: ['arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy'],
89-
// configurationValues ref: https://github.com/aws/amazon-vpc-cni-k8s/blob/HEAD/charts/aws-vpc-cni/values.yaml
90-
configurationValues: |||
91-
enableNetworkPolicy: "false"
92-
|||,
93-
},
94-
{
95-
// aws-ebs-csi-driver ensures that our PVCs are bound to PVs that
96-
// couple to AWS EBS based storage, without it expect to see pods
97-
// mounting a PVC failing to schedule and PVC resources that are
98-
// unbound.
99-
//
100-
// Related docs: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html
101-
//
102-
name: 'aws-ebs-csi-driver',
103-
wellKnownPolicies: {
104-
ebsCSIController: true,
105-
},
106-
// We enable detailed metrics collection to watch for issues with
107-
// jupyterhub-home-nfs~
108-
// configurationValues ref: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/HEAD/charts/aws-ebs-csi-driver/values.yaml
109-
configurationValues: |||
110-
defaultStorageClass:
111-
enabled: true
112-
controller:
113-
enableMetrics: true
114-
node:
115-
enableMetrics: true
116-
|||,
117-
},
118-
]
1+
local cluster = import './libsonnet/cluster.jsonnet';
2+
3+
local c = cluster.makeCluster(
4+
name='projectpythia',
5+
region='us-west-2',
6+
nodeAz='us-west-2a',
7+
version='1.34',
8+
coreNodeInstanceType='r8i-flex.large',
9+
notebookCPUInstanceTypes=[
10+
'r5.xlarge',
11+
'r5.4xlarge',
12+
'r5.16xlarge',
11913
],
120-
nodeGroups: [
121-
n { clusterName: $.metadata.name }
122-
for n in
123-
[
124-
ng {
125-
namePrefix: 'core',
126-
nameSuffix: 'a',
127-
nameIncludeInstanceType: false,
128-
availabilityZones: [nodeAz],
129-
instanceType: 'r5.xlarge',
130-
minSize: 1,
131-
maxSize: 6,
132-
labels+: {
133-
'hub.jupyter.org/node-purpose': 'core',
134-
'k8s.dask.org/node-purpose': 'core',
135-
},
136-
},
137-
] + [
138-
ng {
139-
namePrefix: 'nb',
140-
availabilityZones: [nodeAz],
141-
minSize: 0,
142-
maxSize: 500,
143-
instanceType: n.instanceType,
144-
labels+: {
145-
'hub.jupyter.org/node-purpose': 'user',
146-
'k8s.dask.org/node-purpose': 'scheduler',
147-
},
148-
taints+: {
149-
'hub.jupyter.org_dedicated': 'user:NoSchedule',
150-
'hub.jupyter.org/dedicated': 'user:NoSchedule',
151-
},
152-
tags+: {
153-
'2i2c:node-purpose': 'user',
154-
},
155-
} + n
156-
for n in notebookNodes
157-
] + (
158-
if daskNodes != null then
159-
[
160-
ng {
161-
namePrefix: 'dask',
162-
availabilityZones: [nodeAz],
163-
minSize: 0,
164-
maxSize: 500,
165-
labels+: {
166-
'k8s.dask.org/node-purpose': 'worker',
167-
},
168-
taints+: {
169-
'k8s.dask.org_dedicated': 'worker:NoSchedule',
170-
'k8s.dask.org/dedicated': 'worker:NoSchedule',
171-
},
172-
tags+: {
173-
'2i2c:node-purpose': 'worker',
174-
},
175-
instancesDistribution+: {
176-
onDemandBaseCapacity: 0,
177-
onDemandPercentageAboveBaseCapacity: 0,
178-
spotAllocationStrategy: 'capacity-optimized',
179-
},
180-
} + n
181-
for n in daskNodes
182-
] else []
183-
)
14+
daskInstanceTypes=[],
15+
hubs=['staging', 'prod', 'pythia-binder'],
16+
notebookGPUNodeGroups=[
17+
{
18+
instanceType: 'g4dn.xlarge',
19+
},
18420
],
185-
}
21+
nodeGroupGenerations=['a'],
22+
);
23+
24+
c

0 commit comments

Comments
 (0)