Skip to content

Commit 3ac3f5b

Browse files
committed
k8s: Terraform deployment for GKE clusters
This provides a Terraform configuration for deploying our Kubernetes clusters to GKE. We deploy an identical cluster to each of a list of regions, with one small node for admin purposes due to a requirement to not use spot instances for the main node group for the and two autoscaling groups one with small 8 core nodes for most jobs and one with bigger nodes for the more resource intensive ones. This is different to our current scheme where each cluster has a single node group and we direct jobs in Jenkins. With this scheme we allow the Kubernetes scheduler to place jobs, or we can still direct them to specific node sizes using nodeSelector in the jobs and the labels that are assigned to the nodegroups. This is a more Kubernetes way of doing things and decouples further from Jenkins. Signed-off-by: Mark Brown <[email protected]>
1 parent ffe5e7a commit 3ac3f5b

File tree

4 files changed

+170
-0
lines changed

4 files changed

+170
-0
lines changed

k8s/gke/gke.tf

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# FIXME: For real deployment we should store the terraform state
2+
# in cloud storage rather than just the current directory, terraform
3+
# supports Azure blob storage directly. This means configuration
4+
# doesn't need to be on a single machine somewhere.
5+
#
6+
# See https://developer.hashicorp.com/terraform/language/settings/backends/gcs
7+
#
8+
#terraform {
9+
# backend "gcs" {
10+
# resource_group_name = "kernelci-tf-storage"
11+
# storage_account_name = "kernelci-tf"
12+
# container_name = "tfstate"
13+
# key = "workers.terraform.tfstate"
14+
# }
15+
#}
16+
17+
#variable "gke_username" {
18+
# default = ""
19+
# description = "gke username"
20+
#}
21+
22+
#variable "gke_password" {
23+
# default = ""
24+
# description = "gke password"
25+
#}
26+
27+
locals {
28+
regions = toset([
29+
"us-central1",
30+
"europe-west2",
31+
])
32+
}
33+
34+
# GKE cluster
35+
resource "google_container_cluster" "primary" {
36+
for_each = local.regions
37+
38+
name = "${each.key}-workers"
39+
location = each.key
40+
41+
# We can't create a cluster with no node pool defined, but we want to only use
42+
# separately managed node pools. So we create the smallest possible default
43+
# node pool and immediately delete it.
44+
remove_default_node_pool = true
45+
initial_node_count = 1
46+
47+
network = "${each.key}-vpc"
48+
subnetwork = "${each.key}-subnet"
49+
}
50+
51+
# Smaller nodes for most jobs
52+
resource "google_container_node_pool" "small_nodes" {
53+
for_each = local.regions
54+
55+
name = "${each.key}-small-node-pool"
56+
location = each.key
57+
cluster = "${each.key}-workers"
58+
59+
node_config {
60+
oauth_scopes = [
61+
"https://www.googleapis.com/auth/logging.write",
62+
"https://www.googleapis.com/auth/monitoring",
63+
]
64+
65+
labels = {
66+
"kernelci/worker" = "worker"
67+
"kernelci/worker-size" = "small"
68+
}
69+
70+
# Standard machine, 8 vCPUs, 30G memory
71+
machine_type = "n1-standard-8"
72+
preemptible = true
73+
spot = true
74+
tags = [
75+
"kernelci/worker",
76+
"kernelci/small-worker"
77+
]
78+
79+
metadata = {
80+
disable-legacy-endpoints = "true"
81+
}
82+
}
83+
84+
autoscaling {
85+
min_node_count = 1
86+
max_node_count = 10
87+
}
88+
}
89+
90+
# Bigger nodes for all*config jobs
91+
resource "google_container_node_pool" "big_nodes" {
92+
for_each = local.regions
93+
94+
name = "${each.key}-big-node-pool"
95+
location = each.key
96+
cluster = "${each.key}-workers"
97+
98+
node_config {
99+
oauth_scopes = [
100+
"https://www.googleapis.com/auth/logging.write",
101+
"https://www.googleapis.com/auth/monitoring",
102+
]
103+
104+
labels = {
105+
"kernelci/worker" = "worker"
106+
"kernelci/worker-size" = "big"
107+
}
108+
109+
# Standard machine, 32 vCPUs, 128G (?) memory
110+
machine_type = "n2-standard-32"
111+
preemptible = true
112+
spot = true
113+
tags = [
114+
"kernelci/worker",
115+
"kernelci/big-worker"
116+
]
117+
118+
metadata = {
119+
disable-legacy-endpoints = "true"
120+
}
121+
}
122+
123+
autoscaling {
124+
min_node_count = 1
125+
max_node_count = 10
126+
}
127+
}

k8s/gke/outputs.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
output "project_id" {
2+
value = var.project_id
3+
description = "GCloud Project ID"
4+
}

k8s/gke/versions.tf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
terraform {
2+
required_providers {
3+
google = {
4+
source = "hashicorp/google"
5+
version = "4.27.0"
6+
}
7+
}
8+
9+
required_version = ">= 0.14"
10+
}
11+

k8s/gke/vpc.tf

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
variable "project_id" {
2+
description = "project id"
3+
}
4+
5+
variable "region" {
6+
description = "region"
7+
}
8+
9+
provider "google" {
10+
project = var.project_id
11+
region = var.region
12+
}
13+
14+
# VPC
15+
resource "google_compute_network" "vpc" {
16+
for_each = local.regions
17+
name = "${each.key}-vpc"
18+
auto_create_subnetworks = "false"
19+
}
20+
21+
# Subnet
22+
resource "google_compute_subnetwork" "subnet" {
23+
for_each = local.regions
24+
name = "${each.key}-subnet"
25+
region = each.key
26+
network = google_compute_network.vpc[each.value].name
27+
ip_cidr_range = "10.10.0.0/24"
28+
}

0 commit comments

Comments
 (0)