From 77ca58274c3636a6935f607f402deb3956702482 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Fri, 10 May 2019 14:08:11 -0700 Subject: [PATCH 01/21] Initial progress --- deploy/gcp/.gitignore | 2 + deploy/gcp/README.md | 0 deploy/gcp/data.tf | 4 ++ deploy/gcp/main.tf | 101 ++++++++++++++++++++++++++++++++++++++++ deploy/gcp/variables.tf | 4 ++ 5 files changed, 111 insertions(+) create mode 100644 deploy/gcp/.gitignore create mode 100644 deploy/gcp/README.md create mode 100644 deploy/gcp/data.tf create mode 100644 deploy/gcp/main.tf create mode 100644 deploy/gcp/variables.tf diff --git a/deploy/gcp/.gitignore b/deploy/gcp/.gitignore new file mode 100644 index 00000000000..9e9c81cf46c --- /dev/null +++ b/deploy/gcp/.gitignore @@ -0,0 +1,2 @@ +.terraform +*.tfstate* diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/deploy/gcp/data.tf b/deploy/gcp/data.tf new file mode 100644 index 00000000000..c8abb1b2060 --- /dev/null +++ b/deploy/gcp/data.tf @@ -0,0 +1,4 @@ +data "google_compute_image" "bastion_image" { + family = "ubuntu-1804-lts" + project = "ubuntu-os-cloud" +} \ No newline at end of file diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf new file mode 100644 index 00000000000..f9b8f5c3ef1 --- /dev/null +++ b/deploy/gcp/main.tf @@ -0,0 +1,101 @@ +variable "GCP_CREDENTIALS_PATH" {} +variable "GCP_REGION" {} +variable "GCP_PROJECT" {} + +provider "google" { + credentials = "${file("${var.GCP_CREDENTIALS_PATH}")}" + region = "${var.GCP_REGION}" +} + +locals { + credential_path = "${path.module}/credentials" + kubeconfig = "${local.credential_path}/kubeconfig_${var.cluster_name}" + key_file = "${local.credential_path}/${var.cluster_name}-node-key.pem" + bastion_key_file = "${local.credential_path}/${var.cluster_name}-bastion-key.pem" +} + +resource "null_resource" "prepare-dir" { + provisioner "local-exec" { + command = "mkdir -p ${local.credential_path}" + } +} + +resource "google_compute_network" "vpc_network" { + name = "vpc-network" + auto_create_subnetworks = false + project = "${var.GCP_PROJECT}" +} + +resource "google_compute_subnetwork" "private_subnet" { + ip_cidr_range = "10.0.1.0/24" + name = "private-subnet" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" +} + +resource "google_compute_subnetwork" "public_subnet" { + ip_cidr_range = "10.0.4.0/24" + name = "public-subnet" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" +} + +resource "google_container_cluster" "cluster" { + name = "the-cluster" // turn this into var + network = "${google_compute_network.vpc_network.self_link}" + subnetwork = "${google_compute_subnetwork.private_subnet.self_link}" + location = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" + + private_cluster_config { + enable_private_endpoint = false + enable_private_nodes = true + master_ipv4_cidr_block = "172.16.0.0/28" + } + + ip_allocation_policy { + use_ip_aliases = true + } + +// remove_default_node_pool = true + initial_node_count = 1 +} + +//resource "google_container_node_pool" "node_pool" { +// cluster = "${google_container_cluster.cluster.name}" +// location = "${google_container_cluster.cluster.location}" +// name = "the-node-pool" +// initial_node_count = "3" +// +// node_config { +// machine_type = "n1-standard-1" +// } +// +//} + +resource "tls_private_key" "bastion" { + algorithm = "RSA" + rsa_bits = 4096 +} + +resource "google_compute_project_metadata_item" "ssh-key" { + key = "ssh-key" + value = "${tls_private_key.bastion.public_key_openssh}" +} + +resource "google_compute_instance" "bastion" { + machine_type = "f1-micro" + name = "bastion" + "boot_disk" { + initialize_params { + image = "ubuntu-os-cloud/ubuntu-1804-lts" + } + } + "network_interface" { + subnetwork = "${google_compute_subnetwork.public_subnet.self_link}" + } + + metadata { + ssh-key = "${}" + } +} \ No newline at end of file diff --git a/deploy/gcp/variables.tf b/deploy/gcp/variables.tf new file mode 100644 index 00000000000..c9f16091fc8 --- /dev/null +++ b/deploy/gcp/variables.tf @@ -0,0 +1,4 @@ +variable "cluster_name" { + description = "TiDB clustername" + default = "tidb-cluster" +} \ No newline at end of file From 2b0224cf138c0f0456880cebf7a2afb877cbe1de Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Fri, 10 May 2019 17:39:51 -0700 Subject: [PATCH 02/21] Adds infrastructure --- deploy/gcp/main.tf | 92 +++++++++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 25 deletions(-) diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index f9b8f5c3ef1..c4acef9f0c1 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -27,14 +27,22 @@ resource "google_compute_network" "vpc_network" { } resource "google_compute_subnetwork" "private_subnet" { - ip_cidr_range = "10.0.1.0/24" + ip_cidr_range = "172.31.252.0/22" name = "private-subnet" network = "${google_compute_network.vpc_network.self_link}" project = "${var.GCP_PROJECT}" + secondary_ip_range { + ip_cidr_range = "172.30.0.0/16" + range_name = "pods-${var.GCP_REGION}" + } + secondary_ip_range { + ip_cidr_range = "172.31.224.0/20" + range_name = "services-${var.GCP_REGION}" + } } resource "google_compute_subnetwork" "public_subnet" { - ip_cidr_range = "10.0.4.0/24" + ip_cidr_range = "172.29.252.0/22" name = "public-subnet" network = "${google_compute_network.vpc_network.self_link}" project = "${var.GCP_PROJECT}" @@ -50,40 +58,74 @@ resource "google_container_cluster" "cluster" { private_cluster_config { enable_private_endpoint = false enable_private_nodes = true - master_ipv4_cidr_block = "172.16.0.0/28" + master_ipv4_cidr_block = "172.31.64.0/28" } ip_allocation_policy { use_ip_aliases = true } -// remove_default_node_pool = true + remove_default_node_pool = true initial_node_count = 1 } -//resource "google_container_node_pool" "node_pool" { -// cluster = "${google_container_cluster.cluster.name}" -// location = "${google_container_cluster.cluster.location}" -// name = "the-node-pool" -// initial_node_count = "3" -// -// node_config { -// machine_type = "n1-standard-1" -// } -// -//} - -resource "tls_private_key" "bastion" { - algorithm = "RSA" - rsa_bits = 4096 +resource "google_container_node_pool" "pd_pool" { + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "pd-pool" + initial_node_count = "1" + + node_config { + machine_type = "n1-standard-1" + local_ssd_count = 1 + } + +} + +resource "google_container_node_pool" "tikv_pool" { + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "tikv-pool" + initial_node_count = "1" + + node_config { + machine_type = "n1-standard-1" + local_ssd_count = 1 + } + +} + +resource "google_container_node_pool" "tidb_pool" { + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "tidb-pool" + initial_node_count = "1" + + node_config { + machine_type = "n1-standard-1" + } + } -resource "google_compute_project_metadata_item" "ssh-key" { - key = "ssh-key" - value = "${tls_private_key.bastion.public_key_openssh}" +resource "google_compute_firewall" "allow_ssh_bastion" { + name = "allow-ssh-bastion" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" + + allow { + protocol = "tcp" + ports = ["22"] + } + source_ranges = ["0.0.0.0/0"] + target_tags = ["bastion"] } resource "google_compute_instance" "bastion" { + project = "${var.GCP_PROJECT}" + zone = "${var.GCP_REGION}-a" machine_type = "f1-micro" name = "bastion" "boot_disk" { @@ -93,9 +135,9 @@ resource "google_compute_instance" "bastion" { } "network_interface" { subnetwork = "${google_compute_subnetwork.public_subnet.self_link}" + access_config {} } + tags = ["bastion"] - metadata { - ssh-key = "${}" - } + metadata_startup_script = "sudo apt-get install -y mysql-client && curl -s https://packagecloud.io/install/repositories/akopytov/sysbench/script.rpm.sh | bash && sudo apt-get -y install sysbench" } \ No newline at end of file From e073432534499d56d1a075561605aef1a7a99236 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Tue, 14 May 2019 08:55:15 -0700 Subject: [PATCH 03/21] Some changes to main.tf and adds manifests and charts --- deploy/gcp/charts/tidb-cluster | 1 + deploy/gcp/charts/tidb-operator | 1 + deploy/gcp/main.tf | 41 ++++++++++++++++++++++++++- deploy/gcp/manifests/crd.yaml | 1 + deploy/gcp/manifests/tiller-rbac.yaml | 1 + 5 files changed, 44 insertions(+), 1 deletion(-) create mode 120000 deploy/gcp/charts/tidb-cluster create mode 120000 deploy/gcp/charts/tidb-operator create mode 120000 deploy/gcp/manifests/crd.yaml create mode 120000 deploy/gcp/manifests/tiller-rbac.yaml diff --git a/deploy/gcp/charts/tidb-cluster b/deploy/gcp/charts/tidb-cluster new file mode 120000 index 00000000000..326d3821047 --- /dev/null +++ b/deploy/gcp/charts/tidb-cluster @@ -0,0 +1 @@ +../../../charts/tidb-cluster \ No newline at end of file diff --git a/deploy/gcp/charts/tidb-operator b/deploy/gcp/charts/tidb-operator new file mode 120000 index 00000000000..a45f172da27 --- /dev/null +++ b/deploy/gcp/charts/tidb-operator @@ -0,0 +1 @@ +../../../charts/tidb-operator \ No newline at end of file diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index c4acef9f0c1..aa3f18a5a49 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -61,12 +61,25 @@ resource "google_container_cluster" "cluster" { master_ipv4_cidr_block = "172.31.64.0/28" } + master_auth { + username = "" + password = "" + } + + master_authorized_networks_config { + cidr_blocks { + cidr_block = "0.0.0.0/0" + } + } + ip_allocation_policy { use_ip_aliases = true } remove_default_node_pool = true initial_node_count = 1 + + min_master_version = "latest" } resource "google_container_node_pool" "pd_pool" { @@ -79,6 +92,7 @@ resource "google_container_node_pool" "pd_pool" { node_config { machine_type = "n1-standard-1" local_ssd_count = 1 + } } @@ -93,6 +107,7 @@ resource "google_container_node_pool" "tikv_pool" { node_config { machine_type = "n1-standard-1" local_ssd_count = 1 + } } @@ -140,4 +155,28 @@ resource "google_compute_instance" "bastion" { tags = ["bastion"] metadata_startup_script = "sudo apt-get install -y mysql-client && curl -s https://packagecloud.io/install/repositories/akopytov/sysbench/script.rpm.sh | bash && sudo apt-get -y install sysbench" -} \ No newline at end of file +} + +resource "null_resource" "get-credentials" { + provisioner "local-exec" { + command = "gcloud container clusters get-credentials ${google_container_cluster.cluster.name} --region ${var.GCP_REGION}" + environment { + KUBECONFIG= "${local.kubeconfig}" + } + } +} + +resource "null_resource" "setup-env" { + depends_on = ["google_container_cluster.cluster", "null_resource.get-credentials"] + + provisioner "local-exec" { + working_dir = "${path.module}" + command = < Date: Tue, 14 May 2019 12:44:23 -0700 Subject: [PATCH 04/21] Final touches, everything works --- deploy/gcp/.gitignore | 2 + deploy/gcp/data.tf | 13 +- deploy/gcp/main.tf | 123 +++++- deploy/gcp/manifests/gke-storage.yml | 1 + .../manifests/local-volume-provisioner.yaml | 128 ++++++ .../templates/tidb-cluster-values.yaml.tpl | 407 ++++++++++++++++++ deploy/gcp/variables.tf | 38 +- 7 files changed, 695 insertions(+), 17 deletions(-) create mode 120000 deploy/gcp/manifests/gke-storage.yml create mode 100644 deploy/gcp/manifests/local-volume-provisioner.yaml create mode 100644 deploy/gcp/templates/tidb-cluster-values.yaml.tpl diff --git a/deploy/gcp/.gitignore b/deploy/gcp/.gitignore index 9e9c81cf46c..955562c1e46 100644 --- a/deploy/gcp/.gitignore +++ b/deploy/gcp/.gitignore @@ -1,2 +1,4 @@ .terraform *.tfstate* +credentials +rendered diff --git a/deploy/gcp/data.tf b/deploy/gcp/data.tf index c8abb1b2060..985af921484 100644 --- a/deploy/gcp/data.tf +++ b/deploy/gcp/data.tf @@ -1,4 +1,9 @@ -data "google_compute_image" "bastion_image" { - family = "ubuntu-1804-lts" - project = "ubuntu-os-cloud" -} \ No newline at end of file +data "template_file" "tidb_cluster_values" { + template = "${file("${path.module}/templates/tidb-cluster-values.yaml.tpl")}" + vars { + cluster_version = "${var.tidb_version}" + pd_replicas = "${var.pd_count}" + tikv_replicas = "${var.tikv_count}" + tidb_replicas = "${var.tidb_count}" + } +} diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index aa3f18a5a49..5eda677db7c 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -5,13 +5,20 @@ variable "GCP_PROJECT" {} provider "google" { credentials = "${file("${var.GCP_CREDENTIALS_PATH}")}" region = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" +} + +// required for taints on node pools +provider "google-beta" { + credentials = "${file("${var.GCP_CREDENTIALS_PATH}")}" + region = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" } locals { credential_path = "${path.module}/credentials" kubeconfig = "${local.credential_path}/kubeconfig_${var.cluster_name}" - key_file = "${local.credential_path}/${var.cluster_name}-node-key.pem" - bastion_key_file = "${local.credential_path}/${var.cluster_name}-bastion-key.pem" + tidb_cluster_values_path = "${path.module}/rendered/tidb-cluster-values.yaml" } resource "null_resource" "prepare-dir" { @@ -55,11 +62,11 @@ resource "google_container_cluster" "cluster" { location = "${var.GCP_REGION}" project = "${var.GCP_PROJECT}" - private_cluster_config { - enable_private_endpoint = false - enable_private_nodes = true - master_ipv4_cidr_block = "172.31.64.0/28" - } +// private_cluster_config { +// enable_private_endpoint = false +// enable_private_nodes = true +// master_ipv4_cidr_block = "172.31.64.0/28" +// } master_auth { username = "" @@ -82,45 +89,91 @@ resource "google_container_cluster" "cluster" { min_master_version = "latest" } + resource "google_container_node_pool" "pd_pool" { + provider = "google-beta" project = "${var.GCP_PROJECT}" cluster = "${google_container_cluster.cluster.name}" location = "${google_container_cluster.cluster.location}" name = "pd-pool" - initial_node_count = "1" + initial_node_count = "${var.pd_count}" node_config { - machine_type = "n1-standard-1" + machine_type = "${var.pd_instance_type}" local_ssd_count = 1 + taint { + effect = "NO_SCHEDULE" + key = "dedicated" + value = "pd" + } + labels { + dedicated = "pd" + } + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } } resource "google_container_node_pool" "tikv_pool" { + provider = "google-beta" project = "${var.GCP_PROJECT}" cluster = "${google_container_cluster.cluster.name}" location = "${google_container_cluster.cluster.location}" name = "tikv-pool" - initial_node_count = "1" + initial_node_count = "${var.tikv_count}" node_config { - machine_type = "n1-standard-1" + machine_type = "${var.tikv_instance_type}" local_ssd_count = 1 + taint { + effect = "NO_SCHEDULE" + key = "dedicated" + value = "tikv" + } + labels { + dedicated = "tikv" + } + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } } resource "google_container_node_pool" "tidb_pool" { + provider = "google-beta" project = "${var.GCP_PROJECT}" cluster = "${google_container_cluster.cluster.name}" location = "${google_container_cluster.cluster.location}" name = "tidb-pool" + initial_node_count = "${var.tidb_count}" + + node_config { + machine_type = "${var.tidb_instance_type}" + taint { + effect = "NO_SCHEDULE" + key = "dedicated" + value = "tidb" + } + labels { + dedicated = "tidb" + } + tags = ["tidb"] + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] + } + +} + +resource "google_container_node_pool" "monitor_pool" { + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "monitor-pool" initial_node_count = "1" node_config { - machine_type = "n1-standard-1" + machine_type = "${var.monitor_instance_type}" + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } } @@ -138,6 +191,19 @@ resource "google_compute_firewall" "allow_ssh_bastion" { target_tags = ["bastion"] } +resource "google_compute_firewall" "allow_mysql_from_bastion" { + name = "allow-mysql-from-bastion" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" + + allow { + protocol = "tcp" + ports = ["4000"] + } + source_tags = ["bastion"] + target_tags = ["tidb"] +} + resource "google_compute_instance" "bastion" { project = "${var.GCP_PROJECT}" zone = "${var.GCP_REGION}-a" @@ -166,6 +232,12 @@ resource "null_resource" "get-credentials" { } } +resource "local_file" "tidb-cluster-values" { + depends_on = ["data.template_file.tidb_cluster_values"] + filename = "${local.tidb_cluster_values_path}" + content = "${data.template_file.tidb_cluster_values.rendered}" +} + resource "null_resource" "setup-env" { depends_on = ["google_container_cluster.cluster", "null_resource.get-credentials"] @@ -174,9 +246,36 @@ resource "null_resource" "setup-env" { command = < 8, default thread pool size for coprocessors + # will be set to tikv.resources.limits.cpu * 0.8. + # readpoolCoprocessorConcurrency: 8 + + # scheduler's worker pool size, should increase it in heavy write cases, + # also should less than total cpu cores. + # storageSchedulerWorkerPoolSize: 4 + +tidb: + replicas: ${tidb_replicas} + # The secret name of root password, you can create secret with following command: + # kubectl create secret generic tidb-secret --from-literal=root= --namespace= + # If unset, the root password will be empty and you can set it after connecting + # passwordSecretName: tidb-secret + # initSql is the SQL statements executed after the TiDB cluster is bootstrapped. + # initSql: |- + # create database app; + image: "pingcap/tidb:${cluster_version}" + # Image pull policy. + imagePullPolicy: IfNotPresent + logLevel: info + preparedPlanCacheEnabled: false + preparedPlanCacheCapacity: 100 + # Enable local latches for transactions. Enable it when + # there are lots of conflicts between transactions. + txnLocalLatchesEnabled: false + txnLocalLatchesCapacity: "10240000" + # The limit of concurrent executed sessions. + tokenLimit: "1000" + # Set the memory quota for a query in bytes. Default: 32GB + memQuotaQuery: "34359738368" + # The limitation of the number for the entries in one transaction. + # If using TiKV as the storage, the entry represents a key/value pair. + # WARNING: Do not set the value too large, otherwise it will make a very large impact on the TiKV cluster. + # Please adjust this configuration carefully. + txnEntryCountLimit: "300000" + # The limitation of the size in byte for the entries in one transaction. + # If using TiKV as the storage, the entry represents a key/value pair. + # WARNING: Do not set the value too large, otherwise it will make a very large impact on the TiKV cluster. + # Please adjust this configuration carefully. + txnTotalSizeLimit: "104857600" + # enableBatchDml enables batch commit for the DMLs + enableBatchDml: false + # check mb4 value in utf8 is used to control whether to check the mb4 characters when the charset is utf8. + checkMb4ValueInUtf8: true + # treat-old-version-utf8-as-utf8mb4 use for upgrade compatibility. Set to true will treat old version table/column UTF8 charset as UTF8MB4. + treatOldVersionUtf8AsUtf8mb4: true + # lease is schema lease duration, very dangerous to change only if you know what you do. + lease: 45s + # Max CPUs to use, 0 use number of CPUs in the machine. + maxProcs: 0 + resources: + limits: {} + # cpu: 16000m + # memory: 16Gi + requests: {} + # cpu: 12000m + # memory: 12Gi + nodeSelector: + dedicated: tidb + # kind: tidb + # zone: cn-bj1-01,cn-bj1-02 + # region: cn-bj1 + tolerations: + - key: dedicated + operator: Equal + value: tidb + effect: "NoSchedule" + maxFailoverCount: 3 + service: + type: LoadBalancer + exposeStatus: true + annotations: + cloud.google.com/load-balancer-type: "Internal" + # separateSlowLog: true + slowLogTailer: + image: busybox:1.26.2 + resources: + limits: + cpu: 100m + memory: 50Mi + requests: + cpu: 20m + memory: 5Mi + + # tidb plugin configuration + plugin: + # enable plugin or not + enable: false + # the start argument to specify the folder containing + directory: /plugins + # the start argument to specify the plugin id (name "-" version) that needs to be loaded, e.g. 'conn_limit-1'. + list: ["whitelist-1"] + +# mysqlClient is used to set password for TiDB +# it must has Python MySQL client installed +mysqlClient: + image: tnir/mysqlclient + imagePullPolicy: IfNotPresent + +monitor: + create: true + # Also see rbac.create + # If you set rbac.create to false, you need to provide a value here. + # If you set rbac.create to true, you should leave this empty. + # serviceAccount: + persistent: true + storageClassName: pd-ssd + storage: 500Gi + grafana: + create: true + image: grafana/grafana:6.0.1 + imagePullPolicy: IfNotPresent + logLevel: info + resources: + limits: {} + # cpu: 8000m + # memory: 8Gi + requests: {} + # cpu: 4000m + # memory: 4Gi + username: admin + password: admin + config: + # Configure Grafana using environment variables except GF_PATHS_DATA, GF_SECURITY_ADMIN_USER and GF_SECURITY_ADMIN_PASSWORD + # Ref https://grafana.com/docs/installation/configuration/#using-environment-variables + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_NAME: "Main Org." + GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" + # if grafana is running behind a reverse proxy with subpath http://foo.bar/grafana + # GF_SERVER_DOMAIN: foo.bar + # GF_SERVER_ROOT_URL: "%(protocol)s://%(domain)s/grafana/" + service: + type: LoadBalancer + prometheus: + image: prom/prometheus:v2.2.1 + imagePullPolicy: IfNotPresent + logLevel: info + resources: + limits: {} + # cpu: 8000m + # memory: 8Gi + requests: {} + # cpu: 4000m + # memory: 4Gi + service: + type: NodePort + reserveDays: 12 + # alertmanagerURL: "" + nodeSelector: {} + # kind: monitor + # zone: cn-bj1-01,cn-bj1-02 + # region: cn-bj1 + tolerations: [] + # - key: node-role + # operator: Equal + # value: tidb + # effect: "NoSchedule" + +binlog: + pump: + create: false + replicas: 1 + image: "pingcap/tidb-binlog:${cluster_version}" + imagePullPolicy: IfNotPresent + logLevel: info + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + storage: 10Gi + syncLog: true + # a integer value to control expiry date of the binlog data, indicates for how long (in days) the binlog data would be stored. + # must bigger than 0 + gc: 7 + # number of seconds between heartbeat ticks (in 2 seconds) + heartbeatInterval: 2 + + drainer: + create: false + image: "pingcap/tidb-binlog:${cluster_version}" + imagePullPolicy: IfNotPresent + logLevel: info + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + storage: 10Gi + # parallel worker count (default 16) + workerCount: 16 + # the interval time (in seconds) of detect pumps' status (default 10) + detectInterval: 10 + # disbale detect causality + disableDetect: false + # disable dispatching sqls that in one same binlog; if set true, work-count and txn-batch would be useless + disableDispatch: false + # # disable sync these schema + ignoreSchemas: "INFORMATION_SCHEMA,PERFORMANCE_SCHEMA,mysql,test" + # if drainer donesn't have checkpoint, use initial commitTS to initial checkpoint + initialCommitTs: 0 + # enable safe mode to make syncer reentrant + safeMode: false + # number of binlog events in a transaction batch (default 20) + txnBatch: 20 + # downstream storage, equal to --dest-db-type + # valid values are "mysql", "pb", "kafka" + destDBType: pb + mysql: {} + # host: "127.0.0.1" + # user: "root" + # password: "" + # port: 3306 + # # Time and size limits for flash batch write + # timeLimit: "30s" + # sizeLimit: "100000" + kafka: {} + # only need config one of zookeeper-addrs and kafka-addrs, will get kafka address if zookeeper-addrs is configed. + # zookeeperAddrs: "127.0.0.1:2181" + # kafkaAddrs: "127.0.0.1:9092" + # kafkaVersion: "0.8.2.0" + +scheduledBackup: + create: false + binlogImage: "pingcap/tidb-binlog:${cluster_version}" + binlogImagePullPolicy: IfNotPresent + # https://github.com/tennix/tidb-cloud-backup + mydumperImage: pingcap/tidb-cloud-backup:latest + mydumperImagePullPolicy: IfNotPresent + # storageClassName is a StorageClass provides a way for administrators to describe the "classes" of storage they offer. + # different classes might map to quality-of-service levels, or to backup policies, + # or to arbitrary policies determined by the cluster administrators. + # refer to https://kubernetes.io/docs/concepts/storage/storage-classes + storageClassName: local-storage + storage: 100Gi + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#schedule + schedule: "0 0 * * *" + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend + suspend: false + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#jobs-history-limits + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + # https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#starting-deadline + startingDeadlineSeconds: 3600 + # https://github.com/maxbube/mydumper/blob/master/docs/mydumper_usage.rst#options + options: "--chunk-filesize=100" + # secretName is the name of the secret which stores user and password used for backup + # Note: you must give the user enough privilege to do the backup + # you can create the secret by: + # kubectl create secret generic backup-secret --from-literal=user=root --from-literal=password= + secretName: backup-secret + # backup to gcp + gcp: {} + # bucket: "" + # secretName is the name of the secret which stores the gcp service account credentials json file + # The service account must have read/write permission to the above bucket. + # Read the following document to create the service account and download the credentials file as credentials.json: + # https://cloud.google.com/docs/authentication/production#obtaining_and_providing_service_account_credentials_manually + # And then create the secret by: kubectl create secret generic gcp-backup-secret --from-file=./credentials.json + # secretName: gcp-backup-secret + + # backup to ceph object storage + ceph: {} + # endpoint: "" + # bucket: "" + # secretName is the name of the secret which stores ceph object store access key and secret key + # You can create the secret by: + # kubectl create secret generic ceph-backup-secret --from-literal=access_key= --from-literal=secret_key= + # secretName: ceph-backup-secret + +metaInstance: "{{ $labels.instance }}" +metaType: "{{ $labels.type }}" +metaValue: "{{ $value }}" diff --git a/deploy/gcp/variables.tf b/deploy/gcp/variables.tf index c9f16091fc8..98f6565c97b 100644 --- a/deploy/gcp/variables.tf +++ b/deploy/gcp/variables.tf @@ -1,4 +1,40 @@ variable "cluster_name" { description = "TiDB clustername" default = "tidb-cluster" -} \ No newline at end of file +} + +variable "tidb_version" { + description = "TiDB version" + default = "v2.1.8" +} + +variable "pd_count" { + description = "Number of PD nodes per availability zone" + default = 1 +} + +variable "tikv_count" { + description = "Number of TiKV nodes per availability zone" + default = 1 +} + +variable "tidb_count" { + description = "Number of TiDB nodes per availability zone" + default = 1 +} + +variable "pd_instance_type" { + default = "n1-standard-1" +} + +variable "tikv_instance_type" { + default = "n1-standard-1" +} + +variable "tidb_instance_type" { + default = "n1-standard-1" +} + +variable "monitor_instance_type" { + default = "n1-standard-1" +} From d8d07c95aca0d5c6c29d8084d17f2f2f998c20cb Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Tue, 14 May 2019 14:59:11 -0700 Subject: [PATCH 05/21] Adds readme --- deploy/gcp/README.md | 99 +++++++++++++++++++++++++++++++++++++++++ deploy/gcp/data.tf | 11 +++-- deploy/gcp/main.tf | 10 +---- deploy/gcp/outputs.tf | 19 ++++++++ deploy/gcp/variables.tf | 26 ++++++++--- 5 files changed, 149 insertions(+), 16 deletions(-) create mode 100644 deploy/gcp/outputs.tf diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index e69de29bb2d..ecb1144c305 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -0,0 +1,99 @@ +# Deploy TiDB Operator and TiDB cluster on GCP GKE + +## Requirements: +* [gcloud](https://cloud.google.com/sdk/install) +* [terraform](https://www.terraform.io/downloads.html) +* [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl) >= 1.11 +* [helm](https://github.com/helm/helm/blob/master/docs/install.md#installing-the-helm-client) >= 2.9.0 +* [jq](https://stedolan.github.io/jq/download/) + +## Configure gcloud + +https://cloud.google.com/sdk/docs/initializing + +## Setup + +The default setup will create a new VPC, two subnetworks, and an f1-micro instance as a bastion machine. The GKE cluster is created with the following instance types as worker nodes: + +* 3 n1-standard-4 instances for PD +* 3 n1-highmem-8 instances for TiKV +* 3 n1-standard-16 instances for TiDB +* 3 n1-standard-2 instances for monitor + +> *NOTE*: The number of nodes created depends on how many availability zones there are in the chosen region. Most have 3 zones, but us-central1 has 4. See https://cloud.google.com/compute/docs/regions-zones/ for more information. + +The terraform script expects three environment variables. You can let Terraform prompt you for them, or `export` them ahead of time. If you choose to export them, they are: + +* `TF_VAR_GCP_CREDENTIALS_PATH`: Path to a valid GCP credentials file +* `TF_VAR_GCP_REGION`: The region to create the resources in, for example: `us-west1` +* `TF_VAR_GCP_PROJECT`: The name of the GCP project + +It is generally considered a good idea to create a service account to be used by Terraform. See https://cloud.google.com/iam/docs/creating-managing-service-accounts for more information on how to manage them. + +The service account should have sufficient permissions to create resources in the project. The `Project Editor` primitive will accomplish this. + +If the GCP project is new, make sure the relevant APIs are enabled: + +```bash +gcloud services enable cloudresourcemanager.googleapis.com && \ +gcloud services enable cloudbilling.googleapis.com && \ +gcloud services enable iam.googleapis.com && \ +gcloud services enable compute.googleapis.com && \ +gcloud services enable container.googleapis.com +``` + +Now we can launch the script: + +```bash +git clone https://github.com/pingcap/tidb-operator +$ cd tidb-operator/deploy/gcp +$ terraform init +$ terraform apply +``` + +After `terraform apply` is successful, the TiDB cluster can be accessed by SSHing into the bastion machine and connecting via MySQL: +```bash +gcloud compute ssh bastion --zone +mysql -h -P 4000 -u root +``` + +It is possible to interact with the cluster using `kubectl` and `helm` with the kubeconfig file `credentials/kubeconfig_`. The default `cluster_name` is `my-cluster`, it can be changed in `variables.tf` +```bash +# By specifying --kubeconfig argument +kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb +helm --kubeconfig credentials/kubeconfig_ ls + +# Or setting KUBECONFIG environment variable +export KUBECONFIG=$PWD/credentials/kubeconfig_ +kubectl get po -n tidb +helm ls +``` + +When done, the infrastructure can be torn down by running `terraform destroy` + + +> *NOTE*: Any provisioned disks will have to be manually deleted after `terraform destroy`, assuming you do not need the data on the volumes anymore. + +## Upgrade TiDB cluster + +To upgrade TiDB cluster, modify `tidb_version` variable to a higher version in variables.tf and run `terraform apply`. + +> *Note*: The upgrading doesn't finish immediately. You can watch the upgrading process by `watch kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb` + +## Scale TiDB cluster + +To scale TiDB cluster, modify `tikv_count` or `tidb_count` to your desired count, and then run `terraform apply`. + +> *Note*: Currently, scaling in is not supported since we cannot determine which node to scale. Scaling out needs a few minutes to complete, you can watch the scaling out by `watch kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb` + +> *Note*: Incrementing the node count will create a node per GCP availability zones. + +## Customize + +### Customize GCP resources + +GCP allows attaching a local SSD to any instance type that is `n1-standard-1` or greater. This allows for good customizability. + +### Customize TiDB Parameters + +Currently, there are not too many parameters exposed to be customized. However, you can modify `templates/tidb-cluster-values.yaml.tpl` before deploying. If you modify it after the cluster is created and then run `terraform apply`, it will not take effect unless the pod(s) is manually deleted. \ No newline at end of file diff --git a/deploy/gcp/data.tf b/deploy/gcp/data.tf index 985af921484..b7bb9bb7b7a 100644 --- a/deploy/gcp/data.tf +++ b/deploy/gcp/data.tf @@ -2,8 +2,13 @@ data "template_file" "tidb_cluster_values" { template = "${file("${path.module}/templates/tidb-cluster-values.yaml.tpl")}" vars { cluster_version = "${var.tidb_version}" - pd_replicas = "${var.pd_count}" - tikv_replicas = "${var.tikv_count}" - tidb_replicas = "${var.tidb_count}" + pd_replicas = "${var.pd_replica_count}" + tikv_replicas = "${var.tikv_replica_count}" + tidb_replicas = "${var.tidb_replica_count}" } } + +data "external" "tidb_ilb_ip" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-tidb -o json | jq '.status.loadBalancer.ingress[0]'"] +} diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index 5eda677db7c..667a6e8ca4b 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -56,18 +56,12 @@ resource "google_compute_subnetwork" "public_subnet" { } resource "google_container_cluster" "cluster" { - name = "the-cluster" // turn this into var + name = "${var.cluster_name}" network = "${google_compute_network.vpc_network.self_link}" subnetwork = "${google_compute_subnetwork.private_subnet.self_link}" location = "${var.GCP_REGION}" project = "${var.GCP_PROJECT}" -// private_cluster_config { -// enable_private_endpoint = false -// enable_private_nodes = true -// master_ipv4_cidr_block = "172.31.64.0/28" -// } - master_auth { username = "" password = "" @@ -207,7 +201,7 @@ resource "google_compute_firewall" "allow_mysql_from_bastion" { resource "google_compute_instance" "bastion" { project = "${var.GCP_PROJECT}" zone = "${var.GCP_REGION}-a" - machine_type = "f1-micro" + machine_type = "${var.bastion_instance_type}" name = "bastion" "boot_disk" { initialize_params { diff --git a/deploy/gcp/outputs.tf b/deploy/gcp/outputs.tf new file mode 100644 index 00000000000..2ca2ff58f31 --- /dev/null +++ b/deploy/gcp/outputs.tf @@ -0,0 +1,19 @@ +output "region" { + value = "${var.GCP_REGION}" +} + +output "cluster_id" { + value = "${google_container_cluster.cluster.id}" +} + +output "kubeconfig_file" { + value = "${local.kubeconfig}" +} + +output "tidb_version" { + value = "${var.tidb_version}" +} + +output "tidb_ilb_ip" { + value = "${data.external.tidb_ilb_ip.result["ip"]}" +} diff --git a/deploy/gcp/variables.tf b/deploy/gcp/variables.tf index 98f6565c97b..a4755fd6637 100644 --- a/deploy/gcp/variables.tf +++ b/deploy/gcp/variables.tf @@ -1,6 +1,6 @@ variable "cluster_name" { description = "TiDB clustername" - default = "tidb-cluster" + default = "my-cluster" } variable "tidb_version" { @@ -13,6 +13,18 @@ variable "pd_count" { default = 1 } +variable "pd_replica_count" { + default = 3 +} + +variable "tikv_replica_count" { + default = 3 +} + +variable "tidb_replica_count" { + default = 3 +} + variable "tikv_count" { description = "Number of TiKV nodes per availability zone" default = 1 @@ -24,17 +36,21 @@ variable "tidb_count" { } variable "pd_instance_type" { - default = "n1-standard-1" + default = "n1-standard-4" } variable "tikv_instance_type" { - default = "n1-standard-1" + default = "n1-highmem-8" } variable "tidb_instance_type" { - default = "n1-standard-1" + default = "n1-standard-16" } variable "monitor_instance_type" { - default = "n1-standard-1" + default = "n1-standard-2" +} + +variable "bastion_instance_type" { + default = "f1-micro" } From 9fd192774cced4486071c09b0ba01500404cf81a Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Tue, 14 May 2019 15:50:21 -0700 Subject: [PATCH 06/21] Cleans up README a little --- deploy/gcp/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index ecb1144c305..664b34416ad 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -46,9 +46,9 @@ Now we can launch the script: ```bash git clone https://github.com/pingcap/tidb-operator -$ cd tidb-operator/deploy/gcp -$ terraform init -$ terraform apply +cd tidb-operator/deploy/gcp +terraform init +terraform apply ``` After `terraform apply` is successful, the TiDB cluster can be accessed by SSHing into the bastion machine and connecting via MySQL: From 35b9056b34d6fe3eab9f254edb83be6ef33fe11a Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Tue, 14 May 2019 20:03:32 -0700 Subject: [PATCH 07/21] Adds firewall rule to allow ssh from bastion to GKE nodes --- deploy/gcp/main.tf | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index 667a6e8ca4b..7b9c9f1b7c4 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -103,7 +103,7 @@ resource "google_container_node_pool" "pd_pool" { labels { dedicated = "pd" } - + tags = ["pd"] oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } @@ -128,6 +128,7 @@ resource "google_container_node_pool" "tikv_pool" { labels { dedicated = "tikv" } + tags = ["tikv"] oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } @@ -167,6 +168,7 @@ resource "google_container_node_pool" "monitor_pool" { node_config { machine_type = "${var.monitor_instance_type}" + tags = ["monitor"] oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } @@ -198,6 +200,19 @@ resource "google_compute_firewall" "allow_mysql_from_bastion" { target_tags = ["tidb"] } +resource "google_compute_firewall" "allow_ssh_from_bastion" { + name = "allow-mysql-from-bastion" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" + + allow { + protocol = "tcp" + ports = ["22"] + } + source_tags = ["bastion"] + target_tags = ["tidb", "tikv", "pd", "monitor"] +} + resource "google_compute_instance" "bastion" { project = "${var.GCP_PROJECT}" zone = "${var.GCP_REGION}-a" From adb5854a08a561bbca8485c5ec816da9ef7e58de Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Tue, 14 May 2019 20:05:13 -0700 Subject: [PATCH 08/21] Fixes name of ssh from bastion firewall rule --- deploy/gcp/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index 7b9c9f1b7c4..d150872839f 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -201,7 +201,7 @@ resource "google_compute_firewall" "allow_mysql_from_bastion" { } resource "google_compute_firewall" "allow_ssh_from_bastion" { - name = "allow-mysql-from-bastion" + name = "allow-ssh-from-bastion" network = "${google_compute_network.vpc_network.self_link}" project = "${var.GCP_PROJECT}" From cad1678a6b95a3354de905240b3538d8b6ba2655 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Wed, 15 May 2019 08:09:03 -0700 Subject: [PATCH 09/21] Adds tweaks and formatting --- deploy/gcp/README.md | 4 +- deploy/gcp/data.tf | 9 +- deploy/gcp/main.tf | 164 ++++++++++-------- deploy/gcp/outputs.tf | 4 + .../templates/tidb-cluster-values.yaml.tpl | 2 +- deploy/gcp/variables.tf | 10 +- 6 files changed, 106 insertions(+), 87 deletions(-) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index 664b34416ad..f66f6c5327a 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -45,7 +45,7 @@ gcloud services enable container.googleapis.com Now we can launch the script: ```bash -git clone https://github.com/pingcap/tidb-operator +git clone --depth=1 https://github.com/pingcap/tidb-operator cd tidb-operator/deploy/gcp terraform init terraform apply @@ -82,7 +82,7 @@ To upgrade TiDB cluster, modify `tidb_version` variable to a higher version in v ## Scale TiDB cluster -To scale TiDB cluster, modify `tikv_count` or `tidb_count` to your desired count, and then run `terraform apply`. +To scale TiDB cluster, modify `tikv_count`, `tikv_replica_count`, `tidb_count`, and `tidb_replica_count` to your desired count, and then run `terraform apply`. > *Note*: Currently, scaling in is not supported since we cannot determine which node to scale. Scaling out needs a few minutes to complete, you can watch the scaling out by `watch kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb` diff --git a/deploy/gcp/data.tf b/deploy/gcp/data.tf index b7bb9bb7b7a..4a6de8d9e63 100644 --- a/deploy/gcp/data.tf +++ b/deploy/gcp/data.tf @@ -1,10 +1,11 @@ data "template_file" "tidb_cluster_values" { template = "${file("${path.module}/templates/tidb-cluster-values.yaml.tpl")}" - vars { + + vars { cluster_version = "${var.tidb_version}" - pd_replicas = "${var.pd_replica_count}" - tikv_replicas = "${var.tikv_replica_count}" - tidb_replicas = "${var.tidb_replica_count}" + pd_replicas = "${var.pd_replica_count}" + tikv_replicas = "${var.tikv_replica_count}" + tidb_replicas = "${var.tidb_replica_count}" } } diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index d150872839f..19bdccfc4fd 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -4,20 +4,20 @@ variable "GCP_PROJECT" {} provider "google" { credentials = "${file("${var.GCP_CREDENTIALS_PATH}")}" - region = "${var.GCP_REGION}" - project = "${var.GCP_PROJECT}" + region = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" } // required for taints on node pools provider "google-beta" { credentials = "${file("${var.GCP_CREDENTIALS_PATH}")}" - region = "${var.GCP_REGION}" - project = "${var.GCP_PROJECT}" + region = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" } locals { - credential_path = "${path.module}/credentials" - kubeconfig = "${local.credential_path}/kubeconfig_${var.cluster_name}" + credential_path = "${path.module}/credentials" + kubeconfig = "${local.credential_path}/kubeconfig_${var.cluster_name}" tidb_cluster_values_path = "${path.module}/rendered/tidb-cluster-values.yaml" } @@ -28,39 +28,41 @@ resource "null_resource" "prepare-dir" { } resource "google_compute_network" "vpc_network" { - name = "vpc-network" + name = "vpc-network" auto_create_subnetworks = false - project = "${var.GCP_PROJECT}" + project = "${var.GCP_PROJECT}" } resource "google_compute_subnetwork" "private_subnet" { ip_cidr_range = "172.31.252.0/22" - name = "private-subnet" - network = "${google_compute_network.vpc_network.self_link}" - project = "${var.GCP_PROJECT}" + name = "private-subnet" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" + secondary_ip_range { ip_cidr_range = "172.30.0.0/16" - range_name = "pods-${var.GCP_REGION}" + range_name = "pods-${var.GCP_REGION}" } + secondary_ip_range { ip_cidr_range = "172.31.224.0/20" - range_name = "services-${var.GCP_REGION}" + range_name = "services-${var.GCP_REGION}" } } resource "google_compute_subnetwork" "public_subnet" { ip_cidr_range = "172.29.252.0/22" - name = "public-subnet" - network = "${google_compute_network.vpc_network.self_link}" - project = "${var.GCP_PROJECT}" + name = "public-subnet" + network = "${google_compute_network.vpc_network.self_link}" + project = "${var.GCP_PROJECT}" } resource "google_container_cluster" "cluster" { - name = "${var.cluster_name}" - network = "${google_compute_network.vpc_network.self_link}" + name = "${var.cluster_name}" + network = "${google_compute_network.vpc_network.self_link}" subnetwork = "${google_compute_subnetwork.private_subnet.self_link}" - location = "${var.GCP_REGION}" - project = "${var.GCP_PROJECT}" + location = "${var.GCP_REGION}" + project = "${var.GCP_PROJECT}" master_auth { username = "" @@ -78,173 +80,183 @@ resource "google_container_cluster" "cluster" { } remove_default_node_pool = true - initial_node_count = 1 + initial_node_count = 1 min_master_version = "latest" } - resource "google_container_node_pool" "pd_pool" { - provider = "google-beta" - project = "${var.GCP_PROJECT}" - cluster = "${google_container_cluster.cluster.name}" - location = "${google_container_cluster.cluster.location}" - name = "pd-pool" + provider = "google-beta" + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "pd-pool" initial_node_count = "${var.pd_count}" node_config { - machine_type = "${var.pd_instance_type}" + machine_type = "${var.pd_instance_type}" local_ssd_count = 1 + taint { effect = "NO_SCHEDULE" - key = "dedicated" - value = "pd" + key = "dedicated" + value = "pd" } + labels { dedicated = "pd" } - tags = ["pd"] + + tags = ["pd"] oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } - } resource "google_container_node_pool" "tikv_pool" { - provider = "google-beta" - project = "${var.GCP_PROJECT}" - cluster = "${google_container_cluster.cluster.name}" - location = "${google_container_cluster.cluster.location}" - name = "tikv-pool" + provider = "google-beta" + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "tikv-pool" initial_node_count = "${var.tikv_count}" node_config { - machine_type = "${var.tikv_instance_type}" + machine_type = "${var.tikv_instance_type}" local_ssd_count = 1 + taint { effect = "NO_SCHEDULE" - key = "dedicated" - value = "tikv" + key = "dedicated" + value = "tikv" } + labels { dedicated = "tikv" } - tags = ["tikv"] - oauth_scopes = ["storage-ro", "logging-write", "monitoring"] + tags = ["tikv"] + oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } - } resource "google_container_node_pool" "tidb_pool" { - provider = "google-beta" - project = "${var.GCP_PROJECT}" - cluster = "${google_container_cluster.cluster.name}" - location = "${google_container_cluster.cluster.location}" - name = "tidb-pool" + provider = "google-beta" + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "tidb-pool" initial_node_count = "${var.tidb_count}" node_config { machine_type = "${var.tidb_instance_type}" + taint { effect = "NO_SCHEDULE" - key = "dedicated" - value = "tidb" + key = "dedicated" + value = "tidb" } + labels { dedicated = "tidb" } - tags = ["tidb"] + + tags = ["tidb"] oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } - } resource "google_container_node_pool" "monitor_pool" { - project = "${var.GCP_PROJECT}" - cluster = "${google_container_cluster.cluster.name}" - location = "${google_container_cluster.cluster.location}" - name = "monitor-pool" + project = "${var.GCP_PROJECT}" + cluster = "${google_container_cluster.cluster.name}" + location = "${google_container_cluster.cluster.location}" + name = "monitor-pool" initial_node_count = "1" node_config { machine_type = "${var.monitor_instance_type}" - tags = ["monitor"] + tags = ["monitor"] oauth_scopes = ["storage-ro", "logging-write", "monitoring"] } - } resource "google_compute_firewall" "allow_ssh_bastion" { - name = "allow-ssh-bastion" + name = "allow-ssh-bastion" network = "${google_compute_network.vpc_network.self_link}" project = "${var.GCP_PROJECT}" allow { protocol = "tcp" - ports = ["22"] + ports = ["22"] } + source_ranges = ["0.0.0.0/0"] - target_tags = ["bastion"] + target_tags = ["bastion"] } resource "google_compute_firewall" "allow_mysql_from_bastion" { - name = "allow-mysql-from-bastion" + name = "allow-mysql-from-bastion" network = "${google_compute_network.vpc_network.self_link}" project = "${var.GCP_PROJECT}" allow { protocol = "tcp" - ports = ["4000"] + ports = ["4000"] } + source_tags = ["bastion"] target_tags = ["tidb"] } resource "google_compute_firewall" "allow_ssh_from_bastion" { - name = "allow-ssh-from-bastion" + name = "allow-ssh-from-bastion" network = "${google_compute_network.vpc_network.self_link}" project = "${var.GCP_PROJECT}" allow { protocol = "tcp" - ports = ["22"] + ports = ["22"] } + source_tags = ["bastion"] target_tags = ["tidb", "tikv", "pd", "monitor"] } resource "google_compute_instance" "bastion" { - project = "${var.GCP_PROJECT}" - zone = "${var.GCP_REGION}-a" + project = "${var.GCP_PROJECT}" + zone = "${var.GCP_REGION}-a" machine_type = "${var.bastion_instance_type}" - name = "bastion" + name = "bastion" + "boot_disk" { initialize_params { image = "ubuntu-os-cloud/ubuntu-1804-lts" } } + "network_interface" { - subnetwork = "${google_compute_subnetwork.public_subnet.self_link}" - access_config {} + subnetwork = "${google_compute_subnetwork.public_subnet.self_link}" + access_config = {} } + tags = ["bastion"] - metadata_startup_script = "sudo apt-get install -y mysql-client && curl -s https://packagecloud.io/install/repositories/akopytov/sysbench/script.rpm.sh | bash && sudo apt-get -y install sysbench" + metadata_startup_script = "sudo apt-get install -y mysql-client && curl -s https://packagecloud.io/install/repositories/akopytov/sysbench/script.deb.sh | bash && sudo apt-get -y install sysbench" } resource "null_resource" "get-credentials" { provisioner "local-exec" { command = "gcloud container clusters get-credentials ${google_container_cluster.cluster.name} --region ${var.GCP_REGION}" + environment { - KUBECONFIG= "${local.kubeconfig}" + KUBECONFIG = "${local.kubeconfig}" } } } resource "local_file" "tidb-cluster-values" { depends_on = ["data.template_file.tidb_cluster_values"] - filename = "${local.tidb_cluster_values_path}" - content = "${data.template_file.tidb_cluster_values.rendered}" + filename = "${local.tidb_cluster_values_path}" + content = "${data.template_file.tidb_cluster_values.rendered}" } resource "null_resource" "setup-env" { @@ -252,6 +264,7 @@ resource "null_resource" "setup-env" { provisioner "local-exec" { working_dir = "${path.module}" + command = < Date: Wed, 15 May 2019 09:22:25 -0700 Subject: [PATCH 10/21] Adds reclaim policy to delete disks --- deploy/gcp/data.tf | 5 +++++ deploy/gcp/main.tf | 1 + deploy/gcp/outputs.tf | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/deploy/gcp/data.tf b/deploy/gcp/data.tf index 4a6de8d9e63..6986042aec9 100644 --- a/deploy/gcp/data.tf +++ b/deploy/gcp/data.tf @@ -13,3 +13,8 @@ data "external" "tidb_ilb_ip" { depends_on = ["null_resource.deploy-tidb-cluster"] program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-tidb -o json | jq '.status.loadBalancer.ingress[0]'"] } + +data "external" "monitor_ilb_ip" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-grafana -o json | jq '.status.loadBalancer.ingress[0]'"] +} diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index 19bdccfc4fd..f06823b32be 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -295,6 +295,7 @@ until kubectl get po -n tidb -lapp.kubernetes.io/component=tidb | grep Running; echo "Wait for TiDB pod running" sleep 5 done +kubectl get pv -l app.kubernetes.io/namespace=tidb -o name | xargs -I {} kubectl patch {} -p '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}' EOS environment { diff --git a/deploy/gcp/outputs.tf b/deploy/gcp/outputs.tf index 92a209fdae2..fd7ed97b21a 100644 --- a/deploy/gcp/outputs.tf +++ b/deploy/gcp/outputs.tf @@ -21,3 +21,7 @@ output "tidb_version" { output "tidb_ilb_ip" { value = "${data.external.tidb_ilb_ip.result["ip"]}" } + +output "monitor_ilb_ip" { + value = "${data.external.monitor_ilb_ip.result["ip"]}" +} From c9bcbaab0f060c0765c197469a2a720f4a10d5c5 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Wed, 15 May 2019 09:25:03 -0700 Subject: [PATCH 11/21] Updates readme to reflect disk reclaim policy --- deploy/gcp/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index f66f6c5327a..5228df7d83a 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -72,8 +72,6 @@ helm ls When done, the infrastructure can be torn down by running `terraform destroy` -> *NOTE*: Any provisioned disks will have to be manually deleted after `terraform destroy`, assuming you do not need the data on the volumes anymore. - ## Upgrade TiDB cluster To upgrade TiDB cluster, modify `tidb_version` variable to a higher version in variables.tf and run `terraform apply`. From 5f300b9138ab301b880d0ca8740512010ae02ae3 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Wed, 15 May 2019 13:18:13 -0700 Subject: [PATCH 12/21] Adds on-destroy to change persistent volume claimpolicy to delete --- deploy/gcp/main.tf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index f06823b32be..81516da15dc 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -251,6 +251,16 @@ resource "null_resource" "get-credentials" { KUBECONFIG = "${local.kubeconfig}" } } + provisioner "local-exec" { + when = "destroy" + command = < Date: Thu, 16 May 2019 18:04:05 -0700 Subject: [PATCH 13/21] Removes superfluous command changing reclaim policy --- deploy/gcp/main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index 81516da15dc..f833830d0ed 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -305,7 +305,6 @@ until kubectl get po -n tidb -lapp.kubernetes.io/component=tidb | grep Running; echo "Wait for TiDB pod running" sleep 5 done -kubectl get pv -l app.kubernetes.io/namespace=tidb -o name | xargs -I {} kubectl patch {} -p '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}' EOS environment { From f428090c22b06399d5d2024c6f71ba9d04a2d45e Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Thu, 16 May 2019 20:31:48 -0700 Subject: [PATCH 14/21] Adds monitor node count variable --- deploy/gcp/main.tf | 5 ++++- deploy/gcp/variables.tf | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index f833830d0ed..aa906058d62 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -170,7 +170,7 @@ resource "google_container_node_pool" "monitor_pool" { cluster = "${google_container_cluster.cluster.name}" location = "${google_container_cluster.cluster.location}" name = "monitor-pool" - initial_node_count = "1" + initial_node_count = "${var.monitor_count}" node_config { machine_type = "${var.monitor_instance_type}" @@ -251,12 +251,15 @@ resource "null_resource" "get-credentials" { KUBECONFIG = "${local.kubeconfig}" } } + provisioner "local-exec" { when = "destroy" + command = < Date: Fri, 17 May 2019 17:02:50 -0700 Subject: [PATCH 15/21] Adds startup script daemonset to properly change open fd on pd and tikv nodes --- deploy/gcp/main.tf | 3 ++ deploy/gcp/manifests/startup-script.yaml | 41 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 deploy/gcp/manifests/startup-script.yaml diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index aa906058d62..87e873a1ee6 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -95,6 +95,7 @@ resource "google_container_node_pool" "pd_pool" { node_config { machine_type = "${var.pd_instance_type}" + image_type = "UBUNTU" local_ssd_count = 1 taint { @@ -122,6 +123,7 @@ resource "google_container_node_pool" "tikv_pool" { node_config { machine_type = "${var.tikv_instance_type}" + image_type = "UBUNTU" local_ssd_count = 1 taint { @@ -282,6 +284,7 @@ resource "null_resource" "setup-env" { kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user $$(gcloud config get-value account) kubectl create serviceaccount --namespace kube-system tiller kubectl apply -f manifests/crd.yaml +kubectl apply -f manifests/startup-script.yaml kubectl apply -f manifests/local-volume-provisioner.yaml kubectl apply -f manifests/gke-storage.yml kubectl apply -f manifests/tiller-rbac.yaml diff --git a/deploy/gcp/manifests/startup-script.yaml b/deploy/gcp/manifests/startup-script.yaml new file mode 100644 index 00000000000..eb734aae896 --- /dev/null +++ b/deploy/gcp/manifests/startup-script.yaml @@ -0,0 +1,41 @@ +kind: DaemonSet +apiVersion: extensions/v1beta1 +metadata: + name: startup-script + namespace: kube-system + labels: + app: startup-script +spec: + template: + metadata: + labels: + app: startup-script + spec: + tolerations: + - key: dedicated + operator: Equal + value: pd + effect: "NoSchedule" + - key: dedicated + operator: Equal + value: tikv + effect: "NoSchedule" + hostPID: true + containers: + - name: startup-script + image: gcr.io/google-containers/startup-script:v1 + imagePullPolicy: Always + securityContext: + privileged: true + env: + - name: STARTUP_SCRIPT + value: | + #! /bin/bash + set -o errexit + set -o pipefail + set -o nounset + echo 'root soft nofile 1000000' >> /etc/security/limits.d/99-tidb.conf + echo 'root hard nofile 1000000' >> /etc/security/limits.d/99-tidb.conf + echo 'root soft core unlimited' >> /etc/security/limits.d/99-tidb.conf + echo 'root soft stack 10240' >> /etc/security/limits.d/99-tidb.conf + echo done From 157f039ab32d05dce0839d83fd7ecc67eea02bd8 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Fri, 17 May 2019 21:36:17 -0700 Subject: [PATCH 16/21] Streamlines startup daemonset and adds Linux Guest Environment installation --- deploy/gcp/data.tf | 19 ++++-- deploy/gcp/main.tf | 32 +++++++--- deploy/gcp/manifests/startup-script.yaml | 62 ++++++++++++------- deploy/gcp/outputs.tf | 8 +++ .../templates/tidb-cluster-values.yaml.tpl | 2 +- deploy/gcp/variables.tf | 11 +++- 6 files changed, 94 insertions(+), 40 deletions(-) diff --git a/deploy/gcp/data.tf b/deploy/gcp/data.tf index 6986042aec9..0595c5e681a 100644 --- a/deploy/gcp/data.tf +++ b/deploy/gcp/data.tf @@ -2,10 +2,11 @@ data "template_file" "tidb_cluster_values" { template = "${file("${path.module}/templates/tidb-cluster-values.yaml.tpl")}" vars { - cluster_version = "${var.tidb_version}" - pd_replicas = "${var.pd_replica_count}" - tikv_replicas = "${var.tikv_replica_count}" - tidb_replicas = "${var.tidb_replica_count}" + cluster_version = "${var.tidb_version}" + pd_replicas = "${var.pd_replica_count}" + tikv_replicas = "${var.tikv_replica_count}" + tidb_replicas = "${var.tidb_replica_count}" + operator_version = "${var.tidb_operator_version}" } } @@ -18,3 +19,13 @@ data "external" "monitor_ilb_ip" { depends_on = ["null_resource.deploy-tidb-cluster"] program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-grafana -o json | jq '.status.loadBalancer.ingress[0]'"] } + +data "external" "tidb_port" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-tidb -o json | jq '.spec.ports | .[] | select( .name == \"mysql-client\") | {port: .port|tostring}'"] +} + +data "external" "monitor_port" { + depends_on = ["null_resource.deploy-tidb-cluster"] + program = ["bash", "-c", "kubectl --kubeconfig ${local.kubeconfig} get svc -n tidb tidb-cluster-grafana -o json | jq '.spec.ports | .[] | select( .name == \"grafana\") | {port: .port|tostring}'"] +} diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index 87e873a1ee6..6d029256626 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -36,7 +36,7 @@ resource "google_compute_network" "vpc_network" { resource "google_compute_subnetwork" "private_subnet" { ip_cidr_range = "172.31.252.0/22" name = "private-subnet" - network = "${google_compute_network.vpc_network.self_link}" + network = "${google_compute_network.vpc_network.name}" project = "${var.GCP_PROJECT}" secondary_ip_range { @@ -48,25 +48,34 @@ resource "google_compute_subnetwork" "private_subnet" { ip_cidr_range = "172.31.224.0/20" range_name = "services-${var.GCP_REGION}" } + + lifecycle { + ignore_changes = ["secondary_ip_range"] + } } resource "google_compute_subnetwork" "public_subnet" { ip_cidr_range = "172.29.252.0/22" name = "public-subnet" - network = "${google_compute_network.vpc_network.self_link}" + network = "${google_compute_network.vpc_network.name}" project = "${var.GCP_PROJECT}" } resource "google_container_cluster" "cluster" { name = "${var.cluster_name}" - network = "${google_compute_network.vpc_network.self_link}" - subnetwork = "${google_compute_subnetwork.private_subnet.self_link}" + network = "${google_compute_network.vpc_network.name}" + subnetwork = "${google_compute_subnetwork.private_subnet.name}" location = "${var.GCP_REGION}" project = "${var.GCP_PROJECT}" master_auth { username = "" password = "" + + // due to https://github.com/terraform-providers/terraform-provider-google/issues/3369 + client_certificate_config { + issue_client_certificate = false + } } master_authorized_networks_config { @@ -83,6 +92,10 @@ resource "google_container_cluster" "cluster" { initial_node_count = 1 min_master_version = "latest" + + lifecycle { + ignore_changes = ["master_auth"] // see above linked issue + } } resource "google_container_node_pool" "pd_pool" { @@ -95,7 +108,7 @@ resource "google_container_node_pool" "pd_pool" { node_config { machine_type = "${var.pd_instance_type}" - image_type = "UBUNTU" + image_type = "UBUNTU" local_ssd_count = 1 taint { @@ -123,7 +136,7 @@ resource "google_container_node_pool" "tikv_pool" { node_config { machine_type = "${var.tikv_instance_type}" - image_type = "UBUNTU" + image_type = "UBUNTU" local_ssd_count = 1 taint { @@ -259,7 +272,6 @@ resource "null_resource" "get-credentials" { command = <> /etc/security/limits.d/99-tidb.conf - echo 'root hard nofile 1000000' >> /etc/security/limits.d/99-tidb.conf - echo 'root soft core unlimited' >> /etc/security/limits.d/99-tidb.conf - echo 'root soft stack 10240' >> /etc/security/limits.d/99-tidb.conf - echo done + #!/usr/bin/env bash + set -euo pipefail + apt-get update + apt-get install -y software-properties-common + apt-add-repository universe + apt-get update + declare -a PKG_LIST=(python-google-compute-engine \ + python3-google-compute-engine \ + google-compute-engine-oslogin \ + gce-compute-image-packages) + for pkg in ${PKG_LIST[@]}; do + apt-get install -y $pkg || echo "Not available: $pkg" + done + mount | grep -v nobarrier | awk '/ssd/{print $1}' | xargs -i mount {} -o remount,nobarrier + cat < /etc/security/limits.d/99-tidb.conf + root soft nofile 1000000 + root hard nofile 1000000 + root soft core unlimited + root soft stack 10240 + EOF + volumeMounts: + - mountPath: /mnt/disks + name: local-ssd + mountPropagation: Bidirectional + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: local-ssd + hostPath: + path: /mnt/disks \ No newline at end of file diff --git a/deploy/gcp/outputs.tf b/deploy/gcp/outputs.tf index fd7ed97b21a..69dd56a0158 100644 --- a/deploy/gcp/outputs.tf +++ b/deploy/gcp/outputs.tf @@ -25,3 +25,11 @@ output "tidb_ilb_ip" { output "monitor_ilb_ip" { value = "${data.external.monitor_ilb_ip.result["ip"]}" } + +output "how_to_ssh_to_bastion" { + value = "gcloud compute ssh bastion --zone ${var.GCP_REGION}-a" +} + +output "how_to_connect_to_mysql_from_bastion" { + value = "mysql -h ${data.external.tidb_ilb_ip.result["ip"]} -P ${data.external.tidb_port.result["port"]} -u root" +} diff --git a/deploy/gcp/templates/tidb-cluster-values.yaml.tpl b/deploy/gcp/templates/tidb-cluster-values.yaml.tpl index 5ec677f06cf..496c786f071 100644 --- a/deploy/gcp/templates/tidb-cluster-values.yaml.tpl +++ b/deploy/gcp/templates/tidb-cluster-values.yaml.tpl @@ -30,7 +30,7 @@ services: type: ClusterIP discovery: - image: pingcap/tidb-operator:v1.0.0-beta.2 + image: pingcap/tidb-operator:${operator_version} imagePullPolicy: IfNotPresent resources: limits: diff --git a/deploy/gcp/variables.tf b/deploy/gcp/variables.tf index 99097fdac9c..ec01b8d6320 100644 --- a/deploy/gcp/variables.tf +++ b/deploy/gcp/variables.tf @@ -8,9 +8,9 @@ variable "tidb_version" { default = "v2.1.8" } -variable "pd_count" { - description = "Number of PD nodes per availability zone" - default = 1 +variable "tidb_operator_version" { + description = "TiDB operator version" + default = "v1.0.0-beta.2" } variable "pd_replica_count" { @@ -25,6 +25,11 @@ variable "tidb_replica_count" { default = 3 } +variable "pd_count" { + description = "Number of PD nodes per availability zone" + default = 1 +} + variable "tikv_count" { description = "Number of TiKV nodes per availability zone" default = 1 From 2b145ab4f4bb622bc42caf49c486d872f13a22a2 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Mon, 20 May 2019 20:25:22 -0700 Subject: [PATCH 17/21] Adds note about default set up exceeding GCP default cpu quota --- deploy/gcp/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index 5228df7d83a..9877b7d43a2 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -21,6 +21,7 @@ The default setup will create a new VPC, two subnetworks, and an f1-micro instan * 3 n1-standard-2 instances for monitor > *NOTE*: The number of nodes created depends on how many availability zones there are in the chosen region. Most have 3 zones, but us-central1 has 4. See https://cloud.google.com/compute/docs/regions-zones/ for more information. +> *NOTE*: The default setup, as listed above, will exceed the default CPU quota of a GCP project. The terraform script expects three environment variables. You can let Terraform prompt you for them, or `export` them ahead of time. If you choose to export them, they are: From 1c5b36da692002fa52b15ed25af19cf16f2dfaa6 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Mon, 20 May 2019 20:40:30 -0700 Subject: [PATCH 18/21] Adds link to GCP quota documentation page --- deploy/gcp/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index 9877b7d43a2..a6ea7e7fd16 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -21,7 +21,7 @@ The default setup will create a new VPC, two subnetworks, and an f1-micro instan * 3 n1-standard-2 instances for monitor > *NOTE*: The number of nodes created depends on how many availability zones there are in the chosen region. Most have 3 zones, but us-central1 has 4. See https://cloud.google.com/compute/docs/regions-zones/ for more information. -> *NOTE*: The default setup, as listed above, will exceed the default CPU quota of a GCP project. +> *NOTE*: The default setup, as listed above, will exceed the default CPU quota of a GCP project. To increase your project's quota, please follow the instructions [here](https://cloud.google.com/compute/quotas). The default setup will require at least 91 CPUs, more if you need to scale out. The terraform script expects three environment variables. You can let Terraform prompt you for them, or `export` them ahead of time. If you choose to export them, they are: From 88f32c6be66f27c3beb4388629419b09cee96036 Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Mon, 20 May 2019 20:41:36 -0700 Subject: [PATCH 19/21] Fixes formatting --- deploy/gcp/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index a6ea7e7fd16..c05486a1f83 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -21,6 +21,7 @@ The default setup will create a new VPC, two subnetworks, and an f1-micro instan * 3 n1-standard-2 instances for monitor > *NOTE*: The number of nodes created depends on how many availability zones there are in the chosen region. Most have 3 zones, but us-central1 has 4. See https://cloud.google.com/compute/docs/regions-zones/ for more information. + > *NOTE*: The default setup, as listed above, will exceed the default CPU quota of a GCP project. To increase your project's quota, please follow the instructions [here](https://cloud.google.com/compute/quotas). The default setup will require at least 91 CPUs, more if you need to scale out. The terraform script expects three environment variables. You can let Terraform prompt you for them, or `export` them ahead of time. If you choose to export them, they are: From 843d3e92e11bb711ee7bb6692b8b4d6ffb7ac86e Mon Sep 17 00:00:00 2001 From: Jacob Lerche Date: Tue, 21 May 2019 12:34:02 -0700 Subject: [PATCH 20/21] Adds clarifications to README, adds tidb and monitor port to outputs --- deploy/gcp/README.md | 4 ++-- deploy/gcp/main.tf | 4 ++++ deploy/gcp/outputs.tf | 8 ++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index c05486a1f83..0715ea95af7 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -26,11 +26,11 @@ The default setup will create a new VPC, two subnetworks, and an f1-micro instan The terraform script expects three environment variables. You can let Terraform prompt you for them, or `export` them ahead of time. If you choose to export them, they are: -* `TF_VAR_GCP_CREDENTIALS_PATH`: Path to a valid GCP credentials file +* `TF_VAR_GCP_CREDENTIALS_PATH`: Path to a valid GCP credentials file. It is generally considered a good idea to create a service account to be used by Terraform. See [this page](https://cloud.google.com/iam/docs/creating-managing-service-accounts) for more information on how to manage them. See [this page](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) for creating and managing service account keys which, when downloaded, will be the needed credentials file. * `TF_VAR_GCP_REGION`: The region to create the resources in, for example: `us-west1` * `TF_VAR_GCP_PROJECT`: The name of the GCP project -It is generally considered a good idea to create a service account to be used by Terraform. See https://cloud.google.com/iam/docs/creating-managing-service-accounts for more information on how to manage them. + The service account should have sufficient permissions to create resources in the project. The `Project Editor` primitive will accomplish this. diff --git a/deploy/gcp/main.tf b/deploy/gcp/main.tf index 6d029256626..a10aeeb5a9a 100644 --- a/deploy/gcp/main.tf +++ b/deploy/gcp/main.tf @@ -316,6 +316,10 @@ EOS resource "null_resource" "deploy-tidb-cluster" { depends_on = ["null_resource.setup-env", "local_file.tidb-cluster-values", "google_container_node_pool.pd_pool", "google_container_node_pool.tikv_pool", "google_container_node_pool.tidb_pool"] + triggers { + values = "${data.template_file.tidb_cluster_values.rendered}" + } + provisioner "local-exec" { command = < Date: Tue, 21 May 2019 18:37:02 -0700 Subject: [PATCH 21/21] Adds section on how to delete nodes that are automatically replicated across zones in a regional cluster --- deploy/gcp/README.md | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index 0715ea95af7..db9575b59ed 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -20,7 +20,7 @@ The default setup will create a new VPC, two subnetworks, and an f1-micro instan * 3 n1-standard-16 instances for TiDB * 3 n1-standard-2 instances for monitor -> *NOTE*: The number of nodes created depends on how many availability zones there are in the chosen region. Most have 3 zones, but us-central1 has 4. See https://cloud.google.com/compute/docs/regions-zones/ for more information. +> *NOTE*: The number of nodes created depends on how many availability zones there are in the chosen region. Most have 3 zones, but us-central1 has 4. See https://cloud.google.com/compute/docs/regions-zones/ for more information. Please refer to the `Customize` section for information on how to customize node pools in a regional cluster. > *NOTE*: The default setup, as listed above, will exceed the default CPU quota of a GCP project. To increase your project's quota, please follow the instructions [here](https://cloud.google.com/compute/quotas). The default setup will require at least 91 CPUs, more if you need to scale out. @@ -84,7 +84,7 @@ To upgrade TiDB cluster, modify `tidb_version` variable to a higher version in v To scale TiDB cluster, modify `tikv_count`, `tikv_replica_count`, `tidb_count`, and `tidb_replica_count` to your desired count, and then run `terraform apply`. -> *Note*: Currently, scaling in is not supported since we cannot determine which node to scale. Scaling out needs a few minutes to complete, you can watch the scaling out by `watch kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb` +> *Note*: Currently, scaling in is not supported since we cannot determine which node to remove. Scaling out needs a few minutes to complete, you can watch the scaling out by `watch kubectl --kubeconfig credentials/kubeconfig_ get po -n tidb` > *Note*: Incrementing the node count will create a node per GCP availability zones. @@ -96,4 +96,31 @@ GCP allows attaching a local SSD to any instance type that is `n1-standard-1` or ### Customize TiDB Parameters -Currently, there are not too many parameters exposed to be customized. However, you can modify `templates/tidb-cluster-values.yaml.tpl` before deploying. If you modify it after the cluster is created and then run `terraform apply`, it will not take effect unless the pod(s) is manually deleted. \ No newline at end of file +Currently, there are not too many parameters exposed to be customized. However, you can modify `templates/tidb-cluster-values.yaml.tpl` before deploying. If you modify it after the cluster is created and then run `terraform apply`, it will not take effect unless the pod(s) is manually deleted. + +### Customizing node pools + +The cluster is created as a regional, as opposed to a zonal cluster. This means that GKE will replicate node pools to each availability zone. This is desired to maintain high availability, however for the monitoring services, like Grafana, this is potentially unnecessary. It is possible to manually remove nodes if desired via `gcloud`. + +> *NOTE*: GKE node pools are managed instance groups, so a node deleted by `gcloud compute instances delete` will be automatically recreated and added back to the cluster. + +Suppose we wish to delete a node from the monitor pool, we can do +```bash +$ gcloud compute instance-groups managed list | grep monitor +``` +and the result will be something like this +```bash +gke-my-cluster-monitor-pool-08578e18-grp us-west1-b zone gke-my-cluster-monitor-pool-08578e18 0 0 gke-my-cluster-monitor-pool-08578e18 no +gke-my-cluster-monitor-pool-7e31100f-grp us-west1-c zone gke-my-cluster-monitor-pool-7e31100f 1 1 gke-my-cluster-monitor-pool-7e31100f no +gke-my-cluster-monitor-pool-78a961e5-grp us-west1-a zone gke-my-cluster-monitor-pool-78a961e5 1 1 gke-my-cluster-monitor-pool-78a961e5 no +``` +The first column is the name of the managed instance group, and the second column is the zone it was created in. We will also need the name of the instance in that group, we can get it as follows +```bash +$ gcloud compute instance-groups managed list-instances gke-my-cluster-monitor-pool-08578e18-grp --zone us-west1-b +NAME ZONE STATUS ACTION INSTANCE_TEMPLATE VERSION_NAME LAST_ERROR +gke-my-cluster-monitor-pool-08578e18-c7vd us-west1-b RUNNING NONE gke-my-cluster-monitor-pool-08578e18 +``` +Now we can delete the instance +```bash +$ gcloud compute instance-groups managed delete-instances gke-my-cluster-monitor-pool-08578e18-grp --instances=gke-my-cluster-monitor-pool-08578e18-c7vd --zone us-west1-b +``` \ No newline at end of file