merge: #10132

10132: Upgrade GKE prometheus set up to prometheus-community/kube-prometheus-stack r=npepinpe a=npepinpe ## Description This PR updates the `prometheus-values.yaml` we use to set up our monitoring stack on our GKE clusters. These are the latest values used, adapted for the new chart. At the same time, I've already migrated us from the old deprecated chart to the new chart (prometheus-community/kube-prometheus-stack), and upgraded from 9.x to 16.0.0. In order to migrate, I did the following (based on [this issue from our SREs](https://github.com/camunda-cloud/monitoring/issues/524)): - [x] Modify the PV reclaim policy to `retain` instead of delete; this allows us to delete the old PVC but keep the persistent volume, retaining our data - [x] Pre-create the PVC that the new chart expects; it will then pick up on creation and won't create a new one, and we keep the old PV/data intact. - [x] Follow these unofficial [upgrade instructions](prometheus-community/helm-charts#250 (comment)); essentially we need to re-create the CRDs as `helm upgrade` doesn't install CRDs, so we need to pick up the CRDs from the updated operator version. - [x] Migrate from the old chart to the new chart using `helm upgrade metrics --debug --namespace default --dependency-update -f prometheus-operator-values.yml --version 10.0.0 prometheus-community/kube-prometheus-stack` (first run with a `--dry-run` to ensure the PVC and so on will be kept) - [x] Once done, [follow the upgrade instructions](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#upgrading-chart) for each major version upgrade as you go along, using the command above but updating the version. This was done until version 16.0.0, which removes the last component using deprecated APIs (kube-state-metrics). With that done, we could then upgrade the Kubernetes clusters to 1.23 without any issues. The next time we need to do all of this will be when upgrading to k8s 1.25, which removes further APIs. While it's possible to upgrade k8s first and then fix the Helm release, it's easier to first upgrade the charts to make sure nothing using the deprecated APIs, and then upgrade k8s. One last thing: we could upgrade to 17.x and remove our pinned version of Grafana to upgrade Grafana to 8.x (like we have in SaaS). To do that, just edit the values file, remove the pinned tag for Grafana, update the necessary CRDs as described on the chart readme (link is above), and then run `helm upgrade metrics --debug --namespace default --dependency-update -f prometheus-operator-values.yml --version 17.0.0 prometheus-community/kube-prometheus-stack`. ## Related issues closes #9074 Co-authored-by: Nicolas Pepin-Perreault <nicolas.pepin-perreault@camunda.com>
camunda · Aug 30, 2022 · 7bfa617 · 7bfa617
2 parents 1480f7e + c90aff8
commit 7bfa617
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 6 deletions.
diff --git a/...docs/setup/prometheus-operator-values.yml → ...setup/zeebe-cluster-prometheus-values.yml b/...docs/setup/prometheus-operator-values.yml → ...setup/zeebe-cluster-prometheus-values.yml
@@ -1,5 +1,5 @@
 alertmanager:
-  enabled: false
+  enabled: true
 
 grafana:
   image:
@@ -9,6 +9,9 @@ grafana:
     userKey: admin-user
     passwordKey: admin-password
   grafana.ini:
+    server:
+      # REPLACE THIS WITH THE ACTUAL ROOT URL
+      root_url: "http://34.77.165.228"
     auth.github:
       enabled: true
       allow_sign_up: true
@@ -53,23 +56,28 @@ grafana:
 
 prometheusOperator:
   admissionWebhooks:
-    enabled: false
+    patch:
+      image:
+        # the default image in newer chart versions, and required for k8s 1.22
+        repository: k8s.gcr.io/ingress-nginx/kube-webhook-certgen
+        tag: v1.2.0
+
+thanosRuler:
+  enabled: false
 
 prometheus:
   prometheusSpec:
     retention: 30d
     storageSpec:
       volumeClaimTemplate:
-        metadata:
-          name: prometheus-data
+        spec:
+          storageClassName: ssd
           selector:
             matchLabels:
               app: prometheus
-        spec:
           accessModes:
             - ReadWriteOnce
           resources:
             requests:
               storage: 150Gi
-          storageClassName: ssd
 
diff --git a/benchmarks/docs/setup/zeebe-long-running-prometheus-values.yml b/benchmarks/docs/setup/zeebe-long-running-prometheus-values.yml
@@ -0,0 +1,83 @@
+alertmanager:
+  enabled: true
+
+grafana:
+  image:
+    tag: 7.4.5
+  admin:
+    existingSecret: grafana-admin-password
+    userKey: admin-user
+    passwordKey: admin-password
+  grafana.ini:
+    server:
+      # REPLACE THIS WITH THE ACTUAL ROOT URL
+      root_url: "http://35.189.240.202/"
+    auth.github:
+      enabled: true
+      allow_sign_up: true
+      scopes: user:email,read:org
+      auth_url: https://github.com/login/oauth/authorize
+      token_url: https://github.com/login/oauth/access_token
+      api_url: https://api.github.com/user
+      allowed_organizations: zeebe-io camunda camunda-cloud
+      client_id: "$__file{/etc/secrets/auth-github-oauth/client_id}"
+      client_secret: "$__file{/etc/secrets/auth-github-oauth/client_secret}"
+      role_attribute_path: "editor"
+  extraSecretMounts:
+    - name: auth-github-oauth
+      secretName: auth-github-oauth
+      defaultMode: 0440
+      mountPath: /etc/secrets/auth-github-oauth
+      readOnly: true
+  dashboardProviders:
+    dashboardproviders.yaml:
+      apiVersion: 1
+      providers:
+        - name: default
+          orgId: 1
+          folder:
+          type: file
+          disableDeletion: true
+          editable: false
+          options:
+            path: /var/lib/grafana/dashboards/default
+  dashboards:
+    default:
+      zeebe:
+        url: https://raw.githubusercontent.com/camunda/zeebe/main/monitor/grafana/zeebe.json
+      zeebe-overview:
+        url: https://raw.githubusercontent.com/camunda/zeebe/main/monitor/grafana/zeebe-overview.json
+  persistence:
+    enabled: true
+    storageClassName: ssd
+  sidecar:
+    dashboards:
+      searchNamespace: ALL
+
+prometheusOperator:
+  admissionWebhooks:
+    patch:
+      image:
+        # the default image in newer chart versions, and required for k8s 1.22
+        repository: k8s.gcr.io/ingress-nginx/kube-webhook-certgen
+        tag: v1.2.0
+
+thanosRuler:
+  enabled: false
+
+prometheus:
+  prometheusSpec:
+    retention: 30d
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          storageClassName: ssd
+          selector:
+            matchLabels:
+              app: prometheus
+          accessModes:
+            - ReadWriteOnce
+          resources:
+            requests:
+              storage: 150Gi
+