diff --git a/helm/templates/prometheus-rule.yaml b/helm/templates/prometheus-rule.yaml new file mode 100644 index 0000000..9fd6010 --- /dev/null +++ b/helm/templates/prometheus-rule.yaml @@ -0,0 +1,25 @@ +{{- if .Values.metrics.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Chart.Name | trimSuffix "-chart" | trunc 44 }}-controller-rules + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "ack-rds-controller.app.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} + k8s-app: {{ include "ack-rds-controller.app.name" . }} + helm.sh/chart: {{ include "ack-rds-controller.chart.name-version" . }} +{{- with .Values.metrics.prometheusRule.additionalLabels }} +{{ toYaml . | indent 4 }} +{{- end }} +spec: + groups: + - name: {{ include "ack-rds-controller.app.name" . }} + rules: +{{- with .Values.metrics.prometheusRule.rules }} +{{ toYaml . | indent 8 }} +{{- end }} +{{- end }} + diff --git a/helm/templates/service-monitor.yaml b/helm/templates/service-monitor.yaml new file mode 100644 index 0000000..4c82a5a --- /dev/null +++ b/helm/templates/service-monitor.yaml @@ -0,0 +1,36 @@ +{{- if .Values.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ .Chart.Name | trimSuffix "-chart" | trunc 44 }}-controller-metrics + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "ack-rds-controller.app.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} + k8s-app: {{ include "ack-rds-controller.app.name" . }} + helm.sh/chart: {{ include "ack-rds-controller.chart.name-version" . }} +{{- with .Values.metrics.serviceMonitor.additionalLabels }} +{{ toYaml . | indent 4 }} +{{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: {{ include "ack-rds-controller.app.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: Helm + k8s-app: {{ include "ack-rds-controller.app.name" . }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: metricsport + {{- with .Values.metrics.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.metrics.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} +{{- end }} + diff --git a/helm/values.schema.json b/helm/values.schema.json index c3f56a0..e186962 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -104,6 +104,59 @@ "type" ], "type": "object" + }, + "serviceMonitor": { + "description": "Prometheus ServiceMonitor settings", + "properties": { + "enabled": { + "type": "boolean" + }, + "additionalLabels": { + "type": "object" + }, + "interval": { + "type": "string" + }, + "scrapeTimeout": { + "type": "string" + } + }, + "type": "object" + }, + "prometheusRule": { + "description": "Prometheus PrometheusRule settings", + "properties": { + "enabled": { + "type": "boolean" + }, + "additionalLabels": { + "type": "object" + }, + "rules": { + "type": "array", + "items": { + "type": "object", + "properties": { + "alert": { + "type": "string" + }, + "expr": { + "type": "string" + }, + "for": { + "type": "string" + }, + "labels": { + "type": "object" + }, + "annotations": { + "type": "object" + } + } + } + } + }, + "type": "object" } }, "required": [ diff --git a/helm/values.yaml b/helm/values.yaml index 8f2aea3..fef14bc 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -77,6 +77,32 @@ metrics: # Which Type to use for the Kubernetes Service? # See: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types type: "ClusterIP" + serviceMonitor: + # Set to true to automatically create a Prometheus ServiceMonitor resource + # Requires the Prometheus Operator CRDs to be installed + enabled: false + # Additional labels for the ServiceMonitor (e.g., for Prometheus selector) + additionalLabels: {} + # Scrape interval + interval: 30s + # Scrape timeout + scrapeTimeout: 10s + prometheusRule: + # Set to true to automatically create a Prometheus PrometheusRule resource + # Requires the Prometheus Operator CRDs to be installed + enabled: false + # Additional labels for the PrometheusRule (e.g., for Prometheus selector) + additionalLabels: {} + # Alert rules (can be overridden or extended) + rules: + - alert: RDSControllerSyncErrors + expr: sum by (controller) (rate(controller_runtime_reconcile_errors_total{job="rds-controller-metrics"}[10m])) > 0.5 + for: 5m + labels: + severity: critical + annotations: + description: RDS controller having sync errors in the last 10 minutes for controller {{ $labels.controller }} + summary: RDS controller having sync errors with one or more objects resources: requests: