feat: add service monitor and error alter integration guideline
This commit is contained in:
parent
fbed05d60c
commit
937f6eab9b
@ -1,32 +1,91 @@
|
||||
# Prometheus Alter Rule Config
|
||||
# 1. Prometheus Alert Rule Configuration
|
||||
|
||||
Add `prometheusrule.yaml` to `<helm-pkg>/templates`.
|
||||
see
|
||||
```
|
||||
## 1.1. Add `prometheusrule.yaml` to `<helm-pkg>/templates`.
|
||||
|
||||
Example:
|
||||
|
||||
> Update metrics to your service name, see freeleaps-ops/freeleaps/helm-pkg/metrics
|
||||
```yaml
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
SPDX-License-Identifier: APACHE-2.0
|
||||
*/}}
|
||||
|
||||
{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }}
|
||||
{{- if .Values.metrics.prometheusRule.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ include "common.names.fullname" . }}
|
||||
namespace: {{ default (include "common.names.namespace" .) .Values.metrics.prometheusRule.namespace | quote}}
|
||||
labels: {{- include "common.labels.standard" ( dict "customLabels" .Values.commonLabels "context" $ ) | nindent 4 }}
|
||||
{{- if .Values.metrics.prometheusRule.additionalLabels }}
|
||||
{{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.commonAnnotations }}
|
||||
annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
|
||||
name: {{ .Values.metrics.prometheusRule.name }}
|
||||
namespace: {{ .Values.metrics.prometheusRule.namespace | quote }}
|
||||
{{- with .Values.metrics.prometheusRule.labels }}
|
||||
labels:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
{{- with .Values.metrics.prometheusRule.rules }}
|
||||
- name: {{ template "common.names.name" $ }}
|
||||
rules: {{- include "common.tplvalues.render" (dict "value" . "context" $) | nindent 8 }}
|
||||
- name: {{ $.Values.metrics.prometheusRule.name }}
|
||||
rules:
|
||||
{{- range . }}
|
||||
- alert: {{ .alert }}
|
||||
expr: {{ .expr | quote }}
|
||||
{{- if .for }}
|
||||
for: {{ .for }}
|
||||
{{- end }}
|
||||
{{- if .labels }}
|
||||
labels:
|
||||
{{- toYaml .labels | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- if .annotations }}
|
||||
annotations:
|
||||
{{- toYaml .annotations | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
```
|
||||
|
||||
```
|
||||
## 1.2. Add prometheusrule configuration to values.{alpha/prod}.yaml
|
||||
Example:
|
||||
|
||||
> See freeleaps-ops/freeleaps/helm-pkg/metrics
|
||||
|
||||
```yaml
|
||||
prometheusRule:
|
||||
name: freepeals-metrics
|
||||
enabled: true
|
||||
namespace: "freeleaps-monitoring-system"
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsMetricsServiceDown
|
||||
expr: up{job="metrics-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: metrics-service
|
||||
annotations:
|
||||
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
|
||||
description: "Freeleaps Metrics service has been down for more than 1 minutes."
|
||||
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||
|
||||
- alert: FreeleapsMetricsServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: metrics-service
|
||||
annotations:
|
||||
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
|
||||
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
|
||||
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||
```
|
||||
|
||||
## 1.3. Verify Alert Rule Configuration is Effective
|
||||
|
||||
> Redirect to local
|
||||

|
||||
|
||||
> You can see the newly added rules indicating they are effective
|
||||
|
||||

|
||||
BIN
docs/asserts/image4.png
Normal file
BIN
docs/asserts/image4.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 569 KiB |
BIN
docs/asserts/image5.png
Normal file
BIN
docs/asserts/image5.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 341 KiB |
Loading…
Reference in New Issue
Block a user