prometheusrule.yaml 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. {{- if and .Values.prometheusRule.enabled .Values.serviceMonitor.enabled }}
  2. apiVersion: monitoring.coreos.com/v1
  3. kind: PrometheusRule
  4. metadata:
  5. name: {{ include "opentelemetry-collector.fullname" . }}
  6. labels:
  7. {{- include "opentelemetry-collector.labels" . | nindent 4 }}
  8. {{- range $key, $value := .Values.prometheusRule.extraLabels }}
  9. {{- printf "%s: %s" $key (tpl $value $ | quote) | nindent 4 }}
  10. {{- end }}
  11. spec:
  12. groups:
  13. {{- if .Values.prometheusRule.groups }}
  14. {{- toYaml .Values.prometheusRule.groups | nindent 4 }}
  15. {{- end }}
  16. {{- if .Values.prometheusRule.defaultRules.enabled }}
  17. - name: collectorRules
  18. rules:
  19. - alert: ReceiverDroppedSpans
  20. expr: rate(otelcol_receiver_refused_spans[5m]) > 0
  21. for: 2m
  22. labels:
  23. severity: critical
  24. annotations:
  25. description: '{{`The {{ $labels.receiver }} receiver is dropping spans at a rate of {{ humanize $value }} per second `}}'
  26. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#receive-failures'
  27. - alert: ReceiverDroppedMetrics
  28. expr: rate(otelcol_receiver_refused_metric_points[5m]) > 0
  29. for: 2m
  30. labels:
  31. severity: critical
  32. annotations:
  33. description: '{{`The {{ $labels.receiver }} receiver is dropping metrics at a rate of {{ humanize $value }} per second `}}'
  34. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#receive-failures'
  35. - alert: ProcessorDroppedSpans
  36. expr: rate(otelcol_processor_dropped_spans[5m]) > 0
  37. for: 2m
  38. labels:
  39. severity: critical
  40. annotations:
  41. description: '{{`The {{ $labels.processor }} processor is dropping spans at a rate of {{ humanize $value }} per second `}}'
  42. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-loss'
  43. - alert: ProcessorDroppedMetrics
  44. expr: rate(otelcol_processor_dropped_metric_points[5m]) > 0
  45. for: 2m
  46. labels:
  47. severity: critical
  48. annotations:
  49. description: '{{`The {{ $labels.processor }} processor is dropping metrics at a rate of {{ humanize $value }} per second `}}'
  50. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-loss'
  51. - alert: ExporterDroppedSpans
  52. expr: rate(otelcol_exporter_send_failed_spans[5m]) > 0
  53. for: 2m
  54. labels:
  55. severity: critical
  56. annotations:
  57. description: '{{`The {{ $labels.exporter }} exporter is dropping spans at a rate of {{ humanize $value }} per second `}}'
  58. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#receive-failures'
  59. - alert: ExporterDroppedMetrics
  60. expr: rate(otelcol_exporter_send_failed_metric_points[5m]) > 0
  61. for: 2m
  62. labels:
  63. severity: critical
  64. annotations:
  65. description: '{{`The {{ $labels.exporter }} exporter is dropping metrics at a rate of {{ humanize $value }} per second `}}'
  66. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#receive-failures'
  67. - alert: ExporterQueueSize
  68. expr: otelcol_exporter_queue_size > 5000
  69. for: 1m
  70. labels:
  71. severity: warning
  72. annotations:
  73. description: '{{`The {{ $labels.exporter }} queue has reached a size of {{ $value }} `}}'
  74. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#queue-length'
  75. {{- $signals := list "spans" "metric_points" "log_records" }}
  76. {{- range $signal := $signals }}
  77. - alert: SendQueueFailed
  78. expr: rate(otelcol_exporter_enqueue_failed_{{ $signal }}[5m]) > 0
  79. for: 1m
  80. labels:
  81. severity: warning
  82. annotations:
  83. description: '{{`The {{ $labels.exporter }} sending queue failed to accept {{ $value }} `}} {{ $signal }}'
  84. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#queue-length'
  85. {{- end }}
  86. {{- end }}
  87. {{- end }}