prometheusrule.yaml 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. {{- if and .Values.prometheusRule.enabled .Values.serviceMonitor.enabled }}
  2. apiVersion: monitoring.coreos.com/v1
  3. kind: PrometheusRule
  4. metadata:
  5. name: {{ include "opentelemetry-collector.fullname" . }}
  6. namespace: {{ template "opentelemetry-collector.namespace" . }}
  7. labels:
  8. {{- include "opentelemetry-collector.labels" . | nindent 4 }}
  9. {{- range $key, $value := .Values.prometheusRule.extraLabels }}
  10. {{- printf "%s: %s" $key (tpl $value $ | quote) | nindent 4 }}
  11. {{- end }}
  12. spec:
  13. groups:
  14. {{- if .Values.prometheusRule.groups }}
  15. {{- toYaml .Values.prometheusRule.groups | nindent 4 }}
  16. {{- end }}
  17. {{- if .Values.prometheusRule.defaultRules.enabled }}
  18. - name: collectorRules
  19. rules:
  20. - alert: ReceiverDroppedSpans
  21. expr: rate(otelcol_receiver_refused_spans[5m]) > 0
  22. for: 2m
  23. labels:
  24. severity: critical
  25. annotations:
  26. description: '{{`The {{ $labels.receiver }} receiver is dropping spans at a rate of {{ humanize $value }} per second `}}'
  27. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#receive-failures'
  28. - alert: ReceiverDroppedMetrics
  29. expr: rate(otelcol_receiver_refused_metric_points[5m]) > 0
  30. for: 2m
  31. labels:
  32. severity: critical
  33. annotations:
  34. description: '{{`The {{ $labels.receiver }} receiver is dropping metrics at a rate of {{ humanize $value }} per second `}}'
  35. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#receive-failures'
  36. - alert: ReceiverDroppedLogs
  37. expr: rate(otelcol_receiver_refused_log_records[5m]) > 0
  38. for: 5m
  39. labels:
  40. severity: critical
  41. annotations:
  42. description: '{{` The {{ $labels.receiver }} is dropping logs at a rate of {{ humanize $value }} per second `}}'
  43. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#receive-failures'
  44. - alert: ProcessorDroppedSpans
  45. expr: rate(otelcol_processor_dropped_spans[5m]) > 0
  46. for: 2m
  47. labels:
  48. severity: critical
  49. annotations:
  50. description: '{{`The {{ $labels.processor }} processor is dropping spans at a rate of {{ humanize $value }} per second `}}'
  51. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-loss'
  52. - alert: ProcessorDroppedMetrics
  53. expr: rate(otelcol_processor_dropped_metric_points[5m]) > 0
  54. for: 2m
  55. labels:
  56. severity: critical
  57. annotations:
  58. description: '{{`The {{ $labels.processor }} processor is dropping metrics at a rate of {{ humanize $value }} per second `}}'
  59. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-loss'
  60. - alert: ProcessorDroppedLogs
  61. expr: rate(otelcol_processor_dropped_log_records[5m]) > 0
  62. for: 5m
  63. labels:
  64. severity: critical
  65. annotations:
  66. description: '{{` The {{ $labels.processor }} is dropping logs at a rate of {{ humanize $value }} per second `}}'
  67. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-loss'
  68. - alert: ExporterDroppedSpans
  69. expr: rate(otelcol_exporter_send_failed_spans[5m]) > 0
  70. for: 2m
  71. labels:
  72. severity: critical
  73. annotations:
  74. description: '{{`The {{ $labels.exporter }} exporter is dropping spans at a rate of {{ humanize $value }} per second `}}'
  75. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-egress'
  76. - alert: ExporterDroppedMetrics
  77. expr: rate(otelcol_exporter_send_failed_metric_points[5m]) > 0
  78. for: 2m
  79. labels:
  80. severity: critical
  81. annotations:
  82. description: '{{`The {{ $labels.exporter }} exporter is dropping metrics at a rate of {{ humanize $value }} per second `}}'
  83. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-egress'
  84. - alert: ExporterDroppedLogs
  85. expr: rate(otelcol_exporter_send_failed_log_records[5m]) > 0
  86. for: 5m
  87. labels:
  88. severity: critical
  89. annotations:
  90. description: '{{` The {{ $labels.exporter }} is dropping logs at a rate of {{ humanize $value }} per second `}}'
  91. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#data-egress'
  92. - alert: ExporterQueueSize
  93. expr: otelcol_exporter_queue_size > 5000
  94. for: 1m
  95. labels:
  96. severity: warning
  97. annotations:
  98. description: '{{`The {{ $labels.exporter }} queue has reached a size of {{ $value }} `}}'
  99. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#queue-length'
  100. {{- $signals := list "spans" "metric_points" "log_records" }}
  101. {{- range $signal := $signals }}
  102. - alert: SendQueueFailed{{ $signal }}
  103. expr: rate(otelcol_exporter_enqueue_failed_{{ $signal }}[5m]) > 0
  104. for: 1m
  105. labels:
  106. severity: warning
  107. annotations:
  108. description: '{{`The {{ $labels.exporter }} sending queue failed to accept {{ $value }} `}} {{ $signal }}'
  109. runbook_url: 'https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/monitoring.md#queue-length'
  110. {{- end }}
  111. {{- end }}
  112. {{- end }}