|
/etc/prometheus/alerts/alert_healthchecks.yml > Selfmonitoring
|
| Labels |
State |
Active Since |
Value |
|
alertname="SelfMonitoringAlwaysFiring"
application="leonard_healthchecks"
severity="info"
|
firing |
2025-09-10 18:53:31.797458603 +0000 UTC |
19 |
|
|
/etc/prometheus/alerts/alert_loadbalancing.yml > lowpref
|
|
|
|
/etc/prometheus/alerts/blackbox-exporter.yml > BlackboxExporter
|
|
|
alert: BlackboxProbeFailed
expr: probe_success
== 0
for: 15m
labels:
severity: critical
annotations:
description: |-
Probe failed
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/alerts/general.yml > general
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/alerts/node-exporter.yml > NodeExporter
|
| Labels |
State |
Active Since |
Value |
|
alertname="HostRequiresReboot"
instance="ffs14"
job="node"
nodename="ffs14"
severity="info"
|
firing |
2025-11-20 20:44:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- ffs14 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs14 job:node nodename:ffs14]
- summary
- Host requires reboot (instance ffs14)
|
|
alertname="HostRequiresReboot"
instance="prometheus01"
job="node"
nodename="prometheus01"
severity="info"
|
firing |
2025-11-17 05:45:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- prometheus01 requires a reboot.
VALUE = 1
LABELS = map[instance:prometheus01 job:node nodename:prometheus01]
- summary
- Host requires reboot (instance prometheus01)
|
|
alertname="HostRequiresReboot"
instance="ffs10"
job="node"
nodename="ffs10"
severity="info"
|
firing |
2025-11-20 20:44:32.079934382 +0000 UTC |
1 |
| Annotations |
- description
- ffs10 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs10 job:node nodename:ffs10]
- summary
- Host requires reboot (instance ffs10)
|
|
alertname="HostRequiresReboot"
instance="monitor01"
job="node"
nodename="monitor01"
severity="info"
|
firing |
2025-11-20 20:44:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- monitor01 requires a reboot.
VALUE = 1
LABELS = map[instance:monitor01 job:node nodename:monitor01]
- summary
- Host requires reboot (instance monitor01)
|
|
alertname="HostRequiresReboot"
instance="gw09n04"
job="node"
nodename="gw09n04"
severity="info"
|
firing |
2025-11-20 20:44:32.079934382 +0000 UTC |
1 |
| Annotations |
- description
- gw09n04 requires a reboot.
VALUE = 1
LABELS = map[instance:gw09n04 job:node nodename:gw09n04]
- summary
- Host requires reboot (instance gw09n04)
|
|
alertname="HostRequiresReboot"
instance="gw04n05"
job="node"
nodename="gw04n05"
severity="info"
|
firing |
2025-12-02 23:19:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- gw04n05 requires a reboot.
VALUE = 1
LABELS = map[instance:gw04n05 job:node nodename:gw04n05]
- summary
- Host requires reboot (instance gw04n05)
|
|
alertname="HostRequiresReboot"
instance="ffs11"
job="node"
nodename="ffs11"
severity="info"
|
firing |
2025-11-26 06:35:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- ffs11 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs11 job:node nodename:ffs11]
- summary
- Host requires reboot (instance ffs11)
|
|
alertname="HostRequiresReboot"
instance="ffs08"
job="node"
nodename="ffs08"
severity="info"
|
firing |
2025-11-20 20:43:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- ffs08 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs08 job:node nodename:ffs08]
- summary
- Host requires reboot (instance ffs08)
|
|
alertname="HostRequiresReboot"
instance="ffs13"
job="node"
nodename="ffs13"
severity="info"
|
firing |
2025-11-20 20:43:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- ffs13 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs13 job:node nodename:ffs13]
- summary
- Host requires reboot (instance ffs13)
|
|
alertname="HostRequiresReboot"
instance="ffs05"
job="node"
nodename="ffs05"
severity="info"
|
firing |
2025-11-20 20:44:17.079934382 +0000 UTC |
1 |
| Annotations |
- description
- ffs05 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs05 job:node nodename:ffs05]
- summary
- Host requires reboot (instance ffs05)
|
|
alertname="HostRequiresReboot"
instance="monitor02"
job="node"
nodename="monitor02"
severity="info"
|
firing |
2025-11-20 20:44:32.079934382 +0000 UTC |
1 |
| Annotations |
- description
- monitor02 requires a reboot.
VALUE = 1
LABELS = map[instance:monitor02 job:node nodename:monitor02]
- summary
- Host requires reboot (instance monitor02)
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="HostUnusualDiskIo"
device="zd48"
instance="ffs11"
job="node"
nodename="ffs11"
severity="warning"
|
pending |
2025-12-06 03:19:47.079934382 +0000 UTC |
0.8714444444443668 |
| Annotations |
- description
- Time spent in IO is too high on ffs11. Check storage for issues.
VALUE = 0.8714444444443668
LABELS = map[device:zd48 instance:ffs11 job:node nodename:ffs11]
- summary
- Host unusual disk IO (instance ffs11)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
alert: HostFilesystemDeviceError
expr: node_filesystem_device_error
== 1
for: 2m
labels:
severity: critical
annotations:
description: |-
{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host filesystem device error (instance {{ $labels.instance }})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/alerts/postfix.yml > postfix_smtp_status_deferred
|
|
|
|
/etc/prometheus/alerts/pve.yml > pve-guest-alerts
|
|
|
|
|
|
/etc/prometheus/alerts/smartctl-exporter.yml > SmartctlExporter
|
alert: SmartCriticalWarning
expr: smartctl_device_critical_warning
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has critical warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart critical warning (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureCritical
expr: smartctl_device_temperature
> 80
for: 2m
labels:
severity: critical
annotations:
description: |-
Device temperature critical (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature critical (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureWarning
expr: smartctl_device_temperature
> 60
for: 2m
labels:
severity: warning
annotations:
description: |-
Device temperature warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature warning (instance {{ $labels.instance }})
|
alert: SmartMediaErrors
expr: smartctl_device_media_errors
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has media errors (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart media errors (instance {{ $labels.instance }})
|
|
|