Alerts


/etc/prometheus/prometheus/alert_rules.yml > blackbox-alerts
EndpointDown (6 active)
alert: EndpointDown
expr: probe_success
  == 0
for: 1m
labels:
  category: availability
  severity: critical
  team: apps
annotations:
  action: 1) Verificar estado de la aplicación | 2) Revisar balanceador de carga |
    3) Verificar certificado SSL | 4) Revisar logs de la aplicación | 5) Verificar
    conectividad de red
  description: El endpoint {{ $labels.instance }} no responde. El probe de monitoreo
    no recibe código HTTP 2xx.
  runbook_url: https://wiki.celuwebcloud.com/runbooks/endpoint-down
  summary: "\U0001F310 SERVICIO CAÍDO: {{ $labels.instance }}"
Labels State Active Since Value
alertname="EndpointDown" category="availability" instance="https://cw20-api-web-lego.celuwebcloud.com/api/auth/business_unit?company_code=1008&device=mobile" job="blackbox-http" severity="critical" team="apps" firing 2026-03-20 02:52:27.590845081 +0000 UTC 0
alertname="EndpointDown" category="availability" instance="https://cw20-api-web.celuwebcloud.com/api/auth/business_unit?company_code=1008&device=mobile" job="blackbox-http" severity="critical" team="apps" firing 2026-03-20 02:52:27.590845081 +0000 UTC 0
alertname="EndpointDown" category="availability" instance="https://cw20-api-web.celuwebdev.com/api/auth/business_unit?company_code=1008&device=mobile" job="blackbox-http" severity="critical" team="apps" firing 2026-03-20 02:52:27.590845081 +0000 UTC 0
alertname="EndpointDown" category="availability" instance="https://cw20-api-web.sandboxcw.net/api/auth/business_unit?company_code=1000&device=mobile" job="blackbox-http" severity="critical" team="apps" firing 2026-03-20 02:52:27.590845081 +0000 UTC 0
alertname="EndpointDown" category="availability" instance="https://pideky.celuwebcloud.com/Api/api/Status" job="blackbox-http" severity="critical" team="apps" firing 2026-03-20 02:52:27.590845081 +0000 UTC 0
alertname="EndpointDown" category="availability" instance="https://cw20-api-web-etix.celuwebcloud.com/api/auth/business_unit?company_code=1008&device=mobile" job="blackbox-http" severity="critical" team="apps" firing 2026-03-20 02:52:27.590845081 +0000 UTC 0
HighLatency (0 active)
alert: HighLatency
expr: probe_duration_seconds
  > 5
for: 3m
labels:
  category: performance
  severity: warning
  team: apps
annotations:
  action: Verificar carga del servidor, base de datos, queries lentos.
  description: El tiempo de respuesta es {{ $value | humanizeDuration }}.
  summary: Latencia alta en {{ $labels.instance }}
SSLCertificateExpired (0 active)
alert: SSLCertificateExpired
expr: (probe_ssl_earliest_cert_expiry
  - time()) / 86400 < 0
labels:
  category: security
  severity: critical
  team: infra
annotations:
  action: 'URGENTE: 1) Renovar certificado: certbot renew --force-renewal | 2) Reiniciar
    nginx/apache | 3) Verificar con: curl -v https://{{ $labels.instance }} | 4) Si
    usa CDN, invalidar caché'
  description: El certificado SSL para {{ $labels.instance }} ha EXPIRADO. Los usuarios
    están viendo errores de seguridad AHORA.
  summary: "\U0001F534 CERTIFICADO SSL EXPIRADO - {{ $labels.instance }}"
SSLCertificateExpiringSoon (0 active)
alert: SSLCertificateExpiringSoon
expr: (probe_ssl_earliest_cert_expiry
  - time()) / 86400 < 30
for: 1m
labels:
  category: security
  severity: warning
  team: infra
annotations:
  action: '1) Renovar certificado con certbot: certbot renew | 2) O contactar proveedor
    SSL | 3) Verificar auto-renovación configurada | 4) Reiniciar servicio web después
    de renovar'
  days_remaining: '{{ $value }}'
  description: El certificado SSL para {{ $labels.instance }} expira en {{ $value
    | humanizeDuration }}. Los usuarios verán advertencia de seguridad.
  summary: "\U0001F512 Certificado SSL expira en {{ $value | humanizeDuration }}"
/etc/prometheus/prometheus/alert_rules.yml > hardware-alerts
HighTemperature (0 active)
alert: HighTemperature
expr: node_hwmon_temp_celsius
  > 80
for: 5m
labels:
  category: hardware
  severity: critical
  team: infra
annotations:
  action: 'URGENTE: Verificar ventiladores, limpiar polvo del servidor. Apagar si
    supera 85°C.'
  description: La temperatura del sensor {{ $labels.sensor }} está a {{ $value }}°C
  summary: Temperatura alta en {{ $labels.instance }}
MediumTemperature (0 active)
alert: MediumTemperature
expr: node_hwmon_temp_celsius
  > 70
for: 10m
labels:
  category: hardware
  severity: warning
  team: infra
annotations:
  action: Verificar ventilación del servidor/data center.
  description: La temperatura está a {{ $value }}°C. Monitorear de cerca.
  summary: Temperatura elevada en {{ $labels.instance }}
RAIDDegraded (0 active)
alert: RAIDDegraded
expr: node_md_state{state="degraded"}
  == 1
labels:
  category: hardware
  severity: critical
  team: infra
annotations:
  action: 'URGENTE: Reemplazar disco fallido inmediatamente. Verificar: cat /proc/mdstat'
  description: El array RAID {{ $labels.md_device }} está en estado degradado.
  summary: RAID degradado en {{ $labels.instance }}
/etc/prometheus/prometheus/alert_rules.yml > info-alerts
NodeRebooted (0 active)
alert: NodeRebooted
expr: (time()
  - node_boot_time_seconds) < 300
labels:
  category: maintenance
  severity: info
  team: infra
annotations:
  action: Verificar si fue reinicio planificado o inesperado.
  description: El sistema se reinició hace {{ $value | humanizeDuration }}.
  summary: Reinicio detectado en {{ $labels.instance }}
RebootRequired (0 active)
alert: RebootRequired
expr: node_reboot_required
  > 0
labels:
  category: maintenance
  severity: info
  team: infra
annotations:
  action: Planificar ventana de mantenimiento para reiniciar y aplicar actualizaciones
    de seguridad.
  description: Hay actualizaciones del kernel pendientes que requieren reinicio.
  summary: Reinicio requerido en {{ $labels.instance }}
/etc/prometheus/prometheus/alert_rules.yml > linux-critical
HostDown (14 active)
alert: HostDown
expr: up{job=~"linux-servers|windows-servers"}
  == 0
for: 2m
labels:
  category: availability
  severity: critical
  team: infra
annotations:
  action: 1) Verificar si el servidor está encendido | 2) Comprobar conectividad de
    red (ping) | 3) Verificar servicio node_exporter/windows_exporter | 4) Revisar
    firewall
  description: El servidor {{ $labels.instance }} no ha respondido a pings de monitoreo
    en los últimos 2 minutos.
  runbook_url: https://wiki.celuwebcloud.com/runbooks/host-down
  summary: "\U0001F6A8 SERVIDOR CAÍDO: {{ $labels.instance }}"
Labels State Active Since Value
alertname="HostDown" category="availability" exporter="node_exporter" instance="CW20_SQL_STD_PROD:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="windows_exporter" instance="APP-RAMO:9182" job="windows-servers" os="windows" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="HERCULES:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="PFSENSE:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="OLLAMA-CLOUD:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="CW20_PRD_QA_DEV_AIRFLOW:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="CW20_QA_FINAL_API1:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="CW20_QA_DEV_HERRAMIENTAS:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="SICOLSA:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="windows_exporter" instance="POTENTIA:9182" job="windows-servers" os="windows" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="HELPDESK:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="CW20_PRD_FINAL_API1_WEB1:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="CW20_DEV_QA_FINAL_API1_WEB:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
alertname="HostDown" category="availability" exporter="node_exporter" instance="ERPFORPYMES:9100" job="linux-servers" os="linux" severity="critical" team="infra" firing 2026-03-20 02:52:26.072733514 +0000 UTC 0
DiskFull (0 active)
alert: DiskFull
expr: (node_filesystem_size_bytes{mountpoint="/"}
  - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"}
  * 100 > 95
for: 2m
labels:
  category: storage
  os: linux
  severity: critical
  team: infra
annotations:
  action: '1) Limpiar logs: sudo find /var/log -type f -name '*.log' -mtime
    +7 -delete | 2) Limpiar paquetes: sudo apt autoremove && sudo apt autoclean
    | 3) Verificar docker: docker system prune -a'
  description: El disco raíz está {{ $value | humanizePercentage }} lleno. Espacio
    libre crítico. Quedan {{ $value | humanizePercentage }} libres.
  runbook_url: https://wiki.celuwebcloud.com/runbooks/disk-full
  summary: Disco raíz casi lleno en {{ $labels.instance }}
  usage_percent: '{{ $value }}'
HighCPUUsage (0 active)
alert: HighCPUUsage
expr: 100
  - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 90
for: 5m
labels:
  category: performance
  severity: critical
  team: infra
annotations:
  action: '1) Conectar vía SSH y ejecutar: top o htop | 2) Identificar procesos de
    alto consumo | 3) Si es un servicio específico: sudo systemctl restart <servicio>
    | 4) Considerar escalar recursos si es carga legítima'
  cpu_percent: '{{ $value }}'
  description: 'El uso de CPU ha estado por encima del 90% durante más de 5 minutos.
    Valor actual: {{ $value | humanizePercentage }}'
  runbook_url: https://wiki.celuwebcloud.com/runbooks/high-cpu
  summary: "\U0001F525 CPU CRÍTICO en {{ $labels.instance }}"
MemoryCritical (0 active)
alert: MemoryCritical
expr: (node_memory_MemTotal_bytes
  - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 95
for: 3m
labels:
  category: performance
  severity: critical
  team: infra
annotations:
  action: '1) Identificar procesos: ps aux --sort=-%mem | head -10 | 2) Liberar caché:
    sudo sync && sudo echo 3 > /proc/sys/vm/drop_caches | 3) Considerar
    reiniciar servicios de alto consumo | 4) Verificar memory leaks en aplicaciones'
  description: 'El uso de memoria está por encima del 95%. Valor actual: {{ $value
    | humanizePercentage }}. Riesgo de usar swap o que OOM killer termine procesos.'
  memory_percent: '{{ $value }}'
  runbook_url: https://wiki.celuwebcloud.com/runbooks/memory-critical
  summary: ⚠️ MEMORIA CRÍTICA en {{ $labels.instance }}
/etc/prometheus/prometheus/alert_rules.yml > linux-warning
DiskSpaceLow (1 active)
alert: DiskSpaceLow
expr: (node_filesystem_size_bytes{mountpoint="/"}
  - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"}
  * 100 > 80
for: 5m
labels:
  category: storage
  severity: warning
  team: infra
annotations:
  action: Planificar limpieza de disco. Revisar logs, archivos temporales, backups
    antiguos.
  description: El disco raíz está {{ $value | humanizePercentage }} lleno.
  summary: Espacio en disco bajo en {{ $labels.instance }}
Labels State Active Since Value
alertname="DiskSpaceLow" category="storage" device="/dev/nvme0n1p1" exporter="node_exporter" fstype="xfs" instance="AUTH-POS:9100" job="linux-servers" mountpoint="/" os="linux" severity="warning" team="infra" firing 2026-03-20 02:52:38.584607583 +0000 UTC 85.22314598924574
DiskWillFillIn4Hours (0 active)
alert: DiskWillFillIn4Hours
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h],
  4 * 3600) < 0
for: 10m
labels:
  category: storage
  severity: warning
  team: infra
annotations:
  action: Acción preventiva urgente. Investigar qué está consumiendo espacio rápidamente.
  description: Basado en la tendencia actual, el disco se llenará en las próximas
    4 horas.
  summary: El disco se llenará pronto en {{ $labels.instance }}
HighCPUUsageWarning (0 active)
alert: HighCPUUsageWarning
expr: 100
  - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 75
for: 10m
labels:
  category: performance
  severity: warning
  team: infra
annotations:
  action: Monitorear tendencia. Identificar si es carga normal o anomalía.
  description: 'El uso de CPU ha estado por encima del 75% durante más de 10 minutos.
    Valor: {{ $value | humanizePercentage }}'
  summary: Uso de CPU elevado en {{ $labels.instance }}
HighLoadAverage (0 active)
alert: HighLoadAverage
expr: node_load1
  > (count by(instance) (node_cpu_seconds_total{mode="idle"}) * 2)
for: 10m
labels:
  category: performance
  severity: warning
  team: infra
annotations:
  action: Muchos procesos en cola. Verificar I/O de disco o procesos bloqueados.
  description: La carga promedio (1m) es {{ $value }}, superior al doble del número
    de cores.
  summary: Carga del sistema alta en {{ $labels.instance }}
HighSwapUsage (0 active)
alert: HighSwapUsage
expr: (node_memory_SwapTotal_bytes
  > 0) and (node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / node_memory_SwapTotal_bytes
  * 100 > 80
for: 5m
labels:
  category: performance
  severity: warning
  team: infra
annotations:
  action: El sistema está usando swap pesadamente. Considerar aumentar RAM o investigar
    memory leaks.
  description: El uso de swap está por encima del 80%. Esto indica presión de memoria.
  summary: Uso de swap elevado en {{ $labels.instance }}
MemoryHigh (0 active)
alert: MemoryHigh
expr: (node_memory_MemTotal_bytes
  - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
  category: performance
  severity: warning
  team: infra
annotations:
  action: Revisar procesos con alto consumo de memoria. Verificar leaks de memoria
    en aplicaciones.
  description: 'El uso de memoria está por encima del 85%. Valor: {{ $value | humanizePercentage
    }}'
  summary: Uso de memoria elevado en {{ $labels.instance }}
SlowDiskIO (0 active)
alert: SlowDiskIO
expr: rate(node_disk_io_time_seconds_total[5m])
  > 0.8
for: 5m
labels:
  category: storage
  severity: warning
  team: infra
annotations:
  action: Verificar qué procesos están haciendo I/O intensivo con 'iotop'.
    Considerar SSD o RAID.
  description: El tiempo de I/O del disco está por encima del 80%. Posible cuello
    de botella.
  summary: I/O de disco lento en {{ $labels.instance }}
/etc/prometheus/prometheus/alert_rules.yml > network-alerts
HighNetworkTrafficRX (0 active)
alert: HighNetworkTrafficRX
expr: rate(node_network_receive_bytes_total[5m])
  > 1e+08
for: 10m
labels:
  category: network
  severity: info
  team: infra
annotations:
  action: Verificar si es tráfico legítimo o posible ataque DDoS.
  description: Tráfico de red entrante está por encima de 100MB/s.
  summary: Alto tráfico de entrada en {{ $labels.instance }}
HighNetworkTrafficTX (0 active)
alert: HighNetworkTrafficTX
expr: rate(node_network_transmit_bytes_total[5m])
  > 1e+08
for: 10m
labels:
  category: network
  severity: info
  team: infra
annotations:
  action: Verificar si es tráfico legítimo o posible exfiltración de datos.
  description: Tráfico de red saliente está por encima de 100MB/s.
  summary: Alto tráfico de salida en {{ $labels.instance }}
NetworkErrors (0 active)
alert: NetworkErrors
expr: rate(node_network_receive_errs_total[5m])
  > 10 or rate(node_network_transmit_errs_total[5m]) > 10
for: 5m
labels:
  category: network
  severity: warning
  team: infra
annotations:
  action: Verificar cables, interfaces de red, switch. Usar 'ethtool' para
    diagnóstico.
  description: Se detectan errores de transmisión/recepción de red.
  summary: Errores de red en {{ $labels.instance }}
TooManyNetworkConnections (0 active)
alert: TooManyNetworkConnections
expr: node_netstat_Tcp_CurrEstab
  > 10000
for: 5m
labels:
  category: network
  severity: warning
  team: infra
annotations:
  action: Verificar conexiones con 'ss -s' o 'netstat -an'. Buscar
    patrones inusuales.
  description: Hay {{ $value }} conexiones TCP establecidas. Esto puede indicar un
    ataque o leak de conexiones.
  summary: Muchas conexiones TCP en {{ $labels.instance }}
/etc/prometheus/prometheus/alert_rules.yml > process-alerts
TooManyProcesses (0 active)
alert: TooManyProcesses
expr: node_processes_max_processes
  - node_processes_state{state="R"} < 100
for: 5m
labels:
  category: processes
  severity: warning
  team: infra
annotations:
  action: Verificar fuga de procesos (fork bombs, aplicaciones con leaks).
  description: Quedan menos de 100 procesos disponibles del límite del sistema.
  summary: Límite de procesos cercano en {{ $labels.instance }}
ZombieProcesses (0 active)
alert: ZombieProcesses
expr: node_processes_state{state="Z"}
  > 0
for: 5m
labels:
  category: processes
  severity: warning
  team: infra
annotations:
  action: 'Identificar proceso padre que dejó huérfanos: ps aux | grep 'Z''
  description: Hay {{ $value }} procesos zombie en el sistema.
  summary: Procesos zombie en {{ $labels.instance }}
/etc/prometheus/prometheus/alert_rules.yml > windows-alerts
WindowsDiskFull (2 active)
alert: WindowsDiskFull
expr: (windows_logical_disk_size_bytes
  - windows_logical_disk_free_bytes) / windows_logical_disk_size_bytes * 100 >
  90
for: 5m
labels:
  category: storage
  os: windows
  severity: critical
  team: infra
annotations:
  action: '1) Ejecutar cleanmgr como administrador | 2) Vaciar Papelera de reciclaje
    | 3) Limpiar logs de IIS: C:/inetpub/logs | 4) Limpiar Event Viewer logs | 5)
    Desinstalar programas no usados'
  description: 'Volumen {{ $labels.volume }} en {{ $labels.instance }} tiene {{ $value
    | humanizePercentage }} de uso (umbral: 90%). Se requiere acción inmediata.'
  runbook_url: https://wiki.celuwebcloud.com/runbooks/disk-full
  summary: Disco lleno en {{ $labels.instance }} - Volumen {{ $labels.volume }}
  usage_percent: '{{ $value }}'
Labels State Active Since Value
alertname="WindowsDiskFull" category="storage" exporter="windows_exporter" instance="CW20_QA_DEV_FINAL_BD1:9182" job="windows-servers" os="windows" severity="critical" team="infra" volume="D:" firing 2026-03-20 02:52:47.826883481 +0000 UTC 94.68224550560335
alertname="WindowsDiskFull" category="storage" exporter="windows_exporter" instance="VANGUARD:9182" job="windows-servers" os="windows" severity="critical" team="infra" volume="D:" firing 2026-03-20 02:52:47.826883481 +0000 UTC 96.67037652697947
WindowsHighMemory (3 active)
alert: WindowsHighMemory
expr: (windows_cs_physical_memory_bytes
  - windows_os_physical_memory_free_bytes) / windows_cs_physical_memory_bytes * 100
  > 90
for: 5m
labels:
  category: performance
  severity: critical
  team: infra
annotations:
  action: Reiniciar servicios de alto consumo. Considerar aumentar RAM.
  description: 'Uso de memoria: {{ $value | humanizePercentage }}'
  summary: Memoria crítica en Windows {{ $labels.instance }}
Labels State Active Since Value
alertname="WindowsHighMemory" category="performance" exporter="windows_exporter" instance="CW20_PRD_FINAL_BD1:9182" job="windows-servers" os="windows" severity="critical" team="infra" firing 2026-03-22 08:09:47.826883481 +0000 UTC 91.08384304135572
alertname="WindowsHighMemory" category="performance" exporter="windows_exporter" instance="CW20_QA_DEV_FINAL_BD1:9182" job="windows-servers" os="windows" severity="critical" team="infra" firing 2026-03-20 02:52:47.826883481 +0000 UTC 93.15900855422527
alertname="WindowsHighMemory" category="performance" exporter="windows_exporter" instance="SQL-IMPORTADORAS:9182" job="windows-servers" os="windows" severity="critical" team="infra" firing 2026-03-20 02:52:47.826883481 +0000 UTC 93.09981281219888
WindowsServerDown (2 active)
alert: WindowsServerDown
expr: up{job="windows-servers"}
  == 0
for: 2m
labels:
  category: availability
  severity: critical
  team: infra
annotations:
  action: Verificar servicio windows_exporter, firewall, y estado del servidor.
  description: El servidor Windows no responde al exporter.
  summary: Servidor Windows {{ $labels.instance }} está caído
Labels State Active Since Value
alertname="WindowsServerDown" category="availability" exporter="windows_exporter" instance="APP-RAMO:9182" job="windows-servers" os="windows" severity="critical" team="infra" firing 2026-03-20 02:52:47.826883481 +0000 UTC 0
alertname="WindowsServerDown" category="availability" exporter="windows_exporter" instance="POTENTIA:9182" job="windows-servers" os="windows" severity="critical" team="infra" firing 2026-03-20 02:52:47.826883481 +0000 UTC 0
WindowsHighCPU (0 active)
alert: WindowsHighCPU
expr: 100
  - (avg by(instance) (windows_cpu_time_total{mode="idle"}) * 100) > 85
for: 10m
labels:
  category: performance
  severity: warning
  team: infra
annotations:
  action: Verificar procesos en Task Manager o con Get-Process en PowerShell.
  description: 'Uso de CPU: {{ $value | humanizePercentage }}'
  summary: CPU elevada en Windows {{ $labels.instance }}
WindowsServiceStopped (0 active)
alert: WindowsServiceStopped
expr: windows_service_state{name=~"MSSQLSERVER|W3SVC|ADWS|NTDS|DNS",state="running"}
  == 0
for: 1m
labels:
  category: services
  severity: critical
  team: infra
annotations:
  action: 'Iniciar servicio inmediatamente: net start {{ $labels.name }}'
  description: El servicio {{ $labels.name }} está detenido.
  summary: Servicio crítico detenido en {{ $labels.instance }}
WindowsUnexpectedReboot (0 active)
alert: WindowsUnexpectedReboot
expr: (time()
  - windows_system_system_up_time) < 300 and (time() - windows_system_system_up_time)
  > 0
labels:
  category: availability
  severity: warning
  team: infra
annotations:
  action: Verificar logs del sistema para determinar causa del reinicio. Event Viewer
    > System.
  description: El servidor Windows se reinició hace menos de 5 minutos.
  summary: Reinicio reciente detectado en {{ $labels.instance }}