infrastructure-monitoring
Set up comprehensive infrastructure monitoring with Prometheus, Grafana, and alerting systems for metrics, health checks, and performance tracking.
Installation
Copy to your project
cp -r skills/infrastructure-monitoring/ /your-project/.claude/skills/infrastructure-monitoring/
Infrastructure Monitoring
Overview
Implement comprehensive infrastructure monitoring to track system health, performance metrics, and resource utilization with alerting and visualization across your entire stack.
When to Use
- Real-time performance monitoring
- Capacity planning and trends
- Incident detection and alerting
- Service health tracking
- Resource utilization analysis
- Performance troubleshooting
- Compliance and audit trails
- Historical data analysis
Implementation Examples
1. Prometheus Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: "infrastructure-monitor"
environment: "production"
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
# Rule files
rule_files:
- "alerts.yml"
- "rules.yml"
scrape_configs:
# Prometheus itself
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Node Exporter for system metrics
- job_name: "node"
static_configs:
- targets:
- "node1.internal:9100"
- "node2.internal:9100"
- "node3.internal:9100"
relabel_configs:
- source_labels: [__address__]
target_label: instance
# Docker container metrics
- job_name: "docker"
static_configs:
- targets: ["localhost:9323"]
metrics_path: "/metrics"
# Kubernetes metrics
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels:
[
__meta_kubernetes_namespace,
__meta_kubernetes_service_name,
__meta_kubernetes_endpoint_port_name,
]
action: keep
regex: default;kubernetes;https
# Application metrics
- job_name: "application"
metrics_path: "/metrics"
static_configs:
- targets:
- "app1.internal:8080"
- "app2.internal:8080"
- "app3.internal:8080"
scrape_interval: 10s
scrape_timeout: 5s
# PostgreSQL metrics
- job_name: "postgres"
static_configs:
- targets: ["postgres-exporter.internal:9187"]
# Redis metrics
- job_name: "redis"
static_configs:
- targets: ["redis-exporter.internal:9121"]
# RabbitMQ metrics
- job_name: "rabbitmq"
static_configs:
- targets: ["rabbitmq.internal:15692"]
2. Alert Rules
# alerts.yml
groups:
- name: application_alerts
interval: 30s
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "High request latency"
description: "P95 latency is {{ $value }}s"
- alert: ServiceDown
expr: up{job="application"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.instance }} is down"
description: "Service has been unreachable for 1 minute"
- name: infrastructure_alerts
interval: 30s
rules:
- alert: HighCPUUsage
expr: (100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}%"
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value }}%"
- alert: LowDiskSpace
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|vfat"} / node_filesystem_size_bytes) * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Available disk space is {{ $value }}%"
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Kubernetes node {{ $labels.node }} is not ready"
description: "Node has been unready for 5 minutes"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} is crash looping"
description: "Pod has restarted {{ $value }} times in 15 minutes"
3. Alertmanager Configuration
# alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: "YOUR_SLACK_WEBHOOK_URL"
# Template files
templates:
- "/etc/alertmanager/templates/*.tmpl"
# Routing tree
route:
receiver: "default"
group_by: ["alertname", "cluster", "service"]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
# Critical alerts
- match:
severity: critical
receiver: "critical-team"
continue: true
group_wait: 10s
repeat_interval: 1h
# Warning alerts
- match:
severity: warning
receiver: "warning-channel"
group_wait: 1m
# Receivers
receivers:
- name: "default"
slack_configs:
- channel: "#alerts"
title: "Alert: {{ .GroupLabels.alertname }}"
text: "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
- name: "critical-team"
slack_configs:
- channel: "#critical-alerts"
title: "CRITICAL: {{ .GroupLabels.alertname }}"
email_configs:
- to: "oncall@mycompany.com"
from: "alertmanager@mycompany.com"
smarthost: "smtp.mycompany.com:587"
auth_username: "alertmanager@mycompany.com"
auth_password: "secret"
- name: "warning-channel"
slack_configs:
- channel: "#warnings"
title: "Warning: {{ .GroupLabels.alertname }}"
4. Grafana Dashboard
{
"dashboard": {
"title": "Infrastructure Overview",
"panels": [
{
"title": "CPU Usage",
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)"
}
],
"type": "graph",
"alert": {
"name": "CPU Usage Alert",
"conditions": [
{
"evaluator": {
"type": "gt",
"params": [80]
}
}
]
}
},
{
"title": "Memory Usage",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100"
}
],
"type": "graph"
},
{
"title": "Request Rate",
"targets": [
{
"expr": "rate(http_requests_total[5m])"
}
],
"type": "graph"
},
{
"title": "Error Rate",
"targets": [
{
"expr": "rate(http_requests_total{status=~\"5..\"}[5m])"
}
],
"type": "graph"
},
{
"title": "Response Time P95",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
}
],
"type": "graph"
},
{
"title": "Disk Usage",
"targets": [
{
"expr": "(1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100"
}
],
"type": "graph"
}
]
}
}
5. Monitoring Deployment
#!/bin/bash
# deploy-monitoring.sh - Deploy Prometheus and Grafana
set -euo pipefail
NAMESPACE="monitoring"
PROMETHEUS_VERSION="v2.40.0"
GRAFANA_VERSION="9.3.2"
echo "Creating monitoring namespace..."
kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
# Deploy Prometheus
echo "Deploying Prometheus..."
kubectl apply -f prometheus-configmap.yaml -n "$NAMESPACE"
kubectl apply -f prometheus-deployment.yaml -n "$NAMESPACE"
kubectl apply -f prometheus-service.yaml -n "$NAMESPACE"
# Deploy Alertmanager
echo "Deploying Alertmanager..."
kubectl apply -f alertmanager-configmap.yaml -n "$NAMESPACE"
kubectl apply -f alertmanager-deployment.yaml -n "$NAMESPACE"
kubectl apply -f alertmanager-service.yaml -n "$NAMESPACE"
# Deploy Grafana
echo "Deploying Grafana..."
kubectl apply -f grafana-deployment.yaml -n "$NAMESPACE"
kubectl apply -f grafana-service.yaml -n "$NAMESPACE"
# Wait for deployments
echo "Waiting for deployments to be ready..."
kubectl rollout status deployment/prometheus -n "$NAMESPACE" --timeout=5m
kubectl rollout status deployment/alertmanager -n "$NAMESPACE" --timeout=5m
kubectl rollout status deployment/grafana -n "$NAMESPACE" --timeout=5m
# Port forward for access
echo "Port forwarding to services..."
kubectl port-forward -n "$NAMESPACE" svc/prometheus 9090:9090 &
kubectl port-forward -n "$NAMESPACE" svc/grafana 3000:3000 &
echo "Monitoring stack deployed successfully!"
echo "Prometheus: http://localhost:9090"
echo "Grafana: http://localhost:3000"
Monitoring Best Practices
✅ DO
- Monitor key business metrics
- Set appropriate alert thresholds
- Use consistent naming conventions
- Implement dashboards for visualization
- Keep data retention reasonable
- Use labels for better querying
- Test alerting paths regularly
- Document alert meanings
❌ DON’T
- Alert on every metric change
- Ignore alert noise
- Store too much unnecessary data
- Set unrealistic thresholds
- Mix metrics from different sources
- Forget to test alert routing
- Alert without runbooks
- Over-instrument without purpose