alert-management
Implement comprehensive alert management with PagerDuty, escalation policies, and incident coordination. Use when setting up alerting systems, managing on-call schedules, or coordinating incident response.
Installation
Copy to your project
cp -r skills/alert-management/ /your-project/.claude/skills/alert-management/
Alert Management
Overview
Design and implement sophisticated alert management systems with PagerDuty integration, escalation policies, alert routing, and incident coordination.
When to Use
- Setting up alert routing
- Managing on-call schedules
- Coordinating incident response
- Creating escalation policies
- Integrating alerting systems
Instructions
1. PagerDuty Client Integration
// pagerduty-client.js
const axios = require("axios");
class PagerDutyClient {
constructor(apiToken) {
this.apiToken = apiToken;
this.baseUrl = "https://api.pagerduty.com";
this.eventUrl = "https://events.pagerduty.com/v2/enqueue";
this.client = axios.create({
baseURL: this.baseUrl,
headers: {
Authorization: `Token token=${apiToken}`,
Accept: "application/vnd.pagerduty+json;version=2",
},
});
}
async triggerEvent(config) {
const event = {
routing_key: config.routingKey,
event_action: config.eventAction || "trigger",
dedup_key: config.dedupKey || `event-${Date.now()}`,
payload: {
summary: config.summary,
timestamp: new Date().toISOString(),
severity: config.severity || "error",
source: config.source || "Monitoring System",
component: config.component,
custom_details: config.customDetails || {},
},
};
try {
const response = await axios.post(this.eventUrl, event);
return response.data;
} catch (error) {
console.error("Failed to trigger PagerDuty event:", error);
throw error;
}
}
async resolveEvent(dedupKey) {
const event = {
routing_key: process.env.PAGERDUTY_ROUTING_KEY,
event_action: "resolve",
dedup_key: dedupKey,
};
try {
return await axios.post(this.eventUrl, event);
} catch (error) {
console.error("Failed to resolve event:", error);
throw error;
}
}
async getServices() {
const response = await this.client.get("/services");
return response.data.services;
}
async getEscalationPolicies() {
const response = await this.client.get("/escalation_policies");
return response.data.escalation_policies;
}
async createIncident(config) {
const incident = {
type: "incident",
title: config.title,
service: {
id: config.serviceId,
type: "service_reference",
},
escalation_policy: {
id: config.escalationPolicyId,
type: "escalation_policy_reference",
},
body: {
type: "incident_body",
details: config.details || "",
},
};
try {
const response = await this.client.post("/incidents", incident, {
headers: { From: process.env.PAGERDUTY_EMAIL },
});
return response.data.incident;
} catch (error) {
console.error("Failed to create incident:", error);
throw error;
}
}
async acknowledgeIncident(incidentId, userId) {
try {
const response = await this.client.put(
`/incidents/${incidentId}`,
{
incidents: [
{
id: incidentId,
type: "incident_reference",
status: "acknowledged",
},
],
},
{ headers: { From: process.env.PAGERDUTY_EMAIL } },
);
return response.data.incidents[0];
} catch (error) {
console.error("Failed to acknowledge:", error);
throw error;
}
}
async resolveIncident(incidentId) {
try {
const response = await this.client.put(
`/incidents/${incidentId}`,
{
incidents: [
{
id: incidentId,
type: "incident_reference",
status: "resolved",
},
],
},
{ headers: { From: process.env.PAGERDUTY_EMAIL } },
);
return response.data.incidents[0];
} catch (error) {
console.error("Failed to resolve:", error);
throw error;
}
}
}
module.exports = PagerDutyClient;
2. Alertmanager Configuration
# /etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: "${SLACK_WEBHOOK_URL}"
templates:
- "/etc/alertmanager/templates/*.tmpl"
route:
receiver: "default"
group_by: ["alertname", "cluster", "service"]
group_wait: 10s
group_interval: 10s
repeat_interval: 4h
routes:
- match:
severity: critical
receiver: pagerduty
continue: true
group_wait: 0s
- match:
severity: warning
receiver: slack
- match:
service: payment-service
receiver: payment-team
group_wait: 30s
receivers:
- name: "default"
slack_configs:
- channel: "#alerts"
title: "Alert: {{ .GroupLabels.alertname }}"
- name: "pagerduty"
pagerduty_configs:
- service_key: "${PAGERDUTY_SERVICE_KEY}"
description: "{{ .GroupLabels.alertname }}"
- name: "slack"
slack_configs:
- channel: "#alerts"
title: "Warning: {{ .GroupLabels.alertname }}"
- name: "payment-team"
pagerduty_configs:
- service_key: "${PAYMENT_PAGERDUTY_KEY}"
slack_configs:
- channel: "#payment-alerts"
inhibit_rules:
- source_match:
severity: "critical"
target_match:
severity: "warning"
equal: ["alertname", "service"]
3. Alert Handler Middleware
// alert-handler.js
const PagerDutyClient = require("./pagerduty-client");
const pdClient = new PagerDutyClient(process.env.PAGERDUTY_API_TOKEN);
class AlertHandler {
constructor() {
this.alertCache = new Map();
this.deduplicationWindow = 300000; // 5 minutes
}
shouldSendAlert(dedupKey) {
const cacheEntry = this.alertCache.get(dedupKey);
if (!cacheEntry) return true;
const timeSinceLastAlert = Date.now() - cacheEntry.timestamp;
return timeSinceLastAlert >= this.deduplicationWindow;
}
recordAlert(dedupKey) {
this.alertCache.set(dedupKey, { timestamp: Date.now() });
}
determineSeverity(value, thresholds) {
if (value >= thresholds.critical) return "critical";
if (value >= thresholds.warning) return "warning";
return "info";
}
async sendAlert(config) {
const dedupKey =
config.dedupKey || `alert-${config.alertName}-${Date.now()}`;
try {
if (!this.shouldSendAlert(dedupKey)) {
console.log("Alert recently sent, skipping");
return;
}
const event = {
routingKey: config.routingKey,
eventAction: config.eventAction || "trigger",
dedupKey: dedupKey,
summary: config.summary,
severity: config.severity,
source: config.source || "Monitoring System",
component: config.component,
customDetails: {
...config.customDetails,
alertName: config.alertName,
timestamp: new Date().toISOString(),
},
};
const result = await pdClient.triggerEvent(event);
this.recordAlert(dedupKey);
console.log("Alert sent", {
alertName: config.alertName,
severity: config.severity,
});
return result;
} catch (error) {
console.error("Failed to send alert:", error);
await this.sendSlackAlert(config);
}
}
async sendSlackAlert(config) {
const axios = require("axios");
const webhookUrl = process.env.SLACK_WEBHOOK_URL;
const message = {
color: config.severity === "critical" ? "danger" : "warning",
title: config.summary,
text: config.customDetails?.description || "",
fields: [
{ title: "Severity", value: config.severity, short: true },
{ title: "Component", value: config.component, short: true },
],
};
try {
await axios.post(webhookUrl, { attachments: [message] });
} catch (error) {
console.error("Failed to send Slack alert:", error);
}
}
async resolveAlert(dedupKey) {
try {
await pdClient.resolveEvent(dedupKey);
console.log("Alert resolved");
} catch (error) {
console.error("Failed to resolve alert:", error);
}
}
}
module.exports = new AlertHandler();
4. Alert Routing Engine
// alert-router.js
class AlertRouter {
constructor() {
this.routes = [];
}
addRoute(rule) {
this.routes.push({
priority: rule.priority || 0,
condition: rule.condition,
handler: rule.handler,
escalation: rule.escalation,
});
this.routes.sort((a, b) => b.priority - a.priority);
}
async route(alert) {
for (const route of this.routes) {
if (route.condition(alert)) {
return await route.handler(alert, route.escalation);
}
}
return this.defaultHandler(alert);
}
async defaultHandler(alert) {
console.log("Routing to default handler:", alert.name);
return { routed: true, handler: "default" };
}
}
// Usage
const router = new AlertRouter();
router.addRoute({
priority: 100,
condition: (alert) =>
alert.severity === "critical" && alert.component === "database",
handler: async (alert) => {
console.log("Routing critical database alert to DBA team");
return { team: "dba", escalation: "immediate" };
},
});
router.addRoute({
priority: 90,
condition: (alert) => alert.component === "payment-service",
handler: async (alert) => {
console.log("Routing to payment team");
return { team: "payment", escalation: "payment-policy" };
},
});
router.addRoute({
priority: 10,
condition: (alert) => alert.severity === "warning",
handler: async (alert) => {
console.log("Routing warning to Slack");
return { handler: "slack-only" };
},
});
module.exports = router;
5. Docker Compose Alert Stack
# docker-compose.yml
version: "3.8"
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
environment:
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
PAGERDUTY_SERVICE_KEY: ${PAGERDUTY_SERVICE_KEY}
depends_on:
- prometheus
alert-handler:
build: .
environment:
PAGERDUTY_API_TOKEN: ${PAGERDUTY_API_TOKEN}
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
ports:
- "3000:3000"
depends_on:
- alertmanager
Best Practices
✅ DO
- Set appropriate thresholds
- Implement alert deduplication
- Use clear alert names
- Include runbook links
- Configure escalation properly
- Test alert rules
- Monitor alert quality
- Set repeat intervals
- Track alert metrics
- Document alert meanings
❌ DON’T
- Alert on every anomaly
- Ignore alert fatigue
- Set thresholds arbitrarily
- Skip runbooks
- Alert without action
- Disable alerts in production
- Use vague alert names
- Forget escalation policies
- Re-alert too frequently
Alert Severity Levels
- Critical: Immediate action required, customer impact
- Warning: Investigation needed, potential issues
- Info: Informational, no action required
Key Metrics
- Alert volume
- Resolution time
- False positive rate
- Escalation frequency
- MTTD (Mean Time to Detection)
- MTTR (Mean Time to Resolution)