Monitoring Setup Examples
Configure comprehensive monitoring with Ops Atlas.
Dashboard Configuration
System Overview Dashboard
json
{
"name": "System Overview",
"widgets": [
{
"type": "metric",
"title": "Total Requests",
"query": "sum(requests_total)",
"position": { "x": 0, "y": 0, "w": 3, "h": 2 }
},
{
"type": "metric",
"title": "Error Rate",
"query": "avg(error_rate)",
"threshold": { "warning": 1, "critical": 5 },
"position": { "x": 3, "y": 0, "w": 3, "h": 2 }
},
{
"type": "chart",
"title": "Request Rate",
"query": "sum(requests_per_sec) by service",
"chart_type": "line",
"position": { "x": 0, "y": 2, "w": 6, "h": 4 }
},
{
"type": "table",
"title": "Service Status",
"query": "service_status",
"position": { "x": 6, "y": 0, "w": 6, "h": 6 }
}
]
}Alert Rules
Critical Alerts
yaml
# alerts/critical.yml
alerts:
- name: Service Down
condition: service_status == "down"
duration: 1m
severity: critical
notifications:
- slack
- pagerduty
annotations:
summary: "Service {{ $labels.service }} is down"
runbook: "https://wiki.example.com/runbooks/service-down"
- name: High Error Rate
condition: error_rate > 5
duration: 5m
severity: critical
notifications:
- slack
- email
annotations:
summary: "Error rate is {{ $value }}% on {{ $labels.service }}"
- name: Database Connection Failed
condition: db_connection_status == 0
duration: 30s
severity: critical
notifications:
- slack
- pagerdutyWarning Alerts
yaml
# alerts/warnings.yml
alerts:
- name: High CPU Usage
condition: cpu_percent > 80
duration: 10m
severity: warning
notifications:
- slack
annotations:
summary: "CPU at {{ $value }}% on {{ $labels.service }}"
- name: High Memory Usage
condition: memory_percent > 85
duration: 10m
severity: warning
notifications:
- slack
- name: Slow Response Time
condition: response_time_ms > 500
duration: 5m
severity: warning
annotations:
summary: "Response time is {{ $value }}ms"
- name: Disk Space Low
condition: disk_percent > 80
duration: 30m
severity: warningNotification Templates
Slack Template
json
{
"attachments": [
{
"color": "{{ if eq .Severity \"critical\" }}danger{{ else }}warning{{ end }}",
"title": "{{ .AlertName }}",
"title_link": "{{ .DashboardURL }}",
"text": "{{ .Summary }}",
"fields": [
{
"title": "Service",
"value": "{{ .Service }}",
"short": true
},
{
"title": "Severity",
"value": "{{ .Severity }}",
"short": true
},
{
"title": "Started",
"value": "{{ .StartedAt }}",
"short": true
}
],
"footer": "Ops Atlas",
"ts": {{ .Timestamp }}
}
]
}Health Checks
yaml
# health-checks.yml
services:
- name: api-gateway
type: http
url: https://api.example.com/health
interval: 30s
timeout: 5s
expected_status: 200
expected_body: '{"status":"ok"}'
- name: database
type: tcp
host: db.example.com
port: 5432
interval: 10s
timeout: 3s
- name: redis
type: redis
url: redis://redis.example.com:6379
interval: 10sSLO Configuration
yaml
# slos.yml
slos:
- name: API Availability
target: 99.9
window: 30d
indicator:
type: availability
service: api-gateway
- name: API Latency
target: 95
window: 30d
indicator:
type: latency
service: api-gateway
threshold: 200ms
- name: Error Budget
target: 99.5
window: 7d
indicator:
type: error_rate
service: "*"
threshold: 0.5