Skip to content

Monitoring Setup Examples

Configure comprehensive monitoring with Ops Atlas.

Dashboard Configuration

System Overview Dashboard

json
{
  "name": "System Overview",
  "widgets": [
    {
      "type": "metric",
      "title": "Total Requests",
      "query": "sum(requests_total)",
      "position": { "x": 0, "y": 0, "w": 3, "h": 2 }
    },
    {
      "type": "metric", 
      "title": "Error Rate",
      "query": "avg(error_rate)",
      "threshold": { "warning": 1, "critical": 5 },
      "position": { "x": 3, "y": 0, "w": 3, "h": 2 }
    },
    {
      "type": "chart",
      "title": "Request Rate",
      "query": "sum(requests_per_sec) by service",
      "chart_type": "line",
      "position": { "x": 0, "y": 2, "w": 6, "h": 4 }
    },
    {
      "type": "table",
      "title": "Service Status",
      "query": "service_status",
      "position": { "x": 6, "y": 0, "w": 6, "h": 6 }
    }
  ]
}

Alert Rules

Critical Alerts

yaml
# alerts/critical.yml
alerts:
  - name: Service Down
    condition: service_status == "down"
    duration: 1m
    severity: critical
    notifications:
      - slack
      - pagerduty
    annotations:
      summary: "Service {{ $labels.service }} is down"
      runbook: "https://wiki.example.com/runbooks/service-down"

  - name: High Error Rate
    condition: error_rate > 5
    duration: 5m
    severity: critical
    notifications:
      - slack
      - email
    annotations:
      summary: "Error rate is {{ $value }}% on {{ $labels.service }}"

  - name: Database Connection Failed
    condition: db_connection_status == 0
    duration: 30s
    severity: critical
    notifications:
      - slack
      - pagerduty

Warning Alerts

yaml
# alerts/warnings.yml
alerts:
  - name: High CPU Usage
    condition: cpu_percent > 80
    duration: 10m
    severity: warning
    notifications:
      - slack
    annotations:
      summary: "CPU at {{ $value }}% on {{ $labels.service }}"

  - name: High Memory Usage
    condition: memory_percent > 85
    duration: 10m
    severity: warning
    notifications:
      - slack

  - name: Slow Response Time
    condition: response_time_ms > 500
    duration: 5m
    severity: warning
    annotations:
      summary: "Response time is {{ $value }}ms"

  - name: Disk Space Low
    condition: disk_percent > 80
    duration: 30m
    severity: warning

Notification Templates

Slack Template

json
{
  "attachments": [
    {
      "color": "{{ if eq .Severity \"critical\" }}danger{{ else }}warning{{ end }}",
      "title": "{{ .AlertName }}",
      "title_link": "{{ .DashboardURL }}",
      "text": "{{ .Summary }}",
      "fields": [
        {
          "title": "Service",
          "value": "{{ .Service }}",
          "short": true
        },
        {
          "title": "Severity",
          "value": "{{ .Severity }}",
          "short": true
        },
        {
          "title": "Started",
          "value": "{{ .StartedAt }}",
          "short": true
        }
      ],
      "footer": "Ops Atlas",
      "ts": {{ .Timestamp }}
    }
  ]
}

Health Checks

yaml
# health-checks.yml
services:
  - name: api-gateway
    type: http
    url: https://api.example.com/health
    interval: 30s
    timeout: 5s
    expected_status: 200
    expected_body: '{"status":"ok"}'

  - name: database
    type: tcp
    host: db.example.com
    port: 5432
    interval: 10s
    timeout: 3s

  - name: redis
    type: redis
    url: redis://redis.example.com:6379
    interval: 10s

SLO Configuration

yaml
# slos.yml
slos:
  - name: API Availability
    target: 99.9
    window: 30d
    indicator:
      type: availability
      service: api-gateway

  - name: API Latency
    target: 95
    window: 30d
    indicator:
      type: latency
      service: api-gateway
      threshold: 200ms

  - name: Error Budget
    target: 99.5
    window: 7d
    indicator:
      type: error_rate
      service: "*"
      threshold: 0.5

Released under the MIT License.