Prometheus Integration
Export async-inspect metrics to Prometheus for monitoring and alerting.
Quick Start
use async_inspect::{Inspector, Config};
use async_inspect::integrations::PrometheusExporter;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create inspector
let inspector = Inspector::new(Config::default());
// Start Prometheus exporter
let exporter = PrometheusExporter::new(inspector.clone());
exporter.start_server("0.0.0.0:9090").await?;
println!("Metrics available at http://localhost:9090/metrics");
// Your application code
Ok(())
}
Installation
Add the prometheus-export feature:
[dependencies]
async-inspect = { version = "0.1", features = ["prometheus-export"] }
Exported Metrics
Task Metrics
async_inspect_tasks_total
Type: Counter Description: Total number of tasks created Labels:
name: Task function name
# Rate of task creation
rate(async_inspect_tasks_total[5m])
# Top 10 most created tasks
topk(10, sum by (name) (async_inspect_tasks_total))
async_inspect_tasks_by_state
Type: Gauge Description: Current number of tasks in each state Labels:
state:running,blocked,completed,failed
# Currently running tasks
async_inspect_tasks_by_state{state="running"}
# Percentage of blocked tasks
async_inspect_tasks_by_state{state="blocked"}
/ sum(async_inspect_tasks_by_state) * 100
async_inspect_task_duration_seconds
Type: Histogram Description: Task execution duration Labels:
name: Task function namestate: Final state (completed,failed)
Buckets: 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0
# 99th percentile task duration
histogram_quantile(0.99,
sum(rate(async_inspect_task_duration_seconds_bucket[5m])) by (le, name)
)
# Average task duration by name
sum(rate(async_inspect_task_duration_seconds_sum[5m])) by (name)
/ sum(rate(async_inspect_task_duration_seconds_count[5m])) by (name)
Event Metrics
async_inspect_events_total
Type: Counter Description: Total number of events recorded Labels:
type:poll,wake,drop,spawn
# Event rate by type
rate(async_inspect_events_total[5m])
# Most active event types
topk(5, sum by (type) (rate(async_inspect_events_total[5m])))
async_inspect_poll_count
Type: Histogram Description: Number of polls per task Labels:
name: Task function name
# Tasks with excessive polling
async_inspect_poll_count > 1000
Deadlock Metrics
async_inspect_deadlocks_detected
Type: Counter Description: Number of deadlocks detected
# Any deadlocks in last 5 minutes?
increase(async_inspect_deadlocks_detected[5m]) > 0
Resource Metrics
async_inspect_memory_bytes
Type: Gauge Description: Memory used by async-inspect Labels:
component:tasks,events,metadata
# Total memory usage
sum(async_inspect_memory_bytes)
# Memory by component
async_inspect_memory_bytes
Configuration
Custom Port
exporter.start_server("0.0.0.0:9091").await?;
Custom Endpoint Path
let exporter = PrometheusExporter::builder()
.path("/custom/metrics") // Default: /metrics
.port(9090)
.build(inspector.clone());
Update Interval
let exporter = PrometheusExporter::builder()
.update_interval(Duration::from_secs(5)) // Default: 1s
.build(inspector.clone());
Custom Registry
use prometheus::Registry;
let registry = Registry::new();
let exporter = PrometheusExporter::with_registry(
inspector.clone(),
registry
);
Grafana Dashboard
Import Pre-built Dashboard
- Download dashboard JSON: async-inspect-dashboard.json
- Grafana → Dashboards → Import
- Upload JSON file
- Select Prometheus data source
Dashboard Panels
The included dashboard has:
-
Overview
- Total tasks
- Active tasks
- Task creation rate
- Deadlock count
-
Performance
- Task duration (p50, p95, p99)
- Slowest tasks
- Poll count distribution
-
Task States
- Running tasks (time series)
- Blocked tasks (time series)
- Completed vs Failed ratio
-
Events
- Event rate by type
- Poll/Wake ratio
- Spawn rate
Custom Dashboard Example
{
"panels": [
{
"title": "Active Tasks",
"targets": [
{
"expr": "async_inspect_tasks_by_state{state=\"running\"}"
}
]
},
{
"title": "Task Duration (p99)",
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(async_inspect_task_duration_seconds_bucket[5m])) by (le, name))"
}
]
}
]
}
Alerting
Prometheus Alert Rules
groups:
- name: async_inspect_alerts
rules:
# Alert on high number of running tasks
- alert: HighTaskCount
expr: async_inspect_tasks_by_state{state="running"} > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of concurrent tasks"
description: "{{ $value }} tasks currently running"
# Alert on deadlocks
- alert: DeadlockDetected
expr: increase(async_inspect_deadlocks_detected[5m]) > 0
labels:
severity: critical
annotations:
summary: "Deadlock detected in async tasks"
description: "{{ $value }} deadlock(s) detected"
# Alert on slow tasks
- alert: SlowTasksDetected
expr: |
histogram_quantile(0.99,
sum(rate(async_inspect_task_duration_seconds_bucket[5m])) by (le, name)
) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Tasks taking longer than expected"
description: "p99 duration: {{ $value }}s"
# Alert on high blocked task percentage
- alert: HighBlockedTaskPercentage
expr: |
async_inspect_tasks_by_state{state="blocked"}
/ sum(async_inspect_tasks_by_state) * 100 > 50
for: 10m
labels:
severity: warning
annotations:
summary: "More than 50% of tasks are blocked"
description: "{{ $value }}% of tasks blocked"
# Alert on memory usage
- alert: HighMemoryUsage
expr: sum(async_inspect_memory_bytes) > 1e9 # 1GB
for: 5m
labels:
severity: warning
annotations:
summary: "async-inspect using excessive memory"
description: "Memory usage: {{ $value | humanize }}B"
Recording Rules
Pre-aggregate expensive queries:
groups:
- name: async_inspect_recording
interval: 10s
rules:
# Task creation rate
- record: async_inspect:task_creation_rate:5m
expr: rate(async_inspect_tasks_total[5m])
# Average task duration
- record: async_inspect:task_duration_avg:5m
expr: |
sum(rate(async_inspect_task_duration_seconds_sum[5m]))
/ sum(rate(async_inspect_task_duration_seconds_count[5m]))
# Blocked task percentage
- record: async_inspect:blocked_percentage
expr: |
async_inspect_tasks_by_state{state="blocked"}
/ sum(async_inspect_tasks_by_state) * 100
Advanced Usage
Multiple Inspectors
let inspector1 = Inspector::new(Config::default());
let inspector2 = Inspector::new(Config::default());
// Single exporter for both
let exporter = PrometheusExporter::multi(vec![
inspector1.clone(),
inspector2.clone(),
]);
Custom Metrics
Add your own metrics to the same registry:
use prometheus::{Counter, Registry};
let registry = Registry::new();
// Your custom metric
let requests = Counter::new("http_requests_total", "Total requests")?;
registry.register(Box::new(requests.clone()))?;
// async-inspect metrics
let exporter = PrometheusExporter::with_registry(
inspector.clone(),
registry.clone()
);
Filtering Metrics
let exporter = PrometheusExporter::builder()
.enable_task_metrics(true)
.enable_event_metrics(false) // Disable event metrics
.enable_memory_metrics(true)
.build(inspector.clone());
Integration with Existing Monitoring
Kubernetes
apiVersion: v1
kind: Service
metadata:
name: myapp
labels:
app: myapp
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
prometheus.io/path: "/metrics"
spec:
selector:
app: myapp
ports:
- name: metrics
port: 9090
targetPort: 9090
Docker Compose
version: '3'
services:
app:
build: .
ports:
- "9090:9090"
labels:
- "prometheus.scrape=true"
- "prometheus.port=9090"
prometheus:
image: prom/prometheus
ports:
- "9091:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
prometheus.yml:
scrape_configs:
- job_name: 'async-inspect'
static_configs:
- targets: ['app:9090']
Systemd Service
[Unit]
Description=My Async App
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/bin/myapp
Environment="ASYNC_INSPECT_METRICS_PORT=9090"
Restart=always
[Install]
WantedBy=multi-user.target
Best Practices
1. Use Recording Rules
Pre-aggregate expensive queries:
# Instead of calculating p99 in Grafana
- record: async_inspect:duration_p99:5m
expr: histogram_quantile(0.99, ...)
2. Limit Cardinality
Avoid high-cardinality labels:
// ❌ BAD - too many unique values
task_duration{task_id="12345"}
// ✅ GOOD - bounded cardinality
task_duration{name="fetch_user"}
3. Set Appropriate Scrape Intervals
scrape_configs:
- job_name: 'async-inspect'
scrape_interval: 15s # Balance freshness vs load
4. Retention
Configure retention based on needs:
# Prometheus
--storage.tsdb.retention.time=30d
--storage.tsdb.retention.size=50GB
Troubleshooting
Metrics not appearing
-
Check endpoint is accessible:
curl http://localhost:9090/metrics -
Verify Prometheus scraping:
# Check targets in Prometheus UI
http://localhost:9091/targets -
Enable debug logging:
env_logger::Builder::from_env(
env_logger::Env::default().default_filter_or("async_inspect=debug")
).init();
High scrape duration
Reduce metrics:
let exporter = PrometheusExporter::builder()
.enable_event_metrics(false) // Disable if not needed
.build(inspector.clone());
Examples
Complete example: examples/prometheus_integration.rs