Skip to main content

ML Engine API

Base URL: http://localhost:8080/api/v1/ml

All endpoints require JWT authentication.


POST /api/v1/ml/analyze

Comprehensive incident analysis: root cause + blast radius + predicted future anomalies.

Request

{
"anomaly_id": "anom-7f3d",
"service_id": "api-gateway",
"metric_name": "cpu_usage_percent",
"timestamp": "2026-04-10T12:00:00Z",
"anomaly_score": 0.95
}

Response 200

{
"anomaly_id": "anom-7f3d",
"service_id": "api-gateway",
"analyzed_at": "2026-04-10T12:00:20Z",
"root_cause": {
"category": "infrastructure",
"confidence": 0.92,
"summary": "CPU saturation caused by memory pressure from uncached DB queries",
"evidence": {
"cpu_high": 0.90,
"memory_high": 0.85
},
"suggested_actions": [
"Scale to 5 pods",
"Restart pods to clear leaked resources"
],
"historical_matches": [
{
"incident_id": "inc-march-14",
"resolution": "Scaled from 3 to 5 pods",
"similarity": 0.89,
"time_to_recovery_mins": 12
}
]
},
"features": {
"raw_value": 87.3,
"derivative_value": 2.1,
"second_derivative": 0.3,
"hour_of_day": 12,
"day_of_week": 4,
"is_weekend": false,
"correlated_metrics": {
"memory_usage_percent": 0.82,
"error_rate": 0.71
}
},
"blast_radius": ["payment-service", "user-service"],
"causal_relationships": [
{
"cause_metric": "cpu_usage_percent",
"effect_metric": "error_rate",
"strength": 0.78,
"lag_minutes": 2
}
],
"predicted_anomalies": [
{
"metric_name": "memory_usage_percent",
"confidence_score": 0.84,
"prediction_reason": "Memory trending toward 90% threshold",
"suggested_runbook": "clear-caches",
"lead_time_minutes": 15
}
]
}

GET /api/v1/ml/predict

Predict upcoming anomalies for a service.

Query parameters

ParamTypeDefaultDescription
service_idstringrequiredService to predict for
horizon_minutesint30Prediction window

Response 200

{
"service_id": "database",
"generated_at": "2026-04-10T12:00:00Z",
"horizon_minutes": 30,
"predictions": [
{
"metric_name": "cpu_usage_percent",
"confidence_score": 0.78,
"prediction_reason": "CPU trending toward high values",
"suggested_runbook": "scale-up-compute",
"lead_time_minutes": 15
}
]
}

Detect metrics slowly degrading toward critical thresholds.

Query parameters

ParamTypeDescription
service_idstringOptional filter
lookback_hoursintHistorical window (default: 24)

Response 200

{
"trends": [
{
"service_id": "payment-api",
"metric_name": "memory_usage_percent",
"current_value": 72.4,
"trend_slope_per_hour": 0.8,
"estimated_breach_minutes": 45,
"threshold": 90.0,
"confidence": 0.91
}
]
}

GET /api/v1/ml/forecast

Time-series forecast with confidence intervals.

Query parameters

ParamTypeDefaultDescription
service_idstringrequiredTarget service
metricstringrequiredMetric name
horizon_hoursint24Forecast window
algorithmstringautoarima, exponential_smoothing, or auto

Response 200

{
"service_id": "payment-api",
"metric_name": "request_rate",
"algorithm": "arima",
"forecast": [
{
"timestamp": "2026-04-11T00:00:00Z",
"predicted_value": 1250.3,
"lower_bound": 1100.0,
"upper_bound": 1400.0,
"confidence": 0.95
}
]
}

GET /api/v1/ml/causal

Discover causal relationships between metrics for a service.

Response 200

{
"service_id": "checkout-service",
"relationships": [
{
"cause_metric": "db_query_latency_ms",
"effect_metric": "response_time_ms",
"strength": 0.91,
"lag_minutes": 2,
"confidence": 0.88
}
]
}

POST /api/v1/ml/validate-causality

Test whether a suspected causal relationship is statistically valid.

Request

{
"service_id": "payment-api",
"cause_metric": "cpu_usage_percent",
"effect_metric": "error_rate",
"lookback_hours": 72
}

Response 200

{
"is_causal": true,
"strength": 0.84,
"lag_minutes": 3,
"confidence": 0.91,
"p_value": 0.003,
"evidence": "cpu_usage_percent consistently precedes error_rate by 3±1 minutes across 7 incidents"
}

GET /api/v1/ml/blast-radius

Estimate the blast radius for a service failure.

Query parameters

ParamTypeDescription
service_idstringOrigin service
metricstringOptional — metric causing the failure

Response 200

{
"origin_service": "database",
"directly_affected": ["payment-api", "user-service"],
"transitively_affected": ["checkout-service", "notification-service"],
"estimated_severity": "high",
"confidence": 0.87
}

POST /api/v1/ml/train

Train a new ML model. Requires Admin role.

Request

{
"service_id": "api-gateway",
"training_window_days": 30,
"model_type": "isolation_forest"
}

Response 202

{
"job_id": "train-job-abc123",
"status": "queued",
"estimated_duration_minutes": 5
}

POST /api/v1/ml/promote

Promote a shadow model to production. Requires Admin role.

Request

{
"model_id": "model-abc123",
"service_id": "api-gateway"
}

GET /api/v1/ml/drift

Check model performance drift for a service.

Response 200

{
"service_id": "api-gateway",
"model_id": "model-abc123",
"drift_score": 0.12,
"drift_detected": false,
"kl_divergence": 0.08,
"last_trained": "2026-03-15T00:00:00Z",
"recommendation": "Model is healthy."
}