Canary Releases Erklaert
Canary Releases: Sichere, schrittweise Deployments
Canary Releases ermöglichen das Testen neuer Versionen mit echten Nutzern bei minimalem Risiko. Lernen Sie Traffic-Splitting, Monitoring und automatische Rollbacks.
Das Konzept
┌─────────────────────────────────────────────────────────────┐ │ CANARY RELEASE │ ├─────────────────────────────────────────────────────────────┤ │ │ │ PHASE 1: Canary Start (5% Traffic) │ │ ┌───────────────┐ │ │ │ LOAD BALANCER │ │ │ └───────┬───────┘ │ │ │ │ │ ┌─────┴─────┐ │ │ │ │ │ │ 95% 5% │ │ │ │ │ │ ▼ ▼ │ │ ┌───────┐ ┌───────┐ │ │ │ v1.0 │ │ v1.1 │ ← Canary (neue Version) │ │ │Stable │ │Canary │ │ │ └───────┘ └───────┘ │ │ │ │ PHASE 2: Graduelle Erhöhung │ │ 5% → 10% → 25% → 50% → 100% │ │ │ │ Bei Problemen: Sofort auf 0% zurück │ │ │ └─────────────────────────────────────────────────────────────┘
Canary vs Blue-Green vs Rolling
| Strategie | Risiko | Ressourcen | Rollback |
|---|---|---|---|
| Canary | Minimal (nur X% betroffen) | Gering (wenige Canary-Instanzen) | Traffic-Umleitung |
| Blue-Green | Mittel (100% Switch) | Hoch (doppelte Infrastruktur) | Sofortiger Switch |
| Rolling | Mittel (schrittweise Pods) | Gering (keine Extra-Kapazität) | Rollback Deployment |
Nginx Canary mit Weight
# /etc/nginx/conf.d/canary.conf
upstream backend_stable {
server stable-1:8080 weight=95;
server stable-2:8080 weight=95;
}
upstream backend_canary {
server canary-1:8080 weight=5;
}
# Kombinierter Upstream mit Gewichtung
upstream backend {
server stable-1:8080 weight=95;
server stable-2:8080 weight=95;
server canary-1:8080 weight=5;
}
server {
listen 80;
server_name app.example.com;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
# Header für Debugging
add_header X-Backend-Server $upstream_addr;
}
}
Cookie-basiertes Canary
# Bestimmte User gezielt zur Canary routen
map $cookie_canary $backend_pool {
"true" canary;
default stable;
}
upstream stable {
server stable-1:8080;
server stable-2:8080;
}
upstream canary {
server canary-1:8080;
}
server {
listen 80;
location / {
proxy_pass http://$backend_pool;
# Canary-Cookie für Beta-Tester setzen
# (via separate Admin-Aktion)
}
}
// Canary für interne Mitarbeiter
// JavaScript: Canary-Cookie setzen
function enableCanary() {
document.cookie = "canary=true; path=/; max-age=86400";
location.reload();
}
function disableCanary() {
document.cookie = "canary=; path=/; max-age=0";
location.reload();
}
// Admin-Panel Button
<button onclick="enableCanary()">Enable Canary</button>
Kubernetes Canary mit Ingress
# stable-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: myapp-stable
spec:
replicas: 9 # 90% der Pods
selector:
matchLabels:
app: myapp
track: stable
template:
metadata:
labels:
app: myapp
track: stable
spec:
containers:
- name: myapp
image: myapp:v1.0
ports:
- containerPort: 8080
# canary-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: myapp-canary
spec:
replicas: 1 # 10% der Pods
selector:
matchLabels:
app: myapp
track: canary
template:
metadata:
labels:
app: myapp
track: canary
spec:
containers:
- name: myapp
image: myapp:v1.1
ports:
- containerPort: 8080
# service.yaml - Routet zu beiden Tracks
apiVersion: v1
kind: Service
metadata:
name: myapp
spec:
selector:
app: myapp # Matched beide: stable UND canary
ports:
- port: 80
targetPort: 8080
# Traffic-Split basiert auf Replica-Anzahl:
# 9 stable + 1 canary = 10% Canary Traffic
Istio Traffic Splitting
# VirtualService für präzises Traffic-Splitting
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: myapp
spec:
hosts:
- myapp
http:
- route:
- destination:
host: myapp
subset: stable
weight: 95
- destination:
host: myapp
subset: canary
weight: 5
---
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: myapp
spec:
host: myapp
subsets:
- name: stable
labels:
track: stable
- name: canary
labels:
track: canary
#!/bin/bash
# canary-promote.sh - Canary schrittweise erhöhen
STEPS=(5 10 25 50 75 100)
for weight in "${STEPS[@]}"; do
stable_weight=$((100 - weight))
echo "Setting canary to ${weight}%..."
kubectl patch virtualservice myapp --type=json -p="[
{\"op\": \"replace\", \"path\": \"/spec/http/0/route/0/weight\", \"value\": $stable_weight},
{\"op\": \"replace\", \"path\": \"/spec/http/0/route/1/weight\", \"value\": $weight}
]"
echo "Waiting 5 minutes for metrics..."
sleep 300
# Error-Rate prüfen
ERROR_RATE=$(curl -s "http://prometheus:9090/api/v1/query?query=rate(http_errors_total{version='canary'}[5m])" | jq '.data.result[0].value[1]')
if (( $(echo "$ERROR_RATE > 0.01" | bc -l) )); then
echo "Error rate too high (${ERROR_RATE}), rolling back!"
kubectl patch virtualservice myapp --type=json -p="[
{\"op\": \"replace\", \"path\": \"/spec/http/0/route/0/weight\", \"value\": 100},
{\"op\": \"replace\", \"path\": \"/spec/http/0/route/1/weight\", \"value\": 0}
]"
exit 1
fi
echo "Canary at ${weight}% - metrics OK"
done
echo "Canary promotion complete!"
Argo Rollouts (GitOps)
# Argo Rollout mit automatischer Canary-Analyse
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: myapp
spec:
replicas: 10
selector:
matchLabels:
app: myapp
template:
metadata:
labels:
app: myapp
spec:
containers:
- name: myapp
image: myapp:v1.1
ports:
- containerPort: 8080
strategy:
canary:
steps:
- setWeight: 5
- pause: {duration: 5m}
- setWeight: 10
- pause: {duration: 5m}
- setWeight: 25
- pause: {duration: 10m}
- setWeight: 50
- pause: {duration: 10m}
- setWeight: 100
# Automatische Analyse
analysis:
templates:
- templateName: success-rate
startingStep: 2
args:
- name: service-name
value: myapp
# AnalysisTemplate für Canary-Bewertung
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
name: success-rate
spec:
args:
- name: service-name
metrics:
- name: success-rate
interval: 1m
count: 5
successCondition: result[0] >= 0.99
failureLimit: 3
provider:
prometheus:
address: http://prometheus:9090
query: |
sum(rate(http_requests_total{service="{{args.service-name}}",status=~"2.."}[5m])) /
sum(rate(http_requests_total{service="{{args.service-name}}"}[5m]))
Monitoring und Metriken
// Wichtige Canary-Metriken
const canaryMetrics = {
// Vergleiche Canary vs Stable
errorRate: {
canary: 'rate(http_errors_total{track="canary"}[5m])',
stable: 'rate(http_errors_total{track="stable"}[5m])',
threshold: 'canary <= stable * 1.1' // Max 10% mehr Fehler
},
latency: {
canary: 'histogram_quantile(0.99, rate(http_duration_bucket{track="canary"}[5m]))',
stable: 'histogram_quantile(0.99, rate(http_duration_bucket{track="stable"}[5m]))',
threshold: 'canary <= stable * 1.2' // Max 20% langsamer
},
saturation: {
canary: 'rate(http_requests_total{track="canary"}[5m])',
// Canary sollte proportional Traffic erhalten
}
};
// Alerting
const alerts = [
{
name: 'CanaryHighErrorRate',
expr: 'rate(http_errors_total{track="canary"}[5m]) > 0.01',
for: '2m',
severity: 'critical',
action: 'auto_rollback'
},
{
name: 'CanaryHighLatency',
expr: 'histogram_quantile(0.99, rate(http_duration_bucket{track="canary"}[5m])) > 2',
for: '5m',
severity: 'warning',
action: 'notify_oncall'
}
];
Feature Flags + Canary
// Kombination: Canary für Code, Feature Flags für Features
class FeatureService {
constructor(private userId: string) {}
isCanaryUser(): boolean {
// User ist auf Canary-Infrastruktur
return process.env.DEPLOYMENT_TRACK === 'canary';
}
isFeatureEnabled(feature: string): boolean {
// Feature Flag (unabhängig von Infrastruktur)
return this.featureFlags.isEnabled(feature, this.userId);
}
// Progressives Rollout
shouldShowNewCheckout(): boolean {
// Canary-User UND Feature-Flag aktiviert
return this.isCanaryUser() && this.isFeatureEnabled('new_checkout');
}
}
// Vorteile:
// - Canary testet Infrastruktur/Performance
// - Feature Flags testen neue Features
// - Beide können unabhängig gerollt werden
💡 Best Practices:
1. Klein starten (1-5%) und langsam erhöhen
2. Mindestens 5-10 Minuten pro Stufe warten
3. Automatische Rollback-Trigger bei Metriken-Abweichung
4. A/B-Vergleich: Canary vs Stable Metriken
5. Canary zuerst auf interne User/Regionen limitieren
2. Mindestens 5-10 Minuten pro Stufe warten
3. Automatische Rollback-Trigger bei Metriken-Abweichung
4. A/B-Vergleich: Canary vs Stable Metriken
5. Canary zuerst auf interne User/Regionen limitieren