Complete pre-launch checklist for deploying Groq-powered applications to production. Covers API key security, model selection, rate limit planning, fallback strategies, and monitoring setup.
.env files)gsk_ pattern in codellama-3.3-70b-versatile)llama-3.1-8b-instant)max_tokens set to actual expected output size (not context max)retry-after header implementedp-queue or similar)Groq.APIError, Groq.APIConnectionError)async function completionWithFallback(messages: any[]) {
try {
return await groq.chat.completions.create({
model: "llama-3.3-70b-versatile",
messages,
timeout: 15_000,
});
} catch (err: any) {
if (err.status === 429 || err.status >= 500) {
console.warn("Groq primary failed, trying fallback model");
try {
return await groq.chat.completions.create({
model: "llama-3.1-8b-instant",
messages,
timeout: 10_000,
});
} catch {
console.error("Groq fully unavailable, degrading gracefully");
return { choices: [{ message: { content: "Service temporarily unavailable. Please try again." } }] };
}
}
throw err;
}
}
// /api/health or /healthz
export async function GET() {
const checks: Record<string, any> = { status: "healthy" };
const start = performance.now();
try {
await groq.chat.completions.create({
model: "llama-3.1-8b-instant",
messages: [{ role: "user", content: "OK" }],
max_tokens: 1,
temperature: 0,
});
checks.groq = { status: "connected", latencyMs: Math.round(performance.now() - start) };
} catch (err: any) {
checks.status = "degraded";
checks.groq = { status: "error", error: err.status || err.message };
}
return Response.json(checks, { status: checks.status === "healthy" ? 200 : 503 });
}
groq-incident-runbook)set -euo pipefail
# Pre-flight checks
echo "1. Groq API status..."
curl -sf https://status.groq.com > /dev/null && echo "OK" || echo "ISSUE"
echo "2. Production key valid..."
curl -sf https://api.groq.com/openai/v1/models \
-H "Authorization: Bearer $GROQ_API_KEY_PROD" | jq '.data | length'
echo "3. Health endpoint..."
curl -sf https://your-app.com/api/health | jq .
echo "4. Rate limit headroom..."
curl -si https://api.groq.com/openai/v1/chat/completions \
-H "Authorization: Bearer $GROQ_API_KEY_PROD" \
-H "Content-Type: application/json" \
-d '{"model":"llama-3.1-8b-instant","messages":[{"role":"user","content":"ping"}],"max_tokens":1}' \
2>/dev/null | grep -i "x-ratelimit-remaining"
| Alert | Condition | Severity |
|---|---|---|
| API errors spike | 5xx rate > 5/min | P1 |
| Latency degraded | p95 > 1000ms | P2 |
| Rate limited | 429 count > 5/min | P2 |
| Auth failure | Any 401 error | P1 |
| Spending near cap | >90% of monthly budget | P3 |
For version upgrades, see groq-upgrade-migration.