Production architecture for AI inference, fine-tuning, and batch processing with Together AI's OpenAI-compatible API. Designed for teams routing requests across 100+ open-source models (Llama, Mixtral, Qwen, FLUX) with intelligent model selection, response caching, fine-tune pipeline management, and cost optimization via batch inference at 50% discount. Key design drivers: model routing for cost/quality tradeoffs, inference caching for repeated queries, fine-tune lifecycle management, and graceful degradation across model providers.
Application ──→ Model Router ──→ Cache (Redis) ──→ Together API (v1)
↓ /chat/completions
Queue (Bull) ──→ Batch Worker /completions
↓ /images/generations
Fine-Tune Manager ──→ Together API /fine-tunes
↓ /models
Cost Tracker ──→ Analytics Dashboard
class InferenceService {
constructor(private together: TogetherClient, private cache: CacheLayer, private router: ModelRouter) {}
async complete(request: InferenceRequest): Promise<InferenceResponse> {
const model = this.router.selectModel(request.task, request.priority);
const cacheKey = `inference:${model}:${this.hashPrompt(request.prompt)}`;
const cached = await this.cache.get(cacheKey);
if (cached && request.allowCached) return cached;
const response = await this.together.chatCompletions({ model, messages: request.messages, temperature: request.temperature ?? 0.7 });
await this.cache.set(cacheKey, response, CACHE_CONFIG.inference.ttl);
await this.costTracker.record(model, response.usage);
return response;
}
async submitBatch(requests: InferenceRequest[]): Promise<string> {
const batchId = await this.together.createBatch(requests.map(r => ({
model: this.router.selectModel(r.task, 'batch'), messages: r.messages })));
return batchId; // 50% cost reduction for batch processing
}
}
const CACHE_CONFIG = {
inference: { ttl: 3600, prefix: 'infer' }, // 1 hr — deterministic prompts (temp=0) cache well
embeddings: { ttl: 86400, prefix: 'embed' }, // 24 hr — embeddings are stable for same input
modelList: { ttl: 3600, prefix: 'models' }, // 1 hr — available models change infrequently
fineTune: { ttl: 60, prefix: 'ft' }, // 1 min — training status needs near-real-time
batchStatus: { ttl: 30, prefix: 'batch' }, // 30s — batch completion polling
};
// Cache only temp=0 responses by default; stochastic responses bypass cache unless explicitly opted in
class InferencePipeline {
private queue = new Bull('together-events', { redis: process.env.REDIS_URL });
async onFineTuneComplete(event: FineTuneEvent): Promise<void> {
await this.queue.add('deploy-model', event, { attempts: 3, backoff: { type: 'exponential', delay: 5000 } });
}
async processFineTuneEvent(event: FineTuneEvent): Promise<void> {
if (event.status === 'completed') {
await this.router.registerModel(event.modelId, { task: event.task, cost: event.inferCostPerToken });
await this.runEvalSuite(event.modelId, event.evalDataset);
}
if (event.status === 'failed') await this.notifyTeam(event.error);
}
async processBatchComplete(batchId: string): Promise<void> {
const results = await this.together.getBatchResults(batchId);
await this.storeResults(results);
await this.costTracker.recordBatch(batchId, results.usage);
}
}
interface InferenceRequest { task: 'chat' | 'code' | 'embedding' | 'image'; messages: Message[]; prompt?: string; temperature?: number; priority: 'realtime' | 'standard' | 'batch'; allowCached?: boolean; }
interface ModelRoute { modelId: string; task: string; costPerToken: number; latencyP50Ms: number; qualityScore: number; }
interface FineTuneJob { id: string; baseModel: string; trainingFile: string; status: 'pending' | 'running' | 'completed' | 'failed'; epochs: number; learningRate: number; }
interface CostRecord { model: string; promptTokens: number; completionTokens: number; costUsd: number; timestamp: string; }
| Component | Failure Mode | Recovery |
|---|---|---|
| Inference request | Model overloaded (500) | Fallback to alternative model in same task category |
| Rate limiting | 429 Too Many Requests | Token bucket with exponential backoff, queue overflow to batch |
| Fine-tune job | Training divergence | Auto-stop on loss plateau, notify team with checkpoint artifacts |
| Batch processing | Partial batch failure | Retry failed items individually, report partial results |
| Model routing | Selected model deprecated | Auto-reroute to replacement model, alert team to update config |
See together-deploy-integration.