Skills Development ElevenLabs TTS Production Reference Architecture

ElevenLabs TTS Production Reference Architecture

v20260423
elevenlabs-reference-architecture
This comprehensive guide provides a production-grade reference architecture for building robust Text-to-Speech (TTS) and voice applications using the ElevenLabs API. It covers detailed project structure, service layer design (including TTS orchestration, voice management, and caching), configuration handling, and best practices for streaming and multi-model orchestration, ensuring scalability and reliability in a commercial SaaS environment.
Get Skill
242 downloads
Overview

ElevenLabs Reference Architecture

Overview

Production-ready architecture for ElevenLabs TTS/voice applications. Covers project layout, service layers, caching, streaming, and multi-model orchestration.

Prerequisites

  • Understanding of layered architecture patterns
  • ElevenLabs SDK knowledge (see elevenlabs-sdk-patterns)
  • TypeScript project with async patterns
  • Redis (optional, for distributed caching)

Instructions

Step 1: Project Structure

my-elevenlabs-service/
├── src/
│   ├── elevenlabs/
│   │   ├── client.ts            # Singleton client with retry config
│   │   ├── config.ts            # Environment-aware configuration
│   │   ├── models.ts            # Model selection logic
│   │   ├── errors.ts            # Error classification (see sdk-patterns)
│   │   └── types.ts             # TypeScript interfaces
│   ├── services/
│   │   ├── tts-service.ts       # Text-to-Speech orchestration
│   │   ├── voice-service.ts     # Voice management (clone, list, settings)
│   │   ├── audio-service.ts     # SFX, isolation, transcription
│   │   └── cache-service.ts     # Audio caching layer
│   ├── api/
│   │   ├── routes/
│   │   │   ├── tts.ts           # POST /api/tts
│   │   │   ├── voices.ts        # GET/POST /api/voices
│   │   │   ├── webhooks.ts      # POST /webhooks/elevenlabs
│   │   │   └── health.ts        # GET /health
│   │   └── middleware/
│   │       ├── rate-limit.ts    # Request throttling
│   │       └── auth.ts          # Your app's auth (not ElevenLabs auth)
│   ├── queue/
│   │   ├── tts-queue.ts         # Async TTS job processing
│   │   └── workers.ts           # Queue workers
│   └── monitoring/
│       ├── metrics.ts           # Latency, error rate, quota tracking
│       └── alerts.ts            # Budget and health alerts
├── tests/
│   ├── unit/
│   │   ├── tts-service.test.ts
│   │   └── cache-service.test.ts
│   └── integration/
│       └── tts-smoke.test.ts
├── config/
│   ├── development.json
│   ├── staging.json
│   └── production.json
└── .env.example

Step 2: Configuration Layer

// src/elevenlabs/config.ts
export interface ElevenLabsConfig {
  apiKey: string;
  environment: "development" | "staging" | "production";
  defaults: {
    modelId: string;
    voiceId: string;
    outputFormat: string;
    voiceSettings: {
      stability: number;
      similarity_boost: number;
      style: number;
      speed: number;
    };
  };
  performance: {
    maxConcurrency: number;
    timeoutMs: number;
    maxRetries: number;
  };
  cache: {
    enabled: boolean;
    maxSizeMB: number;
    ttlSeconds: number;
  };
}

const ENV_CONFIGS: Record<string, Partial<ElevenLabsConfig>> = {
  development: {
    defaults: {
      modelId: "eleven_flash_v2_5",    // Cheap + fast for dev
      voiceId: "21m00Tcm4TlvDq8ikWAM", // Rachel
      outputFormat: "mp3_22050_32",     // Small files
      voiceSettings: { stability: 0.5, similarity_boost: 0.75, style: 0, speed: 1 },
    },
    performance: { maxConcurrency: 2, timeoutMs: 30_000, maxRetries: 1 },
    cache: { enabled: true, maxSizeMB: 50, ttlSeconds: 3600 },
  },
  production: {
    defaults: {
      modelId: "eleven_multilingual_v2", // High quality for prod
      voiceId: "21m00Tcm4TlvDq8ikWAM",
      outputFormat: "mp3_44100_128",     // High quality
      voiceSettings: { stability: 0.5, similarity_boost: 0.75, style: 0, speed: 1 },
    },
    performance: { maxConcurrency: 10, timeoutMs: 60_000, maxRetries: 3 },
    cache: { enabled: true, maxSizeMB: 500, ttlSeconds: 86_400 },
  },
};

export function loadConfig(): ElevenLabsConfig {
  const env = process.env.NODE_ENV || "development";
  const envConfig = ENV_CONFIGS[env] || ENV_CONFIGS.development;

  return {
    apiKey: process.env.ELEVENLABS_API_KEY!,
    environment: env as any,
    ...envConfig,
  } as ElevenLabsConfig;
}

Step 3: TTS Service Layer

// src/services/tts-service.ts
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
import PQueue from "p-queue";
import { loadConfig } from "../elevenlabs/config";
import { classifyError } from "../elevenlabs/errors";

export class TTSService {
  private client: ElevenLabsClient;
  private queue: PQueue;
  private config: ReturnType<typeof loadConfig>;

  constructor() {
    this.config = loadConfig();
    this.client = new ElevenLabsClient({
      apiKey: this.config.apiKey,
      maxRetries: this.config.performance.maxRetries,
      timeoutInSeconds: this.config.performance.timeoutMs / 1000,
    });
    this.queue = new PQueue({
      concurrency: this.config.performance.maxConcurrency,
    });
  }

  async generate(text: string, options?: {
    voiceId?: string;
    modelId?: string;
    outputFormat?: string;
    streaming?: boolean;
  }): Promise<ReadableStream | Buffer> {
    const voiceId = options?.voiceId || this.config.defaults.voiceId;
    const modelId = options?.modelId || this.config.defaults.modelId;
    const format = options?.outputFormat || this.config.defaults.outputFormat;

    return this.queue.add(async () => {
      const start = performance.now();

      try {
        if (options?.streaming) {
          return await this.client.textToSpeech.stream(voiceId, {
            text,
            model_id: modelId,
            output_format: format,
            voice_settings: this.config.defaults.voiceSettings,
          });
        }

        const audio = await this.client.textToSpeech.convert(voiceId, {
          text,
          model_id: modelId,
          output_format: format,
          voice_settings: this.config.defaults.voiceSettings,
        });

        const latency = performance.now() - start;
        console.log(`[TTS] ${text.length} chars, ${modelId}, ${latency.toFixed(0)}ms`);
        return audio;
      } catch (error) {
        throw classifyError(error);
      }
    }) as Promise<ReadableStream | Buffer>;
  }

  // Split long text into chunks with prosody context
  async generateLongText(text: string, voiceId?: string): Promise<Buffer[]> {
    const chunks = this.splitText(text, 4500); // Stay under 5000 limit
    const results: Buffer[] = [];

    for (let i = 0; i < chunks.length; i++) {
      const audio = await this.generate(chunks[i], {
        voiceId,
        // Pass context for natural prosody across chunks
      });
      results.push(audio as Buffer);
    }

    return results;
  }

  private splitText(text: string, maxChars: number): string[] {
    const chunks: string[] = [];
    const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
    let current = "";

    for (const sentence of sentences) {
      if ((current + sentence).length > maxChars) {
        if (current) chunks.push(current.trim());
        current = sentence;
      } else {
        current += sentence;
      }
    }
    if (current) chunks.push(current.trim());
    return chunks;
  }
}

Step 4: Voice Management Service

// src/services/voice-service.ts
export class VoiceService {
  private client: ElevenLabsClient;

  constructor(client: ElevenLabsClient) {
    this.client = client;
  }

  async listVoices(filter?: { category?: "premade" | "cloned" | "generated" }) {
    const { voices } = await this.client.voices.getAll();
    if (filter?.category) {
      return voices.filter(v => v.category === filter.category);
    }
    return voices;
  }

  async cloneVoice(name: string, description: string, audioFiles: NodeJS.ReadableStream[]) {
    return this.client.voices.add({
      name,
      description,
      files: audioFiles,
    });
  }

  async getVoiceSettings(voiceId: string) {
    return this.client.voices.getSettings(voiceId);
  }

  async updateVoiceSettings(voiceId: string, settings: {
    stability: number;
    similarity_boost: number;
  }) {
    return this.client.voices.editSettings(voiceId, settings);
  }

  async deleteVoice(voiceId: string) {
    return this.client.voices.delete(voiceId);
  }
}

Step 5: Data Flow Diagram

                         ┌──────────────┐
                         │   Client     │
                         │  (Browser/   │
                         │   Mobile)    │
                         └──────┬───────┘
                                │
                         ┌──────▼───────┐
                         │   API Layer  │
                         │   /api/tts   │
                         │   /api/voice │
                         └──────┬───────┘
                                │
                    ┌───────────┼───────────┐
                    │           │           │
             ┌──────▼──┐ ┌─────▼─────┐ ┌──▼──────┐
             │  Cache   │ │   TTS     │ │  Voice  │
             │ Service  │ │  Service  │ │ Service │
             └──────┬───┘ └─────┬─────┘ └────────┘
                    │           │
              ┌─────▼─┐  ┌─────▼──────────┐
              │ Redis/ │  │ Concurrency    │
              │ LRU    │  │ Queue (p-queue)│
              └────────┘  └─────┬──────────┘
                                │
                         ┌──────▼───────┐
                         │  ElevenLabs  │
                         │  Client SDK  │
                         │  (singleton) │
                         └──────┬───────┘
                                │
                    ┌───────────┼───────────┐
                    │           │           │
             ┌──────▼──┐ ┌─────▼─────┐ ┌──▼──────┐
             │ /v1/tts  │ │ /v1/voices│ │ /v1/sfx │
             │ REST/WS  │ │  REST     │ │  REST   │
             └──────────┘ └───────────┘ └─────────┘
                    ElevenLabs API (api.elevenlabs.io)

Step 6: Health Check Composition

// src/api/routes/health.ts
export async function healthCheck() {
  const checks = await Promise.allSettled([
    checkElevenLabsConnectivity(),
    checkQuotaStatus(),
    checkCacheHealth(),
  ]);

  const elevenlabs = checks[0].status === "fulfilled" ? checks[0].value : null;
  const quota = checks[1].status === "fulfilled" ? checks[1].value : null;
  const cache = checks[2].status === "fulfilled" ? checks[2].value : null;

  const degraded = !elevenlabs || (quota && quota.pctUsed > 90);

  return {
    status: !elevenlabs ? "unhealthy" : degraded ? "degraded" : "healthy",
    services: { elevenlabs, quota, cache },
    timestamp: new Date().toISOString(),
  };
}

Architecture Decisions

Decision Choice Rationale
Client pattern Singleton One connection pool, shared retry config
Concurrency p-queue Respects plan limits, prevents 429
Caching LRU (local) or Redis (distributed) Repeated content is common in TTS
Long text Sentence-boundary splitting Preserves natural speech prosody
Error handling Classification + retry Different strategies for 429 vs 401 vs 500
Model selection Environment-based Flash in dev (cheap), Multilingual in prod (quality)
Streaming HTTP streaming + WebSocket HTTP for simple, WS for LLM integration

Error Handling

Issue Cause Solution
Circular dependencies Wrong layering Services depend on client, never reverse
Cold start latency Client initialization Pre-warm in server startup
Memory pressure Unbounded audio cache Set maxSizeMB on cache
Type errors SDK version mismatch Pin SDK version in package.json

Resources

Next Steps

Start with elevenlabs-install-auth for setup, then apply this architecture. Use elevenlabs-core-workflow-a and elevenlabs-core-workflow-b for feature implementation.

Info
Category Development
Name elevenlabs-reference-architecture
Version v20260423
Size 13.36KB
Updated At 2026-04-28
Language