The Reliability Challenge

Agentic systems face unique reliability challenges beyond traditional applications.

The Core Problems

1. LLM Non-Determinism


// Same prompt, different outputs
const result1 = await agent.generate("Summarize this article");
const result2 = await agent.generate("Summarize this article");
// result1 !== result2 (even with temperature=0)

2. External API Failures


// Your agent depends on multiple services
Google Calendar API → Down (503)
HubSpot CRM API → Rate limited (429)
OpenAI API → Timeout
Slack API → Success ✓

// Result: Partial failure - what do you do?

3. Rate Limits and Quotas


// OpenAI: 10,000 TPM (tokens per minute)
// HubSpot: 100 requests per 10 seconds
// Google Calendar: 1,000,000 queries per day

// Your agent makes 50 API calls in a workflow
// One rate limit breaks the entire pipeline

Why This Matters

Without proper error handling:

🔴 Agent crashes leave users confused
🔴 Partial data corruption (some tools succeed, others fail)
🔴 Cascading failures across systems
🔴 No visibility into what went wrong
🔴 Manual intervention required for recovery

With proper error handling:

✅ Graceful degradation
✅ Automatic retries
✅ Clear error messages
✅ Partial success handling
✅ Self-healing systems

Retry Strategies

Exponential Backoff

The gold standard for retrying failed requests.


// lib/retry.ts
export class RetryStrategy {
  async withExponentialBackoff<T>(
    fn: () => Promise<T>,
    options: RetryOptions = {}
  ): Promise<T> {
    const {
      maxRetries = 3,
      baseDelay = 1000,
      maxDelay = 30000,
      backoffMultiplier = 2,
      retryableErrors = [429, 500, 502, 503, 504],
    } = options;

    let lastError: Error;

    for (let attempt = 0; attempt <= maxRetries; attempt++) {
      try {
        return await fn();
      } catch (error) {
        lastError = error as Error;

        // Don't retry on non-retryable errors
        if (!this.isRetryable(error, retryableErrors)) {
          throw error;
        }

        // Last attempt - throw
        if (attempt === maxRetries) {
          throw new Error(
            `Max retries (${maxRetries}) exceeded. Last error: ${lastError.message}`
          );
        }

        // Calculate delay with exponential backoff
        const delay = Math.min(
          baseDelay * Math.pow(backoffMultiplier, attempt),
          maxDelay
        );

        // Add jitter to prevent thundering herd
        const jitter = Math.random() * 0.3 * delay;
        const totalDelay = delay + jitter;

        console.log(
          `Retry attempt ${attempt + 1}/${maxRetries} after ${Math.round(totalDelay)}ms`
        );

        await this.sleep(totalDelay);
      }
    }

    throw lastError!;
  }

  private isRetryable(error: any, retryableCodes: number[]): boolean {
    // HTTP errors
    if (error.status && retryableCodes.includes(error.status)) {
      return true;
    }

    // Network errors
    if (error.code === 'ECONNRESET' || error.code === 'ETIMEDOUT') {
      return true;
    }

    // OpenAI specific errors
    if (error.type === 'insufficient_quota' || error.type === 'server_error') {
      return true;
    }

    return false;
  }

  private sleep(ms: number): Promise<void> {
    return new Promise((resolve) => setTimeout(resolve, ms));
  }
}

// Usage
const retry = new RetryStrategy();

const result = await retry.withExponentialBackoff(
  () => openai.chat.completions.create({ /* ... */ }),
  { maxRetries: 5 }
);

Circuit Breaker

Prevent cascading failures by "breaking" the circuit after repeated failures.


// lib/circuit-breaker.ts
export class CircuitBreaker {
  private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
  private failureCount = 0;
  private lastFailureTime?: Date;
  private successCount = 0;

  constructor(
    private options: {
      failureThreshold: number; // Open after N failures
      resetTimeout: number; // Try again after N ms
      successThreshold: number; // Close after N successes in HALF_OPEN
    } = {
      failureThreshold: 5,
      resetTimeout: 60000, // 1 minute
      successThreshold: 2,
    }
  ) {}

  async execute<T>(fn: () => Promise<T>): Promise<T> {
    if (this.state === 'OPEN') {
      // Check if we should try again
      if (this.shouldAttemptReset()) {
        this.state = 'HALF_OPEN';
        console.log('Circuit breaker: HALF_OPEN - attempting reset');
      } else {
        throw new Error(
          `Circuit breaker is OPEN. Too many failures. Retry after ${this.getRetryAfter()}ms`
        );
      }
    }

    try {
      const result = await fn();

      // Success handling
      if (this.state === 'HALF_OPEN') {
        this.successCount++;
        
        if (this.successCount >= this.options.successThreshold) {
          this.reset();
          console.log('Circuit breaker: CLOSED - service recovered');
        }
      } else {
        this.reset();
      }

      return result;
    } catch (error) {
      this.recordFailure();
      throw error;
    }
  }

  private recordFailure(): void {
    this.failureCount++;
    this.lastFailureTime = new Date();
    this.successCount = 0;

    if (this.failureCount >= this.options.failureThreshold) {
      this.state = 'OPEN';
      console.error(
        `Circuit breaker: OPEN - ${this.failureCount} consecutive failures`
      );
    }
  }

  private shouldAttemptReset(): boolean {
    if (!this.lastFailureTime) return false;
    
    const timeSinceFailure = Date.now() - this.lastFailureTime.getTime();
    return timeSinceFailure >= this.options.resetTimeout;
  }

  private getRetryAfter(): number {
    if (!this.lastFailureTime) return 0;
    
    const elapsed = Date.now() - this.lastFailureTime.getTime();
    return Math.max(0, this.options.resetTimeout - elapsed);
  }

  private reset(): void {
    this.state = 'CLOSED';
    this.failureCount = 0;
    this.successCount = 0;
    this.lastFailureTime = undefined;
  }

  getState() {
    return {
      state: this.state,
      failureCount: this.failureCount,
      successCount: this.successCount,
    };
  }
}

// Usage
const openAICircuit = new CircuitBreaker();

const result = await openAICircuit.execute(async () => {
  return await openai.chat.completions.create({ /* ... */ });
});

Combined: Retry + Circuit Breaker


// lib/resilient-call.ts
export class ResilientCall {
  private circuitBreaker: CircuitBreaker;
  private retryStrategy: RetryStrategy;

  constructor() {
    this.circuitBreaker = new CircuitBreaker();
    this.retryStrategy = new RetryStrategy();
  }

  async execute<T>(fn: () => Promise<T>): Promise<T> {
    return this.circuitBreaker.execute(async () => {
      return this.retryStrategy.withExponentialBackoff(fn);
    });
  }
}

// Usage in agents
const resilient = new ResilientCall();

const result = await resilient.execute(() =>
  openai.chat.completions.create({ /* ... */ })
);

Fallback Patterns

Model Fallbacks

When primary model fails, fall back to alternatives.


// lib/model-fallback.ts
export class ModelFallback {
  private models = [
    { provider: 'openai', model: 'gpt-4o', priority: 1 },
    { provider: 'openai', model: 'gpt-4o-mini', priority: 2 },
    { provider: 'anthropic', model: 'claude-3-5-sonnet', priority: 3 },
  ];

  async generate(prompt: string): Promise<string> {
    let lastError: Error;

    for (const config of this.models) {
      try {
        console.log(`Trying ${config.provider}/${config.model}...`);
        
        const result = await this.callModel(config, prompt);
        
        console.log(`✓ Success with ${config.provider}/${config.model}`);
        return result;
      } catch (error) {
        lastError = error as Error;
        console.warn(
          `✗ Failed with ${config.provider}/${config.model}: ${lastError.message}`
        );
        
        // Continue to next model
      }
    }

    throw new Error(
      `All models failed. Last error: ${lastError!.message}`
    );
  }

  private async callModel(
    config: { provider: string; model: string },
    prompt: string
  ): Promise<string> {
    switch (config.provider) {
      case 'openai':
        return this.callOpenAI(config.model, prompt);
      case 'anthropic':
        return this.callAnthropic(config.model, prompt);
      default:
        throw new Error(`Unknown provider: ${config.provider}`);
    }
  }

  private async callOpenAI(model: string, prompt: string): Promise<string> {
    const response = await openai.chat.completions.create({
      model,
      messages: [{ role: 'user', content: prompt }],
    });
    
    return response.choices[0].message.content || '';
  }

  private async callAnthropic(model: string, prompt: string): Promise<string> {
    // Anthropic implementation
    return '';
  }
}

Tool Fallbacks

When tool execution fails, try alternatives.


// lib/tool-fallback.ts
export class ToolFallback {
  async searchWeb(query: string): Promise<SearchResult[]> {
    const strategies = [
      { name: 'Tavily', fn: () => this.searchWithTavily(query) },
      { name: 'Brave', fn: () => this.searchWithBrave(query) },
      { name: 'DuckDuckGo', fn: () => this.searchWithDDG(query) },
    ];

    for (const strategy of strategies) {
      try {
        console.log(`Searching with ${strategy.name}...`);
        const results = await strategy.fn();
        
        if (results.length > 0) {
          return results;
        }
      } catch (error) {
        console.warn(`${strategy.name} search failed:`, error);
      }
    }

    // All failed - return empty results with warning
    console.error('All search providers failed');
    return [];
  }

  async sendNotification(message: string): Promise<void> {
    const channels = [
      { name: 'Slack', fn: () => this.sendSlack(message) },
      { name: 'Email', fn: () => this.sendEmail(message) },
      { name: 'Webhook', fn: () => this.sendWebhook(message) },
    ];

    let succeeded = false;

    for (const channel of channels) {
      try {
        await channel.fn();
        console.log(`✓ Sent via ${channel.name}`);
        succeeded = true;
        break; // Success - stop trying
      } catch (error) {
        console.warn(`✗ ${channel.name} failed:`, error);
      }
    }

    if (!succeeded) {
      throw new Error('Failed to send notification via any channel');
    }
  }
}

Default Responses

Provide safe defaults when all else fails.


// lib/default-responses.ts
export class DefaultResponseHandler {
  async generateSafely(
    agent: Agent,
    prompt: string,
    options?: {
      defaultResponse?: string;
      timeout?: number;
    }
  ): Promise<string> {
    const timeout = options?.timeout || 30000;
    const defaultResponse =
      options?.defaultResponse ||
      "I'm having trouble processing your request right now. Please try again later.";

    try {
      // Race between generation and timeout
      const result = await Promise.race([
        agent.generate(prompt),
        this.timeoutPromise(timeout),
      ]);

      if (typeof result === 'string' && result.length > 0) {
        return result;
      }

      // Empty response - use default
      return defaultResponse;
    } catch (error) {
      console.error('Agent generation failed:', error);
      
      // Log error for monitoring
      this.logError(error, { prompt, agent: agent.name });
      
      return defaultResponse;
    }
  }

  private timeoutPromise(ms: number): Promise<never> {
    return new Promise((_, reject) =>
      setTimeout(() => reject(new Error('Timeout')), ms)
    );
  }

  private logError(error: any, context: any): void {
    // Send to error tracking service
    console.error('Error context:', context);
  }
}

Error Recovery

Graceful Degradation

Handle partial failures without breaking the entire workflow.


// lib/graceful-degradation.ts
export class MeetingBriefingWithFallback {
  async generateBriefing(meetingId: string): Promise<Briefing> {
    const results = await this.gatherContextWithFallbacks(meetingId);

    // Generate briefing with whatever data we got
    const briefing = await this.generateFromPartialData(results);

    // Add warnings about missing data
    briefing.warnings = this.generateWarnings(results);

    return briefing;
  }

  private async gatherContextWithFallbacks(
    meetingId: string
  ): Promise<PartialContext> {
    const results: PartialContext = {
      meeting: null,
      crm: null,
      support: null,
      slack: null,
      errors: [],
    };

    // Try to get meeting details
    try {
      results.meeting = await this.getMeetingDetails(meetingId);
    } catch (error) {
      results.errors.push({
        source: 'calendar',
        error: (error as Error).message,
      });
    }

    // Try to get CRM data
    try {
      results.crm = await this.getCRMData(meetingId);
    } catch (error) {
      results.errors.push({
        source: 'crm',
        error: (error as Error).message,
      });
    }

    // Try to get support tickets
    try {
      results.support = await this.getSupportTickets(meetingId);
    } catch (error) {
      results.errors.push({
        source: 'support',
        error: (error as Error).message,
      });
    }

    // Try to get Slack history
    try {
      results.slack = await this.getSlackHistory(meetingId);
    } catch (error) {
      results.errors.push({
        source: 'slack',
        error: (error as Error).message,
      });
    }

    return results;
  }

  private async generateFromPartialData(
    context: PartialContext
  ): Promise<Briefing> {
    const prompt = this.buildPromptFromPartial(context);
    
    const result = await agent.generate(prompt);

    return {
      summary: result.summary,
      keyPoints: result.keyPoints,
      dataCompleteness: this.calculateCompleteness(context),
    };
  }

  private generateWarnings(context: PartialContext): string[] {
    const warnings: string[] = [];

    if (!context.crm) {
      warnings.push('⚠️ CRM data unavailable - missing account context');
    }

    if (!context.support) {
      warnings.push('⚠️ Support tickets unavailable - may miss recent issues');
    }

    if (!context.slack) {
      warnings.push('⚠️ Slack history unavailable - missing recent conversations');
    }

    if (context.errors.length > 0) {
      warnings.push(
        `⚠️ ${context.errors.length} data sources failed to load`
      );
    }

    return warnings;
  }

  private calculateCompleteness(context: PartialContext): number {
    const sources = [
      context.meeting,
      context.crm,
      context.support,
      context.slack,
    ];
    
    const available = sources.filter(Boolean).length;
    return (available / sources.length) * 100;
  }
}

Partial Success Handling


// lib/partial-success.ts
export class BatchProcessor {
  async processBatch<T, R>(
    items: T[],
    processor: (item: T) => Promise<R>
  ): Promise<BatchResult<R>> {
    const results: R[] = [];
    const errors: BatchError[] = [];

    for (let i = 0; i < items.length; i++) {
      const item = items[i];
      
      try {
        const result = await processor(item);
        results.push(result);
      } catch (error) {
        errors.push({
          index: i,
          item,
          error: (error as Error).message,
        });
        
        // Continue processing despite error
        console.warn(`Failed to process item ${i}:`, error);
      }
    }

    return {
      results,
      errors,
      successCount: results.length,
      failureCount: errors.length,
      totalCount: items.length,
      successRate: (results.length / items.length) * 100,
    };
  }
}

// Usage
const processor = new BatchProcessor();

const result = await processor.processBatch(
  articles,
  async (article) => await summarizeArticle(article)
);

console.log(`Processed ${result.successCount}/${result.totalCount} articles`);

if (result.errors.length > 0) {
  console.warn(`${result.errors.length} failures:`, result.errors);
  
  // Retry failed items
  const retryResult = await processor.processBatch(
    result.errors.map(e => e.item),
    async (article) => await summarizeArticle(article)
  );
}

Monitoring & Alerting

Health Checks


// app/api/health/route.ts
import { NextResponse } from 'next/server';

export async function GET() {
  const checks = {
    openai: await checkOpenAI(),
    database: await checkDatabase(),
    slack: await checkSlack(),
    hubspot: await checkHubSpot(),
  };

  const allHealthy = Object.values(checks).every((c) => c.healthy);

  return NextResponse.json(
    {
      status: allHealthy ? 'healthy' : 'degraded',
      timestamp: new Date().toISOString(),
      checks,
    },
    { status: allHealthy ? 200 : 503 }
  );
}

async function checkOpenAI(): Promise<HealthCheck> {
  const start = Date.now();
  
  try {
    await openai.models.list();
    
    return {
      healthy: true,
      latency: Date.now() - start,
      message: 'OK',
    };
  } catch (error) {
    return {
      healthy: false,
      latency: Date.now() - start,
      message: (error as Error).message,
    };
  }
}

async function checkDatabase(): Promise<HealthCheck> {
  const start = Date.now();
  
  try {
    await db.$queryRaw`SELECT 1`;
    
    return {
      healthy: true,
      latency: Date.now() - start,
      message: 'OK',
    };
  } catch (error) {
    return {
      healthy: false,
      latency: Date.now() - start,
      message: (error as Error).message,
    };
  }
}

Error Tracking


// lib/error-tracker.ts
import * as Sentry from '@sentry/node';

export class ErrorTracker {
  static init() {
    Sentry.init({
      dsn: process.env.SENTRY_DSN,
      environment: process.env.NODE_ENV,
      tracesSampleRate: 0.1,
    });
  }

  static captureAgentError(
    error: Error,
    context: {
      agent: string;
      prompt?: string;
      tools?: string[];
      duration?: number;
    }
  ) {
    Sentry.captureException(error, {
      tags: {
        type: 'agent_error',
        agent: context.agent,
      },
      contexts: {
        agent: {
          name: context.agent,
          tools: context.tools,
          duration: context.duration,
        },
      },
      extra: {
        prompt: context.prompt?.substring(0, 500), // First 500 chars
      },
    });
  }

  static captureToolError(
    error: Error,
    context: {
      tool: string;
      args: any;
      duration?: number;
    }
  ) {
    Sentry.captureException(error, {
      tags: {
        type: 'tool_error',
        tool: context.tool,
      },
      extra: {
        args: context.args,
        duration: context.duration,
      },
    });
  }

  static trackPerformance(
    operation: string,
    duration: number,
    metadata?: Record<string, any>
  ) {
    Sentry.metrics.distribution(
      `agent.${operation}.duration`,
      duration,
      {
        tags: metadata,
      }
    );
  }
}

// Usage in agents
try {
  const start = Date.now();
  const result = await agent.generate(prompt);
  
  ErrorTracker.trackPerformance(
    'generation',
    Date.now() - start,
    { agent: agent.name }
  );
  
  return result;
} catch (error) {
  ErrorTracker.captureAgentError(error as Error, {
    agent: agent.name,
    prompt,
  });
  
  throw error;
}

SLA Monitoring


// lib/sla-monitor.ts
export class SLAMonitor {
  private metrics: Map<string, Metric[]> = new Map();

  recordOperation(
    operation: string,
    duration: number,
    success: boolean
  ): void {
    if (!this.metrics.has(operation)) {
      this.metrics.set(operation, []);
    }

    this.metrics.get(operation)!.push({
      duration,
      success,
      timestamp: new Date(),
    });

    // Keep only last 1000 entries
    const metrics = this.metrics.get(operation)!;
    if (metrics.length > 1000) {
      metrics.shift();
    }
  }

  getStats(operation: string): SLAStats {
    const metrics = this.metrics.get(operation) || [];

    if (metrics.length === 0) {
      return {
        availability: 0,
        p50: 0,
        p95: 0,
        p99: 0,
        errorRate: 0,
        totalRequests: 0,
      };
    }

    const successes = metrics.filter((m) => m.success);
    const durations = metrics.map((m) => m.duration).sort((a, b) => a - b);

    return {
      availability: (successes.length / metrics.length) * 100,
      p50: this.percentile(durations, 50),
      p95: this.percentile(durations, 95),
      p99: this.percentile(durations, 99),
      errorRate: ((metrics.length - successes.length) / metrics.length) * 100,
      totalRequests: metrics.length,
    };
  }

  checkSLA(operation: string, sla: SLA): SLAStatus {
    const stats = this.getStats(operation);

    const violations: string[] = [];

    if (stats.availability < sla.minAvailability) {
      violations.push(
        `Availability: ${stats.availability.toFixed(2)}% < ${sla.minAvailability}%`
      );
    }

    if (stats.p95 > sla.maxP95Latency) {
      violations.push(
        `P95 latency: ${stats.p95}ms > ${sla.maxP95Latency}ms`
      );
    }

    if (stats.errorRate > sla.maxErrorRate) {
      violations.push(
        `Error rate: ${stats.errorRate.toFixed(2)}% > ${sla.maxErrorRate}%`
      );
    }

    return {
      healthy: violations.length === 0,
      violations,
      stats,
    };
  }

  private percentile(sorted: number[], p: number): number {
    const index = Math.ceil((sorted.length * p) / 100) - 1;
    return sorted[index] || 0;
  }
}

// Usage
const monitor = new SLAMonitor();

// Record operations
const start = Date.now();
try {
  await agent.generate(prompt);
  monitor.recordOperation('agent.generate', Date.now() - start, true);
} catch (error) {
  monitor.recordOperation('agent.generate', Date.now() - start, false);
}

// Check SLA
const status = monitor.checkSLA('agent.generate', {
  minAvailability: 99.9, // 99.9% uptime
  maxP95Latency: 5000, // 5s P95
  maxErrorRate: 1, // 1% error rate
});

if (!status.healthy) {
  // Alert team
  await slack.postMessage({
    channel: '#alerts',
    text: `SLA violation: ${status.violations.join(', ')}`,
  });
}

Production Checklist

Must-Haves


// Complete production-ready agent wrapper
export class ProductionAgent {
  private circuit: CircuitBreaker;
  private retry: RetryStrategy;
  private fallback: ModelFallback;
  private monitor: SLAMonitor;

  async generate(prompt: string): Promise<string> {
    const start = Date.now();
    let success = false;

    try {
      // Circuit breaker + retry
      const result = await this.circuit.execute(async () => {
        return this.retry.withExponentialBackoff(async () => {
          // Try primary model with fallbacks
          return await this.fallback.generate(prompt);
        });
      });

      success = true;
      return result;
    } catch (error) {
      // Track error
      ErrorTracker.captureAgentError(error as Error, {
        agent: 'production-agent',
        prompt,
        duration: Date.now() - start,
      });

      // Return safe default
      return "I'm experiencing technical difficulties. Please try again.";
    } finally {
      // Record metrics
      this.monitor.recordOperation(
        'agent.generate',
        Date.now() - start,
        success
      );
    }
  }
}

Key Takeaways

Retry with exponential backoff - Handle transient failures
Circuit breakers - Prevent cascading failures
Fallback chains - Multiple models, tools, channels
Graceful degradation - Partial success is better than total failure
Comprehensive monitoring - Health checks, error tracking, SLA monitoring

Production reliability is not optional for agentic systems!

Get chapter updates & code samples

We’ll email diagrams, code snippets, and additions.

Error Handling & Reliability