Blueprint/Chapter 18
Chapter 18

Testing & Observability

By Zavier SandersSeptember 21, 2025

Essential testing strategies and observability patterns for production agents.

Prefer something you can ship today? Start with theQuickstart: Ship One Agent with Mastra— then come back here to deepen the concepts.

Why This Matters

You can't improve what you can't measure, and you can't ship what you can't trust.

The Production Reality

// ❌ Flying blind
await agent.generate(input);
// Did it work? How long did it take? What did it cost?

// ✅ Observable and tested
const result = await trackedAgent.generate(input);
// Know: success rate, latency, tokens used, cost

Testing Strategies

Unit Tests for Tools

// src/mastra/tools/__tests__/search-web.test.ts
import { searchWeb } from '../search-web';

describe('searchWeb', () => {
  it('returns search results', async () => {
    const result = await searchWeb.execute(
      { context: mockContext },
      { query: 'Next.js', limit: 5 }
    );

    expect(result.success).toBe(true);
    expect(result.results).toHaveLength(5);
    expect(result.results[0]).toHaveProperty('title');
    expect(result.results[0]).toHaveProperty('url');
  });

  it('handles errors gracefully', async () => {
    const result = await searchWeb.execute(
      { context: mockContext },
      { query: '', limit: 5 }
    );

    expect(result.success).toBe(false);
    expect(result.error).toBeDefined();
  });
});

Integration Tests for Agents

// tests/agents/summarizer.test.ts
import { summarizerAgent } from '@/mastra/agents/summarizer';

describe('Summarizer Agent', () => {
  it('summarizes articles correctly', async () => {
    const article = `
      Long article content here...
      Multiple paragraphs...
    `;

    const summary = await summarizerAgent.generate(
      `Summarize this:\n\n${article}`
    );

    // Check summary is shorter than original
    expect(summary.length).toBeLessThan(article.length);
    
    // Check it contains key information
    expect(summary).toContain('key topic');
  });

  it('handles empty input', async () => {
    await expect(
      summarizerAgent.generate('')
    ).rejects.toThrow();
  });
});

Evaluation Tests

// tests/evals/agent.eval.ts
interface EvalCase {
  input: string;
  expectedBehavior: string;
  criteria: (output: string) => boolean;
}

const evalCases: EvalCase[] = [
  {
    input: 'What is 2+2?',
    expectedBehavior: 'Should answer 4',
    criteria: (output) => output.includes('4'),
  },
  {
    input: 'Search for Next.js documentation',
    expectedBehavior: 'Should use search tool',
    criteria: (output) => output.includes('nextjs.org'),
  },
];

describe('Agent Evals', () => {
  it('passes all eval cases', async () => {
    const results = await Promise.all(
      evalCases.map(async (testCase) => {
        const output = await agent.generate(testCase.input);
        return {
          passed: testCase.criteria(output),
          case: testCase.expectedBehavior,
        };
      })
    );

    const failures = results.filter(r => !r.passed);
    
    if (failures.length > 0) {
      console.log('Failed cases:', failures);
    }

    expect(failures.length).toBe(0);
  });
});

Basic Observability

Simple Logging

// lib/logger.ts
export const logger = {
  info: (message: string, meta?: any) => {
    console.log(JSON.stringify({
      level: 'info',
      message,
      timestamp: new Date().toISOString(),
      ...meta,
    }));
  },
  
  error: (message: string, error?: Error, meta?: any) => {
    console.error(JSON.stringify({
      level: 'error',
      message,
      error: error?.message,
      stack: error?.stack,
      timestamp: new Date().toISOString(),
      ...meta,
    }));
  },
};

// Usage in agents
import { logger } from '@/lib/logger';

export async function generateWithAgent(input: string) {
  const start = Date.now();
  
  logger.info('Agent execution started', {
    input: input.substring(0, 100),
  });

  try {
    const result = await agent.generate(input);
    
    logger.info('Agent execution completed', {
      latency: Date.now() - start,
      outputLength: result.length,
    });

    return result;
  } catch (error) {
    logger.error('Agent execution failed', error as Error, {
      latency: Date.now() - start,
    });
    throw error;
  }
}

Error Tracking with Sentry

// lib/sentry.ts
import * as Sentry from '@sentry/nextjs';

export function initSentry() {
  if (process.env.SENTRY_DSN) {
    Sentry.init({
      dsn: process.env.SENTRY_DSN,
      environment: process.env.NODE_ENV,
      tracesSampleRate: 0.1,
    });
  }
}

// app/layout.tsx
export default function RootLayout({ children }) {
  useEffect(() => {
    initSentry();
  }, []);

  return children;
}

// Usage
try {
  await agent.generate(input);
} catch (error) {
  Sentry.captureException(error, {
    tags: { agent: 'summarizer' },
    extra: { input },
  });
  throw error;
}

Performance Tracking

// lib/metrics.ts
export class Metrics {
  private metrics: Map<string, number[]> = new Map();

  track(name: string, value: number) {
    const values = this.metrics.get(name) || [];
    values.push(value);
    this.metrics.set(name, values);
  }

  getStats(name: string) {
    const values = this.metrics.get(name) || [];
    
    if (values.length === 0) {
      return null;
    }

    return {
      count: values.length,
      avg: values.reduce((a, b) => a + b, 0) / values.length,
      min: Math.min(...values),
      max: Math.max(...values),
      p95: this.percentile(values, 0.95),
    };
  }

  private percentile(arr: number[], p: number): number {
    const sorted = [...arr].sort((a, b) => a - b);
    const index = Math.ceil(sorted.length * p) - 1;
    return sorted[index];
  }
}

export const metrics = new Metrics();

// Usage
const start = Date.now();
const result = await agent.generate(input);
metrics.track('agent.latency', Date.now() - start);

// View stats
console.log(metrics.getStats('agent.latency'));
// { count: 100, avg: 1234, min: 500, max: 3000, p95: 2800 }

Cost Monitoring

Token Tracking

// lib/token-tracker.ts
interface TokenUsage {
  inputTokens: number;
  outputTokens: number;
  totalTokens: number;
}

interface CostData {
  timestamp: Date;
  agent: string;
  model: string;
  tokens: TokenUsage;
  cost: number;
}

export class TokenTracker {
  private costs: CostData[] = [];

  track(agent: string, model: string, usage: TokenUsage) {
    const cost = this.calculateCost(model, usage);
    
    this.costs.push({
      timestamp: new Date(),
      agent,
      model,
      tokens: usage,
      cost,
    });

    // Log to database
    this.logToDatabase({ agent, model, usage, cost });
  }

  private calculateCost(model: string, usage: TokenUsage): number {
    const pricing = {
      'gpt-4o': {
        input: 2.5 / 1_000_000,
        output: 10 / 1_000_000,
      },
      'gpt-4o-mini': {
        input: 0.15 / 1_000_000,
        output: 0.6 / 1_000_000,
      },
    };

    const rates = pricing[model] || pricing['gpt-4o'];

    return (
      usage.inputTokens * rates.input +
      usage.outputTokens * rates.output
    );
  }

  getTodayCost(): number {
    const today = new Date();
    today.setHours(0, 0, 0, 0);

    return this.costs
      .filter(c => c.timestamp >= today)
      .reduce((sum, c) => sum + c.cost, 0);
  }

  private async logToDatabase(data: any) {
    await db.tokenUsage.create({ data });
  }
}

export const tokenTracker = new TokenTracker();

// Usage
const result = await agent.generate(input);

tokenTracker.track('summarizer', 'gpt-4o-mini', {
  inputTokens: result.usage.inputTokens,
  outputTokens: result.usage.outputTokens,
  totalTokens: result.usage.totalTokens,
});

Cost Dashboard

// app/api/admin/costs/route.ts
export async function GET() {
  const today = new Date();
  today.setHours(0, 0, 0, 0);

  const todayCosts = await db.tokenUsage.aggregate({
    where: { timestamp: { gte: today } },
    _sum: { cost: true },
    _count: true,
  });

  const byAgent = await db.tokenUsage.groupBy({
    by: ['agent'],
    where: { timestamp: { gte: today } },
    _sum: { cost: true, totalTokens: true },
  });

  return Response.json({
    today: {
      total: todayCosts._sum.cost || 0,
      requests: todayCosts._count,
    },
    byAgent: byAgent.map(a => ({
      agent: a.agent,
      cost: a._sum.cost,
      tokens: a._sum.totalTokens,
    })),
  });
}

Health Checks

// app/api/health/route.ts
export async function GET() {
  const health = {
    status: 'healthy',
    timestamp: new Date().toISOString(),
    checks: {},
  };

  // Check database
  try {
    await db.$queryRaw`SELECT 1`;
    health.checks.database = 'up';
  } catch (error) {
    health.checks.database = 'down';
    health.status = 'unhealthy';
  }

  // Check OpenAI
  try {
    await openai.models.list();
    health.checks.openai = 'up';
  } catch (error) {
    health.checks.openai = 'down';
    health.status = 'degraded';
  }

  const statusCode = health.status === 'healthy' ? 200 : 503;

  return Response.json(health, { status: statusCode });
}

Progressive Approach

Phase 1: Day 1 (Essential)

Basic logging (console.log with structure)
Error tracking (Sentry)
Health checks
Unit tests for tools
Integration tests for agents
Token/cost tracking
Performance metrics

Phase 3: Month 1 (Advanced)

Automated evals in CI/CD
Custom dashboards
Alerting on errors/costs
A/B testing

Key Takeaways

  1. Test your tools - Unit tests prevent regressions
  2. Test your agents - Integration and eval tests ensure quality
  3. Log everything - Structured logging enables debugging
  4. Track errors - Sentry catches production issues
  5. Monitor costs - Token tracking prevents budget surprises
  6. Health checks - Know when systems are down

Start simple, add sophistication as you need it!

Get chapter updates & code samples

We’ll email diagrams, code snippets, and additions.