Why This Matters
You can't improve what you can't measure, and you can't ship what you can't trust.
The Production Reality
// ❌ Flying blind await agent.generate(input); // Did it work? How long did it take? What did it cost? // ✅ Observable and tested const result = await trackedAgent.generate(input); // Know: success rate, latency, tokens used, cost
Testing Strategies
Unit Tests for Tools
// src/mastra/tools/__tests__/search-web.test.ts import { searchWeb } from '../search-web'; describe('searchWeb', () => { it('returns search results', async () => { const result = await searchWeb.execute( { context: mockContext }, { query: 'Next.js', limit: 5 } ); expect(result.success).toBe(true); expect(result.results).toHaveLength(5); expect(result.results[0]).toHaveProperty('title'); expect(result.results[0]).toHaveProperty('url'); }); it('handles errors gracefully', async () => { const result = await searchWeb.execute( { context: mockContext }, { query: '', limit: 5 } ); expect(result.success).toBe(false); expect(result.error).toBeDefined(); }); });
Integration Tests for Agents
// tests/agents/summarizer.test.ts import { summarizerAgent } from '@/mastra/agents/summarizer'; describe('Summarizer Agent', () => { it('summarizes articles correctly', async () => { const article = ` Long article content here... Multiple paragraphs... `; const summary = await summarizerAgent.generate( `Summarize this:\n\n${article}` ); // Check summary is shorter than original expect(summary.length).toBeLessThan(article.length); // Check it contains key information expect(summary).toContain('key topic'); }); it('handles empty input', async () => { await expect( summarizerAgent.generate('') ).rejects.toThrow(); }); });
Evaluation Tests
// tests/evals/agent.eval.ts interface EvalCase { input: string; expectedBehavior: string; criteria: (output: string) => boolean; } const evalCases: EvalCase[] = [ { input: 'What is 2+2?', expectedBehavior: 'Should answer 4', criteria: (output) => output.includes('4'), }, { input: 'Search for Next.js documentation', expectedBehavior: 'Should use search tool', criteria: (output) => output.includes('nextjs.org'), }, ]; describe('Agent Evals', () => { it('passes all eval cases', async () => { const results = await Promise.all( evalCases.map(async (testCase) => { const output = await agent.generate(testCase.input); return { passed: testCase.criteria(output), case: testCase.expectedBehavior, }; }) ); const failures = results.filter(r => !r.passed); if (failures.length > 0) { console.log('Failed cases:', failures); } expect(failures.length).toBe(0); }); });
Basic Observability
Simple Logging
// lib/logger.ts export const logger = { info: (message: string, meta?: any) => { console.log(JSON.stringify({ level: 'info', message, timestamp: new Date().toISOString(), ...meta, })); }, error: (message: string, error?: Error, meta?: any) => { console.error(JSON.stringify({ level: 'error', message, error: error?.message, stack: error?.stack, timestamp: new Date().toISOString(), ...meta, })); }, }; // Usage in agents import { logger } from '@/lib/logger'; export async function generateWithAgent(input: string) { const start = Date.now(); logger.info('Agent execution started', { input: input.substring(0, 100), }); try { const result = await agent.generate(input); logger.info('Agent execution completed', { latency: Date.now() - start, outputLength: result.length, }); return result; } catch (error) { logger.error('Agent execution failed', error as Error, { latency: Date.now() - start, }); throw error; } }
Error Tracking with Sentry
// lib/sentry.ts import * as Sentry from '@sentry/nextjs'; export function initSentry() { if (process.env.SENTRY_DSN) { Sentry.init({ dsn: process.env.SENTRY_DSN, environment: process.env.NODE_ENV, tracesSampleRate: 0.1, }); } } // app/layout.tsx export default function RootLayout({ children }) { useEffect(() => { initSentry(); }, []); return children; } // Usage try { await agent.generate(input); } catch (error) { Sentry.captureException(error, { tags: { agent: 'summarizer' }, extra: { input }, }); throw error; }
Performance Tracking
// lib/metrics.ts export class Metrics { private metrics: Map<string, number[]> = new Map(); track(name: string, value: number) { const values = this.metrics.get(name) || []; values.push(value); this.metrics.set(name, values); } getStats(name: string) { const values = this.metrics.get(name) || []; if (values.length === 0) { return null; } return { count: values.length, avg: values.reduce((a, b) => a + b, 0) / values.length, min: Math.min(...values), max: Math.max(...values), p95: this.percentile(values, 0.95), }; } private percentile(arr: number[], p: number): number { const sorted = [...arr].sort((a, b) => a - b); const index = Math.ceil(sorted.length * p) - 1; return sorted[index]; } } export const metrics = new Metrics(); // Usage const start = Date.now(); const result = await agent.generate(input); metrics.track('agent.latency', Date.now() - start); // View stats console.log(metrics.getStats('agent.latency')); // { count: 100, avg: 1234, min: 500, max: 3000, p95: 2800 }
Cost Monitoring
Token Tracking
// lib/token-tracker.ts interface TokenUsage { inputTokens: number; outputTokens: number; totalTokens: number; } interface CostData { timestamp: Date; agent: string; model: string; tokens: TokenUsage; cost: number; } export class TokenTracker { private costs: CostData[] = []; track(agent: string, model: string, usage: TokenUsage) { const cost = this.calculateCost(model, usage); this.costs.push({ timestamp: new Date(), agent, model, tokens: usage, cost, }); // Log to database this.logToDatabase({ agent, model, usage, cost }); } private calculateCost(model: string, usage: TokenUsage): number { const pricing = { 'gpt-4o': { input: 2.5 / 1_000_000, output: 10 / 1_000_000, }, 'gpt-4o-mini': { input: 0.15 / 1_000_000, output: 0.6 / 1_000_000, }, }; const rates = pricing[model] || pricing['gpt-4o']; return ( usage.inputTokens * rates.input + usage.outputTokens * rates.output ); } getTodayCost(): number { const today = new Date(); today.setHours(0, 0, 0, 0); return this.costs .filter(c => c.timestamp >= today) .reduce((sum, c) => sum + c.cost, 0); } private async logToDatabase(data: any) { await db.tokenUsage.create({ data }); } } export const tokenTracker = new TokenTracker(); // Usage const result = await agent.generate(input); tokenTracker.track('summarizer', 'gpt-4o-mini', { inputTokens: result.usage.inputTokens, outputTokens: result.usage.outputTokens, totalTokens: result.usage.totalTokens, });
Cost Dashboard
// app/api/admin/costs/route.ts export async function GET() { const today = new Date(); today.setHours(0, 0, 0, 0); const todayCosts = await db.tokenUsage.aggregate({ where: { timestamp: { gte: today } }, _sum: { cost: true }, _count: true, }); const byAgent = await db.tokenUsage.groupBy({ by: ['agent'], where: { timestamp: { gte: today } }, _sum: { cost: true, totalTokens: true }, }); return Response.json({ today: { total: todayCosts._sum.cost || 0, requests: todayCosts._count, }, byAgent: byAgent.map(a => ({ agent: a.agent, cost: a._sum.cost, tokens: a._sum.totalTokens, })), }); }
Health Checks
// app/api/health/route.ts export async function GET() { const health = { status: 'healthy', timestamp: new Date().toISOString(), checks: {}, }; // Check database try { await db.$queryRaw`SELECT 1`; health.checks.database = 'up'; } catch (error) { health.checks.database = 'down'; health.status = 'unhealthy'; } // Check OpenAI try { await openai.models.list(); health.checks.openai = 'up'; } catch (error) { health.checks.openai = 'down'; health.status = 'degraded'; } const statusCode = health.status === 'healthy' ? 200 : 503; return Response.json(health, { status: statusCode }); }
Progressive Approach
Phase 1: Day 1 (Essential)
✅ Basic logging (console.log with structure) ✅ Error tracking (Sentry) ✅ Health checks
Phase 2: Week 1 (Recommended)
✅ Unit tests for tools ✅ Integration tests for agents ✅ Token/cost tracking ✅ Performance metrics
Phase 3: Month 1 (Advanced)
✅ Automated evals in CI/CD ✅ Custom dashboards ✅ Alerting on errors/costs ✅ A/B testing
Key Takeaways
- Test your tools - Unit tests prevent regressions
- Test your agents - Integration and eval tests ensure quality
- Log everything - Structured logging enables debugging
- Track errors - Sentry catches production issues
- Monitor costs - Token tracking prevents budget surprises
- Health checks - Know when systems are down
Start simple, add sophistication as you need it!
Get chapter updates & code samples
We’ll email diagrams, code snippets, and additions.