By the end of this module, students will be able to:
Implement comprehensive logging strategies
Set up metrics and dashboards
Configure alerting for critical issues
Build observability into agents
Topics
1. Observability Principles (20 min)
The Three Pillars
┌─────────────────────────────────────────────────────────┐
│ Observability │
├─────────────────────────────────────────────────────────┤
│ │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ Logs │ │ Metrics │ │ Traces │ │
│ └────┬────┘ └────┬────┘ └────┬────┘ │
│ │ │ │ │
│ What How many/ Where did │
│ happened? how fast? time go? │
│ │
│ Debug Alert Bottleneck │
│ issues on trends analysis │
│ │
└─────────────────────────────────────────────────────────┘
Voice AI Specific Metrics
Category
Metrics
Performance
Response time, function latency
Quality
Recognition accuracy, task completion
Usage
Calls/hour, functions/call
Errors
Error rate, timeout rate
Business
Transfers, resolutions, CSAT
2. Structured Logging (30 min)
JSON Logging Setup
importloggingimportjsonfromdatetimeimportdatetimeclassJSONFormatter(logging.Formatter):defformat(self,record):log_data={"timestamp":datetime.utcnow().isoformat(),"level":record.levelname,"logger":record.name,"message":record.getMessage(),"module":record.module,"function":record.funcName,"line":record.lineno}# Add extra fields
ifhasattr(record,"call_id"):log_data["call_id"]=record.call_idifhasattr(record,"customer_id"):log_data["customer_id"]=record.customer_idifhasattr(record,"duration_ms"):log_data["duration_ms"]=record.duration_msifrecord.exc_info:log_data["exception"]=self.formatException(record.exc_info)returnjson.dumps(log_data)defsetup_logging():handler=logging.StreamHandler()handler.setFormatter(JSONFormatter())root_logger=logging.getLogger()root_logger.addHandler(handler)root_logger.setLevel(logging.INFO)returnlogging.getLogger("agent")
Contextual Logging
classObservableAgent(AgentBase):def__init__(self):super().__init__(name="observable-agent")self.logger=setup_logging()self._setup_functions()def_setup_functions(self):@self.tool(description="Process order",parameters={"type":"object","properties":{"order_id":{"type":"string","description":"Order ID"}},"required":["order_id"]})defprocess_order(args:dict,raw_data:dict=None)->SwaigFunctionResult:order_id=args.get("order_id","")raw_data=raw_dataor{}call_id=raw_data.get("call_id","unknown")global_data=raw_data.get("global_data",{})# Log with context
self.logger.info("Processing order",extra={"call_id":call_id,"customer_id":global_data.get("customer_id"),"order_id":order_id})start=time.perf_counter()try:result=process(order_id)duration=(time.perf_counter()-start)*1000self.logger.info("Order processed successfully",extra={"call_id":call_id,"order_id":order_id,"duration_ms":duration})returnSwaigFunctionResult("Order processed.")exceptExceptionase:self.logger.error(f"Order processing failed: {e}",extra={"call_id":call_id,"order_id":order_id},exc_info=True)returnSwaigFunctionResult("I had trouble processing that order.")
Log Levels for Voice AI
Level
Use For
DEBUG
Function inputs/outputs, state changes
INFO
Call events, successful operations
WARNING
Slow responses, retries, fallbacks
ERROR
Failed operations, exceptions
CRITICAL
System failures, security issues
3. Metrics Collection (35 min)
Prometheus Metrics
fromprometheus_clientimportCounter,Histogram,Gauge,start_http_server# Define metrics
CALLS_TOTAL=Counter('voice_agent_calls_total','Total number of calls',['agent','status'])CALL_DURATION=Histogram('voice_agent_call_duration_seconds','Call duration in seconds',['agent'],buckets=[30,60,120,300,600])FUNCTION_LATENCY=Histogram('voice_agent_function_latency_seconds','Function execution latency',['agent','function'],buckets=[0.1,0.25,0.5,1.0,2.5,5.0])ACTIVE_CALLS=Gauge('voice_agent_active_calls','Number of active calls',['agent'])ERRORS_TOTAL=Counter('voice_agent_errors_total','Total errors',['agent','function','error_type'])classMetricsAgent(AgentBase):def__init__(self):super().__init__(name="metrics-agent")# Start metrics server
start_http_server(9090)self._setup_functions()def_setup_functions(self):@self.tool(description="Get account balance",parameters={"type":"object","properties":{"account_id":{"type":"string","description":"Account ID"}},"required":["account_id"]})defget_balance(args:dict,raw_data:dict=None)->SwaigFunctionResult:account_id=args.get("account_id","")withFUNCTION_LATENCY.labels(agent="metrics-agent",function="get_balance").time():try:balance=fetch_balance(account_id)returnSwaigFunctionResult(f"Balance: ${balance}")exceptExceptionase:ERRORS_TOTAL.labels(agent="metrics-agent",function="get_balance",error_type=type(e).__name__).inc()raise
Custom Business Metrics
TRANSFERS_TOTAL=Counter('voice_agent_transfers_total','Call transfers',['agent','department'])RESOLUTIONS_TOTAL=Counter('voice_agent_resolutions_total','Issues resolved',['agent','resolution_type'])CSAT_SCORES=Histogram('voice_agent_csat_score','Customer satisfaction scores',['agent'],buckets=[1,2,3,4,5])classBusinessMetricsAgent(AgentBase):@AgentBase.tool(description="Transfer call",parameters={"type":"object","properties":{"department":{"type":"string","description":"Department name"}},"required":["department"]})deftransfer(self,args:dict,raw_data:dict=None)->SwaigFunctionResult:department=args.get("department","")TRANSFERS_TOTAL.labels(agent="business-agent",department=department).inc()return(SwaigFunctionResult(f"Transferring to {department}.").connect(get_dept_number(department),final=True))@AgentBase.tool(description="Record resolution",parameters={"type":"object","properties":{"resolution_type":{"type":"string","description":"Type of resolution"}},"required":["resolution_type"]})defresolve_issue(self,args:dict,raw_data:dict=None)->SwaigFunctionResult:resolution_type=args.get("resolution_type","")RESOLUTIONS_TOTAL.labels(agent="business-agent",resolution_type=resolution_type).inc()returnSwaigFunctionResult("Issue marked as resolved.")
4. Distributed Tracing (25 min)
OpenTelemetry Setup
fromopentelemetryimporttracefromopentelemetry.sdk.traceimportTracerProviderfromopentelemetry.sdk.trace.exportimportBatchSpanProcessorfromopentelemetry.exporter.otlp.proto.grpc.trace_exporterimportOTLPSpanExporterdefsetup_tracing():provider=TracerProvider()processor=BatchSpanProcessor(OTLPSpanExporter())provider.add_span_processor(processor)trace.set_tracer_provider(provider)returntrace.get_tracer("voice-agent")tracer=setup_tracing()classTracedAgent(AgentBase):def__init__(self):super().__init__(name="traced-agent")self._setup_functions()def_setup_functions(self):@self.tool(description="Complex operation",parameters={"type":"object","properties":{"data":{"type":"string","description":"Data to process"}},"required":["data"]})defcomplex_operation(args:dict,raw_data:dict=None)->SwaigFunctionResult:data=args.get("data","")raw_data=raw_dataor{}call_id=raw_data.get("call_id","unknown")withtracer.start_as_current_span("complex_operation")asspan:span.set_attribute("call_id",call_id)span.set_attribute("input_length",len(data))# Step 1: Validate
withtracer.start_as_current_span("validate"):validated=validate(data)# Step 2: Process
withtracer.start_as_current_span("process"):result=process(validated)# Step 3: Store
withtracer.start_as_current_span("store"):store(result)span.set_attribute("success",True)returnSwaigFunctionResult("Operation completed.")
Trace Correlation
classCorrelatedAgent(AgentBase):def__init__(self):super().__init__(name="correlated-agent")self._setup_functions()def_setup_functions(self):@self.tool(description="API call with tracing",parameters={"type":"object","properties":{"endpoint":{"type":"string","description":"API endpoint"}},"required":["endpoint"]})defcall_api(args:dict,raw_data:dict=None)->SwaigFunctionResult:endpoint=args.get("endpoint","")raw_data=raw_dataor{}call_id=raw_data.get("call_id","unknown")withtracer.start_as_current_span("api_call")asspan:# Add trace context to outgoing request
headers={"X-Call-ID":call_id,"X-Trace-ID":span.get_span_context().trace_id}response=requests.get(endpoint,headers=headers,timeout=5)span.set_attribute("http.status_code",response.status_code)returnSwaigFunctionResult(f"API returned: {response.json()}")
5. Alerting and Dashboards (30 min)
Alert Rules (Prometheus)
# alerts.ymlgroups:-name:voice_agent_alertsrules:# High error rate-alert:HighErrorRateexpr:|sum(rate(voice_agent_errors_total[5m]))/ sum(rate(voice_agent_calls_total[5m])) > 0.05for:5mlabels:severity:criticalannotations:summary:"Higherrorrate(>5%)"description:"Errorrateis%"# Slow function latency-alert:SlowFunctionLatencyexpr:|histogram_quantile(0.95,rate(voice_agent_function_latency_seconds_bucket[5m])) > 2for:5mlabels:severity:warningannotations:summary:"P95latency>2s"# High transfer rate-alert:HighTransferRateexpr:|sum(rate(voice_agent_transfers_total[1h]))/ sum(rate(voice_agent_calls_total[1h])) > 0.3for:15mlabels:severity:warningannotations:summary:"Transferrate>30%"description:"MayindicateAIhandlingissues"# No calls (system down?)-alert:NoIncomingCallsexpr:|sum(rate(voice_agent_calls_total[10m])) == 0for:15mlabels:severity:criticalannotations:summary:"Noincomingcallsfor15minutes"