Monitoring & Observability
Comprehensive monitoring and observability are essential for running AgentArea in production. This guide covers metrics collection, logging, alerting, and distributed tracing for optimal system visibility.
📊 Observability Stack
AgentArea implements a complete observability solution using industry-standard tools:
📈 Metrics Collection
Prometheus Configuration
Server Configuration
Application Metrics
Custom Metrics
# prometheus.yml
global :
scrape_interval : 15s
evaluation_interval : 15s
external_labels :
cluster : 'agentarea-prod'
region : 'us-west-2'
rule_files :
- "agentarea_rules.yml"
- "infrastructure_rules.yml"
scrape_configs :
# AgentArea API metrics
- job_name : 'agentarea-api'
kubernetes_sd_configs :
- role : pod
relabel_configs :
- source_labels : [ __meta_kubernetes_pod_label_app ]
action : keep
regex : agentarea-api
- source_labels : [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ]
action : keep
regex : true
- source_labels : [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
action : replace
target_label : __metrics_path__
regex : (.+)
# MCP Manager metrics
- job_name : 'mcp-manager'
kubernetes_sd_configs :
- role : pod
relabel_configs :
- source_labels : [ __meta_kubernetes_pod_label_app ]
action : keep
regex : mcp-manager
# Infrastructure metrics
- job_name : 'node-exporter'
kubernetes_sd_configs :
- role : node
relabel_configs :
- action : labelmap
regex : __meta_kubernetes_node_label_(.+)
- job_name : 'postgres-exporter'
static_configs :
- targets : [ 'postgres-exporter:9187' ]
- job_name : 'redis-exporter'
static_configs :
- targets : [ 'redis-exporter:9121' ]
Availability 99.9% uptime target
Service availability
Error rates < 0.1%
Response time < 200ms (p95)
Performance Sub-second response
API response time
Agent creation time
Database query performance
Scalability Linear scaling
Requests per second
Concurrent users
Resource utilization
Business Usage metrics
Active agents
Conversations per hour
User engagement
📝 Logging Strategy
Structured Logging
Application Logs
Log Aggregation
Log Levels & Categories
# Structured logging with Python
import structlog
import logging
from pythonjsonlogger import jsonlogger
# Configure structured logging
structlog.configure(
processors = [
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer()
],
context_class = dict ,
logger_factory = structlog.stdlib.LoggerFactory(),
wrapper_class = structlog.stdlib.BoundLogger,
cache_logger_on_first_use = True ,
)
# Get logger
logger = structlog.get_logger( "agentarea.api" )
# Usage examples
async def create_agent ( agent_config : AgentConfig):
logger.info(
"Creating agent" ,
agent_name = agent_config.name,
model = agent_config.model,
template = agent_config.template,
user_id = current_user.id
)
try :
agent = await agent_service.create(agent_config)
logger.info(
"Agent created successfully" ,
agent_id = agent.id,
creation_time = agent.created_at,
user_id = current_user.id
)
return agent
except Exception as e:
logger.error(
"Failed to create agent" ,
error = str (e),
error_type = type (e). __name__ ,
agent_config = agent_config.dict(),
user_id = current_user.id,
exc_info = True
)
raise
🔍 Distributed Tracing
Jaeger Integration
Trace Configuration
Cross-Service Tracing
Trace Analysis
# OpenTelemetry tracing setup
from opentelemetry import trace
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
# Configure tracer
trace.set_tracer_provider(TracerProvider())
tracer = trace.get_tracer( __name__ )
# Configure Jaeger exporter
jaeger_exporter = JaegerExporter(
agent_host_name = "jaeger-agent" ,
agent_port = 6831 ,
)
# Add span processor
span_processor = BatchSpanProcessor(jaeger_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)
# Auto-instrument frameworks
FastAPIInstrumentor.instrument_app(app)
SQLAlchemyInstrumentor().instrument( engine = engine)
RedisInstrumentor().instrument()
# Manual tracing for business logic
async def create_agent_with_tracing ( agent_config : AgentConfig):
with tracer.start_as_current_span(
"create_agent" ,
attributes = {
"agent.name" : agent_config.name,
"agent.model" : agent_config.model,
"user.id" : current_user.id
}
) as span:
# Validate configuration
with tracer.start_as_current_span( "validate_config" ) as child_span:
validation_result = await validate_agent_config(agent_config)
child_span.set_attribute( "validation.result" , validation_result)
# Create agent in database
with tracer.start_as_current_span( "database_create" ) as child_span:
agent = await create_agent_in_db(agent_config)
child_span.set_attribute( "agent.id" , agent.id)
# Initialize agent runtime
with tracer.start_as_current_span( "initialize_runtime" ) as child_span:
await initialize_agent_runtime(agent)
child_span.set_attribute( "runtime.status" , "initialized" )
span.set_attribute( "operation.status" , "success" )
return agent
🚨 Alerting & Notifications
Alert Rules
Critical Alerts
Resource Alerts
Business Alerts
# prometheus_rules.yml
groups :
- name : agentarea_critical
rules :
# Service availability
- alert : ServiceDown
expr : up{job=~"agentarea.*"} == 0
for : 1m
labels :
severity : critical
team : platform
annotations :
summary : "AgentArea service is down"
description : "{{ $labels.job }} has been down for more than 1 minute"
runbook_url : "https://docs.agentarea.com/runbooks/service-down"
# High error rate
- alert : HighErrorRate
expr : |
(
sum(rate(agentarea_http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(agentarea_http_requests_total[5m])) by (service)
) > 0.05
for : 2m
labels :
severity : critical
team : development
annotations :
summary : "High error rate detected"
description : "Error rate is {{ $value | humanizePercentage }} for {{ $labels.service }}"
# Database connection issues
- alert : DatabaseConnectionFailure
expr : agentarea_db_connections{state="failed"} > 0
for : 30s
labels :
severity : critical
team : infrastructure
annotations :
summary : "Database connection failures"
description : "{{ $value }} database connections have failed"
# High response time
- alert : HighResponseTime
expr : |
histogram_quantile(0.95,
sum(rate(agentarea_http_request_duration_seconds_bucket[5m])) by (le, service)
) > 2
for : 5m
labels :
severity : warning
team : performance
annotations :
summary : "High response time"
description : "95th percentile response time is {{ $value }}s for {{ $labels.service }}"
Notification Channels
PagerDuty Critical alerts only
Service outages
Security incidents
Data loss events
24/7 on-call escalation
Slack Team notifications
Warning alerts
Performance issues
Deployment updates
Team-specific channels
Email Summary reports
Daily health reports
Weekly performance summaries
Monthly SLA reports
Executive dashboards
📊 Grafana Dashboards
Executive Dashboard
Business KPIs
Technical Operations
{
"dashboard" : {
"title" : "AgentArea - Executive Dashboard" ,
"tags" : [ "agentarea" , "executive" ],
"time" : {
"from" : "now-24h" ,
"to" : "now"
},
"panels" : [
{
"title" : "Service Availability" ,
"type" : "stat" ,
"targets" : [
{
"expr" : "avg(up{job=~ \" agentarea.* \" })" ,
"legendFormat" : "Uptime"
}
],
"fieldConfig" : {
"defaults" : {
"unit" : "percentunit" ,
"min" : 0.99 ,
"max" : 1 ,
"thresholds" : {
"steps" : [
{ "color" : "red" , "value" : 0.99 },
{ "color" : "yellow" , "value" : 0.995 },
{ "color" : "green" , "value" : 0.999 }
]
}
}
}
},
{
"title" : "Active Users" ,
"type" : "graph" ,
"targets" : [
{
"expr" : "sum(agentarea_active_users)" ,
"legendFormat" : "Active Users"
}
]
},
{
"title" : "Agent Activity" ,
"type" : "graph" ,
"targets" : [
{
"expr" : "sum(agentarea_active_agents) by (status)" ,
"legendFormat" : "{{status}}"
}
]
},
{
"title" : "Error Rate" ,
"type" : "graph" ,
"targets" : [
{
"expr" : "sum(rate(agentarea_http_requests_total{status=~ \" 5.. \" }[5m])) / sum(rate(agentarea_http_requests_total[5m]))" ,
"legendFormat" : "Error Rate"
}
]
}
]
}
}
🔧 Troubleshooting
Common Issues
Identify Bottlenecks
Use metrics and tracing to identify the slowest components # Query slow endpoints
curl -G 'http://prometheus:9090/api/v1/query' \
--data-urlencode 'query=topk(10, histogram_quantile(0.95, sum(rate(agentarea_http_request_duration_seconds_bucket[5m])) by (le, endpoint)))'
Optimize Database
Review and optimize database queries and indexes -- Find slow queries
SELECT query, mean_time, calls
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10 ;
-- Check index usage
SELECT schemaname, tablename, attname, n_distinct, correlation
FROM pg_stats
WHERE tablename = 'agents' ;
Scale Resources
Adjust resource allocations and replica counts # Scale API replicas
kubectl scale deployment agentarea-api --replicas=5
# Update resource limits
kubectl patch deployment agentarea-api -p '{"spec":{"template":{"spec":{"containers":[{"name":"api","resources":{"limits":{"cpu":"2","memory":"4Gi"}}}]}}}}'
Effective monitoring and observability are crucial for maintaining a reliable AgentArea deployment. Regular review of metrics, logs, and traces helps identify issues before they impact users and ensures optimal system performance.