An application running in production without observability is flying blind. When something breaks at 3am, you need to know: what error occurred, in which code path, for which user, with what request data, and how long it had been failing before the alert fired. Structured logging, metrics, and error tracking are the three pillars of observability. For the blog application, integrating these takes an afternoon and pays dividends for the lifetime of the product.
Structured JSON Logging
# app/logging_config.py
import logging, structlog, uuid
from starlette.middleware.base import BaseHTTPMiddleware
def configure_logging():
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.JSONRenderer(), # output as JSON
],
wrapper_class = structlog.stdlib.BoundLogger,
context_class = dict,
logger_factory = structlog.stdlib.LoggerFactory(),
)
class RequestIdMiddleware(BaseHTTPMiddleware):
"""Adds a unique request_id to every log entry for this request."""
async def dispatch(self, request, call_next):
request_id = request.headers.get("X-Request-Id") or str(uuid.uuid4())
structlog.contextvars.clear_contextvars()
structlog.contextvars.bind_contextvars(
request_id = request_id,
path = request.url.path,
method = request.method,
)
response = await call_next(request)
response.headers["X-Request-Id"] = request_id
return response
# Usage in any module:
log = structlog.get_logger()
log.info("post.created", post_id=post.id, user_id=user.id, title=post.title)
SELECT * FROM logs WHERE user_id = 42 AND level = "error". Plain text logs require fragile regex parsing to extract fields. Always use structured logging for production applications — it costs no extra development time and makes debugging dramatically faster.Prometheus Metrics
from prometheus_client import Counter, Histogram, Gauge, make_asgi_app
from starlette.routing import Mount
# Define metrics
REQUEST_COUNT = Counter(
"http_requests_total",
"Total HTTP requests",
["method", "endpoint", "status_code"],
)
REQUEST_LATENCY = Histogram(
"http_request_duration_seconds",
"HTTP request latency",
["endpoint"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5],
)
ACTIVE_WS_CONNECTIONS = Gauge(
"websocket_active_connections",
"Active WebSocket connections",
)
# Middleware to record metrics
class MetricsMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request, call_next):
import time
start = time.perf_counter()
response = await call_next(request)
duration = time.perf_counter() - start
endpoint = request.url.path
REQUEST_COUNT.labels(
method=request.method,
endpoint=endpoint,
status_code=response.status_code,
).inc()
REQUEST_LATENCY.labels(endpoint=endpoint).observe(duration)
return response
# Mount /metrics endpoint
metrics_app = make_asgi_app()
app.mount("/metrics", metrics_app)
Sentry Error Tracking
# Backend: app/main.py
import sentry_sdk
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.sqlalchemy import SqlalchemyIntegration
sentry_sdk.init(
dsn = settings.sentry_dsn,
environment = settings.environment,
traces_sample_rate = 0.1 if settings.is_production else 1.0,
integrations = [FastApiIntegration(), SqlalchemyIntegration()],
# Do not send PII
send_default_pii = False,
)
// Frontend: src/main.jsx
import * as Sentry from "@sentry/react";
Sentry.init({
dsn: import.meta.env.VITE_SENTRY_DSN,
environment: import.meta.env.MODE,
integrations: [
Sentry.browserTracingIntegration(),
Sentry.replayIntegration({ maskAllText: true }), // mask user text for privacy
],
tracesSampleRate: 0.1, // sample 10% of transactions
replaysOnErrorSampleRate: 1.0, // replay 100% of error sessions
});
Health Check Endpoint
from fastapi import APIRouter
from sqlalchemy import text
import redis.asyncio as aioredis
health_router = APIRouter()
@health_router.get("/api/health")
async def health_check(db: Session = Depends(get_db)):
checks = {}
# Database connectivity
try:
db.execute(text("SELECT 1"))
checks["database"] = "ok"
except Exception as e:
checks["database"] = f"error: {e}"
# Redis connectivity
try:
redis = aioredis.from_url(settings.redis_url)
await redis.ping()
checks["redis"] = "ok"
except Exception as e:
checks["redis"] = f"error: {e}"
all_ok = all(v == "ok" for v in checks.values())
status = 200 if all_ok else 503
return JSONResponse({"status": "ok" if all_ok else "degraded",
"checks": checks}, status_code=status)
Common Mistakes
Mistake 1 — Logging sensitive data (PII, tokens)
❌ Wrong: log.info("login", email=user.email, token=access_token)
✅ Correct: log.info("login", user_id=user.id) — log IDs not PII, never tokens.
Mistake 2 — /metrics endpoint publicly accessible
❌ Wrong — Prometheus metrics expose internal application details (error rates, slow queries) to anyone on the internet.
✅ Correct — restrict /metrics to internal Nginx access only, or require a bearer token.