Developer Kit
Observability Bootstrapper
Adds structured logging with trace IDs, Prometheus metrics, OpenTelemetry tracing, and baseline alert rules to any service. Useful for making services observable by default instead of as an afterthought. Engineers shipping new services, platform teams standardizing observability across a portfolio, SREs enforcing observability acceptance criteria on new deploys. The consequence is predictable: the first production incident reveals that the service has no structured logs, no metrics, and no traces, so debugging requires a correlated investigation across three teams with partial information. Bootstrapping observability from the start costs a few hours; retrofitting it after an incident costs a week. A structured bootstrapper produces a working baseline — instrumented code, dashboards, alerts — that the team extends as their system evolves.
One-Time Purchase
$19.99
Observability Bootstrap — order-processing-service (Python/FastAPI, tier 1)
Exporters: stdout (logs/logfmt), OTLP → OpenTelemetry Collector (metrics + traces) → Grafana Cloud
observability/logger.py
import logging
import sys
from contextvars import ContextVar
from pythonjsonlogger import jsonlogger
_trace_id: ContextVar[str] = ContextVar("trace_id", default="unset")
_span_id: ContextVar[str] = ContextVar("span_id", default="unset")
class CorrelationFilter(logging.Filter):
def filter(self, record):
record.trace_id = _trace_id.get()
record.span_id = _span_id.get()
record.service = "order-processing-service"
return True
def get_logger(name: str) -> logging.Logger:
logger = logging.getLogger(name)
handler = logging.StreamHandler(sys.stdout)
formatter = jsonlogger.JsonFormatter(
fmt="%(asctime)s %(levelname)s %(name)s %(trace_id)s %(span_id)s %(message)s",
rename_fields={"asctime": "ts", "levelname": "level", "name": "logger"},
)
handler.setFormatter(formatter)
logger.addFilter(CorrelationFilter())
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
# ⚠️ Never log: customer_email, customer_name, card_last4, raw request/response bodies
observability/metrics.py
from prometheus_client import Counter, Histogram, CollectorRegistry, REGISTRY
# RED metrics — cardinality cap: status and endpoint only (never user_id, order_id)
REQUEST_COUNT = Counter(
"orders_http_requests_total",
"Total HTTP requests",
["method", "endpoint", "status_class"], # status_class: 2xx/4xx/5xx, not raw code
)
REQUEST_LATENCY = Histogram(
"orders_http_request_duration_seconds",
"HTTP request latency",
["method", "endpoint"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
)
ORDER_CREATED = Counter("orders_created_total", "Orders successfully created", ["payment_method"])
ORDER_FAILED = Counter("orders_failed_total", "Order creation failures", ["failure_reason"])
# failure_reason bounded to: validation_error | payment_declined | inventory_unavailable | timeout | unknown
DB_QUERY_DURATION = Histogram(
"orders_db_query_duration_seconds",
"Database query latency",
["operation", "table"],
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
)
observability/tracing.py
import os
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.sampling import TraceIdRatioBased, ParentBased
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
def init_tracing(app, db_engine):
env = os.getenv("ENV", "development")
sample_rate = 1.0 if env == "development" else float(os.getenv("TRACE_SAMPLE_RATE", "0.1"))
provider = TracerProvider(
sampler=ParentBased(root=TraceIdRatioBased(sample_rate)),
resource=Resource({"service.name": "order-processing-service", "deployment.environment": env}),
)
provider.add_span_processor(
BatchSpanProcessor(OTLPSpanExporter(endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")))
)
trace.set_tracer_provider(provider)
FastAPIInstrumentor.instrument_app(app)
SQLAlchemyInstrumentor().instrument(engine=db_engine)
# Propagates W3C TraceContext across async task boundaries via contextvars automatically
observability/middleware.py
import time
from starlette.middleware.base import BaseHTTPMiddleware
from opentelemetry import trace
from observability.logger import _trace_id, _span_id
from observability.metrics import REQUEST_COUNT, REQUEST_LATENCY
class ObservabilityMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request, call_next):
start = time.perf_counter()
span = trace.get_current_span()
ctx = span.get_span_context()
_trace_id.set(format(ctx.trace_id, "032x") if ctx.is_valid else "unset")
_span_id.set(format(ctx.span_id, "016x") if ctx.is_valid else "unset")
response = await call_next(request)
duration = time.perf_counter() - start
endpoint = request.scope.get("route", {}).path if hasattr(request.scope.get("route", {}), "path") else "unknown"
status_class = f"{response.status_code // 100}xx"
REQUEST_COUNT.labels(request.method, endpoint, status_class).inc()
REQUEST_LATENCY.labels(request.method, endpoint).observe(duration)
return response
alerts.yaml
# ⚠️ Thresholds below are starting points — tune after observing your baseline p50/p99.
groups:
- name: order-processing-service
rules:
- alert: HighErrorRate
expr: |
rate(orders_http_requests_total{status_class="5xx"}[5m])
/ rate(orders_http_requests_total[5m]) > 0.05
for: 3m
labels:
severity: critical
service: order-processing-service
annotations:
summary: "Error rate above 5% for 3 minutes"
runbook: "https://wiki.internal/runbooks/order-processing/high-error-rate"
- alert: P99LatencyHigh
expr: |
histogram_quantile(0.99,
rate(orders_http_request_duration_seconds_bucket[5m])) > 2.0
for: 5m
labels:
severity: warning
annotations:
summary: "p99 latency > 2s — investigate DB or downstream dependencies"
- alert: OrderFailureSpike
expr: rate(orders_failed_total[5m]) > 0.5
for: 2m
labels:
severity: critical
annotations:
summary: "Order failure rate > 0.5/sec — check payment gateway and inventory service"
- alert: DBQueryLatencyHigh
expr: |
histogram_quantile(0.95,
rate(orders_db_query_duration_seconds_bucket[5m])) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "DB p95 query time > 500ms — check for slow queries or connection pool exhaustion"
- alert: ServiceDown
expr: up{job="order-processing-service"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "order-processing-service scrape target is down"
handlers/create_order.py — Instrumented Example
import asyncio
from fastapi import APIRouter, HTTPException
from opentelemetry import trace
from observability.logger import get_logger
from observability.metrics import ORDER_CREATED, ORDER_FAILED, DB_QUERY_DURATION
router = APIRouter()
logger = get_logger(__name__)
tracer = trace.get_tracer("order-processing-service")
@router.post("/orders")
async def create_order(payload: OrderRequest):
with tracer.start_as_current_span("create_order") as span:
span.set_attribute("order.payment_method", payload.payment_method)
span.set_attribute("order.item_count", len(payload.items))
# ✗ Do NOT set: span.set_attribute("order.customer_email", ...)
logger.info("order_creation_started", extra={
"payment_method": payload.payment_method,
"item_count": len(payload.items),
})
try:
with tracer.start_as_current_span("db.insert_order"):
with DB_QUERY_DURATION.labels("insert", "orders").time():
order = await db.insert_order(payload)
ORDER_CREATED.labels(payment_method=payload.payment_method).inc()
logger.info("order_creation_succeeded", extra={"order_id_prefix": str(order.id)[:8]})
return {"order_id": order.id}
except PaymentDeclinedError:
ORDER_FAILED.labels(failure_reason="payment_declined").inc()
span.set_status(trace.StatusCode.ERROR, "payment declined")
logger.warning("order_creation_failed", extra={"reason": "payment_declined"})
raise HTTPException(status_code=402, detail="Payment declined")
except Exception as e:
ORDER_FAILED.labels(failure_reason="unknown").inc()
span.record_exception(e)
logger.error("order_creation_error", extra={"error_type": type(e).__name__})
raise HTTPException(status_code=500, detail="Internal error")
dashboard.json (excerpt — full file: 847 lines)
{
"title": "order-processing-service",
"uid": "order-proc-v1",
"panels": [
{
"title": "Request Rate (RPS)",
"type": "timeseries",
"targets": [{"expr": "sum(rate(orders_http_requests_total[1m])) by (endpoint)"}]
},
{
"title": "Error Rate %",
"type": "timeseries",
"targets": [{"expr": "sum(rate(orders_http_requests_total{status_class='5xx'}[5m])) / sum(rate(orders_http_requests_total[5m])) * 100"}]
},
{
"title": "p50 / p95 / p99 Latency",
"type": "timeseries",
"targets": [
{"expr": "histogram_quantile(0.50, rate(orders_http_request_duration_seconds_bucket[5m]))", "legendFormat": "p50"},
{"expr": "histogram_quantile(0.95, rate(orders_http_request_duration_seconds_bucket[5m]))", "legendFormat": "p95"},
{"expr": "histogram_quantile(0.99, rate(orders_http_request_duration_seconds_bucket[5m]))", "legendFormat": "p99"}
]
},
{
"title": "Order Outcomes",
"type": "timeseries",
"targets": [
{"expr": "rate(orders_created_total[5m])", "legendFormat": "created"},
{"expr": "rate(orders_failed_total[5m]) by (failure_reason)", "legendFormat": "failed — {{failure_reason}}"}
]
}
]
}
View full sample →
All sales final. No refunds on digital products.
Includes support for Claude Code, Codex, and OpenClaw in the same license.
What You Get With This Skill
Adds structured logging with trace IDs, Prometheus metrics, OpenTelemetry tracing, and baseline alert rules to any service. Useful for making services observable by default instead of as an afterthought.
All ClearPoint Nexus Skills Include
- Production-ready workflow packaging for three supported platforms.
- Reusable structure designed for repeatable operator tasks.
- Clear deliverable format, not just raw prompt output.
Related Skills
$19.99
One-time license
$19.99
One-time license
$19.99
One-time license