aster Server 内置 OpenTelemetry 支持,提供完整的分布式追踪能力。
config := &server.Config{
Observability: server.ObservabilityConfig{
Enabled: true,
Tracing: server.TracingConfig{
Enabled: true,
ServiceName: "aster",
ServiceVersion: "v0.11.0",
Environment: "production",
OTLPEndpoint: "localhost:4318",
OTLPInsecure: true,
SamplingRate: 1.0, // 100% 采样
},
},
}
srv, _ := server.New(config, deps)
srv.Start()
export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
export OTEL_SERVICE_NAME=aster
export OTEL_TRACES_SAMPLER=always_on
# 使用 Docker 启动 Jaeger
docker run -d --name jaeger \
-p 16686:16686 \
-p 4318:4318 \
jaegertracing/all-in-one:latest
# 访问 UI
open http://localhost:16686
# 启动 Zipkin
docker run -d --name zipkin \
-p 9411:9411 \
openzipkin/zipkin
# 访问 UI
open http://localhost:9411
# otel-collector-config.yaml
receivers:
otlp:
protocols:
http:
endpoint: 0.0.0.0:4318
exporters:
jaeger:
endpoint: jaeger:14250
logging:
loglevel: debug
service:
pipelines:
traces:
receivers: [otlp]
exporters: [jaeger, logging]
docker run -d --name otel-collector \
-p 4318:4318 \
-v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml \
otel/opentelemetry-collector:latest \
--config=/etc/otel-collector-config.yaml
所有 HTTP 请求自动被追踪:
Span: GET /v1/agents
├─ Duration: 45ms
├─ http.method: GET
├─ http.url: /v1/agents
├─ http.status_code: 200
└─ http.request_content_length: 0
Trace context 自动在服务间传播:
Service A → Service B → Service C
| | |
TraceID: abc123 (same across all services)
| | |
SpanID: 001 SpanID: 002 SpanID: 003
| | |
Parent: - Parent: 001 Parent: 002
import (
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
)
func (h *AgentHandler) Create(c *gin.Context) {
ctx := c.Request.Context()
tracer := otel.Tracer("aster")
// 开始新的 span
ctx, span := tracer.Start(ctx, "agent.create")
defer span.End()
// 添加属性
span.SetAttributes(
attribute.String("agent.id", agentID),
attribute.String("agent.type", agentType),
)
// 业务逻辑...
// 记录事件
span.AddEvent("Agent created successfully")
}
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
func processAgent(ctx context.Context) error {
ctx, span := tracer.Start(ctx, "process_agent")
defer span.End()
// 子操作 1
if err := validateAgent(ctx); err != nil {
span.RecordError(err)
return err
}
// 子操作 2
if err := saveAgent(ctx); err != nil {
span.RecordError(err)
return err
}
return nil
}
func validateAgent(ctx context.Context) error {
// 自动成为 process_agent 的子 span
ctx, span := tracer.Start(ctx, "validate_agent")
defer span.End()
// 验证逻辑...
return nil
}
TracingConfig{
SamplingRate: 1.0, // 100% 采样
}
TracingConfig{
SamplingRate: 0.1, // 10% 采样
}
基于 TraceID 的确定性采样:
// 在 tracing.go 中已实现
sdktrace.WithSampler(sdktrace.TraceIDRatioBased(config.SamplingRate))
在 Jaeger UI 中:
POST /v1/agents/chat
├─ 150ms total
├─ authentication (5ms)
├─ load_agent (10ms)
├─ process_message (120ms)
│ ├─ parse_input (5ms)
│ ├─ call_llm (100ms)
│ └─ format_response (15ms)
└─ save_session (15ms)
SamplingRate: 0.1 // 仅采样 10%
sdktrace.WithBatcher(exporter,
sdktrace.WithMaxExportBatchSize(512),
sdktrace.WithBatchTimeout(5*time.Second),
)
// 只添加关键属性
span.SetAttributes(
attribute.String("key_field", value),
)
将 traces 关联到 metrics:
# Prometheus query with exemplar
rate(http_request_duration_seconds_bucket{job="aster"}[5m])
在 Grafana 中点击 exemplar 可直接跳转到对应的 trace。
version: "3.8"
services:
aster:
image: aster:latest
ports:
- "8080:8080"
environment:
- OBSERVABILITY_TRACING_ENABLED=true
- OBSERVABILITY_TRACING_OTLP_ENDPOINT=otel-collector:4318
depends_on:
- otel-collector
otel-collector:
image: otel/opentelemetry-collector:latest
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports:
- "4318:4318"
depends_on:
- jaeger
jaeger:
image: jaegertracing/all-in-one:latest
ports:
- "16686:16686"
- "14250:14250"
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
使用有意义的名称:
// ✅ Good
tracer.Start(ctx, "agent.create")
tracer.Start(ctx, "database.query")
// ❌ Bad
tracer.Start(ctx, "operation")
tracer.Start(ctx, "step1")
span.SetAttributes(
attribute.String("user.id", userID),
attribute.String("agent.type", agentType),
attribute.Int("message.length", len(message)),
)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "Failed to process")
}
config.Observability.Tracing.Enabled = true
telnet localhost 4318
# 查看 exporter 错误
docker logs aster 2>&1 | grep "trace"