Add OpenTelemetry observability to Arrtrix

- Add OTLP/gRPC observability config and resource attributes
- Instrument webhook and onboarding handlers with tracing and metrics
- Add OpenTelemetry dependencies to go.mod/go.sum
- Update NixOS modules to configure observability settings
This commit is contained in:
Chris Kruining 2026-04-16 10:13:51 +02:00
parent bbfe6867c8
commit 81f34676c4
No known key found for this signature in database
GPG key ID: EB894A3560CCCAD2
11 changed files with 307 additions and 29 deletions

View file

@ -7,11 +7,17 @@ import (
"fmt"
"net/http"
"strings"
"time"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"maunium.net/go/mautrix/bridgev2"
"maunium.net/go/mautrix/event"
"maunium.net/go/mautrix/format"
"maunium.net/go/mautrix/id"
"sneeuwvlok/packages/arrtrix/pkg/observability"
)
const ArrWebhookPath = "/_arrtrix/webhook"
@ -69,32 +75,65 @@ func MountArr(router *http.ServeMux, bridge *bridgev2.Bridge) error {
}
func (h *ArrHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
start := time.Now()
ctx, span := observability.StartSpan(r.Context(), "arrtrix.webhook.handle", trace.WithSpanKind(trace.SpanKindServer))
defer span.End()
statusCode := http.StatusAccepted
outcome := "ok"
eventType := ""
defer func() {
observability.RecordWebhook(ctx, eventType, outcome, statusCode, time.Since(start))
}()
var body payload
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
statusCode = http.StatusBadRequest
outcome = "invalid_payload"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
http.Error(w, "invalid webhook payload", http.StatusBadRequest)
return
}
if strings.TrimSpace(body.EventType) == "" {
statusCode = http.StatusBadRequest
outcome = "missing_event_type"
span.SetStatus(codes.Error, "missing eventType")
http.Error(w, "missing eventType", http.StatusBadRequest)
return
}
eventType = body.EventType
span.SetAttributes(
attribute.String("arrtrix.webhook.event_type", body.EventType),
attribute.String("http.method", r.Method),
attribute.String("http.route", ArrWebhookPath),
)
roomID, err := h.resolver.ResolveManagementRoom(r.Context())
roomID, err := h.resolver.ResolveManagementRoom(ctx)
if err != nil {
status := http.StatusInternalServerError
statusCode = http.StatusInternalServerError
outcome = "resolve_failed"
if errors.Is(err, ErrNoManagementRoom) || errors.Is(err, ErrAmbiguousManagementRoom) {
status = http.StatusConflict
statusCode = http.StatusConflict
outcome = "routing_conflict"
}
http.Error(w, err.Error(), status)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
http.Error(w, err.Error(), statusCode)
return
}
if err = h.sender.SendNotice(r.Context(), roomID, renderNotice(body)); err != nil {
if err = h.sender.SendNotice(ctx, roomID, renderNotice(body)); err != nil {
statusCode = http.StatusBadGateway
outcome = "delivery_failed"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
http.Error(w, "failed to deliver webhook", http.StatusBadGateway)
return
}
w.WriteHeader(http.StatusAccepted)
span.SetStatus(codes.Ok, "")
w.WriteHeader(statusCode)
}
type bridgeRoomResolver struct {
@ -102,8 +141,13 @@ type bridgeRoomResolver struct {
}
func (r bridgeRoomResolver) ResolveManagementRoom(ctx context.Context) (id.RoomID, error) {
ctx, span := observability.StartSpan(ctx, "arrtrix.webhook.resolve_management_room")
defer span.End()
rows, err := r.bridge.DB.Query(ctx, `SELECT mxid, management_room FROM "user" WHERE bridge_id=$1 AND management_room IS NOT NULL AND management_room <> ''`, r.bridge.ID)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return "", fmt.Errorf("failed to query management rooms: %w", err)
}
defer rows.Close()
@ -113,6 +157,8 @@ func (r bridgeRoomResolver) ResolveManagementRoom(ctx context.Context) (id.RoomI
for rows.Next() {
var mxid, managementRoom string
if err = rows.Scan(&mxid, &managementRoom); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return "", fmt.Errorf("failed to scan management room: %w", err)
}
owners = append(owners, id.UserID(mxid))
@ -121,15 +167,22 @@ func (r bridgeRoomResolver) ResolveManagementRoom(ctx context.Context) (id.RoomI
}
}
if err = rows.Err(); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return "", fmt.Errorf("failed to iterate management rooms: %w", err)
}
switch len(owners) {
case 0:
span.SetStatus(codes.Error, ErrNoManagementRoom.Error())
return "", ErrNoManagementRoom
case 1:
span.SetAttributes(attribute.Int("arrtrix.management_room.count", 1))
span.SetStatus(codes.Ok, "")
return roomID, nil
default:
span.SetAttributes(attribute.Int("arrtrix.management_room.count", len(owners)))
span.SetStatus(codes.Error, ErrAmbiguousManagementRoom.Error())
return "", fmt.Errorf("%w: %s", ErrAmbiguousManagementRoom, strings.Join(convertUserIDs(owners), ", "))
}
}
@ -139,11 +192,23 @@ type bridgeNoticeSender struct {
}
func (s bridgeNoticeSender) SendNotice(ctx context.Context, roomID id.RoomID, markdown string) error {
ctx, span := observability.StartSpan(ctx, "arrtrix.webhook.send_notice")
defer span.End()
span.SetAttributes(attribute.String("matrix.room_id", roomID.String()))
if err := s.bridge.Bot.EnsureJoined(ctx, roomID); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
content := format.RenderMarkdown(markdown, true, false)
_, err := s.bridge.Bot.SendMessage(ctx, roomID, event.EventMessage, &event.Content{Parsed: &content}, nil)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
span.SetStatus(codes.Ok, "")
return err
}