Add OpenTelemetry observability to Arrtrix

- Add OTLP/gRPC observability config and resource attributes
- Instrument webhook and onboarding handlers with tracing and metrics
- Add OpenTelemetry dependencies to go.mod/go.sum
- Update NixOS modules to configure observability settings
This commit is contained in:
Chris Kruining 2026-04-16 10:13:51 +02:00
parent bbfe6867c8
commit 81f34676c4
No known key found for this signature in database
GPG key ID: EB894A3560CCCAD2
11 changed files with 307 additions and 29 deletions

View file

@ -6,6 +6,8 @@ import (
"gopkg.in/yaml.v3"
"maunium.net/go/mautrix/bridgev2/bridgeconfig"
"sneeuwvlok/packages/arrtrix/pkg/observability"
)
type Config struct {
@ -17,6 +19,7 @@ type Config struct {
AppService bridgeconfig.AppserviceConfig `yaml:"appservice"`
Logging zeroconfig.Config `yaml:"logging"`
Observability observability.Config `yaml:"observability"`
EnvConfigPrefix string `yaml:"env_config_prefix"`
ManagementTexts bridgeconfig.ManagementRoomTexts `yaml:"management_room_texts"`
}
@ -34,6 +37,7 @@ func (c *Config) applyDefaults() {
if c.Homeserver.Software == "" {
c.Homeserver.Software = bridgeconfig.SoftwareStandard
}
c.Observability.ApplyDefaults()
}
func (c *Config) Compile() bridgeconfig.Config {

View file

@ -8,6 +8,8 @@ import (
"strings"
"github.com/rs/zerolog"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"maunium.net/go/mautrix/bridgev2"
"maunium.net/go/mautrix/bridgev2/bridgeconfig"
@ -15,6 +17,8 @@ import (
"maunium.net/go/mautrix/event"
"maunium.net/go/mautrix/format"
"maunium.net/go/mautrix/id"
"sneeuwvlok/packages/arrtrix/pkg/observability"
)
type Handler interface {
@ -110,6 +114,9 @@ func (p *Processor) Handlers() []Handler {
}
func (p *Processor) Handle(ctx context.Context, roomID id.RoomID, eventID id.EventID, user *bridgev2.User, message string, replyTo id.EventID) {
ctx, span := observability.StartSpan(ctx, "arrtrix.matrix.command")
defer span.End()
ms := &bridgev2.MessageStatus{
Step: status.MsgStepCommand,
Status: event.MessageStatusSuccess,
@ -117,6 +124,8 @@ func (p *Processor) Handle(ctx context.Context, roomID id.RoomID, eventID id.Eve
logCopy := zerolog.Ctx(ctx).With().Logger()
log := &logCopy
outcome := "success"
commandName := "unknown-command"
defer func() {
statusInfo := &bridgev2.MessageStatusEventInfo{
@ -131,16 +140,21 @@ func (p *Processor) Handle(ctx context.Context, roomID id.RoomID, eventID id.Eve
if err, ok := recovered.(error); ok {
logEvt = logEvt.Err(err)
ms.InternalError = err
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
} else {
logEvt = logEvt.Any(zerolog.ErrorFieldName, recovered)
ms.InternalError = fmt.Errorf("%v", recovered)
span.SetStatus(codes.Error, "panic")
}
logEvt.Msg("Panic in arrtrix Matrix command handler")
ms.Status = event.MessageStatusFail
ms.IsCertain = true
ms.ErrorAsMessage = true
outcome = "panic"
}
observability.RecordCommand(ctx, commandName, outcome)
p.bridge.Matrix.SendMessageStatus(ctx, ms, statusInfo)
}()
@ -149,10 +163,14 @@ func (p *Processor) Handle(ctx context.Context, roomID id.RoomID, eventID id.Eve
args = []string{"unknown-command"}
}
commandName := strings.ToLower(args[0])
commandName = strings.ToLower(args[0])
if actual, ok := p.alias[commandName]; ok {
commandName = actual
}
span.SetAttributes(
attribute.String("arrtrix.matrix.command.name", commandName),
attribute.String("matrix.room_id", roomID.String()),
)
portal, err := p.bridge.GetPortalByMXID(ctx, roomID)
if err != nil {
@ -179,6 +197,8 @@ func (p *Processor) Handle(ctx context.Context, roomID id.RoomID, eventID id.Eve
handler, ok := p.command[commandName]
if !ok {
log.Debug().Str("mx_command", commandName).Msg("Received unknown Matrix room command")
span.SetStatus(codes.Error, "unknown command")
outcome = "unknown"
commandCtx.Reply("Unknown command, use the `help` command for help.")
return
}
@ -188,6 +208,7 @@ func (p *Processor) Handle(ctx context.Context, roomID id.RoomID, eventID id.Eve
})
log.Debug().Msg("Received Matrix room command")
handler.Run(commandCtx)
span.SetStatus(codes.Ok, "")
}
func (c *Context) Reply(message string, args ...any) {

View file

@ -6,12 +6,16 @@ import (
"strings"
"github.com/rs/zerolog"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"maunium.net/go/mautrix/bridgev2"
"maunium.net/go/mautrix/bridgev2/bridgeconfig"
"maunium.net/go/mautrix/event"
"maunium.net/go/mautrix/format"
"maunium.net/go/mautrix/id"
"sneeuwvlok/packages/arrtrix/pkg/observability"
)
const handledInviteEventType = "com.arrtrix.handled_invite"
@ -23,27 +27,49 @@ func HandleBotInvite(ctx context.Context, bridge *bridgev2.Bridge, texts bridgec
return
}
ctx, span := observability.StartSpan(ctx, "arrtrix.matrix.invite")
defer span.End()
span.SetAttributes(
attribute.String("matrix.room_id", evt.RoomID.String()),
attribute.String("matrix.sender", evt.Sender.String()),
)
outcome := "ignored"
defer observability.RecordInvite(ctx, outcome)
log := zerolog.Ctx(ctx)
sender, err := bridge.GetUserByMXID(ctx, evt.Sender)
if err != nil {
outcome = "user_lookup_failed"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.Err(err).Msg("Failed to load sender for bot invite")
return
}
if !sender.Permissions.Commands {
outcome = "permission_denied"
span.SetStatus(codes.Error, "sender lacks command permission")
return
}
if err = bridge.Bot.EnsureJoined(ctx, evt.RoomID); err != nil {
outcome = "join_failed"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.Err(err).Msg("Failed to accept invite to room")
return
}
members, err := bridge.Matrix.GetMembers(ctx, evt.RoomID)
if err != nil {
outcome = "member_lookup_failed"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.Err(err).Msg("Failed to get members of room after accepting invite")
return
}
if len(members) != 2 {
outcome = "non_management_room"
span.SetStatus(codes.Error, "invite room is not a direct management room")
return
}
@ -51,6 +77,9 @@ func HandleBotInvite(ctx context.Context, bridge *bridgev2.Bridge, texts bridgec
if assignedManagementRoom {
sender.ManagementRoom = evt.RoomID
if err = sender.Save(ctx); err != nil {
outcome = "management_room_save_failed"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.Err(err).Msg("Failed to update user's management room in database")
return
}
@ -59,10 +88,15 @@ func HandleBotInvite(ctx context.Context, bridge *bridgev2.Bridge, texts bridgec
message := buildWelcomeMessage(bridge, texts, sender, assignedManagementRoom)
content := format.RenderMarkdown(message, true, false)
if _, err = bridge.Bot.SendMessage(ctx, evt.RoomID, event.EventMessage, &event.Content{Parsed: &content}, nil); err != nil {
outcome = "welcome_send_failed"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.Err(err).Msg("Failed to send welcome message to room")
return
}
outcome = "welcomed"
span.SetStatus(codes.Ok, "")
evt.Type = event.Type{Type: handledInviteEventType}
}

View file

@ -56,6 +56,13 @@ logging:
- type: stdout
format: pretty-colored
observability:
# OTLP/gRPC endpoint for logs, traces, and metrics.
# Set to e.g. http://127.0.0.1:4317 to enable export.
otlp_grpc_endpoint: ""
service_name: arrtrix
resource_attributes: {}
management_room_texts:
welcome: ""
welcome_connected: ""

View file

@ -18,6 +18,7 @@ import (
"go.mau.fi/util/exerrors"
"go.mau.fi/util/exzerolog"
"go.mau.fi/util/progver"
"go.opentelemetry.io/otel/codes"
"gopkg.in/yaml.v3"
flag "maunium.net/go/mauflag"
"maunium.net/go/mautrix/appservice"
@ -31,6 +32,7 @@ import (
arrconfig "sneeuwvlok/packages/arrtrix/pkg/config"
"sneeuwvlok/packages/arrtrix/pkg/matrixcmd"
"sneeuwvlok/packages/arrtrix/pkg/observability"
"sneeuwvlok/packages/arrtrix/pkg/onboarding"
)
@ -62,6 +64,7 @@ type Main struct {
Config *bridgeconfig.Config
Matrix *matrix.Connector
Bridge *bridgev2.Bridge
OTEL *observability.Runtime
ConfigPath string
RegistrationPath string
@ -251,6 +254,8 @@ func (m *Main) loadRegistrationTokens(cfg *bridgeconfig.Config) error {
}
func (m *Main) Init() {
start := time.Now()
ctx := context.Background()
var err error
m.Log, err = m.Config.Logging.Compile()
if err != nil {
@ -265,6 +270,33 @@ func (m *Main) Init() {
os.Exit(11)
}
otelCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
m.OTEL, err = observability.Setup(otelCtx, m.PublicConfig.Observability, m.Version)
cancel()
if err != nil {
m.Log.WithLevel(zerolog.FatalLevel).Err(err).Msg("Failed to initialize observability")
os.Exit(15)
}
if hook := m.OTEL.LoggerHook(); hook != nil {
logger := m.Log.Hook(hook)
m.Log = &logger
exzerolog.SetupDefaults(m.Log)
}
ctx = m.Log.WithContext(context.Background())
ctx, span := observability.StartSpan(ctx, "arrtrix.runtime.init")
defer func() {
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
observability.RecordStartupPhase(ctx, "init", "error", time.Since(start))
return
}
span.SetStatus(codes.Ok, "")
observability.RecordStartupPhase(ctx, "init", "ok", time.Since(start))
}()
defer span.End()
m.Log.Info().
Str("name", m.Name).
Str("version", m.ver.FormattedVersion).
@ -306,17 +338,48 @@ func (m *Main) Init() {
}
func (m *Main) Start() {
start := time.Now()
ctx := m.Log.WithContext(context.Background())
ctx, span := observability.StartSpan(ctx, "arrtrix.runtime.start")
defer func() {
if r := recover(); r != nil {
span.SetStatus(codes.Error, "panic")
observability.RecordStartupPhase(ctx, "start", "panic", time.Since(start))
span.End()
panic(r)
}
span.End()
}()
if err := m.Bridge.Start(ctx); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
observability.RecordStartupPhase(ctx, "start", "error", time.Since(start))
m.Log.Fatal().Err(err).Msg("Failed to start bridge")
}
span.SetStatus(codes.Ok, "")
observability.RecordStartupPhase(ctx, "start", "ok", time.Since(start))
if m.PostStart != nil {
m.PostStart()
}
}
func (m *Main) Stop() {
start := time.Now()
ctx := m.Log.WithContext(context.Background())
ctx, span := observability.StartSpan(ctx, "arrtrix.runtime.stop")
defer span.End()
m.Bridge.StopWithTimeout(5 * time.Second)
span.SetStatus(codes.Ok, "")
observability.RecordStartupPhase(ctx, "stop", "ok", time.Since(start))
if m.OTEL != nil {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := m.OTEL.Shutdown(shutdownCtx); err != nil && m.Log != nil {
m.Log.Error().Err(err).Msg("Failed to shut down observability")
}
}
}
func (m *Main) WaitForInterrupt() int {

View file

@ -7,11 +7,17 @@ import (
"fmt"
"net/http"
"strings"
"time"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"maunium.net/go/mautrix/bridgev2"
"maunium.net/go/mautrix/event"
"maunium.net/go/mautrix/format"
"maunium.net/go/mautrix/id"
"sneeuwvlok/packages/arrtrix/pkg/observability"
)
const ArrWebhookPath = "/_arrtrix/webhook"
@ -69,32 +75,65 @@ func MountArr(router *http.ServeMux, bridge *bridgev2.Bridge) error {
}
func (h *ArrHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
start := time.Now()
ctx, span := observability.StartSpan(r.Context(), "arrtrix.webhook.handle", trace.WithSpanKind(trace.SpanKindServer))
defer span.End()
statusCode := http.StatusAccepted
outcome := "ok"
eventType := ""
defer func() {
observability.RecordWebhook(ctx, eventType, outcome, statusCode, time.Since(start))
}()
var body payload
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
statusCode = http.StatusBadRequest
outcome = "invalid_payload"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
http.Error(w, "invalid webhook payload", http.StatusBadRequest)
return
}
if strings.TrimSpace(body.EventType) == "" {
statusCode = http.StatusBadRequest
outcome = "missing_event_type"
span.SetStatus(codes.Error, "missing eventType")
http.Error(w, "missing eventType", http.StatusBadRequest)
return
}
eventType = body.EventType
span.SetAttributes(
attribute.String("arrtrix.webhook.event_type", body.EventType),
attribute.String("http.method", r.Method),
attribute.String("http.route", ArrWebhookPath),
)
roomID, err := h.resolver.ResolveManagementRoom(r.Context())
roomID, err := h.resolver.ResolveManagementRoom(ctx)
if err != nil {
status := http.StatusInternalServerError
statusCode = http.StatusInternalServerError
outcome = "resolve_failed"
if errors.Is(err, ErrNoManagementRoom) || errors.Is(err, ErrAmbiguousManagementRoom) {
status = http.StatusConflict
statusCode = http.StatusConflict
outcome = "routing_conflict"
}
http.Error(w, err.Error(), status)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
http.Error(w, err.Error(), statusCode)
return
}
if err = h.sender.SendNotice(r.Context(), roomID, renderNotice(body)); err != nil {
if err = h.sender.SendNotice(ctx, roomID, renderNotice(body)); err != nil {
statusCode = http.StatusBadGateway
outcome = "delivery_failed"
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
http.Error(w, "failed to deliver webhook", http.StatusBadGateway)
return
}
w.WriteHeader(http.StatusAccepted)
span.SetStatus(codes.Ok, "")
w.WriteHeader(statusCode)
}
type bridgeRoomResolver struct {
@ -102,8 +141,13 @@ type bridgeRoomResolver struct {
}
func (r bridgeRoomResolver) ResolveManagementRoom(ctx context.Context) (id.RoomID, error) {
ctx, span := observability.StartSpan(ctx, "arrtrix.webhook.resolve_management_room")
defer span.End()
rows, err := r.bridge.DB.Query(ctx, `SELECT mxid, management_room FROM "user" WHERE bridge_id=$1 AND management_room IS NOT NULL AND management_room <> ''`, r.bridge.ID)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return "", fmt.Errorf("failed to query management rooms: %w", err)
}
defer rows.Close()
@ -113,6 +157,8 @@ func (r bridgeRoomResolver) ResolveManagementRoom(ctx context.Context) (id.RoomI
for rows.Next() {
var mxid, managementRoom string
if err = rows.Scan(&mxid, &managementRoom); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return "", fmt.Errorf("failed to scan management room: %w", err)
}
owners = append(owners, id.UserID(mxid))
@ -121,15 +167,22 @@ func (r bridgeRoomResolver) ResolveManagementRoom(ctx context.Context) (id.RoomI
}
}
if err = rows.Err(); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return "", fmt.Errorf("failed to iterate management rooms: %w", err)
}
switch len(owners) {
case 0:
span.SetStatus(codes.Error, ErrNoManagementRoom.Error())
return "", ErrNoManagementRoom
case 1:
span.SetAttributes(attribute.Int("arrtrix.management_room.count", 1))
span.SetStatus(codes.Ok, "")
return roomID, nil
default:
span.SetAttributes(attribute.Int("arrtrix.management_room.count", len(owners)))
span.SetStatus(codes.Error, ErrAmbiguousManagementRoom.Error())
return "", fmt.Errorf("%w: %s", ErrAmbiguousManagementRoom, strings.Join(convertUserIDs(owners), ", "))
}
}
@ -139,11 +192,23 @@ type bridgeNoticeSender struct {
}
func (s bridgeNoticeSender) SendNotice(ctx context.Context, roomID id.RoomID, markdown string) error {
ctx, span := observability.StartSpan(ctx, "arrtrix.webhook.send_notice")
defer span.End()
span.SetAttributes(attribute.String("matrix.room_id", roomID.String()))
if err := s.bridge.Bot.EnsureJoined(ctx, roomID); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
content := format.RenderMarkdown(markdown, true, false)
_, err := s.bridge.Bot.SendMessage(ctx, roomID, event.EventMessage, &event.Content{Parsed: &content}, nil)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
span.SetStatus(codes.Ok, "")
return err
}