Add OpenTelemetry observability to Arrtrix

- Add OTLP/gRPC observability config and resource attributes
- Instrument webhook and onboarding handlers with tracing and metrics
- Add OpenTelemetry dependencies to go.mod/go.sum
- Update NixOS modules to configure observability settings
This commit is contained in:
Chris Kruining 2026-04-16 10:13:51 +02:00
parent bbfe6867c8
commit 81f34676c4
No known key found for this signature in database
GPG key ID: EB894A3560CCCAD2
11 changed files with 307 additions and 29 deletions

View file

@ -18,6 +18,7 @@ import (
"go.mau.fi/util/exerrors"
"go.mau.fi/util/exzerolog"
"go.mau.fi/util/progver"
"go.opentelemetry.io/otel/codes"
"gopkg.in/yaml.v3"
flag "maunium.net/go/mauflag"
"maunium.net/go/mautrix/appservice"
@ -31,6 +32,7 @@ import (
arrconfig "sneeuwvlok/packages/arrtrix/pkg/config"
"sneeuwvlok/packages/arrtrix/pkg/matrixcmd"
"sneeuwvlok/packages/arrtrix/pkg/observability"
"sneeuwvlok/packages/arrtrix/pkg/onboarding"
)
@ -62,6 +64,7 @@ type Main struct {
Config *bridgeconfig.Config
Matrix *matrix.Connector
Bridge *bridgev2.Bridge
OTEL *observability.Runtime
ConfigPath string
RegistrationPath string
@ -251,6 +254,8 @@ func (m *Main) loadRegistrationTokens(cfg *bridgeconfig.Config) error {
}
func (m *Main) Init() {
start := time.Now()
ctx := context.Background()
var err error
m.Log, err = m.Config.Logging.Compile()
if err != nil {
@ -265,6 +270,33 @@ func (m *Main) Init() {
os.Exit(11)
}
otelCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
m.OTEL, err = observability.Setup(otelCtx, m.PublicConfig.Observability, m.Version)
cancel()
if err != nil {
m.Log.WithLevel(zerolog.FatalLevel).Err(err).Msg("Failed to initialize observability")
os.Exit(15)
}
if hook := m.OTEL.LoggerHook(); hook != nil {
logger := m.Log.Hook(hook)
m.Log = &logger
exzerolog.SetupDefaults(m.Log)
}
ctx = m.Log.WithContext(context.Background())
ctx, span := observability.StartSpan(ctx, "arrtrix.runtime.init")
defer func() {
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
observability.RecordStartupPhase(ctx, "init", "error", time.Since(start))
return
}
span.SetStatus(codes.Ok, "")
observability.RecordStartupPhase(ctx, "init", "ok", time.Since(start))
}()
defer span.End()
m.Log.Info().
Str("name", m.Name).
Str("version", m.ver.FormattedVersion).
@ -306,17 +338,48 @@ func (m *Main) Init() {
}
func (m *Main) Start() {
start := time.Now()
ctx := m.Log.WithContext(context.Background())
ctx, span := observability.StartSpan(ctx, "arrtrix.runtime.start")
defer func() {
if r := recover(); r != nil {
span.SetStatus(codes.Error, "panic")
observability.RecordStartupPhase(ctx, "start", "panic", time.Since(start))
span.End()
panic(r)
}
span.End()
}()
if err := m.Bridge.Start(ctx); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
observability.RecordStartupPhase(ctx, "start", "error", time.Since(start))
m.Log.Fatal().Err(err).Msg("Failed to start bridge")
}
span.SetStatus(codes.Ok, "")
observability.RecordStartupPhase(ctx, "start", "ok", time.Since(start))
if m.PostStart != nil {
m.PostStart()
}
}
func (m *Main) Stop() {
start := time.Now()
ctx := m.Log.WithContext(context.Background())
ctx, span := observability.StartSpan(ctx, "arrtrix.runtime.stop")
defer span.End()
m.Bridge.StopWithTimeout(5 * time.Second)
span.SetStatus(codes.Ok, "")
observability.RecordStartupPhase(ctx, "stop", "ok", time.Since(start))
if m.OTEL != nil {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := m.OTEL.Shutdown(shutdownCtx); err != nil && m.Log != nil {
m.Log.Error().Err(err).Msg("Failed to shut down observability")
}
}
}
func (m *Main) WaitForInterrupt() int {