From 6569468e45c63d5336491b2c0a01fd2569a46c81 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Thu, 15 Jun 2023 11:57:22 +0100 Subject: [PATCH 001/156] Add structures for unit testing handler2 --- sync2/handler2/handler.go | 11 +-- sync2/handler2/handler_test.go | 164 +++++++++++++++++++++++++++++++++ sync2/poller.go | 9 +- 3 files changed, 177 insertions(+), 7 deletions(-) create mode 100644 sync2/handler2/handler_test.go diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index c95a276c..60e8fece 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -4,12 +4,13 @@ import ( "context" "encoding/json" "fmt" - "github.com/jmoiron/sqlx" - "github.com/matrix-org/sliding-sync/sqlutil" "hash/fnv" "os" "sync" + "github.com/jmoiron/sqlx" + "github.com/matrix-org/sliding-sync/sqlutil" + "github.com/getsentry/sentry-go" "github.com/matrix-org/sliding-sync/internal" @@ -30,12 +31,11 @@ var logger = zerolog.New(os.Stdout).With().Timestamp().Logger().Output(zerolog.C // processing v2 data (as a sync2.V2DataReceiver) and publishing updates (pubsub.Payload to V2Listeners); // and receiving and processing EnsurePolling events. type Handler struct { - pMap *sync2.PollerMap + pMap sync2.IPollerMap v2Store *sync2.Storage Store *state.Storage v2Pub pubsub.Notifier v3Sub *pubsub.V3Sub - client sync2.Client unreadMap map[string]struct { Highlight int Notif int @@ -48,13 +48,12 @@ type Handler struct { } func NewHandler( - connStr string, pMap *sync2.PollerMap, v2Store *sync2.Storage, store *state.Storage, client sync2.Client, + pMap sync2.IPollerMap, v2Store *sync2.Storage, store *state.Storage, pub pubsub.Notifier, sub pubsub.Listener, enablePrometheus bool, ) (*Handler, error) { h := &Handler{ pMap: pMap, v2Store: v2Store, - client: client, Store: store, subSystem: "poller", unreadMap: make(map[string]struct { diff --git a/sync2/handler2/handler_test.go b/sync2/handler2/handler_test.go new file mode 100644 index 00000000..13a2597f --- /dev/null +++ b/sync2/handler2/handler_test.go @@ -0,0 +1,164 @@ +package handler2_test + +import ( + "os" + "reflect" + "sync" + "testing" + "time" + + "github.com/matrix-org/sliding-sync/pubsub" + "github.com/matrix-org/sliding-sync/state" + "github.com/matrix-org/sliding-sync/sync2" + "github.com/matrix-org/sliding-sync/sync2/handler2" + "github.com/matrix-org/sliding-sync/testutils" + "github.com/rs/zerolog" +) + +var postgresURI string + +func TestMain(m *testing.M) { + postgresURI = testutils.PrepareDBConnectionString() + exitCode := m.Run() + os.Exit(exitCode) +} + +type pollInfo struct { + pid sync2.PollerID + accessToken string + v2since string + isStartup bool +} + +type mockPollerMap struct { + calls []pollInfo +} + +func (p *mockPollerMap) NumPollers() int { + return 0 +} +func (p *mockPollerMap) Terminate() {} + +func (p *mockPollerMap) EnsurePolling(pid sync2.PollerID, accessToken, v2since string, isStartup bool, logger zerolog.Logger) { + p.calls = append(p.calls, pollInfo{ + pid: pid, + accessToken: accessToken, + v2since: v2since, + isStartup: isStartup, + }) +} +func (p *mockPollerMap) assertCallExists(t *testing.T, pi pollInfo) { + for _, c := range p.calls { + if reflect.DeepEqual(pi, c) { + return + } + } + t.Fatalf("assertCallExists: did not find %+v", pi) +} + +type mockPub struct { + calls []pubsub.Payload + mu *sync.Mutex + waiters map[string][]chan struct{} +} + +func newMockPub() *mockPub { + return &mockPub{ + mu: &sync.Mutex{}, + waiters: make(map[string][]chan struct{}), + } +} + +// Notify chanName that there is a new payload p. Return an error if we failed to send the notification. +func (p *mockPub) Notify(chanName string, payload pubsub.Payload) error { + p.calls = append(p.calls, payload) + p.mu.Lock() + for _, ch := range p.waiters[payload.Type()] { + close(ch) + } + p.waiters[payload.Type()] = nil // don't re-notify for 2nd+ payload + p.mu.Unlock() + return nil +} + +func (p *mockPub) WaitForPayloadType(t string) chan struct{} { + ch := make(chan struct{}) + p.mu.Lock() + p.waiters[t] = append(p.waiters[t], ch) + p.mu.Unlock() + return ch +} + +func (p *mockPub) DoWait(t *testing.T, errMsg string, ch chan struct{}) { + select { + case <-ch: + return + case <-time.After(time.Second): + t.Fatalf("DoWait: timed out waiting: %s", errMsg) + } +} + +// Close is called when we should stop listening. +func (p *mockPub) Close() error { return nil } + +type mockSub struct{} + +// Begin listening on this channel with this callback starting from this position. Blocks until Close() is called. +func (s *mockSub) Listen(chanName string, fn func(p pubsub.Payload)) error { return nil } + +// Close the listener. No more callbacks should fire. +func (s *mockSub) Close() error { return nil } + +func assertNoError(t *testing.T, err error) { + t.Helper() + if err == nil { + return + } + t.Fatalf("assertNoError: %v", err) +} + +// Test that if you call EnsurePolling you get back V2InitialSyncComplete down pubsub and the poller +// map is called correctly +func TestHandlerFreshEnsurePolling(t *testing.T) { + store := state.NewStorage(postgresURI) + v2Store := sync2.NewStore(postgresURI, "secret") + pMap := &mockPollerMap{} + pub := newMockPub() + sub := &mockSub{} + h, err := handler2.NewHandler(pMap, v2Store, store, pub, sub, false) + assertNoError(t, err) + alice := "@alice:localhost" + deviceID := "ALICE" + token := "aliceToken" + + // the device and token needs to already exist prior to EnsurePolling + err = v2Store.DevicesTable.InsertDevice(alice, deviceID) + assertNoError(t, err) + tok, err := v2Store.TokensTable.Insert(token, alice, deviceID, time.Now()) + assertNoError(t, err) + + payloadInitialSyncComplete := pubsub.V2InitialSyncComplete{ + UserID: alice, + DeviceID: deviceID, + } + ch := pub.WaitForPayloadType(payloadInitialSyncComplete.Type()) + // ask the handler to start polling + h.EnsurePolling(&pubsub.V3EnsurePolling{ + UserID: alice, + DeviceID: deviceID, + AccessTokenHash: tok.AccessTokenHash, + }) + pub.DoWait(t, "didn't see V2InitialSyncComplete", ch) + + // make sure we polled with the token i.e it did a db hit + pMap.assertCallExists(t, pollInfo{ + pid: sync2.PollerID{ + UserID: alice, + DeviceID: deviceID, + }, + accessToken: token, + v2since: "", + isStartup: false, + }) + +} diff --git a/sync2/poller.go b/sync2/poller.go index 0d8baf61..f478daba 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -4,12 +4,13 @@ import ( "context" "encoding/json" "fmt" - "github.com/getsentry/sentry-go" "runtime/debug" "sync" "sync/atomic" "time" + "github.com/getsentry/sentry-go" + "github.com/matrix-org/sliding-sync/internal" "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" @@ -55,6 +56,12 @@ type V2DataReceiver interface { OnExpiredToken(ctx context.Context, accessTokenHash, userID, deviceID string) } +type IPollerMap interface { + EnsurePolling(pid PollerID, accessToken, v2since string, isStartup bool, logger zerolog.Logger) + NumPollers() int + Terminate() +} + // PollerMap is a map of device ID to Poller type PollerMap struct { v2Client Client From eefbe334b40aaee4049b674b88c2c1b1bf29d2f8 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Thu, 15 Jun 2023 12:05:37 +0100 Subject: [PATCH 002/156] Unbreak build --- v3.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v3.go b/v3.go index a2860640..acb87575 100644 --- a/v3.go +++ b/v3.go @@ -86,7 +86,7 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han pMap := sync2.NewPollerMap(v2Client, opts.AddPrometheusMetrics) // create v2 handler - h2, err := handler2.NewHandler(postgresURI, pMap, storev2, store, v2Client, pubSub, pubSub, opts.AddPrometheusMetrics) + h2, err := handler2.NewHandler(pMap, storev2, store, pubSub, pubSub, opts.AddPrometheusMetrics) if err != nil { panic(err) } From c28aaf51d7bdf8b5a2ea2431d2669b82c1293095 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 16 Jun 2023 14:33:55 +0100 Subject: [PATCH 003/156] onIncomingRequest: be more careful with ctxs not clear to me if this will explain the discrepancy between metrics and traces. Let's try it. --- sync3/handler/connstate.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 7c92205c..6bac6f30 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -178,39 +178,39 @@ func (s *ConnState) OnIncomingRequest(ctx context.Context, cid sync3.ConnID, req // onIncomingRequest is a callback which fires when the client makes a request to the server. Whilst each request may // be on their own goroutine, the requests are linearised for us by Conn so it is safe to modify ConnState without // additional locking mechanisms. -func (s *ConnState) onIncomingRequest(ctx context.Context, req *sync3.Request, isInitial bool) (*sync3.Response, error) { +func (s *ConnState) onIncomingRequest(reqCtx context.Context, req *sync3.Request, isInitial bool) (*sync3.Response, error) { start := time.Now() // ApplyDelta works fine if s.muxedReq is nil var delta *sync3.RequestDelta s.muxedReq, delta = s.muxedReq.ApplyDelta(req) - internal.Logf(ctx, "connstate", "new subs=%v unsubs=%v num_lists=%v", len(delta.Subs), len(delta.Unsubs), len(delta.Lists)) + internal.Logf(reqCtx, "connstate", "new subs=%v unsubs=%v num_lists=%v", len(delta.Subs), len(delta.Unsubs), len(delta.Lists)) for key, l := range delta.Lists { listData := "" if l.Curr != nil { listDataBytes, _ := json.Marshal(l.Curr) listData = string(listDataBytes) } - internal.Logf(ctx, "connstate", "list[%v] prev_empty=%v curr=%v", key, l.Prev == nil, listData) + internal.Logf(reqCtx, "connstate", "list[%v] prev_empty=%v curr=%v", key, l.Prev == nil, listData) } // work out which rooms we'll return data for and add their relevant subscriptions to the builder // for it to mix together builder := NewRoomsBuilder() // works out which rooms are subscribed to but doesn't pull room data - s.buildRoomSubscriptions(ctx, builder, delta.Subs, delta.Unsubs) + s.buildRoomSubscriptions(reqCtx, builder, delta.Subs, delta.Unsubs) // works out how rooms get moved about but doesn't pull room data - respLists := s.buildListSubscriptions(ctx, builder, delta.Lists) + respLists := s.buildListSubscriptions(reqCtx, builder, delta.Lists) // pull room data and set changes on the response response := &sync3.Response{ - Rooms: s.buildRooms(ctx, builder.BuildSubscriptions()), // pull room data + Rooms: s.buildRooms(reqCtx, builder.BuildSubscriptions()), // pull room data Lists: respLists, } // Handle extensions AFTER processing lists as extensions may need to know which rooms the client // is being notified about (e.g. for room account data) - ctx, region := internal.StartSpan(ctx, "extensions") - response.Extensions = s.extensionsHandler.Handle(ctx, s.muxedReq.Extensions, extensions.Context{ + extCtx, region := internal.StartSpan(reqCtx, "extensions") + response.Extensions = s.extensionsHandler.Handle(extCtx, s.muxedReq.Extensions, extensions.Context{ UserID: s.userID, DeviceID: s.deviceID, RoomIDToTimeline: response.RoomIDsToTimelineEventIDs(), @@ -228,8 +228,8 @@ func (s *ConnState) onIncomingRequest(ctx context.Context, req *sync3.Request, i } // do live tracking if we have nothing to tell the client yet - ctx, region = internal.StartSpan(ctx, "liveUpdate") - s.live.liveUpdate(ctx, req, s.muxedReq.Extensions, isInitial, response) + updateCtx, region := internal.StartSpan(reqCtx, "liveUpdate") + s.live.liveUpdate(updateCtx, req, s.muxedReq.Extensions, isInitial, response) region.End() // counts are AFTER events are applied, hence after liveUpdate @@ -242,7 +242,7 @@ func (s *ConnState) onIncomingRequest(ctx context.Context, req *sync3.Request, i // Add membership events for users sending typing notifications. We do this after live update // and initial room loading code so we LL room members in all cases. if response.Extensions.Typing != nil && response.Extensions.Typing.HasData(isInitial) { - s.lazyLoadTypingMembers(ctx, response) + s.lazyLoadTypingMembers(reqCtx, response) } return response, nil } From 5f3401659fe82301a4afd0550a3127d85fbfe150 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 19 Jun 2023 11:23:31 +0100 Subject: [PATCH 004/156] Treat 403s on /sync as 401s --- sync2/poller.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sync2/poller.go b/sync2/poller.go index f478daba..04b8ca0c 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -450,7 +450,8 @@ func (p *poller) poll(ctx context.Context, s *pollLoopState) error { } if err != nil { // check if temporary - if statusCode != 401 { + isFatal := statusCode == 401 || statusCode == 403 + if !isFatal { p.logger.Warn().Int("code", statusCode).Err(err).Msg("Poller: sync v2 poll returned temporary error") s.failCount += 1 return nil From 69acacfb3a02476f7f5a9d174308cdacce2d63c4 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 14:49:43 +0100 Subject: [PATCH 005/156] v0.99.3 --- cmd/syncv3/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 0331c9fd..274bd084 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -21,7 +21,7 @@ import ( var GitCommit string -const version = "0.99.2" +const version = "0.99.3" const ( // Required fields From 7b8b1aa614fb6401df2942c6488876c68ddffb70 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 15:14:46 +0100 Subject: [PATCH 006/156] Update readme to link v0.99.3 to spec Closes #161. --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 15289a39..a7e3e95b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,11 @@ Run a sliding sync proxy. An implementation of [MSC3575](https://github.com/matrix-org/matrix-doc/blob/kegan/sync-v3/proposals/3575-sync.md). -Proxy version to MSC API specification: +## Proxy version to MSC API specification + +This describes which proxy versions implement which version of the API drafted +in MSC3575. See https://github.com/matrix-org/sliding-sync/releases for the +changes in the proxy itself. - Version 0.1.x: [2022/04/01](https://github.com/matrix-org/matrix-spec-proposals/blob/615e8f5a7bfe4da813bc2db661ed0bd00bccac20/proposals/3575-sync.md) - First release @@ -21,10 +25,11 @@ Proxy version to MSC API specification: - Support for `errcode` when sessions expire. - Version 0.99.1 [2023/01/20](https://github.com/matrix-org/matrix-spec-proposals/blob/b4b4e7ff306920d2c862c6ff4d245110f6fa5bc7/proposals/3575-sync.md) - Preparing for major v1.x release: lists-as-keys support. -- Version 0.99.2 [2024/07/27](https://github.com/matrix-org/matrix-spec-proposals/blob/eab643cb3ca63b03537a260fa343e1fb2d1ee284/proposals/3575-sync.md) +- Version 0.99.2 [2023/03/31](https://github.com/matrix-org/matrix-spec-proposals/blob/eab643cb3ca63b03537a260fa343e1fb2d1ee284/proposals/3575-sync.md) - Experimental support for `bump_event_types` when ordering rooms by recency. - Support for opting in to extensions on a per-list and per-room basis. - - Sentry support. +- Version 0.99.3 [2023/05/23](https://github.com/matrix-org/matrix-spec-proposals/blob/4103ee768a4a3e1decee80c2987f50f4c6b3d539/proposals/3575-sync.md) + - Support for per-list `bump_event_types`, including support on new sliding sync connections. ## Usage From 47058883221177985019642e0300660392ad6e16 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 15:37:03 +0100 Subject: [PATCH 007/156] Update the version-to-msc map --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a7e3e95b..a7c5c0b5 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,8 @@ changes in the proxy itself. - Experimental support for `bump_event_types` when ordering rooms by recency. - Support for opting in to extensions on a per-list and per-room basis. - Version 0.99.3 [2023/05/23](https://github.com/matrix-org/matrix-spec-proposals/blob/4103ee768a4a3e1decee80c2987f50f4c6b3d539/proposals/3575-sync.md) - - Support for per-list `bump_event_types`, including support on new sliding sync connections. + - Support for per-list `bump_event_types`. + - Support for [`conn_id`](https://github.com/matrix-org/matrix-spec-proposals/blob/4103ee768a4a3e1decee80c2987f50f4c6b3d539/proposals/3575-sync.md#concurrent-connections) for distinguishing multiple concurrent connections. ## Usage From 4c661fbdd1d573e7209f257525cab049361dc3b6 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 19 Jun 2023 15:56:22 +0100 Subject: [PATCH 008/156] Add db conns test; uncomment DBMaxConns to break the world --- cmd/syncv3/main.go | 17 ++++--- state/storage.go | 4 -- sync2/storage.go | 4 -- sync3/handler/handler.go | 2 +- tests-integration/db_test.go | 98 ++++++++++++++++++++++++++++++++++++ tests-integration/v3_test.go | 22 ++++---- v3.go | 15 +++++- 7 files changed, 135 insertions(+), 27 deletions(-) create mode 100644 tests-integration/db_test.go diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 274bd084..9938ecc3 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -2,6 +2,14 @@ package main import ( "fmt" + "net/http" + _ "net/http/pprof" + "os" + "os/signal" + "strings" + "syscall" + "time" + "github.com/getsentry/sentry-go" sentryhttp "github.com/getsentry/sentry-go/http" syncv3 "github.com/matrix-org/sliding-sync" @@ -10,13 +18,6 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/rs/zerolog" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" - "net/http" - _ "net/http/pprof" - "os" - "os/signal" - "strings" - "syscall" - "time" ) var GitCommit string @@ -163,6 +164,8 @@ func main() { h2, h3 := syncv3.Setup(args[EnvServer], args[EnvDB], args[EnvSecret], syncv3.Opts{ AddPrometheusMetrics: args[EnvPrometheus] != "", + DBMaxConns: 100, + DBConnMaxIdleTime: time.Hour, }) go h2.StartV2Pollers() diff --git a/state/storage.go b/state/storage.go index 5c4dad14..4a9b9fc6 100644 --- a/state/storage.go +++ b/state/storage.go @@ -6,7 +6,6 @@ import ( "fmt" "os" "strings" - "time" "github.com/getsentry/sentry-go" @@ -58,9 +57,6 @@ func NewStorage(postgresURI string) *Storage { // TODO: if we panic(), will sentry have a chance to flush the event? logger.Panic().Err(err).Str("uri", postgresURI).Msg("failed to open SQL DB") } - db.SetMaxOpenConns(100) - db.SetMaxIdleConns(80) - db.SetConnMaxLifetime(time.Hour) acc := &Accumulator{ db: db, roomsTable: NewRoomsTable(db), diff --git a/sync2/storage.go b/sync2/storage.go index 88d3c759..5f484179 100644 --- a/sync2/storage.go +++ b/sync2/storage.go @@ -2,7 +2,6 @@ package sync2 import ( "os" - "time" "github.com/getsentry/sentry-go" "github.com/jmoiron/sqlx" @@ -27,9 +26,6 @@ func NewStore(postgresURI, secret string) *Storage { // TODO: if we panic(), will sentry have a chance to flush the event? logger.Panic().Err(err).Str("uri", postgresURI).Msg("failed to open SQL DB") } - db.SetMaxOpenConns(100) - db.SetMaxIdleConns(80) - db.SetConnMaxLifetime(time.Hour) return &Storage{ DevicesTable: NewDevicesTable(db), TokensTable: NewTokensTable(db, secret), diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 0f2c1009..3a077498 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -65,7 +65,7 @@ type SyncLiveHandler struct { } func NewSync3Handler( - store *state.Storage, storev2 *sync2.Storage, v2Client sync2.Client, postgresDBURI, secret string, + store *state.Storage, storev2 *sync2.Storage, v2Client sync2.Client, secret string, pub pubsub.Notifier, sub pubsub.Listener, enablePrometheus bool, maxPendingEventUpdates int, ) (*SyncLiveHandler, error) { logger.Info().Msg("creating handler") diff --git a/tests-integration/db_test.go b/tests-integration/db_test.go new file mode 100644 index 00000000..b721102e --- /dev/null +++ b/tests-integration/db_test.go @@ -0,0 +1,98 @@ +package syncv3 + +import ( + "encoding/json" + "fmt" + "sync" + "testing" + "time" + + syncv3 "github.com/matrix-org/sliding-sync" + "github.com/matrix-org/sliding-sync/sync2" + "github.com/matrix-org/sliding-sync/sync3" + "github.com/matrix-org/sliding-sync/testutils" + "github.com/matrix-org/sliding-sync/testutils/m" +) + +// Test that the proxy works fine with low max conns. Low max conns can be a problem +// if a request A needs 2 conns to respond and that blocks forward progress on the server, +// and the request can only obtain 1 conn. +func TestMaxDBConns(t *testing.T) { + pqString := testutils.PrepareDBConnectionString() + // setup code + v2 := runTestV2Server(t) + v3 := runTestServer(t, v2, pqString, syncv3.Opts{ + //DBMaxConns: 3, + }) + defer v2.close() + defer v3.close() + + // make N users and drip feed some events, make sure they are all seen + numUsers := 5 + var wg sync.WaitGroup + wg.Add(numUsers) + for i := 0; i < numUsers; i++ { + go func(n int) { + defer wg.Done() + userID := fmt.Sprintf("@maxconns_%d:localhost", n) + token := fmt.Sprintf("maxconns_%d", n) + roomID := fmt.Sprintf("!maxconns_%d", n) + v2.addAccount(t, userID, token) + v2.queueResponse(userID, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + state: createRoomState(t, userID, time.Now()), + }), + }, + }) + // initial sync + res := v3.mustDoV3Request(t, token, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 1}, + }, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 1, + }, + }}, + }) + t.Logf("user %s has done an initial /sync OK", userID) + m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(1), m.MatchV3Ops( + m.MatchV3SyncOp(0, 0, []string{roomID}), + )), m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + roomID: { + m.MatchJoinCount(1), + }, + })) + // drip feed and get update + dripMsg := testutils.NewEvent(t, "m.room.message", userID, map[string]interface{}{ + "msgtype": "m.text", + "body": "drip drip", + }) + v2.queueResponse(userID, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + events: []json.RawMessage{ + dripMsg, + }, + }), + }, + }) + t.Logf("user %s has queued the drip", userID) + v2.waitUntilEmpty(t, userID) + t.Logf("user %s poller has received the drip", userID) + res = v3.mustDoV3RequestWithPos(t, token, res.Pos, sync3.Request{}) + m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + roomID: { + m.MatchRoomTimelineMostRecent(1, []json.RawMessage{dripMsg}), + }, + })) + t.Logf("user %s has received the drip", userID) + }(i) + } + + wg.Wait() + +} diff --git a/tests-integration/v3_test.go b/tests-integration/v3_test.go index cd8bd2c1..658eefd2 100644 --- a/tests-integration/v3_test.go +++ b/tests-integration/v3_test.go @@ -366,20 +366,22 @@ func runTestServer(t testutils.TestBenchInterface, v2Server *testV2Server, postg //tests often repeat requests. To ensure tests remain fast, reduce the spam protection limits. sync3.SpamProtectionInterval = time.Millisecond - metricsEnabled := false - maxPendingEventUpdates := 200 + combinedOpts := syncv3.Opts{ + TestingSynchronousPubsub: true, // critical to avoid flakey tests + AddPrometheusMetrics: false, + MaxPendingEventUpdates: 200, + } if len(opts) > 0 { - metricsEnabled = opts[0].AddPrometheusMetrics - if opts[0].MaxPendingEventUpdates > 0 { - maxPendingEventUpdates = opts[0].MaxPendingEventUpdates + opt := opts[0] + combinedOpts.AddPrometheusMetrics = opt.AddPrometheusMetrics + combinedOpts.DBConnMaxIdleTime = opt.DBConnMaxIdleTime + combinedOpts.DBMaxConns = opt.DBMaxConns + if opt.MaxPendingEventUpdates > 0 { + combinedOpts.MaxPendingEventUpdates = opt.MaxPendingEventUpdates handler.BufferWaitTime = 5 * time.Millisecond } } - h2, h3 := syncv3.Setup(v2Server.url(), postgresConnectionString, os.Getenv("SYNCV3_SECRET"), syncv3.Opts{ - TestingSynchronousPubsub: true, // critical to avoid flakey tests - MaxPendingEventUpdates: maxPendingEventUpdates, - AddPrometheusMetrics: metricsEnabled, - }) + h2, h3 := syncv3.Setup(v2Server.url(), postgresConnectionString, os.Getenv("SYNCV3_SECRET"), combinedOpts) // for ease of use we don't start v2 pollers at startup in tests r := mux.NewRouter() r.Use(hlog.NewHandler(logger)) diff --git a/v3.go b/v3.go index acb87575..a9e1c557 100644 --- a/v3.go +++ b/v3.go @@ -9,6 +9,7 @@ import ( "time" "github.com/getsentry/sentry-go" + "github.com/jmoiron/sqlx" "github.com/gorilla/mux" "github.com/matrix-org/sliding-sync/internal" @@ -36,6 +37,9 @@ type Opts struct { // if true, publishing messages will block until the consumer has consumed it. // Assumes a single producer and a single consumer. TestingSynchronousPubsub bool + + DBMaxConns int + DBConnMaxIdleTime time.Duration } type server struct { @@ -75,6 +79,15 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han } store := state.NewStorage(postgresURI) storev2 := sync2.NewStore(postgresURI, secret) + for _, db := range []*sqlx.DB{store.DB, storev2.DB} { + if opts.DBMaxConns > 0 { + db.SetMaxOpenConns(opts.DBMaxConns) + db.SetMaxIdleConns(opts.DBMaxConns) + } + if opts.DBConnMaxIdleTime > 0 { + db.SetConnMaxIdleTime(opts.DBConnMaxIdleTime) + } + } bufferSize := 50 if opts.TestingSynchronousPubsub { bufferSize = 0 @@ -93,7 +106,7 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han pMap.SetCallbacks(h2) // create v3 handler - h3, err := handler.NewSync3Handler(store, storev2, v2Client, postgresURI, secret, pubSub, pubSub, opts.AddPrometheusMetrics, opts.MaxPendingEventUpdates) + h3, err := handler.NewSync3Handler(store, storev2, v2Client, secret, pubSub, pubSub, opts.AddPrometheusMetrics, opts.MaxPendingEventUpdates) if err != nil { panic(err) } From 9c1362fc8e85b2a0cce444af5e618e709d4e96e9 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 16:17:57 +0100 Subject: [PATCH 009/156] Actually use txn when inserting tokens & devices --- sync2/devices_table.go | 4 ++-- sync2/tokens_table.go | 4 ++-- sync3/handler/handler.go | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sync2/devices_table.go b/sync2/devices_table.go index ab6ddb9f..30e68df6 100644 --- a/sync2/devices_table.go +++ b/sync2/devices_table.go @@ -32,8 +32,8 @@ func NewDevicesTable(db *sqlx.DB) *DevicesTable { // InsertDevice creates a new devices row with a blank since token if no such row // exists. Otherwise, it does nothing. -func (t *DevicesTable) InsertDevice(userID, deviceID string) error { - _, err := t.db.Exec( +func (t *DevicesTable) InsertDevice(txn *sqlx.Tx, userID, deviceID string) error { + _, err := txn.Exec( ` INSERT INTO syncv3_sync2_devices(user_id, device_id, since) VALUES($1,$2,$3) ON CONFLICT (user_id, device_id) DO NOTHING`, userID, deviceID, "", diff --git a/sync2/tokens_table.go b/sync2/tokens_table.go index 066c6508..961e7bdd 100644 --- a/sync2/tokens_table.go +++ b/sync2/tokens_table.go @@ -171,10 +171,10 @@ func (t *TokensTable) TokenForEachDevice(txn *sqlx.Tx) (tokens []TokenForPoller, } // Insert a new token into the table. -func (t *TokensTable) Insert(plaintextToken, userID, deviceID string, lastSeen time.Time) (*Token, error) { +func (t *TokensTable) Insert(txn *sqlx.Tx, plaintextToken, userID, deviceID string, lastSeen time.Time) (*Token, error) { hashedToken := hashToken(plaintextToken) encToken := t.encrypt(plaintextToken) - _, err := t.db.Exec( + _, err := txn.Exec( `INSERT INTO syncv3_sync2_tokens(token_hash, token_encrypted, user_id, device_id, last_seen) VALUES ($1, $2, $3, $4, $5) ON CONFLICT (token_hash) DO NOTHING;`, diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 0f2c1009..98d4e610 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -420,14 +420,14 @@ func (h *SyncLiveHandler) identifyUnknownAccessToken(accessToken string, logger var token *sync2.Token err = sqlutil.WithTransaction(h.V2Store.DB, func(txn *sqlx.Tx) error { // Create a brand-new row for this token. - token, err = h.V2Store.TokensTable.Insert(accessToken, userID, deviceID, time.Now()) + token, err = h.V2Store.TokensTable.Insert(txn, accessToken, userID, deviceID, time.Now()) if err != nil { logger.Warn().Err(err).Str("user", userID).Str("device", deviceID).Msg("failed to insert v2 token") return err } // Ensure we have a device row for this token. - err = h.V2Store.DevicesTable.InsertDevice(userID, deviceID) + err = h.V2Store.DevicesTable.InsertDevice(txn, userID, deviceID) if err != nil { log.Warn().Err(err).Str("user", userID).Str("device", deviceID).Msg("failed to insert v2 device") return err From 0c4dd11bba4ed2ce2e855df7592ce78b7b1c7665 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 16:42:05 +0100 Subject: [PATCH 010/156] Update tests --- sync2/devices_table_test.go | 96 ++++++++++++++++++++-------------- sync2/handler2/handler_test.go | 16 ++++-- sync2/tokens_table_test.go | 75 +++++++++++++++----------- 3 files changed, 112 insertions(+), 75 deletions(-) diff --git a/sync2/devices_table_test.go b/sync2/devices_table_test.go index 5f70846a..1db3564d 100644 --- a/sync2/devices_table_test.go +++ b/sync2/devices_table_test.go @@ -2,6 +2,7 @@ package sync2 import ( "github.com/jmoiron/sqlx" + "github.com/matrix-org/sliding-sync/sqlutil" "os" "sort" "testing" @@ -41,18 +42,25 @@ func TestDevicesTableSinceColumn(t *testing.T) { aliceSecret1 := "mysecret1" aliceSecret2 := "mysecret2" - t.Log("Insert two tokens for Alice.") - aliceToken, err := tokens.Insert(aliceSecret1, alice, aliceDevice, time.Now()) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } - aliceToken2, err := tokens.Insert(aliceSecret2, alice, aliceDevice, time.Now()) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } + var aliceToken, aliceToken2 *Token + _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) (err error) { + t.Log("Insert two tokens for Alice.") + aliceToken, err = tokens.Insert(txn, aliceSecret1, alice, aliceDevice, time.Now()) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + aliceToken2, err = tokens.Insert(txn, aliceSecret2, alice, aliceDevice, time.Now()) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } - t.Log("Add a devices row for Alice") - err = devices.InsertDevice(alice, aliceDevice) + t.Log("Add a devices row for Alice") + err = devices.InsertDevice(txn, alice, aliceDevice) + if err != nil { + t.Fatalf("Failed to Insert device: %s", err) + } + return nil + }) t.Log("Pretend we're about to start a poller. Fetch Alice's token along with the since value tracked by the devices table.") accessToken, since, err := tokens.GetTokenAndSince(alice, aliceDevice, aliceToken.AccessTokenHash) @@ -104,40 +112,50 @@ func TestTokenForEachDevice(t *testing.T) { chris := "chris" chrisDevice := "chris_desktop" - t.Log("Add a device for Alice, Bob and Chris.") - err := devices.InsertDevice(alice, aliceDevice) - if err != nil { - t.Fatalf("InsertDevice returned error: %s", err) - } - err = devices.InsertDevice(bob, bobDevice) - if err != nil { - t.Fatalf("InsertDevice returned error: %s", err) - } - err = devices.InsertDevice(chris, chrisDevice) - if err != nil { - t.Fatalf("InsertDevice returned error: %s", err) - } + _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error { + t.Log("Add a device for Alice, Bob and Chris.") + err := devices.InsertDevice(txn, alice, aliceDevice) + if err != nil { + t.Fatalf("InsertDevice returned error: %s", err) + } + err = devices.InsertDevice(txn, bob, bobDevice) + if err != nil { + t.Fatalf("InsertDevice returned error: %s", err) + } + err = devices.InsertDevice(txn, chris, chrisDevice) + if err != nil { + t.Fatalf("InsertDevice returned error: %s", err) + } + return nil + }) t.Log("Mark Alice's device with a since token.") sinceValue := "s-1-2-3-4" - devices.UpdateDeviceSince(alice, aliceDevice, sinceValue) - - t.Log("Insert 2 tokens for Alice, one for Bob and none for Chris.") - aliceLastSeen1 := time.Now() - _, err = tokens.Insert("alice_secret", alice, aliceDevice, aliceLastSeen1) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } - aliceLastSeen2 := aliceLastSeen1.Add(1 * time.Minute) - aliceToken2, err := tokens.Insert("alice_secret2", alice, aliceDevice, aliceLastSeen2) + err := devices.UpdateDeviceSince(alice, aliceDevice, sinceValue) if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } - bobToken, err := tokens.Insert("bob_secret", bob, bobDevice, time.Time{}) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) + t.Fatalf("UpdateDeviceSince returned error: %s", err) } + var aliceToken2, bobToken *Token + _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error { + t.Log("Insert 2 tokens for Alice, one for Bob and none for Chris.") + aliceLastSeen1 := time.Now() + _, err = tokens.Insert(txn, "alice_secret", alice, aliceDevice, aliceLastSeen1) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + aliceLastSeen2 := aliceLastSeen1.Add(1 * time.Minute) + aliceToken2, err = tokens.Insert(txn, "alice_secret2", alice, aliceDevice, aliceLastSeen2) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + bobToken, err = tokens.Insert(txn, "bob_secret", bob, bobDevice, time.Time{}) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + return nil + }) + t.Log("Fetch a token for every device") gotTokens, err := tokens.TokenForEachDevice(nil) if err != nil { diff --git a/sync2/handler2/handler_test.go b/sync2/handler2/handler_test.go index 13a2597f..20f064ab 100644 --- a/sync2/handler2/handler_test.go +++ b/sync2/handler2/handler_test.go @@ -1,6 +1,8 @@ package handler2_test import ( + "github.com/jmoiron/sqlx" + "github.com/matrix-org/sliding-sync/sqlutil" "os" "reflect" "sync" @@ -131,11 +133,15 @@ func TestHandlerFreshEnsurePolling(t *testing.T) { deviceID := "ALICE" token := "aliceToken" - // the device and token needs to already exist prior to EnsurePolling - err = v2Store.DevicesTable.InsertDevice(alice, deviceID) - assertNoError(t, err) - tok, err := v2Store.TokensTable.Insert(token, alice, deviceID, time.Now()) - assertNoError(t, err) + var tok *sync2.Token + sqlutil.WithTransaction(v2Store.DB, func(txn *sqlx.Tx) error { + // the device and token needs to already exist prior to EnsurePolling + err = v2Store.DevicesTable.InsertDevice(txn, alice, deviceID) + assertNoError(t, err) + tok, err = v2Store.TokensTable.Insert(txn, token, alice, deviceID, time.Now()) + assertNoError(t, err) + return nil + }) payloadInitialSyncComplete := pubsub.V2InitialSyncComplete{ UserID: alice, diff --git a/sync2/tokens_table_test.go b/sync2/tokens_table_test.go index 9249077e..26e5c823 100644 --- a/sync2/tokens_table_test.go +++ b/sync2/tokens_table_test.go @@ -1,6 +1,8 @@ package sync2 import ( + "github.com/jmoiron/sqlx" + "github.com/matrix-org/sliding-sync/sqlutil" "testing" "time" ) @@ -26,27 +28,31 @@ func TestTokensTable(t *testing.T) { aliceSecret1 := "mysecret1" aliceToken1FirstSeen := time.Now() - // Test a single token - t.Log("Insert a new token from Alice.") - aliceToken, err := tokens.Insert(aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } - - t.Log("The returned Token struct should have been populated correctly.") - assertEqualTokens(t, tokens, aliceToken, aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) - - t.Log("Reinsert the same token.") - reinsertedToken, err := tokens.Insert(aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } + var aliceToken, reinsertedToken *Token + _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error { + // Test a single token + t.Log("Insert a new token from Alice.") + aliceToken, err := tokens.Insert(txn, aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + + t.Log("The returned Token struct should have been populated correctly.") + assertEqualTokens(t, tokens, aliceToken, aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) + + t.Log("Reinsert the same token.") + reinsertedToken, err = tokens.Insert(txn, aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + return nil + }) t.Log("This should yield an equal Token struct.") assertEqualTokens(t, tokens, reinsertedToken, aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) t.Log("Try to mark Alice's token as being used after an hour.") - err = tokens.MaybeUpdateLastSeen(aliceToken, aliceToken1FirstSeen.Add(time.Hour)) + err := tokens.MaybeUpdateLastSeen(aliceToken, aliceToken1FirstSeen.Add(time.Hour)) if err != nil { t.Fatalf("Failed to update last seen: %s", err) } @@ -74,17 +80,20 @@ func TestTokensTable(t *testing.T) { } assertEqualTokens(t, tokens, fetchedToken, aliceSecret1, alice, aliceDevice, aliceToken1LastSeen) - // Test a second token for Alice - t.Log("Insert a second token for Alice.") - aliceSecret2 := "mysecret2" - aliceToken2FirstSeen := aliceToken1LastSeen.Add(time.Minute) - aliceToken2, err := tokens.Insert(aliceSecret2, alice, aliceDevice, aliceToken2FirstSeen) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } - - t.Log("The returned Token struct should have been populated correctly.") - assertEqualTokens(t, tokens, aliceToken2, aliceSecret2, alice, aliceDevice, aliceToken2FirstSeen) + _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error { + // Test a second token for Alice + t.Log("Insert a second token for Alice.") + aliceSecret2 := "mysecret2" + aliceToken2FirstSeen := aliceToken1LastSeen.Add(time.Minute) + aliceToken2, err := tokens.Insert(txn, aliceSecret2, alice, aliceDevice, aliceToken2FirstSeen) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + + t.Log("The returned Token struct should have been populated correctly.") + assertEqualTokens(t, tokens, aliceToken2, aliceSecret2, alice, aliceDevice, aliceToken2FirstSeen) + return nil + }) } func TestDeletingTokens(t *testing.T) { @@ -94,11 +103,15 @@ func TestDeletingTokens(t *testing.T) { t.Log("Insert a new token from Alice.") accessToken := "mytoken" - token, err := tokens.Insert(accessToken, "@bob:builders.com", "device", time.Time{}) - if err != nil { - t.Fatalf("Failed to Insert token: %s", err) - } + var token *Token + err := sqlutil.WithTransaction(db, func(txn *sqlx.Tx) (err error) { + token, err = tokens.Insert(txn, accessToken, "@bob:builders.com", "device", time.Time{}) + if err != nil { + t.Fatalf("Failed to Insert token: %s", err) + } + return nil + }) t.Log("We should be able to fetch this token without error.") _, err = tokens.Token(accessToken) if err != nil { From af852c22d1460c7436f8bd5195f52dec31573b1c Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 16:49:37 +0100 Subject: [PATCH 011/156] Another test fixup --- sync2/tokens_table_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sync2/tokens_table_test.go b/sync2/tokens_table_test.go index 26e5c823..c787b2a0 100644 --- a/sync2/tokens_table_test.go +++ b/sync2/tokens_table_test.go @@ -29,10 +29,10 @@ func TestTokensTable(t *testing.T) { aliceToken1FirstSeen := time.Now() var aliceToken, reinsertedToken *Token - _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error { + _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) (err error) { // Test a single token t.Log("Insert a new token from Alice.") - aliceToken, err := tokens.Insert(txn, aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) + aliceToken, err = tokens.Insert(txn, aliceSecret1, alice, aliceDevice, aliceToken1FirstSeen) if err != nil { t.Fatalf("Failed to Insert token: %s", err) } From a59c6635a2e3eec05d68a1f74e10135cc0a44585 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 19 Jun 2023 17:36:41 +0100 Subject: [PATCH 012/156] Restart the server, try again --- tests-integration/db_test.go | 139 ++++++++++++++++++----------------- tests-integration/v3_test.go | 4 +- 2 files changed, 74 insertions(+), 69 deletions(-) diff --git a/tests-integration/db_test.go b/tests-integration/db_test.go index b721102e..ff8a703a 100644 --- a/tests-integration/db_test.go +++ b/tests-integration/db_test.go @@ -21,78 +21,83 @@ func TestMaxDBConns(t *testing.T) { pqString := testutils.PrepareDBConnectionString() // setup code v2 := runTestV2Server(t) - v3 := runTestServer(t, v2, pqString, syncv3.Opts{ - //DBMaxConns: 3, - }) + opts := syncv3.Opts{ + DBMaxConns: 3, + } + v3 := runTestServer(t, v2, pqString, opts) defer v2.close() defer v3.close() - // make N users and drip feed some events, make sure they are all seen - numUsers := 5 - var wg sync.WaitGroup - wg.Add(numUsers) - for i := 0; i < numUsers; i++ { - go func(n int) { - defer wg.Done() - userID := fmt.Sprintf("@maxconns_%d:localhost", n) - token := fmt.Sprintf("maxconns_%d", n) - roomID := fmt.Sprintf("!maxconns_%d", n) - v2.addAccount(t, userID, token) - v2.queueResponse(userID, sync2.SyncResponse{ - Rooms: sync2.SyncRoomsResponse{ - Join: v2JoinTimeline(roomEvents{ - roomID: roomID, - state: createRoomState(t, userID, time.Now()), - }), - }, - }) - // initial sync - res := v3.mustDoV3Request(t, token, sync3.Request{ - Lists: map[string]sync3.RequestList{"a": { - Ranges: sync3.SliceRanges{ - [2]int64{0, 1}, - }, - RoomSubscription: sync3.RoomSubscription{ - TimelineLimit: 1, + testMaxDBConns := func() { + // make N users and drip feed some events, make sure they are all seen + numUsers := 5 + var wg sync.WaitGroup + wg.Add(numUsers) + for i := 0; i < numUsers; i++ { + go func(n int) { + defer wg.Done() + userID := fmt.Sprintf("@maxconns_%d:localhost", n) + token := fmt.Sprintf("maxconns_%d", n) + roomID := fmt.Sprintf("!maxconns_%d", n) + v2.addAccount(t, userID, token) + v2.queueResponse(userID, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + state: createRoomState(t, userID, time.Now()), + }), }, - }}, - }) - t.Logf("user %s has done an initial /sync OK", userID) - m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(1), m.MatchV3Ops( - m.MatchV3SyncOp(0, 0, []string{roomID}), - )), m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ - roomID: { - m.MatchJoinCount(1), - }, - })) - // drip feed and get update - dripMsg := testutils.NewEvent(t, "m.room.message", userID, map[string]interface{}{ - "msgtype": "m.text", - "body": "drip drip", - }) - v2.queueResponse(userID, sync2.SyncResponse{ - Rooms: sync2.SyncRoomsResponse{ - Join: v2JoinTimeline(roomEvents{ - roomID: roomID, - events: []json.RawMessage{ - dripMsg, + }) + // initial sync + res := v3.mustDoV3Request(t, token, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 1}, }, - }), - }, - }) - t.Logf("user %s has queued the drip", userID) - v2.waitUntilEmpty(t, userID) - t.Logf("user %s poller has received the drip", userID) - res = v3.mustDoV3RequestWithPos(t, token, res.Pos, sync3.Request{}) - m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ - roomID: { - m.MatchRoomTimelineMostRecent(1, []json.RawMessage{dripMsg}), - }, - })) - t.Logf("user %s has received the drip", userID) - }(i) + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 1, + }, + }}, + }) + t.Logf("user %s has done an initial /sync OK", userID) + m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(1), m.MatchV3Ops( + m.MatchV3SyncOp(0, 0, []string{roomID}), + )), m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + roomID: { + m.MatchJoinCount(1), + }, + })) + // drip feed and get update + dripMsg := testutils.NewEvent(t, "m.room.message", userID, map[string]interface{}{ + "msgtype": "m.text", + "body": "drip drip", + }) + v2.queueResponse(userID, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + events: []json.RawMessage{ + dripMsg, + }, + }), + }, + }) + t.Logf("user %s has queued the drip", userID) + v2.waitUntilEmpty(t, userID) + t.Logf("user %s poller has received the drip", userID) + res = v3.mustDoV3RequestWithPos(t, token, res.Pos, sync3.Request{}) + m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + roomID: { + m.MatchRoomTimelineMostRecent(1, []json.RawMessage{dripMsg}), + }, + })) + t.Logf("user %s has received the drip", userID) + }(i) + } + wg.Wait() } - wg.Wait() - + testMaxDBConns() + v3.restart(t, v2, pqString, opts) + testMaxDBConns() } diff --git a/tests-integration/v3_test.go b/tests-integration/v3_test.go index 658eefd2..de72d954 100644 --- a/tests-integration/v3_test.go +++ b/tests-integration/v3_test.go @@ -291,11 +291,11 @@ func (s *testV3Server) close() { s.h2.Teardown() } -func (s *testV3Server) restart(t *testing.T, v2 *testV2Server, pq string) { +func (s *testV3Server) restart(t *testing.T, v2 *testV2Server, pq string, opts ...syncv3.Opts) { t.Helper() log.Printf("restarting server") s.close() - ss := runTestServer(t, v2, pq) + ss := runTestServer(t, v2, pq, opts...) // replace all the fields which will be close()d to ensure we don't leak s.srv = ss.srv s.h2 = ss.h2 From 5178d71f9861b804b334de45e60787bf3c364d21 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 19 Jun 2023 17:52:10 +0100 Subject: [PATCH 013/156] Don't take out another txn when LLing timelines; allow max_conns=1 to work --- state/event_table.go | 4 ++-- state/event_table_test.go | 6 +++++- state/storage.go | 2 +- state/storage_test.go | 6 +++++- tests-integration/db_test.go | 2 +- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index 6bea7114..925844c3 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -438,8 +438,8 @@ func (t *EventTable) SelectClosestPrevBatchByID(roomID string, eventID string) ( // Select the closest prev batch token for the provided event NID. Returns the empty string if there // is no closest. -func (t *EventTable) SelectClosestPrevBatch(roomID string, eventNID int64) (prevBatch string, err error) { - err = t.db.QueryRow( +func (t *EventTable) SelectClosestPrevBatch(txn *sqlx.Tx, roomID string, eventNID int64) (prevBatch string, err error) { + err = txn.QueryRow( `SELECT prev_batch FROM syncv3_events WHERE prev_batch IS NOT NULL AND room_id=$1 AND event_nid >= $2 LIMIT 1`, roomID, eventNID, ).Scan(&prevBatch) if err == sql.ErrNoRows { diff --git a/state/event_table_test.go b/state/event_table_test.go index 13eb1c7e..e5d20f5d 100644 --- a/state/event_table_test.go +++ b/state/event_table_test.go @@ -778,7 +778,11 @@ func TestEventTablePrevBatch(t *testing.T) { } assertPrevBatch := func(roomID string, index int, wantPrevBatch string) { - gotPrevBatch, err := table.SelectClosestPrevBatch(roomID, int64(idToNID[events[index].ID])) + var gotPrevBatch string + err := sqlutil.WithTransaction(table.db, func(txn *sqlx.Tx) error { + gotPrevBatch, err = table.SelectClosestPrevBatch(txn, roomID, int64(idToNID[events[index].ID])) + return err + }) if err != nil { t.Fatalf("failed to SelectClosestPrevBatch: %s", err) } diff --git a/state/storage.go b/state/storage.go index 4a9b9fc6..b9783931 100644 --- a/state/storage.go +++ b/state/storage.go @@ -626,7 +626,7 @@ func (s *Storage) LatestEventsInRooms(userID string, roomIDs []string, to int64, } if earliestEventNID != 0 { // the oldest event needs a prev batch token, so find one now - prevBatch, err := s.EventsTable.SelectClosestPrevBatch(roomID, earliestEventNID) + prevBatch, err := s.EventsTable.SelectClosestPrevBatch(txn, roomID, earliestEventNID) if err != nil { return fmt.Errorf("failed to select prev_batch for room %s : %s", roomID, err) } diff --git a/state/storage_test.go b/state/storage_test.go index a5067418..86af2b02 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -566,7 +566,11 @@ func TestStorageLatestEventsInRoomsPrevBatch(t *testing.T) { wantPrevBatch := wantPrevBatches[i] eventNID := idsToNIDs[eventIDs[i]] // closest batch to the last event in the chunk (latest nid) is always the next prev batch token - pb, err := store.EventsTable.SelectClosestPrevBatch(roomID, eventNID) + var pb string + err = sqlutil.WithTransaction(store.DB, func(txn *sqlx.Tx) error { + pb, err = store.EventsTable.SelectClosestPrevBatch(txn, roomID, eventNID) + return err + }) if err != nil { t.Fatalf("failed to SelectClosestPrevBatch: %s", err) } diff --git a/tests-integration/db_test.go b/tests-integration/db_test.go index ff8a703a..3897009e 100644 --- a/tests-integration/db_test.go +++ b/tests-integration/db_test.go @@ -22,7 +22,7 @@ func TestMaxDBConns(t *testing.T) { // setup code v2 := runTestV2Server(t) opts := syncv3.Opts{ - DBMaxConns: 3, + DBMaxConns: 1, } v3 := runTestServer(t, v2, pqString, opts) defer v2.close() From 1717408dc3d45f3bbd9bffd4d43dcca96660f086 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 17:58:56 +0100 Subject: [PATCH 014/156] Use fewer DB conns when events into the UserCache --- state/event_table.go | 4 ++-- state/event_table_test.go | 12 ++++++++---- state/storage.go | 2 +- state/storage_test.go | 13 +++++++++---- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index 6bea7114..925844c3 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -438,8 +438,8 @@ func (t *EventTable) SelectClosestPrevBatchByID(roomID string, eventID string) ( // Select the closest prev batch token for the provided event NID. Returns the empty string if there // is no closest. -func (t *EventTable) SelectClosestPrevBatch(roomID string, eventNID int64) (prevBatch string, err error) { - err = t.db.QueryRow( +func (t *EventTable) SelectClosestPrevBatch(txn *sqlx.Tx, roomID string, eventNID int64) (prevBatch string, err error) { + err = txn.QueryRow( `SELECT prev_batch FROM syncv3_events WHERE prev_batch IS NOT NULL AND room_id=$1 AND event_nid >= $2 LIMIT 1`, roomID, eventNID, ).Scan(&prevBatch) if err == sql.ErrNoRows { diff --git a/state/event_table_test.go b/state/event_table_test.go index 13eb1c7e..76104a65 100644 --- a/state/event_table_test.go +++ b/state/event_table_test.go @@ -778,10 +778,14 @@ func TestEventTablePrevBatch(t *testing.T) { } assertPrevBatch := func(roomID string, index int, wantPrevBatch string) { - gotPrevBatch, err := table.SelectClosestPrevBatch(roomID, int64(idToNID[events[index].ID])) - if err != nil { - t.Fatalf("failed to SelectClosestPrevBatch: %s", err) - } + var gotPrevBatch string + _ = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error { + gotPrevBatch, err = table.SelectClosestPrevBatch(txn, roomID, int64(idToNID[events[index].ID])) + if err != nil { + t.Fatalf("failed to SelectClosestPrevBatch: %s", err) + } + return nil + }) if wantPrevBatch != "" { if gotPrevBatch == "" || gotPrevBatch != wantPrevBatch { t.Fatalf("SelectClosestPrevBatch: got %v want %v", gotPrevBatch, wantPrevBatch) diff --git a/state/storage.go b/state/storage.go index 5c4dad14..1415d3c0 100644 --- a/state/storage.go +++ b/state/storage.go @@ -630,7 +630,7 @@ func (s *Storage) LatestEventsInRooms(userID string, roomIDs []string, to int64, } if earliestEventNID != 0 { // the oldest event needs a prev batch token, so find one now - prevBatch, err := s.EventsTable.SelectClosestPrevBatch(roomID, earliestEventNID) + prevBatch, err := s.EventsTable.SelectClosestPrevBatch(txn, roomID, earliestEventNID) if err != nil { return fmt.Errorf("failed to select prev_batch for room %s : %s", roomID, err) } diff --git a/state/storage_test.go b/state/storage_test.go index a5067418..e4b053b5 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -566,10 +566,15 @@ func TestStorageLatestEventsInRoomsPrevBatch(t *testing.T) { wantPrevBatch := wantPrevBatches[i] eventNID := idsToNIDs[eventIDs[i]] // closest batch to the last event in the chunk (latest nid) is always the next prev batch token - pb, err := store.EventsTable.SelectClosestPrevBatch(roomID, eventNID) - if err != nil { - t.Fatalf("failed to SelectClosestPrevBatch: %s", err) - } + var pb string + _ = sqlutil.WithTransaction(store.DB, func(txn *sqlx.Tx) (err error) { + pb, err = store.EventsTable.SelectClosestPrevBatch(txn, roomID, eventNID) + if err != nil { + t.Fatalf("failed to SelectClosestPrevBatch: %s", err) + } + return nil + }) + if pb != wantPrevBatch { t.Fatalf("SelectClosestPrevBatch: got %v want %v", pb, wantPrevBatch) } From a6bbf484489d291c096769bc2fb4d1fdfdec2651 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 18:09:14 +0100 Subject: [PATCH 015/156] Reduce DB conn usage when fetching device data slightly terrifyingly this suggests that the impl was written was not atomically doing a get-and-update? --- state/device_data_table.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/state/device_data_table.go b/state/device_data_table.go index 61853f75..1a671cf6 100644 --- a/state/device_data_table.go +++ b/state/device_data_table.go @@ -44,7 +44,7 @@ func NewDeviceDataTable(db *sqlx.DB) *DeviceDataTable { func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *internal.DeviceData, err error) { err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { var row DeviceDataRow - err = t.db.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2`, userID, deviceID) + err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2`, userID, deviceID) if err != nil { if err == sql.ErrNoRows { // if there is no device data for this user, it's not an error. @@ -78,7 +78,7 @@ func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *in // the device_data table. return nil } - _, err = t.db.Exec(`UPDATE syncv3_device_data SET data=$1 WHERE user_id=$2 AND device_id=$3`, data, userID, deviceID) + _, err = txn.Exec(`UPDATE syncv3_device_data SET data=$1 WHERE user_id=$2 AND device_id=$3`, data, userID, deviceID) return err }) return From 5e8a6912c885a0c65af7e19714f88d002a898729 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 18:11:16 +0100 Subject: [PATCH 016/156] Another not-using-txn --- state/device_data_table.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/state/device_data_table.go b/state/device_data_table.go index 1a671cf6..dc8f2488 100644 --- a/state/device_data_table.go +++ b/state/device_data_table.go @@ -94,7 +94,7 @@ func (t *DeviceDataTable) Upsert(dd *internal.DeviceData) (pos int64, err error) err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { // select what already exists var row DeviceDataRow - err = t.db.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2`, dd.UserID, dd.DeviceID) + err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2`, dd.UserID, dd.DeviceID) if err != nil && err != sql.ErrNoRows { return err } @@ -119,7 +119,7 @@ func (t *DeviceDataTable) Upsert(dd *internal.DeviceData) (pos int64, err error) if err != nil { return err } - err = t.db.QueryRow( + err = txn.QueryRow( `INSERT INTO syncv3_device_data(user_id, device_id, data) VALUES($1,$2,$3) ON CONFLICT (user_id, device_id) DO UPDATE SET data=$3, id=nextval('syncv3_device_data_seq') RETURNING id`, dd.UserID, dd.DeviceID, data, From 4156b085f64148fe557e2ac779d6755ab05b492f Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 19 Jun 2023 18:23:00 +0100 Subject: [PATCH 017/156] comments --- v3.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/v3.go b/v3.go index a9e1c557..74ae2be2 100644 --- a/v3.go +++ b/v3.go @@ -81,6 +81,9 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han storev2 := sync2.NewStore(postgresURI, secret) for _, db := range []*sqlx.DB{store.DB, storev2.DB} { if opts.DBMaxConns > 0 { + // https://github.com/go-sql-driver/mysql#important-settings + // "db.SetMaxIdleConns() is recommended to be set same to db.SetMaxOpenConns(). When it is smaller + // than SetMaxOpenConns(), connections can be opened and closed much more frequently than you expect." db.SetMaxOpenConns(opts.DBMaxConns) db.SetMaxIdleConns(opts.DBMaxConns) } From 5081642612061791306a778b6612b8127ed96851 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 19 Jun 2023 18:30:31 +0100 Subject: [PATCH 018/156] Reduce DB Conn usage when fetching room state --- state/storage.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state/storage.go b/state/storage.go index 1415d3c0..a6af1c71 100644 --- a/state/storage.go +++ b/state/storage.go @@ -545,7 +545,7 @@ func (s *Storage) RoomStateAfterEventPosition(ctx context.Context, roomIDs []str if err != nil { return fmt.Errorf("failed to form sql query: %s", err) } - rows, err := s.Accumulator.db.Query(s.Accumulator.db.Rebind(query), args...) + rows, err := txn.Query(txn.Rebind(query), args...) if err != nil { return fmt.Errorf("failed to execute query: %s", err) } From f699eabeedea2b7e00ffd5dd6a918f88ec2f90d9 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 20 Jun 2023 16:53:04 +0100 Subject: [PATCH 019/156] Debug missing roomIDs seen at startup Sentry: https://sentry.tools.element.io/organizations/element/issues/70026/events/7080aeab1d0b402e987a18f26c1ac329/?project=56&query=is%3Aunresolved --- internal/errors.go | 12 ++++++++++-- sync3/caches/global.go | 9 +++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/internal/errors.go b/internal/errors.go index 38fce79f..4017e13e 100644 --- a/internal/errors.go +++ b/internal/errors.go @@ -71,10 +71,18 @@ func ExpiredSessionError() *HandlerError { // Which then produces: // // assertion failed: list is not empty -func Assert(msg string, expr bool) { +// +// An optional debugContext map can be provided. If it is present and sentry is configured, +// it is added as context to the sentry events generated for failed assertions. +func Assert(msg string, expr bool, debugContext ...map[string]interface{}) { assert(msg, expr) if !expr { - sentry.CaptureException(fmt.Errorf("assertion failed: %s", msg)) + sentry.WithScope(func(scope *sentry.Scope) { + if len(debugContext) > 0 { + scope.SetContext(SentryCtxKey, debugContext[0]) + } + sentry.CaptureException(fmt.Errorf("assertion failed: %s", msg)) + }) } } diff --git a/sync3/caches/global.go b/sync3/caches/global.go index 36c2867a..65499577 100644 --- a/sync3/caches/global.go +++ b/sync3/caches/global.go @@ -239,8 +239,13 @@ func (c *GlobalCache) Startup(roomIDToMetadata map[string]internal.RoomMetadata) sort.Strings(roomIDs) for _, roomID := range roomIDs { metadata := roomIDToMetadata[roomID] - internal.Assert("room ID is set", metadata.RoomID != "") - internal.Assert("last message timestamp exists", metadata.LastMessageTimestamp > 1) + debugContext := map[string]interface{}{ + "room_id": roomID, + "metadata.RoomID": metadata.RoomID, + "metadata.LastMessageTimeStamp": metadata.LastMessageTimestamp, + } + internal.Assert("room ID is set", metadata.RoomID != "", debugContext) + internal.Assert("last message timestamp exists", metadata.LastMessageTimestamp > 1, debugContext) c.roomIDToMetadata[roomID] = &metadata } return nil From 8b938b8f2cb3de93d4f6cbdf52c1f972f730011e Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 21 Jun 2023 11:57:22 +0100 Subject: [PATCH 020/156] make setupConnection a task to debug reports where the EnsurePolling blocks and the client times out. (Looks like the SyncLiveHandler never responds to the InitialSyncComplete pubsub msg?) --- sync3/handler/handler.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 31870c5d..6d8cdb7b 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -302,6 +302,8 @@ func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error // When this function returns, the connection is alive and active. func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Request, containsPos bool) (*sync3.Conn, *internal.HandlerError) { + taskCtx, task := internal.StartTask(req.Context(), "setupConnection") + defer task.End() var conn *sync3.Conn // Extract an access token accessToken, err := internal.ExtractAccessToken(req) @@ -333,6 +335,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ } } log := hlog.FromRequest(req).With().Str("user", token.UserID).Str("device", token.DeviceID).Logger() + internal.Logf(taskCtx, "setupConnection", "identified access token as user=%s device=%s", token.UserID, token.DeviceID) // Record the fact that we've recieved a request from this token err = h.V2Store.TokensTable.MaybeUpdateLastSeen(token, time.Now()) From f209337725e209407f1a04f33054b95b934a0550 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 21 Jun 2023 14:40:12 +0100 Subject: [PATCH 021/156] Additional debugging --- sync3/handler/ensure_polling.go | 18 +++++++++++++++++- sync3/handler/handler.go | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sync3/handler/ensure_polling.go b/sync3/handler/ensure_polling.go index a97f7e27..76d7cfcd 100644 --- a/sync3/handler/ensure_polling.go +++ b/sync3/handler/ensure_polling.go @@ -1,6 +1,8 @@ package handler import ( + "context" + "github.com/matrix-org/sliding-sync/internal" "github.com/matrix-org/sliding-sync/sync2" "sync" @@ -41,10 +43,13 @@ func NewEnsurePoller(notifier pubsub.Notifier) *EnsurePoller { // EnsurePolling blocks until the V2InitialSyncComplete response is received for this device. It is // the caller's responsibility to call OnInitialSyncComplete when new events arrive. -func (p *EnsurePoller) EnsurePolling(pid sync2.PollerID, tokenHash string) { +func (p *EnsurePoller) EnsurePolling(ctx context.Context, pid sync2.PollerID, tokenHash string) { + ctx, region := internal.StartSpan(ctx, "EnsurePolling") + defer region.End() p.mu.Lock() // do we need to wait? if p.pendingPolls[pid].done { + internal.Logf(ctx, "EnsurePolling", "user %s device %s already done", pid.UserID, pid.DeviceID) p.mu.Unlock() return } @@ -56,7 +61,10 @@ func (p *EnsurePoller) EnsurePolling(pid sync2.PollerID, tokenHash string) { // TODO: several times there have been problems getting the response back from the poller // we should time out here after 100s and return an error or something to kick conns into // trying again + internal.Logf(ctx, "EnsurePolling", "user %s device %s channel exits, listening for channel close", pid.UserID, pid.DeviceID) + _, r2 := internal.StartSpan(ctx, "waitForExistingChannelClose") <-ch + r2.End() return } // Make a channel to wait until we have done an initial sync @@ -74,10 +82,15 @@ func (p *EnsurePoller) EnsurePolling(pid sync2.PollerID, tokenHash string) { }) // if by some miracle the notify AND sync completes before we receive on ch then this is // still fine as recv on a closed channel will return immediately. + internal.Logf(ctx, "EnsurePolling", "user %s device %s just made channel, listening for channel close", pid.UserID, pid.DeviceID) + _, r2 := internal.StartSpan(ctx, "waitForNewChannelClose") <-ch + r2.End() } func (p *EnsurePoller) OnInitialSyncComplete(payload *pubsub.V2InitialSyncComplete) { + log := logger.With().Str("user", payload.UserID).Str("device", payload.DeviceID).Logger() + log.Trace().Msg("OnInitialSyncComplete: got payload") pid := sync2.PollerID{UserID: payload.UserID, DeviceID: payload.DeviceID} p.mu.Lock() defer p.mu.Unlock() @@ -86,12 +99,14 @@ func (p *EnsurePoller) OnInitialSyncComplete(payload *pubsub.V2InitialSyncComple if !ok { // This can happen when the v2 poller spontaneously starts polling even without us asking it to // e.g from the database + log.Trace().Msg("OnInitialSyncComplete: we weren't waiting for this") p.pendingPolls[pid] = pendingInfo{ done: true, } return } if pending.done { + log.Trace().Msg("OnInitialSyncComplete: already done") // nothing to do, we just got OnInitialSyncComplete called twice return } @@ -101,6 +116,7 @@ func (p *EnsurePoller) OnInitialSyncComplete(payload *pubsub.V2InitialSyncComple pending.done = true pending.ch = nil p.pendingPolls[pid] = pending + log.Trace().Msg("OnInitialSyncComplete: closing channel") close(ch) } diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 6d8cdb7b..a42425b9 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -363,7 +363,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ log.Trace().Msg("checking poller exists and is running") pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} - h.EnsurePoller.EnsurePolling(pid, token.AccessTokenHash) + h.EnsurePoller.EnsurePolling(taskCtx, pid, token.AccessTokenHash) log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. // We'll be quicker next time as the poller will already exist. From e715de183833d3e2402d0b2da2790950fd2823af Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 21 Jun 2023 15:05:59 +0100 Subject: [PATCH 022/156] Remove unused import This was presumably sausage fingers on my part --- sync3/handler/handler.go | 1 - 1 file changed, 1 deletion(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index a42425b9..a810e0f1 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -1,6 +1,5 @@ package handler -import "C" import ( "context" "database/sql" From fa7cc34ad64c41ae254cdfe8ba5661f5d5bc4cee Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 21 Jun 2023 15:09:50 +0100 Subject: [PATCH 023/156] Explicitly log the `pid` --- sync3/handler/handler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index a810e0f1..fe135560 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -360,8 +360,8 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ return nil, internal.ExpiredSessionError() } - log.Trace().Msg("checking poller exists and is running") pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} + log.Trace().Any("pid", pid).Msg("checking poller exists and is running") h.EnsurePoller.EnsurePolling(taskCtx, pid, token.AccessTokenHash) log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. From cc8e6d9fb04df96b2a60971e2c4abf7c5267407b Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 22 Jun 2023 17:36:07 +0100 Subject: [PATCH 024/156] Track the time before processing a request in particular load() and setupConnection() --- sync3/conn.go | 10 +++++----- sync3/conn_test.go | 2 +- sync3/handler/connstate.go | 19 +++++++++++++++++-- sync3/handler/handler.go | 23 ++++++++++++++++++----- 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/sync3/conn.go b/sync3/conn.go index 14ff1689..8010447b 100644 --- a/sync3/conn.go +++ b/sync3/conn.go @@ -30,7 +30,7 @@ type ConnHandler interface { // Callback which is allowed to block as long as the context is active. Return the response // to send back or an error. Errors of type *internal.HandlerError are inspected for the correct // status code to send back. - OnIncomingRequest(ctx context.Context, cid ConnID, req *Request, isInitial bool) (*Response, error) + OnIncomingRequest(ctx context.Context, cid ConnID, req *Request, isInitial bool, start time.Time) (*Response, error) OnUpdate(ctx context.Context, update caches.Update) Destroy() Alive() bool @@ -88,7 +88,7 @@ func (c *Conn) OnUpdate(ctx context.Context, update caches.Update) { // upwards but will NOT be logged to Sentry (neither here nor by the caller). Errors // should be reported to Sentry as close as possible to the point of creating the error, // to provide the best possible Sentry traceback. -func (c *Conn) tryRequest(ctx context.Context, req *Request) (res *Response, err error) { +func (c *Conn) tryRequest(ctx context.Context, req *Request, start time.Time) (res *Response, err error) { // TODO: include useful information from the request in the sentry hub/context // Might be better done in the caller though? defer func() { @@ -116,7 +116,7 @@ func (c *Conn) tryRequest(ctx context.Context, req *Request) (res *Response, err ctx, task := internal.StartTask(ctx, taskType) defer task.End() internal.Logf(ctx, "connstate", "starting user=%v device=%v pos=%v", c.UserID, c.ConnID.DeviceID, req.pos) - return c.handler.OnIncomingRequest(ctx, c.ConnID, req, req.pos == 0) + return c.handler.OnIncomingRequest(ctx, c.ConnID, req, req.pos == 0, start) } func (c *Conn) isOutstanding(pos int64) bool { @@ -132,7 +132,7 @@ func (c *Conn) isOutstanding(pos int64) bool { // If an error is returned, it will be logged by the caller and transmitted to the // client. It will NOT be reported to Sentry---this should happen as close as possible // to the creation of the error (or else Sentry cannot provide a meaningful traceback.) -func (c *Conn) OnIncomingRequest(ctx context.Context, req *Request) (resp *Response, herr *internal.HandlerError) { +func (c *Conn) OnIncomingRequest(ctx context.Context, req *Request, start time.Time) (resp *Response, herr *internal.HandlerError) { c.cancelOutstandingRequestMu.Lock() if c.cancelOutstandingRequest != nil { c.cancelOutstandingRequest() @@ -217,7 +217,7 @@ func (c *Conn) OnIncomingRequest(ctx context.Context, req *Request) (resp *Respo req.SetTimeoutMSecs(1) } - resp, err := c.tryRequest(ctx, req) + resp, err := c.tryRequest(ctx, req, start) if err != nil { herr, ok := err.(*internal.HandlerError) if !ok { diff --git a/sync3/conn_test.go b/sync3/conn_test.go index a0be14a9..ef12a214 100644 --- a/sync3/conn_test.go +++ b/sync3/conn_test.go @@ -16,7 +16,7 @@ type connHandlerMock struct { fn func(ctx context.Context, cid ConnID, req *Request, isInitial bool) (*Response, error) } -func (c *connHandlerMock) OnIncomingRequest(ctx context.Context, cid ConnID, req *Request, init bool) (*Response, error) { +func (c *connHandlerMock) OnIncomingRequest(ctx context.Context, cid ConnID, req *Request, init bool, start time.Time) (*Response, error) { return c.fn(ctx, cid, req, init) } func (c *connHandlerMock) UserID() string { diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 6c8c1a2e..fa621494 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -45,12 +45,13 @@ type ConnState struct { joinChecker JoinChecker extensionsHandler extensions.HandlerInterface + setupHistogramVec *prometheus.HistogramVec processHistogramVec *prometheus.HistogramVec } func NewConnState( userID, deviceID string, userCache *caches.UserCache, globalCache *caches.GlobalCache, - ex extensions.HandlerInterface, joinChecker JoinChecker, histVec *prometheus.HistogramVec, + ex extensions.HandlerInterface, joinChecker JoinChecker, setupHistVec *prometheus.HistogramVec, histVec *prometheus.HistogramVec, maxPendingEventUpdates int, ) *ConnState { cs := &ConnState{ @@ -65,6 +66,7 @@ func NewConnState( extensionsHandler: ex, joinChecker: joinChecker, lazyCache: NewLazyCache(), + setupHistogramVec: setupHistVec, processHistogramVec: histVec, } cs.live = &connStateLive{ @@ -148,13 +150,15 @@ func (s *ConnState) load(ctx context.Context, req *sync3.Request) error { } // OnIncomingRequest is guaranteed to be called sequentially (it's protected by a mutex in conn.go) -func (s *ConnState) OnIncomingRequest(ctx context.Context, cid sync3.ConnID, req *sync3.Request, isInitial bool) (*sync3.Response, error) { +func (s *ConnState) OnIncomingRequest(ctx context.Context, cid sync3.ConnID, req *sync3.Request, isInitial bool, start time.Time) (*sync3.Response, error) { if s.loadPosition == -1 { // load() needs no ctx so drop it _, region := internal.StartSpan(ctx, "load") s.load(ctx, req) region.End() } + setupTime := time.Since(start) + s.trackSetupDuration(setupTime, isInitial) return s.onIncomingRequest(ctx, req, isInitial) } @@ -577,6 +581,17 @@ func (s *ConnState) getInitialRoomData(ctx context.Context, roomSub sync3.RoomSu return rooms } +func (s *ConnState) trackSetupDuration(dur time.Duration, isInitial bool) { + if s.setupHistogramVec == nil { + return + } + val := "0" + if isInitial { + val = "1" + } + s.setupHistogramVec.WithLabelValues(val).Observe(float64(dur.Seconds())) +} + func (s *ConnState) trackProcessDuration(dur time.Duration, isInitial bool) { if s.processHistogramVec == nil { return diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index c924a2d3..3db31daa 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -60,8 +60,9 @@ type SyncLiveHandler struct { GlobalCache *caches.GlobalCache maxPendingEventUpdates int - numConns prometheus.Gauge - histVec *prometheus.HistogramVec + numConns prometheus.Gauge + setupHistVec *prometheus.HistogramVec + histVec *prometheus.HistogramVec } func NewSync3Handler( @@ -130,6 +131,9 @@ func (h *SyncLiveHandler) Teardown() { if h.numConns != nil { prometheus.Unregister(h.numConns) } + if h.setupHistVec != nil { + prometheus.Unregister(h.setupHistVec) + } if h.histVec != nil { prometheus.Unregister(h.histVec) } @@ -149,14 +153,22 @@ func (h *SyncLiveHandler) addPrometheusMetrics() { Name: "num_active_conns", Help: "Number of active sliding sync connections.", }) + h.setupHistVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "sliding_sync", + Subsystem: "api", + Name: "setup_duration_secs", + Help: "Time taken in seconds after receiving a request before we start calculating a sliding sync response.", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, + }, []string{"initial"}) h.histVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "sliding_sync", Subsystem: "api", Name: "process_duration_secs", - Help: "Time taken in seconds for the sliding sync response to calculated, excludes long polling", + Help: "Time taken in seconds for the sliding sync response to be calculated, excludes long polling", Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, }, []string{"initial"}) prometheus.MustRegister(h.numConns) + prometheus.MustRegister(h.setupHistVec) prometheus.MustRegister(h.histVec) } @@ -184,6 +196,7 @@ func (h *SyncLiveHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { // Entry point for sync v3 func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error { + start := time.Now() var requestBody sync3.Request if req.ContentLength != 0 { defer req.Body.Close() @@ -251,7 +264,7 @@ func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error requestBody.SetTimeoutMSecs(timeout) log.Trace().Int("timeout", timeout).Msg("recv") - resp, herr := conn.OnIncomingRequest(req.Context(), &requestBody) + resp, herr := conn.OnIncomingRequest(req.Context(), &requestBody, start) if herr != nil { logErrorOrWarning("failed to OnIncomingRequest", herr) return herr @@ -391,7 +404,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ // to check for an existing connection though, as it's possible for the client to call /sync // twice for a new connection. conn, created := h.ConnMap.CreateConn(connID, func() sync3.ConnHandler { - return NewConnState(token.UserID, token.DeviceID, userCache, h.GlobalCache, h.Extensions, h.Dispatcher, h.histVec, h.maxPendingEventUpdates) + return NewConnState(token.UserID, token.DeviceID, userCache, h.GlobalCache, h.Extensions, h.Dispatcher, h.setupHistVec, h.histVec, h.maxPendingEventUpdates) }) if created { log.Info().Msg("created new connection") From 048a5ac902b513d087a90b313635de315fc82127 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 22 Jun 2023 17:43:56 +0100 Subject: [PATCH 025/156] Track the number of slow requests --- sync3/handler/handler.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 3db31daa..592c26a1 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -63,6 +63,7 @@ type SyncLiveHandler struct { numConns prometheus.Gauge setupHistVec *prometheus.HistogramVec histVec *prometheus.HistogramVec + slowReqs prometheus.Counter } func NewSync3Handler( @@ -137,6 +138,9 @@ func (h *SyncLiveHandler) Teardown() { if h.histVec != nil { prometheus.Unregister(h.histVec) } + if h.slowReqs != nil { + prometheus.Unregister(h.slowReqs) + } } func (h *SyncLiveHandler) updateMetrics() { @@ -167,9 +171,16 @@ func (h *SyncLiveHandler) addPrometheusMetrics() { Help: "Time taken in seconds for the sliding sync response to be calculated, excludes long polling", Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, }, []string{"initial"}) + h.slowReqs = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "sliding_sync", + Subsystem: "api", + Name: "slow_requests", + Help: "Counter of slow (>=50s) requests, initial or otherwise.", + }) prometheus.MustRegister(h.numConns) prometheus.MustRegister(h.setupHistVec) prometheus.MustRegister(h.histVec) + prometheus.MustRegister(h.slowReqs) } func (h *SyncLiveHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { @@ -197,6 +208,11 @@ func (h *SyncLiveHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { // Entry point for sync v3 func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error { start := time.Now() + defer func() { + if time.Since(start) > 50*time.Second { + h.slowReqs.Add(1.0) + } + }() var requestBody sync3.Request if req.ContentLength != 0 { defer req.Body.Close() From 1d46a3037bdaacb71aa9e7597e9bd4bbb197df31 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 22 Jun 2023 17:49:15 +0100 Subject: [PATCH 026/156] Update tests --- sync3/conn_test.go | 38 ++++++++++++++++----------------- sync3/handler/connstate_test.go | 32 +++++++++++++-------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/sync3/conn_test.go b/sync3/conn_test.go index ef12a214..c326938c 100644 --- a/sync3/conn_test.go +++ b/sync3/conn_test.go @@ -47,7 +47,7 @@ func TestConn(t *testing.T) { // initial request resp, err := c.OnIncomingRequest(ctx, &Request{ pos: 0, - }) + }, time.Now()) assertNoError(t, err) assertPos(t, resp.Pos, 1) assertInt(t, resp.Lists["a"].Count, 101) @@ -55,14 +55,14 @@ func TestConn(t *testing.T) { // happy case, pos=1 resp, err = c.OnIncomingRequest(ctx, &Request{ pos: 1, - }) + }, time.Now()) assertPos(t, resp.Pos, 2) assertInt(t, resp.Lists["a"].Count, 102) assertNoError(t, err) // bogus position returns a 400 _, err = c.OnIncomingRequest(ctx, &Request{ pos: 31415, - }) + }, time.Now()) if err == nil { t.Fatalf("expected error, got none") } @@ -106,7 +106,7 @@ func TestConnBlocking(t *testing.T) { Sort: []string{"hi"}, }, }, - }) + }, time.Now()) }() go func() { defer wg.Done() @@ -118,7 +118,7 @@ func TestConnBlocking(t *testing.T) { Sort: []string{"hi2"}, }, }, - }) + }, time.Now()) }() go func() { wg.Wait() @@ -148,18 +148,18 @@ func TestConnRetries(t *testing.T) { }, }}, nil }}) - resp, err := c.OnIncomingRequest(ctx, &Request{}) + resp, err := c.OnIncomingRequest(ctx, &Request{}, time.Now()) assertPos(t, resp.Pos, 1) assertInt(t, resp.Lists["a"].Count, 20) assertInt(t, callCount, 1) assertNoError(t, err) - resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1}) + resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1}, time.Now()) assertPos(t, resp.Pos, 2) assertInt(t, resp.Lists["a"].Count, 20) assertInt(t, callCount, 2) assertNoError(t, err) // retry! Shouldn't invoke handler again - resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1}) + resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1}, time.Now()) assertPos(t, resp.Pos, 2) assertInt(t, resp.Lists["a"].Count, 20) assertInt(t, callCount, 2) // this doesn't increment @@ -170,7 +170,7 @@ func TestConnRetries(t *testing.T) { "a": { Sort: []string{SortByName}, }, - }}) + }}, time.Now()) assertPos(t, resp.Pos, 2) assertInt(t, resp.Lists["a"].Count, 20) assertInt(t, callCount, 3) // this doesn't increment @@ -191,25 +191,25 @@ func TestConnBufferRes(t *testing.T) { }, }}, nil }}) - resp, err := c.OnIncomingRequest(ctx, &Request{}) + resp, err := c.OnIncomingRequest(ctx, &Request{}, time.Now()) assertNoError(t, err) assertPos(t, resp.Pos, 1) assertInt(t, resp.Lists["a"].Count, 1) assertInt(t, callCount, 1) - resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1}) + resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1}, time.Now()) assertNoError(t, err) assertPos(t, resp.Pos, 2) assertInt(t, resp.Lists["a"].Count, 2) assertInt(t, callCount, 2) // retry with modified request data that shouldn't prompt data to be returned. // should invoke handler again! - resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1, UnsubscribeRooms: []string{"a"}}) + resp, err = c.OnIncomingRequest(ctx, &Request{pos: 1, UnsubscribeRooms: []string{"a"}}, time.Now()) assertNoError(t, err) assertPos(t, resp.Pos, 2) assertInt(t, resp.Lists["a"].Count, 2) assertInt(t, callCount, 3) // this DOES increment, the response is buffered and not returned yet. // retry with same request body, so should NOT invoke handler again and return buffered response - resp, err = c.OnIncomingRequest(ctx, &Request{pos: 2, UnsubscribeRooms: []string{"a"}}) + resp, err = c.OnIncomingRequest(ctx, &Request{pos: 2, UnsubscribeRooms: []string{"a"}}, time.Now()) assertNoError(t, err) assertPos(t, resp.Pos, 3) assertInt(t, resp.Lists["a"].Count, 3) @@ -228,7 +228,7 @@ func TestConnErrors(t *testing.T) { // random errors = 500 errCh <- errors.New("oops") - _, herr := c.OnIncomingRequest(ctx, &Request{}) + _, herr := c.OnIncomingRequest(ctx, &Request{}, time.Now()) if herr.StatusCode != 500 { t.Fatalf("random errors should be status 500, got %d", herr.StatusCode) } @@ -237,7 +237,7 @@ func TestConnErrors(t *testing.T) { StatusCode: 400, Err: errors.New("no way!"), } - _, herr = c.OnIncomingRequest(ctx, &Request{}) + _, herr = c.OnIncomingRequest(ctx, &Request{}, time.Now()) if herr.StatusCode != 400 { t.Fatalf("expected status 400, got %d", herr.StatusCode) } @@ -258,7 +258,7 @@ func TestConnErrorsNoCache(t *testing.T) { } }}) // errors should not be cached - resp, herr := c.OnIncomingRequest(ctx, &Request{}) + resp, herr := c.OnIncomingRequest(ctx, &Request{}, time.Now()) if herr != nil { t.Fatalf("expected no error, got %+v", herr) } @@ -267,12 +267,12 @@ func TestConnErrorsNoCache(t *testing.T) { StatusCode: 400, Err: errors.New("no way!"), } - _, herr = c.OnIncomingRequest(ctx, &Request{pos: resp.PosInt()}) + _, herr = c.OnIncomingRequest(ctx, &Request{pos: resp.PosInt()}, time.Now()) if herr.StatusCode != 400 { t.Fatalf("expected status 400, got %d", herr.StatusCode) } // but doing the exact same request should now work - _, herr = c.OnIncomingRequest(ctx, &Request{pos: resp.PosInt()}) + _, herr = c.OnIncomingRequest(ctx, &Request{pos: resp.PosInt()}, time.Now()) if herr != nil { t.Fatalf("expected no error, got %+v", herr) } @@ -361,7 +361,7 @@ func TestConnBufferRememberInflight(t *testing.T) { var err *internal.HandlerError for i, step := range steps { t.Logf("Executing step %d", i) - resp, err = c.OnIncomingRequest(ctx, step.req) + resp, err = c.OnIncomingRequest(ctx, step.req, time.Now()) if !step.wantErr { assertNoError(t, err) } diff --git a/sync3/handler/connstate_test.go b/sync3/handler/connstate_test.go index 9e8c2d72..2b06d0fe 100644 --- a/sync3/handler/connstate_test.go +++ b/sync3/handler/connstate_test.go @@ -107,7 +107,7 @@ func TestConnStateInitial(t *testing.T) { } return result } - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) if userID != cs.UserID() { t.Fatalf("UserID returned wrong value, got %v want %v", cs.UserID(), userID) } @@ -118,7 +118,7 @@ func TestConnStateInitial(t *testing.T) { {0, 9}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -168,7 +168,7 @@ func TestConnStateInitial(t *testing.T) { {0, 9}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -206,7 +206,7 @@ func TestConnStateInitial(t *testing.T) { {0, 9}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -272,7 +272,7 @@ func TestConnStateMultipleRanges(t *testing.T) { userCache.LazyRoomDataOverride = mockLazyRoomOverride dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) // request first page res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ @@ -282,7 +282,7 @@ func TestConnStateMultipleRanges(t *testing.T) { {0, 2}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -308,7 +308,7 @@ func TestConnStateMultipleRanges(t *testing.T) { {0, 2}, {4, 6}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -343,7 +343,7 @@ func TestConnStateMultipleRanges(t *testing.T) { {0, 2}, {4, 6}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -383,7 +383,7 @@ func TestConnStateMultipleRanges(t *testing.T) { {0, 2}, {4, 6}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -451,7 +451,7 @@ func TestBumpToOutsideRange(t *testing.T) { userCache.LazyRoomDataOverride = mockLazyRoomOverride dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) // Ask for A,B res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ Lists: map[string]sync3.RequestList{"a": { @@ -460,7 +460,7 @@ func TestBumpToOutsideRange(t *testing.T) { {0, 1}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -495,7 +495,7 @@ func TestBumpToOutsideRange(t *testing.T) { {0, 1}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -562,7 +562,7 @@ func TestConnStateRoomSubscriptions(t *testing.T) { } dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) // subscribe to room D res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ RoomSubscriptions: map[string]sync3.RoomSubscription{ @@ -576,7 +576,7 @@ func TestConnStateRoomSubscriptions(t *testing.T) { {0, 1}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -630,7 +630,7 @@ func TestConnStateRoomSubscriptions(t *testing.T) { {0, 1}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } @@ -664,7 +664,7 @@ func TestConnStateRoomSubscriptions(t *testing.T) { {0, 1}, }), }}, - }, false) + }, false, time.Now()) if err != nil { t.Fatalf("OnIncomingRequest returned error : %s", err) } From 2829a7ae455f5f5400a83f0341f3fc59073b79c3 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 23 Jun 2023 17:35:39 +0100 Subject: [PATCH 027/156] Log device ID after requests --- internal/context.go | 7 ++++++- sync3/handler/handler.go | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/context.go b/internal/context.go index de8cae9c..18480626 100644 --- a/internal/context.go +++ b/internal/context.go @@ -16,6 +16,7 @@ var ( // logging metadata for a single request type data struct { userID string + deviceID string since int64 next int64 numRooms int @@ -37,13 +38,14 @@ func RequestContext(ctx context.Context) context.Context { } // add the user ID to this request context. Need to have called RequestContext first. -func SetRequestContextUserID(ctx context.Context, userID string) { +func SetRequestContextUserID(ctx context.Context, userID, deviceID string) { d := ctx.Value(ctxData) if d == nil { return } da := d.(*data) da.userID = userID + da.deviceID = deviceID if hub := sentry.GetHubFromContext(ctx); hub != nil { sentry.ConfigureScope(func(scope *sentry.Scope) { scope.SetUser(sentry.User{Username: userID}) @@ -79,6 +81,9 @@ func DecorateLogger(ctx context.Context, l *zerolog.Event) *zerolog.Event { if da.userID != "" { l = l.Str("u", da.userID) } + if da.deviceID != "" { + l = l.Str("dev", da.deviceID) + } if da.since >= 0 { l = l.Int64("p", da.since) } diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 592c26a1..4e57b528 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -263,7 +263,7 @@ func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error return herr } requestBody.SetPos(cpos) - internal.SetRequestContextUserID(req.Context(), conn.UserID) + internal.SetRequestContextUserID(req.Context(), conn.UserID, conn.DeviceID) log := hlog.FromRequest(req).With().Str("user", conn.UserID).Int64("pos", cpos).Logger() var timeout int From f4e935c948d06a22a4e2df2457066e78764463a5 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 23 Jun 2023 17:51:31 +0100 Subject: [PATCH 028/156] Record user and device on context ASAP --- sync3/handler/handler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 4e57b528..52fa3aaf 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -263,7 +263,6 @@ func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error return herr } requestBody.SetPos(cpos) - internal.SetRequestContextUserID(req.Context(), conn.UserID, conn.DeviceID) log := hlog.FromRequest(req).With().Str("user", conn.UserID).Int64("pos", cpos).Logger() var timeout int @@ -363,6 +362,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ } } log := hlog.FromRequest(req).With().Str("user", token.UserID).Str("device", token.DeviceID).Logger() + internal.SetRequestContextUserID(req.Context(), token.UserID, token.DeviceID) // Record the fact that we've recieved a request from this token err = h.V2Store.TokensTable.MaybeUpdateLastSeen(token, time.Now()) From b47ebaabe71e356900c51a0a5b7aae6787f93fc6 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 23 Jun 2023 17:55:07 +0100 Subject: [PATCH 029/156] Log warning for slow requests --- sync3/handler/handler.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 52fa3aaf..50330fcc 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -209,8 +209,10 @@ func (h *SyncLiveHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error { start := time.Now() defer func() { - if time.Since(start) > 50*time.Second { + dur := time.Since(start) + if dur > 50*time.Second { h.slowReqs.Add(1.0) + internal.DecorateLogger(req.Context(), log.Warn()).Dur("duration", dur).Msg("slow request") } }() var requestBody sync3.Request From a8d4f7a35d66a0d6d000fca240cc9ff295b9f0f7 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 21 Jun 2023 14:40:12 +0100 Subject: [PATCH 030/156] Additional debugging --- sync3/handler/ensure_polling.go | 18 +++++++++++++++++- sync3/handler/handler.go | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sync3/handler/ensure_polling.go b/sync3/handler/ensure_polling.go index a97f7e27..76d7cfcd 100644 --- a/sync3/handler/ensure_polling.go +++ b/sync3/handler/ensure_polling.go @@ -1,6 +1,8 @@ package handler import ( + "context" + "github.com/matrix-org/sliding-sync/internal" "github.com/matrix-org/sliding-sync/sync2" "sync" @@ -41,10 +43,13 @@ func NewEnsurePoller(notifier pubsub.Notifier) *EnsurePoller { // EnsurePolling blocks until the V2InitialSyncComplete response is received for this device. It is // the caller's responsibility to call OnInitialSyncComplete when new events arrive. -func (p *EnsurePoller) EnsurePolling(pid sync2.PollerID, tokenHash string) { +func (p *EnsurePoller) EnsurePolling(ctx context.Context, pid sync2.PollerID, tokenHash string) { + ctx, region := internal.StartSpan(ctx, "EnsurePolling") + defer region.End() p.mu.Lock() // do we need to wait? if p.pendingPolls[pid].done { + internal.Logf(ctx, "EnsurePolling", "user %s device %s already done", pid.UserID, pid.DeviceID) p.mu.Unlock() return } @@ -56,7 +61,10 @@ func (p *EnsurePoller) EnsurePolling(pid sync2.PollerID, tokenHash string) { // TODO: several times there have been problems getting the response back from the poller // we should time out here after 100s and return an error or something to kick conns into // trying again + internal.Logf(ctx, "EnsurePolling", "user %s device %s channel exits, listening for channel close", pid.UserID, pid.DeviceID) + _, r2 := internal.StartSpan(ctx, "waitForExistingChannelClose") <-ch + r2.End() return } // Make a channel to wait until we have done an initial sync @@ -74,10 +82,15 @@ func (p *EnsurePoller) EnsurePolling(pid sync2.PollerID, tokenHash string) { }) // if by some miracle the notify AND sync completes before we receive on ch then this is // still fine as recv on a closed channel will return immediately. + internal.Logf(ctx, "EnsurePolling", "user %s device %s just made channel, listening for channel close", pid.UserID, pid.DeviceID) + _, r2 := internal.StartSpan(ctx, "waitForNewChannelClose") <-ch + r2.End() } func (p *EnsurePoller) OnInitialSyncComplete(payload *pubsub.V2InitialSyncComplete) { + log := logger.With().Str("user", payload.UserID).Str("device", payload.DeviceID).Logger() + log.Trace().Msg("OnInitialSyncComplete: got payload") pid := sync2.PollerID{UserID: payload.UserID, DeviceID: payload.DeviceID} p.mu.Lock() defer p.mu.Unlock() @@ -86,12 +99,14 @@ func (p *EnsurePoller) OnInitialSyncComplete(payload *pubsub.V2InitialSyncComple if !ok { // This can happen when the v2 poller spontaneously starts polling even without us asking it to // e.g from the database + log.Trace().Msg("OnInitialSyncComplete: we weren't waiting for this") p.pendingPolls[pid] = pendingInfo{ done: true, } return } if pending.done { + log.Trace().Msg("OnInitialSyncComplete: already done") // nothing to do, we just got OnInitialSyncComplete called twice return } @@ -101,6 +116,7 @@ func (p *EnsurePoller) OnInitialSyncComplete(payload *pubsub.V2InitialSyncComple pending.done = true pending.ch = nil p.pendingPolls[pid] = pending + log.Trace().Msg("OnInitialSyncComplete: closing channel") close(ch) } diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 50330fcc..2350d9a4 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -392,7 +392,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ log.Trace().Msg("checking poller exists and is running") pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} - h.EnsurePoller.EnsurePolling(pid, token.AccessTokenHash) + h.EnsurePoller.EnsurePolling(taskCtx, pid, token.AccessTokenHash) log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. // We'll be quicker next time as the poller will already exist. From a78612e64acfdca0d84ebb34dec3b99d7fd666f0 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 23 Jun 2023 19:06:43 +0100 Subject: [PATCH 031/156] Fix bad backport --- sync3/handler/handler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 2350d9a4..6717e99a 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -392,7 +392,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ log.Trace().Msg("checking poller exists and is running") pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} - h.EnsurePoller.EnsurePolling(taskCtx, pid, token.AccessTokenHash) + h.EnsurePoller.EnsurePolling(req.Context(), pid, token.AccessTokenHash) log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. // We'll be quicker next time as the poller will already exist. From f36c038cf87994fc53229c7e7fa2f4e2b09c8560 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 26 Jun 2023 21:04:02 -0700 Subject: [PATCH 032/156] Rate limit pubsub.V2DeviceData updates to be at most 1 per second The db writes are still instant, but the notifications are now delayed by up to 1 second, in order to not swamp the pubsub channels. --- pubsub/v2.go | 4 +- sync2/device_data_ticker.go | 90 ++++++++++++++++++++++ sync2/device_data_ticker_test.go | 124 +++++++++++++++++++++++++++++++ sync2/handler2/handler.go | 27 +++++-- sync3/handler/handler.go | 11 ++- v3.go | 4 +- 6 files changed, 246 insertions(+), 14 deletions(-) create mode 100644 sync2/device_data_ticker.go create mode 100644 sync2/device_data_ticker_test.go diff --git a/pubsub/v2.go b/pubsub/v2.go index 2ed379f4..7dfb01e0 100644 --- a/pubsub/v2.go +++ b/pubsub/v2.go @@ -91,9 +91,7 @@ type V2InitialSyncComplete struct { func (*V2InitialSyncComplete) Type() string { return "V2InitialSyncComplete" } type V2DeviceData struct { - UserID string - DeviceID string - Pos int64 + UserIDToDeviceIDs map[string][]string } func (*V2DeviceData) Type() string { return "V2DeviceData" } diff --git a/sync2/device_data_ticker.go b/sync2/device_data_ticker.go new file mode 100644 index 00000000..7d77aaea --- /dev/null +++ b/sync2/device_data_ticker.go @@ -0,0 +1,90 @@ +package sync2 + +import ( + "sync" + "time" + + "github.com/matrix-org/sliding-sync/pubsub" +) + +// This struct remembers user+device IDs to notify for then periodically +// emits them all to the caller. Use to rate limit the frequency of device list +// updates. +type DeviceDataTicker struct { + // data structures to periodically notify downstream about device data updates + // The ticker controls the frequency of updates. The done channel is used to stop ticking + // and clean up the goroutine. The notify map contains the values to notify for. + ticker *time.Ticker + done chan struct{} + notifyMap *sync.Map // map of PollerID to bools, unwrapped when notifying + fn func(payload *pubsub.V2DeviceData) +} + +// Create a new device data ticker, which batches calls to Remember and invokes a callback every +// d duration. If d is 0, no batching is performed and the callback is invoked synchronously, which +// is useful for testing. +func NewDeviceDataTicker(d time.Duration) *DeviceDataTicker { + ddt := &DeviceDataTicker{ + done: make(chan struct{}), + notifyMap: &sync.Map{}, + } + if d != 0 { + ddt.ticker = time.NewTicker(d) + } + return ddt +} + +// Stop ticking. +func (t *DeviceDataTicker) Stop() { + if t.ticker != nil { + t.ticker.Stop() + } + close(t.done) +} + +// Set the function which should be called when the tick happens. +func (t *DeviceDataTicker) SetCallback(fn func(payload *pubsub.V2DeviceData)) { + t.fn = fn +} + +// Remember this user/device ID, and emit it later on. +func (t *DeviceDataTicker) Remember(pid PollerID) { + t.notifyMap.Store(pid, true) + if t.ticker == nil { + t.emitUpdate() + } +} + +func (t *DeviceDataTicker) emitUpdate() { + var p pubsub.V2DeviceData + p.UserIDToDeviceIDs = make(map[string][]string) + // populate the pubsub payload + t.notifyMap.Range(func(key, value any) bool { + pid := key.(PollerID) + devices := p.UserIDToDeviceIDs[pid.UserID] + devices = append(devices, pid.DeviceID) + p.UserIDToDeviceIDs[pid.UserID] = devices + // clear the map of this value + t.notifyMap.Delete(key) + return true // keep enumerating + }) + // notify if we have entries + if len(p.UserIDToDeviceIDs) > 0 { + t.fn(&p) + } +} + +// Blocks forever, ticking until Stop() is called. +func (t *DeviceDataTicker) Run() { + if t.ticker == nil { + return + } + for { + select { + case <-t.done: + return + case <-t.ticker.C: + t.emitUpdate() + } + } +} diff --git a/sync2/device_data_ticker_test.go b/sync2/device_data_ticker_test.go new file mode 100644 index 00000000..470e5378 --- /dev/null +++ b/sync2/device_data_ticker_test.go @@ -0,0 +1,124 @@ +package sync2 + +import ( + "reflect" + "sort" + "sync" + "testing" + "time" + + "github.com/matrix-org/sliding-sync/pubsub" +) + +func TestDeviceTickerBasic(t *testing.T) { + duration := time.Millisecond + ticker := NewDeviceDataTicker(duration) + var payloads []*pubsub.V2DeviceData + ticker.SetCallback(func(payload *pubsub.V2DeviceData) { + payloads = append(payloads, payload) + }) + var wg sync.WaitGroup + wg.Add(1) + go func() { + t.Log("starting the ticker") + ticker.Run() + wg.Done() + }() + time.Sleep(duration * 2) // wait until the ticker is consuming + t.Log("remembering a poller") + ticker.Remember(PollerID{ + UserID: "a", + DeviceID: "b", + }) + time.Sleep(duration * 2) + if len(payloads) != 1 { + t.Fatalf("expected 1 callback, got %d", len(payloads)) + } + want := map[string][]string{ + "a": {"b"}, + } + assertPayloadEqual(t, payloads[0].UserIDToDeviceIDs, want) + // check stopping works + payloads = []*pubsub.V2DeviceData{} + ticker.Stop() + wg.Wait() + time.Sleep(duration * 2) + if len(payloads) != 0 { + t.Fatalf("got extra payloads: %+v", payloads) + } +} + +func TestDeviceTickerBatchesCorrectly(t *testing.T) { + duration := 100 * time.Millisecond + ticker := NewDeviceDataTicker(duration) + var payloads []*pubsub.V2DeviceData + ticker.SetCallback(func(payload *pubsub.V2DeviceData) { + payloads = append(payloads, payload) + }) + go ticker.Run() + defer ticker.Stop() + ticker.Remember(PollerID{ + UserID: "a", + DeviceID: "b", + }) + ticker.Remember(PollerID{ + UserID: "a", + DeviceID: "bb", // different device, same user + }) + ticker.Remember(PollerID{ + UserID: "a", + DeviceID: "b", // dupe poller ID + }) + ticker.Remember(PollerID{ + UserID: "x", + DeviceID: "y", // new device and user + }) + time.Sleep(duration * 2) + if len(payloads) != 1 { + t.Fatalf("expected 1 callback, got %d", len(payloads)) + } + want := map[string][]string{ + "a": {"b", "bb"}, + "x": {"y"}, + } + assertPayloadEqual(t, payloads[0].UserIDToDeviceIDs, want) +} + +func TestDeviceTickerForgetsAfterEmitting(t *testing.T) { + duration := time.Millisecond + ticker := NewDeviceDataTicker(duration) + var payloads []*pubsub.V2DeviceData + ticker.SetCallback(func(payload *pubsub.V2DeviceData) { + payloads = append(payloads, payload) + }) + ticker.Remember(PollerID{ + UserID: "a", + DeviceID: "b", + }) + + go ticker.Run() + defer ticker.Stop() + ticker.Remember(PollerID{ + UserID: "a", + DeviceID: "b", + }) + time.Sleep(10 * duration) + if len(payloads) != 1 { + t.Fatalf("got %d payloads, want 1", len(payloads)) + } +} + +func assertPayloadEqual(t *testing.T, got, want map[string][]string) { + t.Helper() + if len(got) != len(want) { + t.Fatalf("got %+v\nwant %+v\n", got, want) + } + for userID, wantDeviceIDs := range want { + gotDeviceIDs := got[userID] + sort.Strings(wantDeviceIDs) + sort.Strings(gotDeviceIDs) + if !reflect.DeepEqual(gotDeviceIDs, wantDeviceIDs) { + t.Errorf("user %v got devices %v want %v", userID, gotDeviceIDs, wantDeviceIDs) + } + } +} diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index c95a276c..f5543003 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -4,11 +4,13 @@ import ( "context" "encoding/json" "fmt" - "github.com/jmoiron/sqlx" - "github.com/matrix-org/sliding-sync/sqlutil" "hash/fnv" "os" "sync" + "time" + + "github.com/jmoiron/sqlx" + "github.com/matrix-org/sliding-sync/sqlutil" "github.com/getsentry/sentry-go" @@ -43,13 +45,15 @@ type Handler struct { // room_id => fnv_hash([typing user ids]) typingMap map[string]uint64 + deviceDataTicker *sync2.DeviceDataTicker + numPollers prometheus.Gauge subSystem string } func NewHandler( connStr string, pMap *sync2.PollerMap, v2Store *sync2.Storage, store *state.Storage, client sync2.Client, - pub pubsub.Notifier, sub pubsub.Listener, enablePrometheus bool, + pub pubsub.Notifier, sub pubsub.Listener, enablePrometheus bool, deviceDataUpdateDuration time.Duration, ) (*Handler, error) { h := &Handler{ pMap: pMap, @@ -61,7 +65,8 @@ func NewHandler( Highlight int Notif int }), - typingMap: make(map[string]uint64), + typingMap: make(map[string]uint64), + deviceDataTicker: sync2.NewDeviceDataTicker(deviceDataUpdateDuration), } if enablePrometheus { @@ -86,6 +91,8 @@ func (h *Handler) Listen() { sentry.CaptureException(err) } }() + h.deviceDataTicker.SetCallback(h.OnBulkDeviceDataUpdate) + go h.deviceDataTicker.Run() } func (h *Handler) Teardown() { @@ -95,6 +102,7 @@ func (h *Handler) Teardown() { h.Store.Teardown() h.v2Store.Teardown() h.pMap.Terminate() + h.deviceDataTicker.Stop() if h.numPollers != nil { prometheus.Unregister(h.numPollers) } @@ -203,19 +211,24 @@ func (h *Handler) OnE2EEData(ctx context.Context, userID, deviceID string, otkCo New: deviceListChanges, }, } - nextPos, err := h.Store.DeviceDataTable.Upsert(&partialDD) + _, err := h.Store.DeviceDataTable.Upsert(&partialDD) if err != nil { logger.Err(err).Str("user", userID).Msg("failed to upsert device data") internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) return } - h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2DeviceData{ + // remember this to notify on pubsub later + h.deviceDataTicker.Remember(sync2.PollerID{ UserID: userID, DeviceID: deviceID, - Pos: nextPos, }) } +// Called periodically by deviceDataTicker, contains many updates +func (h *Handler) OnBulkDeviceDataUpdate(payload *pubsub.V2DeviceData) { + h.v2Pub.Notify(pubsub.ChanV2, payload) +} + func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prevBatch string, timeline []json.RawMessage) { // Remember any transaction IDs that may be unique to this user eventIDsWithTxns := make([]string, 0, len(timeline)) // in timeline order diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 6717e99a..68305e87 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -664,9 +664,14 @@ func (h *SyncLiveHandler) OnUnreadCounts(p *pubsub.V2UnreadCounts) { func (h *SyncLiveHandler) OnDeviceData(p *pubsub.V2DeviceData) { ctx, task := internal.StartTask(context.Background(), "OnDeviceData") defer task.End() - conns := h.ConnMap.Conns(p.UserID, p.DeviceID) - for _, conn := range conns { - conn.OnUpdate(ctx, caches.DeviceDataUpdate{}) + internal.Logf(ctx, "device_data", fmt.Sprintf("%v users to notify", len(p.UserIDToDeviceIDs))) + for userID, deviceIDs := range p.UserIDToDeviceIDs { + for _, deviceID := range deviceIDs { + conns := h.ConnMap.Conns(userID, deviceID) + for _, conn := range conns { + conn.OnUpdate(ctx, caches.DeviceDataUpdate{}) + } + } } } diff --git a/v3.go b/v3.go index a2860640..e189e97a 100644 --- a/v3.go +++ b/v3.go @@ -76,8 +76,10 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han store := state.NewStorage(postgresURI) storev2 := sync2.NewStore(postgresURI, secret) bufferSize := 50 + deviceDataUpdateFrequency := time.Second if opts.TestingSynchronousPubsub { bufferSize = 0 + deviceDataUpdateFrequency = 0 // don't batch } if opts.MaxPendingEventUpdates == 0 { opts.MaxPendingEventUpdates = 2000 @@ -86,7 +88,7 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han pMap := sync2.NewPollerMap(v2Client, opts.AddPrometheusMetrics) // create v2 handler - h2, err := handler2.NewHandler(postgresURI, pMap, storev2, store, v2Client, pubSub, pubSub, opts.AddPrometheusMetrics) + h2, err := handler2.NewHandler(postgresURI, pMap, storev2, store, v2Client, pubSub, pubSub, opts.AddPrometheusMetrics, deviceDataUpdateFrequency) if err != nil { panic(err) } From 0caeb03f3811fd5ef9c6f059fc891350df26424e Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 26 Jun 2023 21:12:56 -0700 Subject: [PATCH 033/156] kick ci? --- sync2/device_data_ticker_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/sync2/device_data_ticker_test.go b/sync2/device_data_ticker_test.go index 470e5378..daa50819 100644 --- a/sync2/device_data_ticker_test.go +++ b/sync2/device_data_ticker_test.go @@ -88,6 +88,7 @@ func TestDeviceTickerForgetsAfterEmitting(t *testing.T) { duration := time.Millisecond ticker := NewDeviceDataTicker(duration) var payloads []*pubsub.V2DeviceData + ticker.SetCallback(func(payload *pubsub.V2DeviceData) { payloads = append(payloads, payload) }) From 82c21e6d5aeedfc69ea29cb0d2b96a35dc668d07 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 26 Jun 2023 21:21:40 -0700 Subject: [PATCH 034/156] Remove branches main for now to kick ci maybe --- .github/workflows/tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7d8df5ca..d0680cb9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,7 +2,6 @@ name: Tests on: push: - branches: ["main"] pull_request: permissions: From 3bf3f2305373d69d6346d38b70be8add2a53f029 Mon Sep 17 00:00:00 2001 From: Mathieu Velten Date: Tue, 27 Jun 2023 20:07:21 +0200 Subject: [PATCH 035/156] Increase timeout of init sync to 30mn for small homeservers --- sync2/client.go | 11 ++++++++++- v3.go | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/sync2/client.go b/sync2/client.go index 7e9a8167..9a0a7b30 100644 --- a/sync2/client.go +++ b/sync2/client.go @@ -7,6 +7,7 @@ import ( "io/ioutil" "net/http" "net/url" + "time" "github.com/matrix-org/gomatrixserverlib" "github.com/tidwall/gjson" @@ -69,7 +70,15 @@ func (v *HTTPClient) DoSyncV2(ctx context.Context, accessToken, since string, is if err != nil { return nil, 0, fmt.Errorf("DoSyncV2: NewRequest failed: %w", err) } - res, err := v.Client.Do(req) + var res *http.Response + if isFirst { + longTimeoutClient := &http.Client{ + Timeout: 30 * time.Minute, + } + res, err = longTimeoutClient.Do(req) + } else { + res, err = v.Client.Do(req) + } if err != nil { return nil, 0, fmt.Errorf("DoSyncV2: request failed: %w", err) } diff --git a/v3.go b/v3.go index 74ae2be2..581eeaff 100644 --- a/v3.go +++ b/v3.go @@ -73,7 +73,7 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han // Setup shared DB and HTTP client v2Client := &sync2.HTTPClient{ Client: &http.Client{ - Timeout: 5 * time.Minute, + Timeout: 45 * time.Second, }, DestinationServer: destHomeserver, } From b9bc83d93f1e834e7302efe6de1b769a8b69e680 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 28 Jun 2023 16:32:23 -0500 Subject: [PATCH 036/156] Add WorkerPool and use it for OnE2EEData - Allowing unlimited concurrency on OnE2EEData causes huge spikes in DB conns when device lists change. - Using a high, bounded amount of concurrency ensure we don't breach DB conn limits. With unit tests. --- internal/pool.go | 67 ++++++++++++++ internal/pool_test.go | 186 ++++++++++++++++++++++++++++++++++++++ sync2/handler2/handler.go | 49 ++++++---- 3 files changed, 282 insertions(+), 20 deletions(-) create mode 100644 internal/pool.go create mode 100644 internal/pool_test.go diff --git a/internal/pool.go b/internal/pool.go new file mode 100644 index 00000000..27c6bb0c --- /dev/null +++ b/internal/pool.go @@ -0,0 +1,67 @@ +package internal + +type WorkerPool struct { + N int + ch chan func() +} + +// Create a new worker pool of size N. Up to N work can be done concurrently. +// The size of N depends on the expected frequency of work and contention for +// shared resources. Large values of N allow more frequent work at the cost of +// more contention for shared resources like cpu, memory and fds. Small values +// of N allow less frequent work but control the amount of shared resource contention. +// Ideally this value will be derived from whatever shared resource constraints you +// are hitting up against, rather than set to a fixed value. For example, if you have +// a database connection limit of 100, then setting N to some fraction of the limit is +// preferred to setting this to an arbitrary number < 100. If more than N work is requested, +// eventually WorkerPool.Queue will block until some work is done. +// +// The larger N is, the larger the up front memory costs are due to the implementation of WorkerPool. +func NewWorkerPool(n int) *WorkerPool { + return &WorkerPool{ + N: n, + // If we have N workers, we can process N work concurrently. + // If we have >N work, we need to apply backpressure to stop us + // making more and more work which takes up more and more memory. + // By setting the channel size to N, we ensure that backpressure is + // being applied on the producer, stopping it from creating more work, + // and hence bounding memory consumption. Work is still being produced + // upstream on the homeserver, but we will consume it when we're ready + // rather than gobble it all at once. + // + // Note: we aren't forced to set this to N, it just serves as a useful + // metric which scales on the number of workers. The amount of in-flight + // work is N, so it makes sense to allow up to N work to be queued up before + // applying backpressure. If the channel buffer is < N then the channel can + // become the bottleneck in the case where we have lots of instantaneous work + // to do. If the channel buffer is too large, we needlessly consume memory as + // make() will allocate a backing array of whatever size you give it up front (sad face) + ch: make(chan func(), n), + } +} + +// Start the workers. Only call this once. +func (wp *WorkerPool) Start() { + for i := 0; i < wp.N; i++ { + go wp.worker() + } +} + +// Stop the worker pool. Only really useful for tests as a worker pool should be started once +// and persist for the lifetime of the process, else it causes needless goroutine churn. +// Only call this once. +func (wp *WorkerPool) Stop() { + close(wp.ch) +} + +// Queue some work on the pool. May or may not block until some work is processed. +func (wp *WorkerPool) Queue(fn func()) { + wp.ch <- fn +} + +// worker impl +func (wp *WorkerPool) worker() { + for fn := range wp.ch { + fn() + } +} diff --git a/internal/pool_test.go b/internal/pool_test.go new file mode 100644 index 00000000..34222077 --- /dev/null +++ b/internal/pool_test.go @@ -0,0 +1,186 @@ +package internal + +import ( + "sync" + "testing" + "time" +) + +// Test basic functions of WorkerPool +func TestWorkerPool(t *testing.T) { + wp := NewWorkerPool(2) + wp.Start() + defer wp.Stop() + + // we should process this concurrently as N=2 so it should take 1s not 2s + var wg sync.WaitGroup + wg.Add(2) + start := time.Now() + wp.Queue(func() { + time.Sleep(time.Second) + wg.Done() + }) + wp.Queue(func() { + time.Sleep(time.Second) + wg.Done() + }) + wg.Wait() + took := time.Since(start) + if took > 2*time.Second { + t.Fatalf("took %v for queued work, it should have been faster than 2s", took) + } +} + +func TestWorkerPoolDoesWorkPriorToStart(t *testing.T) { + wp := NewWorkerPool(2) + + // return channel to use to see when work is done + ch := make(chan int, 2) + wp.Queue(func() { + ch <- 1 + }) + wp.Queue(func() { + ch <- 2 + }) + + // the work should not be done yet + time.Sleep(100 * time.Millisecond) + if len(ch) > 0 { + t.Fatalf("Queued work was done before Start()") + } + + // the work should be starting now + wp.Start() + defer wp.Stop() + + sum := 0 + for { + select { + case <-time.After(time.Second): + t.Fatalf("timed out waiting for work to be done") + case val := <-ch: + sum += val + } + if sum == 3 { // 2 + 1 + break + } + } +} + +type workerState struct { + id int + state int // not running, queued, running, finished + unblock *sync.WaitGroup // decrement to unblock this worker +} + +func TestWorkerPoolBackpressure(t *testing.T) { + // this test assumes backpressure starts at n*2+1 due to a chan buffer of size n, and n in-flight work. + n := 2 + wp := NewWorkerPool(n) + wp.Start() + defer wp.Stop() + + var mu sync.Mutex + stateNotRunning := 0 + stateQueued := 1 + stateRunning := 2 + stateFinished := 3 + size := (2 * n) + 1 + running := make([]*workerState, size) + + go func() { + // we test backpressure by scheduling (n*2)+1 work and ensuring that we see the following running states: + // [2,2,1,1,0] <-- 2 running, 2 queued, 1 blocked <-- THIS IS BACKPRESSURE + // [3,2,2,1,1] <-- 1 finished, 2 running, 2 queued + // [3,3,2,2,1] <-- 2 finished, 2 running , 1 queued + // [3,3,3,2,2] <-- 3 finished, 2 running + for i := 0; i < size; i++ { + // set initial state of this piece of work + wg := &sync.WaitGroup{} + wg.Add(1) + state := &workerState{ + id: i, + state: stateNotRunning, + unblock: wg, + } + mu.Lock() + running[i] = state + mu.Unlock() + + // queue the work on the pool. The final piece of work will block here and remain in + // stateNotRunning and not transition to stateQueued until the first piece of work is done. + wp.Queue(func() { + mu.Lock() + if running[state.id].state != stateQueued { + // we ran work in the worker faster than the code underneath .Queue, so let it catch up + mu.Unlock() + time.Sleep(10 * time.Millisecond) + mu.Lock() + } + running[state.id].state = stateRunning + mu.Unlock() + + running[state.id].unblock.Wait() + mu.Lock() + running[state.id].state = stateFinished + mu.Unlock() + }) + + // mark this work as queued + mu.Lock() + running[i].state = stateQueued + mu.Unlock() + } + }() + + // wait for the workers to be doing work and assert the states of each task + time.Sleep(time.Second) + + assertStates(t, &mu, running, []int{ + stateRunning, stateRunning, stateQueued, stateQueued, stateNotRunning, + }) + + // now let the first task complete + running[0].unblock.Done() + // wait for the pool to grab more work + time.Sleep(100 * time.Millisecond) + // assert new states + assertStates(t, &mu, running, []int{ + stateFinished, stateRunning, stateRunning, stateQueued, stateQueued, + }) + + // now let the second task complete + running[1].unblock.Done() + // wait for the pool to grab more work + time.Sleep(100 * time.Millisecond) + // assert new states + assertStates(t, &mu, running, []int{ + stateFinished, stateFinished, stateRunning, stateRunning, stateQueued, + }) + + // now let the third task complete + running[2].unblock.Done() + // wait for the pool to grab more work + time.Sleep(100 * time.Millisecond) + // assert new states + assertStates(t, &mu, running, []int{ + stateFinished, stateFinished, stateFinished, stateRunning, stateRunning, + }) + +} + +func assertStates(t *testing.T, mu *sync.Mutex, running []*workerState, wantStates []int) { + t.Helper() + mu.Lock() + defer mu.Unlock() + if len(running) != len(wantStates) { + t.Fatalf("assertStates: bad wantStates length, got %d want %d", len(wantStates), len(running)) + } + for i := range running { + state := running[i] + wantVal := wantStates[i] + if state.state != wantVal { + t.Errorf("work[%d] got state %d want %d", i, state.state, wantVal) + } + } +} diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index f5543003..37b068ce 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -46,6 +46,7 @@ type Handler struct { typingMap map[string]uint64 deviceDataTicker *sync2.DeviceDataTicker + e2eeWorkerPool *internal.WorkerPool numPollers prometheus.Gauge subSystem string @@ -67,6 +68,7 @@ func NewHandler( }), typingMap: make(map[string]uint64), deviceDataTicker: sync2.NewDeviceDataTicker(deviceDataUpdateDuration), + e2eeWorkerPool: internal.NewWorkerPool(500), // TODO: assign as fraction of db max conns, not hardcoded } if enablePrometheus { @@ -91,6 +93,7 @@ func (h *Handler) Listen() { sentry.CaptureException(err) } }() + h.e2eeWorkerPool.Start() h.deviceDataTicker.SetCallback(h.OnBulkDeviceDataUpdate) go h.deviceDataTicker.Run() } @@ -201,27 +204,33 @@ func (h *Handler) UpdateDeviceSince(ctx context.Context, userID, deviceID, since } func (h *Handler) OnE2EEData(ctx context.Context, userID, deviceID string, otkCounts map[string]int, fallbackKeyTypes []string, deviceListChanges map[string]int) { - // some of these fields may be set - partialDD := internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - OTKCounts: otkCounts, - FallbackKeyTypes: fallbackKeyTypes, - DeviceLists: internal.DeviceLists{ - New: deviceListChanges, - }, - } - _, err := h.Store.DeviceDataTable.Upsert(&partialDD) - if err != nil { - logger.Err(err).Str("user", userID).Msg("failed to upsert device data") - internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) - return - } - // remember this to notify on pubsub later - h.deviceDataTicker.Remember(sync2.PollerID{ - UserID: userID, - DeviceID: deviceID, + var wg sync.WaitGroup + wg.Add(1) + h.e2eeWorkerPool.Queue(func() { + defer wg.Done() + // some of these fields may be set + partialDD := internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + OTKCounts: otkCounts, + FallbackKeyTypes: fallbackKeyTypes, + DeviceLists: internal.DeviceLists{ + New: deviceListChanges, + }, + } + _, err := h.Store.DeviceDataTable.Upsert(&partialDD) + if err != nil { + logger.Err(err).Str("user", userID).Msg("failed to upsert device data") + internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) + return + } + // remember this to notify on pubsub later + h.deviceDataTicker.Remember(sync2.PollerID{ + UserID: userID, + DeviceID: deviceID, + }) }) + wg.Wait() } // Called periodically by deviceDataTicker, contains many updates From 0342a99524c414d20bba81b79df0bbedfcfe017d Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 4 Jul 2023 10:52:31 +0100 Subject: [PATCH 037/156] bugfix: prevent clients starving themselves by constantly changing req params Because the proxy services changes to req params preferentially to live data, if the client constantly changes the window (e.g due to spidering) then it can accidentally stop the delivery of live events to the client until the spidering process is complete. To help address this, we now process live updates _even if_ we have some data to send to the client. This is bounded in size to prevent the inverse happening: constantly seeing new live events which starves changes to req params. This should hopefully strike the right balance. With regression test. --- sync3/handler/connstate_live.go | 48 ++++++++------ tests-e2e/num_live_test.go | 70 ++++++++++++++++++++ tests-integration/room_subscriptions_test.go | 9 +-- 3 files changed, 102 insertions(+), 25 deletions(-) diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index 648f650c..777289cc 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -81,35 +81,45 @@ func (s *connStateLive) liveUpdate( return case update := <-s.updates: internal.Logf(ctx, "liveUpdate", "process live update") - - s.processLiveUpdate(ctx, update, response) - // pass event to extensions AFTER processing - roomIDsToLists := s.lists.ListsByVisibleRoomIDs(s.muxedReq.Lists) - s.extensionsHandler.HandleLiveUpdate(ctx, update, ex, &response.Extensions, extensions.Context{ - IsInitial: false, - RoomIDToTimeline: response.RoomIDsToTimelineEventIDs(), - UserID: s.userID, - DeviceID: s.deviceID, - RoomIDsToLists: roomIDsToLists, - }) + s.processUpdate(ctx, update, response, ex) // if there's more updates and we don't have lots stacked up already, go ahead and process another for len(s.updates) > 0 && response.ListOps() < 50 { update = <-s.updates - s.processLiveUpdate(ctx, update, response) - s.extensionsHandler.HandleLiveUpdate(ctx, update, ex, &response.Extensions, extensions.Context{ - IsInitial: false, - RoomIDToTimeline: response.RoomIDsToTimelineEventIDs(), - UserID: s.userID, - DeviceID: s.deviceID, - RoomIDsToLists: roomIDsToLists, - }) + s.processUpdate(ctx, update, response, ex) } } } + + // If a client constantly changes their request params in every request they make, we will never consume from + // the update channel as the response will always have data already. In an effort to prevent starvation of new + // data, we will process some updates even though we have data already, but only if A) we didn't live stream + // due to natural circumstances, B) it isn't an initial request and C) there is in fact some data there. + numQueuedUpdates := len(s.updates) + if !hasLiveStreamed && !isInitial && numQueuedUpdates > 0 { + for i := 0; i < numQueuedUpdates; i++ { + update := <-s.updates + s.processUpdate(ctx, update, response, ex) + } + log.Debug().Int("num_queued", numQueuedUpdates).Msg("liveUpdate: caught up") + } + log.Trace().Bool("live_streamed", hasLiveStreamed).Msg("liveUpdate: returning") // TODO: op consolidation } +func (s *connStateLive) processUpdate(ctx context.Context, update caches.Update, response *sync3.Response, ex extensions.Request) { + s.processLiveUpdate(ctx, update, response) + // pass event to extensions AFTER processing + roomIDsToLists := s.lists.ListsByVisibleRoomIDs(s.muxedReq.Lists) + s.extensionsHandler.HandleLiveUpdate(ctx, update, ex, &response.Extensions, extensions.Context{ + IsInitial: false, + RoomIDToTimeline: response.RoomIDsToTimelineEventIDs(), + UserID: s.userID, + DeviceID: s.deviceID, + RoomIDsToLists: roomIDsToLists, + }) +} + func (s *connStateLive) processLiveUpdate(ctx context.Context, up caches.Update, response *sync3.Response) bool { internal.AssertWithContext(ctx, "processLiveUpdate: response list length != internal list length", s.lists.Len() == len(response.Lists)) internal.AssertWithContext(ctx, "processLiveUpdate: request list length != internal list length", s.lists.Len() == len(s.muxedReq.Lists)) diff --git a/tests-e2e/num_live_test.go b/tests-e2e/num_live_test.go index 5a47a02f..acc97261 100644 --- a/tests-e2e/num_live_test.go +++ b/tests-e2e/num_live_test.go @@ -1,10 +1,13 @@ package syncv3_test import ( + "fmt" "testing" + "time" "github.com/matrix-org/sliding-sync/sync3" "github.com/matrix-org/sliding-sync/testutils/m" + "github.com/tidwall/gjson" ) func TestNumLive(t *testing.T) { @@ -126,3 +129,70 @@ func TestNumLive(t *testing.T) { }, })) } + +// Test that if you constantly change req params, we still see live traffic. It does this by: +// - Creating 11 rooms. +// - Hitting /sync with a range [0,1] then [0,2] then [0,3]. Each time this causes a new room to be returned. +// - Interleaving each /sync request with genuine events sent into a room. +// - ensuring we see the genuine events by the time we finish. +func TestReqParamStarvation(t *testing.T) { + alice := registerNewUser(t) + bob := registerNewUser(t) + roomID := alice.CreateRoom(t, map[string]interface{}{ + "preset": "public_chat", + }) + numOtherRooms := 10 + for i := 0; i < numOtherRooms; i++ { + bob.CreateRoom(t, map[string]interface{}{ + "preset": "public_chat", + }) + } + bob.JoinRoom(t, roomID, nil) + res := bob.SlidingSyncUntilMembership(t, "", roomID, bob, "join") + + wantEventIDs := make(map[string]bool) + for i := 0; i < numOtherRooms; i++ { + res = bob.SlidingSync(t, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + Ranges: sync3.SliceRanges{{0, int64(i)}}, // [0,0], [0,1], ... [0,9] + }, + }, + }, WithPos(res.Pos)) + + // mark off any event we see in wantEventIDs + for _, r := range res.Rooms { + for _, ev := range r.Timeline { + gotEventID := gjson.GetBytes(ev, "event_id").Str + wantEventIDs[gotEventID] = false + } + } + + // send an event in the first few syncs to add to wantEventIDs + // We do this for the first few /syncs and don't dictate which response they should arrive + // in, as we do not know and cannot force the proxy to deliver the event in a particular response. + if i < 3 { + eventID := alice.SendEventSynced(t, roomID, Event{ + Type: "m.room.message", + Content: map[string]interface{}{ + "msgtype": "m.text", + "body": fmt.Sprintf("msg %d", i), + }, + }) + wantEventIDs[eventID] = true + } + + // it's possible the proxy won't see this event before the next /sync + // and that is the reason why we don't send it, as opposed to starvation. + // To try to counter this, sleep a bit. This is why we sleep on every cycle and + // why we send the events early on. + time.Sleep(50 * time.Millisecond) + } + + // at this point wantEventIDs should all have false values if we got the events + for evID, unseen := range wantEventIDs { + if unseen { + t.Errorf("failed to see event %v", evID) + } + } +} diff --git a/tests-integration/room_subscriptions_test.go b/tests-integration/room_subscriptions_test.go index ba89ad90..3b4bbd5c 100644 --- a/tests-integration/room_subscriptions_test.go +++ b/tests-integration/room_subscriptions_test.go @@ -137,12 +137,9 @@ func TestRoomSubscriptionMisorderedTimeline(t *testing.T) { }) m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ room.roomID: { - // TODO: this is the correct result, but due to how timeline loading works currently - // it will be returning the last 5 events BEFORE D,E, which isn't ideal but also isn't - // incorrect per se due to the fact that clients don't know when D,E have been processed - // on the server. - // m.MatchRoomTimeline(append(abcInitialEvents, deLiveEvents...)), - m.MatchRoomTimeline(append(roomState[len(roomState)-2:], abcInitialEvents...)), + // we append live events AFTER processing the new timeline limit, so 7 events not 5. + // TODO: ideally we'd just return abcde here. + m.MatchRoomTimeline(append(roomState[len(roomState)-2:], append(abcInitialEvents, deLiveEvents...)...)), }, }), m.LogResponse(t)) From 54cb2cbbb805adf7e882099349d8e148abce3a80 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 4 Jul 2023 13:04:27 +0100 Subject: [PATCH 038/156] Don't rate limit on M_UNKNOWN_POS to allow more rapid recovery after expired connections --- sync3/handler/handler.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 68305e87..b9e57526 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -197,9 +197,13 @@ func (h *SyncLiveHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { Err: err, } } - // artificially wait a bit before sending back the error - // this guards against tightlooping when the client hammers the server with invalid requests - time.Sleep(time.Second) + if herr.ErrCode != "M_UNKNOWN_POS" { + // artificially wait a bit before sending back the error + // this guards against tightlooping when the client hammers the server with invalid requests, + // but not for M_UNKNOWN_POS which we expect to send back after expiring a client's connection. + // We want to recover rapidly in that scenario, hence not sleeping. + time.Sleep(time.Second) + } w.WriteHeader(herr.StatusCode) w.Write(herr.JSON()) } From aaea223eab847782ccbe47244003e8a9f65e5130 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 4 Jul 2023 14:20:25 +0100 Subject: [PATCH 039/156] Don't needlessly hit the db for txn IDs for live events which were not sent by that conn --- sync3/caches/user.go | 16 ++++++++++++++-- sync3/caches/user_test.go | 14 +++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/sync3/caches/user.go b/sync3/caches/user.go index ee259e6c..8b96ddd4 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -395,8 +395,16 @@ func (c *UserCache) AnnotateWithTransactionIDs(ctx context.Context, userID strin i int }) for roomID, events := range roomIDToEvents { - for i, ev := range events { - evID := gjson.GetBytes(ev, "event_id").Str + for i, evJSON := range events { + ev := gjson.ParseBytes(evJSON) + evID := ev.Get("event_id").Str + sender := ev.Get("sender").Str + if sender != userID { + // don't ask for txn IDs for events which weren't sent by us. + // If we do, we'll needlessly hit the database, increasing latencies when + // catching up from the live buffer. + continue + } eventIDs = append(eventIDs, evID) eventIDToEvent[evID] = struct { roomID string @@ -407,6 +415,10 @@ func (c *UserCache) AnnotateWithTransactionIDs(ctx context.Context, userID strin } } } + if len(eventIDs) == 0 { + // don't do any work if we have no events + return roomIDToEvents + } eventIDToTxnID := c.txnIDs.TransactionIDForEvents(userID, deviceID, eventIDs) for eventID, txnID := range eventIDToTxnID { data, ok := eventIDToEvent[eventID] diff --git a/sync3/caches/user_test.go b/sync3/caches/user_test.go index a1e5ec8f..5809317b 100644 --- a/sync3/caches/user_test.go +++ b/sync3/caches/user_test.go @@ -83,8 +83,8 @@ func TestAnnotateWithTransactionIDs(t *testing.T) { data: tc.eventIDToTxnIDs, } uc := caches.NewUserCache(userID, nil, nil, fetcher) - got := uc.AnnotateWithTransactionIDs(context.Background(), userID, "DEVICE", convertIDToEventStub(tc.roomIDToEvents)) - want := convertIDTxnToEventStub(tc.wantRoomIDToEvents) + got := uc.AnnotateWithTransactionIDs(context.Background(), userID, "DEVICE", convertIDToEventStub(userID, tc.roomIDToEvents)) + want := convertIDTxnToEventStub(userID, tc.wantRoomIDToEvents) if !reflect.DeepEqual(got, want) { t.Errorf("%s : got %v want %v", tc.name, js(got), js(want)) } @@ -96,27 +96,27 @@ func js(in interface{}) string { return string(b) } -func convertIDToEventStub(roomToEventIDs map[string][]string) map[string][]json.RawMessage { +func convertIDToEventStub(sender string, roomToEventIDs map[string][]string) map[string][]json.RawMessage { result := make(map[string][]json.RawMessage) for roomID, eventIDs := range roomToEventIDs { events := make([]json.RawMessage, len(eventIDs)) for i := range eventIDs { - events[i] = json.RawMessage(fmt.Sprintf(`{"event_id":"%s","type":"x"}`, eventIDs[i])) + events[i] = json.RawMessage(fmt.Sprintf(`{"event_id":"%s","type":"x","sender":"%s"}`, eventIDs[i], sender)) } result[roomID] = events } return result } -func convertIDTxnToEventStub(roomToEventIDs map[string][][2]string) map[string][]json.RawMessage { +func convertIDTxnToEventStub(sender string, roomToEventIDs map[string][][2]string) map[string][]json.RawMessage { result := make(map[string][]json.RawMessage) for roomID, eventIDs := range roomToEventIDs { events := make([]json.RawMessage, len(eventIDs)) for i := range eventIDs { if eventIDs[i][1] == "" { - events[i] = json.RawMessage(fmt.Sprintf(`{"event_id":"%s","type":"x"}`, eventIDs[i][0])) + events[i] = json.RawMessage(fmt.Sprintf(`{"event_id":"%s","type":"x","sender":"%s"}`, eventIDs[i][0], sender)) } else { - events[i] = json.RawMessage(fmt.Sprintf(`{"event_id":"%s","type":"x","unsigned":{"transaction_id":"%s"}}`, eventIDs[i][0], eventIDs[i][1])) + events[i] = json.RawMessage(fmt.Sprintf(`{"event_id":"%s","type":"x","sender":"%s","unsigned":{"transaction_id":"%s"}}`, eventIDs[i][0], sender, eventIDs[i][1])) } } result[roomID] = events From 0c95c56f0192f8d5f0c48d72e206d3670cb870a1 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 4 Jul 2023 17:20:55 +0100 Subject: [PATCH 040/156] logging: log less, aggregate more. Fix npe on slowReq metric --- sync2/poller.go | 64 +++++++++++++++++++++++++++------ sync3/handler/connstate_live.go | 2 +- sync3/handler/handler.go | 4 ++- 3 files changed, 58 insertions(+), 12 deletions(-) diff --git a/sync2/poller.go b/sync2/poller.go index 0d8baf61..4b94bac9 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -4,12 +4,13 @@ import ( "context" "encoding/json" "fmt" - "github.com/getsentry/sentry-go" "runtime/debug" "sync" "sync/atomic" "time" + "github.com/getsentry/sentry-go" + "github.com/matrix-org/sliding-sync/internal" "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" @@ -24,6 +25,9 @@ type PollerID struct { // alias time.Sleep so tests can monkey patch it out var timeSleep = time.Sleep +// log at most once every duration. Always logs before terminating. +var logInterval = 30 * time.Second + // V2DataReceiver is the receiver for all the v2 sync data the poller gets type V2DataReceiver interface { // Update the since token for this device. Called AFTER all other data in this sync response has been processed. @@ -335,6 +339,18 @@ type poller struct { terminated *atomic.Bool wg *sync.WaitGroup + // stats about poll response data, for logging purposes + lastLogged time.Time + totalStateCalls int + totalTimelineCalls int + totalReceipts int + totalTyping int + totalInvites int + totalDeviceEvents int + totalAccountData int + totalChangedDeviceLists int + totalLeftDeviceLists int + pollHistogramVec *prometheus.HistogramVec processHistogramVec *prometheus.HistogramVec timelineSizeVec *prometheus.HistogramVec @@ -411,6 +427,7 @@ func (p *poller) Poll(since string) { break } } + p.maybeLogStats(true) // always unblock EnsurePolling else we can end up head-of-line blocking other pollers! if state.firstTime { state.firstTime = false @@ -480,6 +497,7 @@ func (p *poller) poll(ctx context.Context, s *pollLoopState) error { p.wg.Done() } p.trackProcessDuration(time.Since(start), wasInitial, wasFirst) + p.maybeLogStats(false) return nil } @@ -518,6 +536,7 @@ func (p *poller) parseToDeviceMessages(ctx context.Context, res *SyncResponse) { if len(res.ToDevice.Events) == 0 { return } + p.totalDeviceEvents += len(res.ToDevice.Events) p.receiver.AddToDeviceMessages(ctx, p.userID, p.deviceID, res.ToDevice.Events) } @@ -556,6 +575,8 @@ func (p *poller) parseE2EEData(ctx context.Context, res *SyncResponse) { deviceListChanges := internal.ToDeviceListChangesMap(res.DeviceLists.Changed, res.DeviceLists.Left) if deviceListChanges != nil || changedFallbackTypes != nil || changedOTKCounts != nil { + p.totalChangedDeviceLists += len(res.DeviceLists.Changed) + p.totalLeftDeviceLists += len(res.DeviceLists.Left) p.receiver.OnE2EEData(ctx, p.userID, p.deviceID, changedOTKCounts, changedFallbackTypes, deviceListChanges) } } @@ -566,6 +587,7 @@ func (p *poller) parseGlobalAccountData(ctx context.Context, res *SyncResponse) if len(res.AccountData.Events) == 0 { return } + p.totalAccountData += len(res.AccountData.Events) p.receiver.OnAccountData(ctx, p.userID, AccountDataGlobalRoom, res.AccountData.Events) } @@ -640,17 +662,39 @@ func (p *poller) parseRoomsResponse(ctx context.Context, res *SyncResponse) { for roomID, roomData := range res.Rooms.Invite { p.receiver.OnInvite(ctx, p.userID, roomID, roomData.InviteState.Events) } - var l *zerolog.Event - if len(res.Rooms.Invite) > 0 || len(res.Rooms.Join) > 0 { - l = p.logger.Info() - } else { - l = p.logger.Debug() + + p.totalReceipts += receiptCalls + p.totalStateCalls += stateCalls + p.totalTimelineCalls += timelineCalls + p.totalTyping += typingCalls + p.totalInvites += len(res.Rooms.Invite) +} + +func (p *poller) maybeLogStats(force bool) { + if !force && time.Since(p.lastLogged) < logInterval { + // only log at most once every logInterval + return } - l.Ints( - "rooms [invite,join,leave]", []int{len(res.Rooms.Invite), len(res.Rooms.Join), len(res.Rooms.Leave)}, + p.lastLogged = time.Now() + p.logger.Info().Ints( + "rooms [timeline,state,typing,receipts,invites]", []int{ + p.totalTimelineCalls, p.totalStateCalls, p.totalTyping, p.totalReceipts, p.totalInvites, + }, ).Ints( - "storage [states,timelines,typing,receipts]", []int{stateCalls, timelineCalls, typingCalls, receiptCalls}, - ).Int("to_device", len(res.ToDevice.Events)).Msg("Poller: accumulated data") + "device [events,changed,left,account]", []int{ + p.totalDeviceEvents, p.totalChangedDeviceLists, p.totalLeftDeviceLists, p.totalAccountData, + }, + ).Msg("Poller: accumulated data") + + p.totalAccountData = 0 + p.totalChangedDeviceLists = 0 + p.totalDeviceEvents = 0 + p.totalInvites = 0 + p.totalLeftDeviceLists = 0 + p.totalReceipts = 0 + p.totalStateCalls = 0 + p.totalTimelineCalls = 0 + p.totalTyping = 0 } func (p *poller) trackTimelineSize(size int, limited bool) { diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index 777289cc..adc2bdbc 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -37,7 +37,7 @@ func (s *connStateLive) onUpdate(up caches.Update) { select { case s.updates <- up: case <-time.After(BufferWaitTime): - logger.Warn().Interface("update", up).Str("user", s.userID).Msg( + logger.Warn().Interface("update", up).Str("user", s.userID).Str("device", s.deviceID).Msg( "cannot send update to connection, buffer exceeded. Destroying connection.", ) s.bufferFull = true diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index b9e57526..627a5527 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -215,7 +215,9 @@ func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error defer func() { dur := time.Since(start) if dur > 50*time.Second { - h.slowReqs.Add(1.0) + if h.slowReqs != nil { + h.slowReqs.Add(1.0) + } internal.DecorateLogger(req.Context(), log.Warn()).Dur("duration", dur).Msg("slow request") } }() From 8336dd244617ce5a2ae4db0ae0d96a9b62b4bbfc Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Thu, 6 Jul 2023 15:12:25 +0100 Subject: [PATCH 041/156] More trace logging --- sync3/handler/connstate.go | 3 +++ sync3/handler/connstate_live.go | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index fa621494..4184a970 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -179,6 +179,9 @@ func (s *ConnState) onIncomingRequest(ctx context.Context, req *sync3.Request, i } internal.Logf(ctx, "connstate", "list[%v] prev_empty=%v curr=%v", key, l.Prev == nil, listData) } + for roomID, sub := range s.muxedReq.RoomSubscriptions { + internal.Logf(ctx, "connstate", "room sub[%v] %v", roomID, sub) + } // work out which rooms we'll return data for and add their relevant subscriptions to the builder // for it to mix together diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index adc2bdbc..4f3579c8 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -80,7 +80,6 @@ func (s *connStateLive) liveUpdate( internal.Logf(ctx, "liveUpdate", "timed out after %v", timeLeftToWait) return case update := <-s.updates: - internal.Logf(ctx, "liveUpdate", "process live update") s.processUpdate(ctx, update, response, ex) // if there's more updates and we don't have lots stacked up already, go ahead and process another for len(s.updates) > 0 && response.ListOps() < 50 { @@ -101,6 +100,7 @@ func (s *connStateLive) liveUpdate( s.processUpdate(ctx, update, response, ex) } log.Debug().Int("num_queued", numQueuedUpdates).Msg("liveUpdate: caught up") + internal.Logf(ctx, "connstate", "liveUpdate caught up %d updates", numQueuedUpdates) } log.Trace().Bool("live_streamed", hasLiveStreamed).Msg("liveUpdate: returning") @@ -108,6 +108,7 @@ func (s *connStateLive) liveUpdate( } func (s *connStateLive) processUpdate(ctx context.Context, update caches.Update, response *sync3.Response, ex extensions.Request) { + internal.Logf(ctx, "liveUpdate", "process live update %s", update.Type()) s.processLiveUpdate(ctx, update, response) // pass event to extensions AFTER processing roomIDsToLists := s.lists.ListsByVisibleRoomIDs(s.muxedReq.Lists) @@ -128,6 +129,7 @@ func (s *connStateLive) processLiveUpdate(ctx context.Context, up caches.Update, // if this is a room event update we may not want to process this if the event nid is < loadPos, // as that means we have already taken it into account if roomEventUpdate != nil && !roomEventUpdate.EventData.AlwaysProcess && roomEventUpdate.EventData.NID < s.loadPosition { + internal.Logf(ctx, "liveUpdate", "not process update %v < %v", roomEventUpdate.EventData.NID, s.loadPosition) return false } From e22f30ef5d45e80d97ef542316fa26213f5409d8 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Thu, 6 Jul 2023 16:11:48 +0100 Subject: [PATCH 042/156] Add gauge for tracking pending EnsurePolling calls --- sync3/handler/handler.go | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 627a5527..6b1e910c 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -60,10 +60,11 @@ type SyncLiveHandler struct { GlobalCache *caches.GlobalCache maxPendingEventUpdates int - numConns prometheus.Gauge - setupHistVec *prometheus.HistogramVec - histVec *prometheus.HistogramVec - slowReqs prometheus.Counter + numConns prometheus.Gauge + setupHistVec *prometheus.HistogramVec + histVec *prometheus.HistogramVec + slowReqs prometheus.Counter + numPendingEnsurePolling prometheus.Gauge } func NewSync3Handler( @@ -132,6 +133,9 @@ func (h *SyncLiveHandler) Teardown() { if h.numConns != nil { prometheus.Unregister(h.numConns) } + if h.numPendingEnsurePolling != nil { + prometheus.Unregister(h.numPendingEnsurePolling) + } if h.setupHistVec != nil { prometheus.Unregister(h.setupHistVec) } @@ -157,6 +161,12 @@ func (h *SyncLiveHandler) addPrometheusMetrics() { Name: "num_active_conns", Help: "Number of active sliding sync connections.", }) + h.numPendingEnsurePolling = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "sliding_sync", + Subsystem: "api", + Name: "num_pending_ensure_polling", + Help: "Number of HTTP requests blocked on EnsurePolling returning.", + }) h.setupHistVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "sliding_sync", Subsystem: "api", @@ -178,6 +188,7 @@ func (h *SyncLiveHandler) addPrometheusMetrics() { Help: "Counter of slow (>=50s) requests, initial or otherwise.", }) prometheus.MustRegister(h.numConns) + prometheus.MustRegister(h.numPendingEnsurePolling) prometheus.MustRegister(h.setupHistVec) prometheus.MustRegister(h.histVec) prometheus.MustRegister(h.slowReqs) @@ -398,7 +409,9 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ log.Trace().Msg("checking poller exists and is running") pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} + h.numPendingEnsurePolling.Inc() h.EnsurePoller.EnsurePolling(req.Context(), pid, token.AccessTokenHash) + h.numPendingEnsurePolling.Dec() log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. // We'll be quicker next time as the poller will already exist. From 365ed4c05d4914c0ff78d3aaa1dec621b52bdae3 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Thu, 6 Jul 2023 17:53:06 +0100 Subject: [PATCH 043/156] nil checks --- sync3/handler/handler.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 6b1e910c..7caec0cd 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -409,9 +409,13 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ log.Trace().Msg("checking poller exists and is running") pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} - h.numPendingEnsurePolling.Inc() + if h.numPendingEnsurePolling != nil { + h.numPendingEnsurePolling.Inc() + } h.EnsurePoller.EnsurePolling(req.Context(), pid, token.AccessTokenHash) - h.numPendingEnsurePolling.Dec() + if h.numPendingEnsurePolling != nil { + h.numPendingEnsurePolling.Dec() + } log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. // We'll be quicker next time as the poller will already exist. From e67ba9a30eed1a2fab22fa6e9dd5db056196586b Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 7 Jul 2023 12:41:39 +0100 Subject: [PATCH 044/156] Add more poller metrics --- sync2/poller.go | 52 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/sync2/poller.go b/sync2/poller.go index 4b94bac9..bf81b1a5 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -61,14 +61,16 @@ type V2DataReceiver interface { // PollerMap is a map of device ID to Poller type PollerMap struct { - v2Client Client - callbacks V2DataReceiver - pollerMu *sync.Mutex - Pollers map[PollerID]*poller - executor chan func() - executorRunning bool - processHistogramVec *prometheus.HistogramVec - timelineSizeHistogramVec *prometheus.HistogramVec + v2Client Client + callbacks V2DataReceiver + pollerMu *sync.Mutex + Pollers map[PollerID]*poller + executor chan func() + executorRunning bool + processHistogramVec *prometheus.HistogramVec + timelineSizeHistogramVec *prometheus.HistogramVec + numOutstandingSyncReqsGauge prometheus.Gauge + totalNumPollsCounter prometheus.Counter } // NewPollerMap makes a new PollerMap. Guarantees that the V2DataReceiver will be called on the same @@ -119,7 +121,20 @@ func NewPollerMap(v2Client Client, enablePrometheus bool) *PollerMap { Buckets: []float64{0.0, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0}, }, []string{"limited"}) prometheus.MustRegister(pm.timelineSizeHistogramVec) - + pm.totalNumPollsCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "sliding_sync", + Subsystem: "poller", + Name: "total_num_polls", + Help: "Total number of poll loops iterated.", + }) + prometheus.MustRegister(pm.totalNumPollsCounter) + pm.numOutstandingSyncReqsGauge = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "sliding_sync", + Subsystem: "poller", + Name: "num_outstanding_sync_v2_reqs", + Help: "Number of sync v2 requests that have yet to return a response.", + }) + prometheus.MustRegister(pm.numOutstandingSyncReqsGauge) } return pm } @@ -200,6 +215,8 @@ func (h *PollerMap) EnsurePolling(pid PollerID, accessToken, v2since string, isS poller = newPoller(pid, accessToken, h.v2Client, h, logger, !needToWait && !isStartup) poller.processHistogramVec = h.processHistogramVec poller.timelineSizeVec = h.timelineSizeHistogramVec + poller.numOutstandingSyncReqs = h.numOutstandingSyncReqsGauge + poller.totalNumPolls = h.totalNumPollsCounter go poller.Poll(v2since) h.Pollers[pid] = poller @@ -351,9 +368,11 @@ type poller struct { totalChangedDeviceLists int totalLeftDeviceLists int - pollHistogramVec *prometheus.HistogramVec - processHistogramVec *prometheus.HistogramVec - timelineSizeVec *prometheus.HistogramVec + pollHistogramVec *prometheus.HistogramVec + processHistogramVec *prometheus.HistogramVec + timelineSizeVec *prometheus.HistogramVec + numOutstandingSyncReqs prometheus.Gauge + totalNumPolls prometheus.Counter } func newPoller(pid PollerID, accessToken string, client Client, receiver V2DataReceiver, logger zerolog.Logger, initialToDeviceOnly bool) *poller { @@ -391,6 +410,9 @@ type pollLoopState struct { // Returns if the access token gets invalidated or if there was a fatal error processing v2 responses. // Use WaitUntilInitialSync() to wait until the first poll has been processed. func (p *poller) Poll(since string) { + if p.totalNumPolls != nil { + p.totalNumPolls.Inc() + } // Basing the sentry-wrangling on the sentry-go net/http integration, see e.g. // https://github.com/getsentry/sentry-go/blob/02e712a638c40cd9701ad52d5d1309d65d556ef9/http/sentryhttp.go#L84 // TODO is this the correct way to create hub? Should the cloning be done by the @@ -452,7 +474,13 @@ func (p *poller) poll(ctx context.Context, s *pollLoopState) error { } start := time.Now() spanCtx, region := internal.StartSpan(ctx, "DoSyncV2") + if p.numOutstandingSyncReqs != nil { + p.numOutstandingSyncReqs.Inc() + } resp, statusCode, err := p.client.DoSyncV2(spanCtx, p.accessToken, s.since, s.firstTime, p.initialToDeviceOnly) + if p.numOutstandingSyncReqs != nil { + p.numOutstandingSyncReqs.Dec() + } region.End() p.trackRequestDuration(time.Since(start), s.since == "", s.firstTime) if p.terminated.Load() { From 7621aa1ebbd42e45bcfd0400e240366067fb792d Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 7 Jul 2023 12:46:01 +0100 Subject: [PATCH 045/156] Track num polls in the right place --- sync2/poller.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sync2/poller.go b/sync2/poller.go index bf81b1a5..ac3678e9 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -410,9 +410,6 @@ type pollLoopState struct { // Returns if the access token gets invalidated or if there was a fatal error processing v2 responses. // Use WaitUntilInitialSync() to wait until the first poll has been processed. func (p *poller) Poll(since string) { - if p.totalNumPolls != nil { - p.totalNumPolls.Inc() - } // Basing the sentry-wrangling on the sentry-go net/http integration, see e.g. // https://github.com/getsentry/sentry-go/blob/02e712a638c40cd9701ad52d5d1309d65d556ef9/http/sentryhttp.go#L84 // TODO is this the correct way to create hub? Should the cloning be done by the @@ -461,6 +458,9 @@ func (p *poller) Poll(since string) { // s (which is assumed to be non-nil). Returns a non-nil error iff the poller loop // should halt. func (p *poller) poll(ctx context.Context, s *pollLoopState) error { + if p.totalNumPolls != nil { + p.totalNumPolls.Inc() + } if s.failCount > 0 { // don't backoff when doing v2 syncs because the response is only in the cache for a short // period of time (on massive accounts on matrix.org) such that if you wait 2,4,8min between From 4d8f3d5e5f36470bc27f6810902d8c7c294a9659 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 7 Jul 2023 13:13:56 +0100 Subject: [PATCH 046/156] Track num_devices_pending_ensure_polling per device not per http req as it's more useful this way --- sync3/handler/ensure_polling.go | 39 ++++++++++++++++++++++++++++++--- sync3/handler/handler.go | 27 +++++------------------ 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/sync3/handler/ensure_polling.go b/sync3/handler/ensure_polling.go index 76d7cfcd..430d378d 100644 --- a/sync3/handler/ensure_polling.go +++ b/sync3/handler/ensure_polling.go @@ -2,9 +2,11 @@ package handler import ( "context" + "sync" + "github.com/matrix-org/sliding-sync/internal" "github.com/matrix-org/sliding-sync/sync2" - "sync" + "github.com/prometheus/client_golang/prometheus" "github.com/matrix-org/sliding-sync/pubsub" ) @@ -30,15 +32,27 @@ type EnsurePoller struct { // pendingPolls tracks the status of pollers that we are waiting to start. pendingPolls map[sync2.PollerID]pendingInfo notifier pubsub.Notifier + // the total number of outstanding ensurepolling requests. + numPendingEnsurePolling prometheus.Gauge } -func NewEnsurePoller(notifier pubsub.Notifier) *EnsurePoller { - return &EnsurePoller{ +func NewEnsurePoller(notifier pubsub.Notifier, enablePrometheus bool) *EnsurePoller { + p := &EnsurePoller{ chanName: pubsub.ChanV3, mu: &sync.Mutex{}, pendingPolls: make(map[sync2.PollerID]pendingInfo), notifier: notifier, } + if enablePrometheus { + p.numPendingEnsurePolling = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "sliding_sync", + Subsystem: "api", + Name: "num_devices_pending_ensure_polling", + Help: "Number of devices blocked on EnsurePolling returning.", + }) + prometheus.MustRegister(p.numPendingEnsurePolling) + } + return p } // EnsurePolling blocks until the V2InitialSyncComplete response is received for this device. It is @@ -73,6 +87,7 @@ func (p *EnsurePoller) EnsurePolling(ctx context.Context, pid sync2.PollerID, to done: false, ch: ch, } + p.calculateNumOutstanding() // increment total p.mu.Unlock() // ask the pollers to poll for this device p.notifier.Notify(p.chanName, &pubsub.V3EnsurePolling{ @@ -116,6 +131,7 @@ func (p *EnsurePoller) OnInitialSyncComplete(payload *pubsub.V2InitialSyncComple pending.done = true pending.ch = nil p.pendingPolls[pid] = pending + p.calculateNumOutstanding() // decrement total log.Trace().Msg("OnInitialSyncComplete: closing channel") close(ch) } @@ -137,4 +153,21 @@ func (p *EnsurePoller) OnExpiredToken(payload *pubsub.V2ExpiredToken) { func (p *EnsurePoller) Teardown() { p.notifier.Close() + if p.numPendingEnsurePolling != nil { + prometheus.Unregister(p.numPendingEnsurePolling) + } +} + +// must hold p.mu +func (p *EnsurePoller) calculateNumOutstanding() { + if p.numPendingEnsurePolling == nil { + return + } + var total int + for _, pi := range p.pendingPolls { + if !pi.done { + total++ + } + } + p.numPendingEnsurePolling.Set(float64(total)) } diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 7caec0cd..b9227cf3 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -60,11 +60,10 @@ type SyncLiveHandler struct { GlobalCache *caches.GlobalCache maxPendingEventUpdates int - numConns prometheus.Gauge - setupHistVec *prometheus.HistogramVec - histVec *prometheus.HistogramVec - slowReqs prometheus.Counter - numPendingEnsurePolling prometheus.Gauge + numConns prometheus.Gauge + setupHistVec *prometheus.HistogramVec + histVec *prometheus.HistogramVec + slowReqs prometheus.Counter } func NewSync3Handler( @@ -94,7 +93,7 @@ func NewSync3Handler( } // set up pubsub mechanism to start from this point - sh.EnsurePoller = NewEnsurePoller(pub) + sh.EnsurePoller = NewEnsurePoller(pub, enablePrometheus) sh.V2Sub = pubsub.NewV2Sub(sub, sh) return sh, nil @@ -133,9 +132,6 @@ func (h *SyncLiveHandler) Teardown() { if h.numConns != nil { prometheus.Unregister(h.numConns) } - if h.numPendingEnsurePolling != nil { - prometheus.Unregister(h.numPendingEnsurePolling) - } if h.setupHistVec != nil { prometheus.Unregister(h.setupHistVec) } @@ -161,12 +157,6 @@ func (h *SyncLiveHandler) addPrometheusMetrics() { Name: "num_active_conns", Help: "Number of active sliding sync connections.", }) - h.numPendingEnsurePolling = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: "sliding_sync", - Subsystem: "api", - Name: "num_pending_ensure_polling", - Help: "Number of HTTP requests blocked on EnsurePolling returning.", - }) h.setupHistVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "sliding_sync", Subsystem: "api", @@ -188,7 +178,6 @@ func (h *SyncLiveHandler) addPrometheusMetrics() { Help: "Counter of slow (>=50s) requests, initial or otherwise.", }) prometheus.MustRegister(h.numConns) - prometheus.MustRegister(h.numPendingEnsurePolling) prometheus.MustRegister(h.setupHistVec) prometheus.MustRegister(h.histVec) prometheus.MustRegister(h.slowReqs) @@ -409,13 +398,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ log.Trace().Msg("checking poller exists and is running") pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} - if h.numPendingEnsurePolling != nil { - h.numPendingEnsurePolling.Inc() - } h.EnsurePoller.EnsurePolling(req.Context(), pid, token.AccessTokenHash) - if h.numPendingEnsurePolling != nil { - h.numPendingEnsurePolling.Dec() - } log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. // We'll be quicker next time as the poller will already exist. From 150821f61ebd85c0988942f62af15cc88c88ac7a Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 7 Jul 2023 14:05:00 +0100 Subject: [PATCH 047/156] Add unregister hooks --- sync2/poller.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sync2/poller.go b/sync2/poller.go index ac3678e9..aa52baa7 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -156,6 +156,12 @@ func (h *PollerMap) Terminate() { if h.timelineSizeHistogramVec != nil { prometheus.Unregister(h.timelineSizeHistogramVec) } + if h.totalNumPollsCounter != nil { + prometheus.Unregister(h.totalNumPollsCounter) + } + if h.numOutstandingSyncReqsGauge != nil { + prometheus.Unregister(h.numOutstandingSyncReqsGauge) + } close(h.executor) } From f22ef91da614621bc4111fb63e3d5fbac464aa6f Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 7 Jul 2023 15:16:59 +0100 Subject: [PATCH 048/156] bugfix: distinguish between a 0 invited_count and a missing invited_count --- sync3/handler/connstate.go | 2 +- sync3/handler/connstate_live.go | 2 +- sync3/room.go | 3 ++- tests-e2e/main_test.go | 2 +- tests-e2e/membership_transitions_test.go | 23 ++++++++++++++++++++--- testutils/m/match.go | 16 ++++++++++++++-- 6 files changed, 39 insertions(+), 9 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 4184a970..04d5e237 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -571,7 +571,7 @@ func (s *ConnState) getInitialRoomData(ctx context.Context, roomSub sync3.RoomSu Initial: true, IsDM: userRoomData.IsDM, JoinedCount: metadata.JoinCount, - InvitedCount: metadata.InviteCount, + InvitedCount: &metadata.InviteCount, PrevBatch: userRoomData.RequestedPrevBatch, } } diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index 4f3579c8..8a2e2cdc 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -218,7 +218,7 @@ func (s *connStateLive) processLiveUpdate(ctx context.Context, up caches.Update, thisRoom.Name = internal.CalculateRoomName(metadata, 5) // TODO: customisable? } if delta.InviteCountChanged { - thisRoom.InvitedCount = roomUpdate.GlobalRoomMetadata().InviteCount + thisRoom.InvitedCount = &roomUpdate.GlobalRoomMetadata().InviteCount } if delta.JoinCountChanged { thisRoom.JoinedCount = roomUpdate.GlobalRoomMetadata().JoinCount diff --git a/sync3/room.go b/sync3/room.go index 388b06b7..6bf7c35d 100644 --- a/sync3/room.go +++ b/sync3/room.go @@ -2,6 +2,7 @@ package sync3 import ( "encoding/json" + "github.com/matrix-org/sliding-sync/internal" "github.com/matrix-org/sliding-sync/sync3/caches" @@ -17,7 +18,7 @@ type Room struct { Initial bool `json:"initial,omitempty"` IsDM bool `json:"is_dm,omitempty"` JoinedCount int `json:"joined_count,omitempty"` - InvitedCount int `json:"invited_count,omitempty"` + InvitedCount *int `json:"invited_count,omitempty"` PrevBatch string `json:"prev_batch,omitempty"` NumLive int `json:"num_live,omitempty"` } diff --git a/tests-e2e/main_test.go b/tests-e2e/main_test.go index 6d5805fb..52476773 100644 --- a/tests-e2e/main_test.go +++ b/tests-e2e/main_test.go @@ -159,7 +159,7 @@ func MatchRoomInviteState(events []Event, partial bool) m.RoomMatcher { } } if !found { - return fmt.Errorf("MatchRoomInviteState: want event %+v but it does not exist", want) + return fmt.Errorf("MatchRoomInviteState: want event %+v but it does not exist or failed to pass equality checks", want) } } return nil diff --git a/tests-e2e/membership_transitions_test.go b/tests-e2e/membership_transitions_test.go index 9bfb7e1f..2271d78e 100644 --- a/tests-e2e/membership_transitions_test.go +++ b/tests-e2e/membership_transitions_test.go @@ -64,7 +64,22 @@ func TestRoomStateTransitions(t *testing.T) { m.MatchRoomHighlightCount(1), m.MatchRoomInitial(true), m.MatchRoomRequiredState(nil), - // TODO m.MatchRoomInviteState(inviteStrippedState.InviteState.Events), + m.MatchInviteCount(1), + m.MatchJoinCount(1), + MatchRoomInviteState([]Event{ + { + Type: "m.room.create", + StateKey: ptr(""), + // no content as it includes the room version which we don't want to guess/hardcode + }, + { + Type: "m.room.join_rules", + StateKey: ptr(""), + Content: map[string]interface{}{ + "join_rule": "public", + }, + }, + }, true), }, joinRoomID: {}, }), @@ -105,6 +120,8 @@ func TestRoomStateTransitions(t *testing.T) { }, }), m.MatchRoomInitial(true), + m.MatchJoinCount(2), + m.MatchInviteCount(0), m.MatchRoomHighlightCount(0), )) } @@ -467,7 +484,7 @@ func TestMemberCounts(t *testing.T) { m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ secondRoomID: { m.MatchRoomInitial(false), - m.MatchInviteCount(0), + m.MatchNoInviteCount(), m.MatchJoinCount(0), // omitempty }, })) @@ -486,7 +503,7 @@ func TestMemberCounts(t *testing.T) { m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ secondRoomID: { m.MatchRoomInitial(false), - m.MatchInviteCount(0), + m.MatchNoInviteCount(), m.MatchJoinCount(2), }, })) diff --git a/testutils/m/match.go b/testutils/m/match.go index e44d0d82..a7b2f635 100644 --- a/testutils/m/match.go +++ b/testutils/m/match.go @@ -48,10 +48,22 @@ func MatchJoinCount(count int) RoomMatcher { } } +func MatchNoInviteCount() RoomMatcher { + return func(r sync3.Room) error { + if r.InvitedCount != nil { + return fmt.Errorf("MatchInviteCount: invited_count is present when it should be missing: val=%v", *r.InvitedCount) + } + return nil + } +} + func MatchInviteCount(count int) RoomMatcher { return func(r sync3.Room) error { - if r.InvitedCount != count { - return fmt.Errorf("MatchInviteCount: got %v want %v", r.InvitedCount, count) + if r.InvitedCount == nil { + return fmt.Errorf("MatchInviteCount: invited_count is missing") + } + if *r.InvitedCount != count { + return fmt.Errorf("MatchInviteCount: got %v want %v", *r.InvitedCount, count) } return nil } From 41a724038467c275129fd016bd9a541fe07883b6 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 10 Jul 2023 13:04:50 +0100 Subject: [PATCH 049/156] Report a metric for the size of gappy state blocks --- sync2/poller.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sync2/poller.go b/sync2/poller.go index aa52baa7..27a97bf1 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -69,6 +69,7 @@ type PollerMap struct { executorRunning bool processHistogramVec *prometheus.HistogramVec timelineSizeHistogramVec *prometheus.HistogramVec + gappyStateSizeVec *prometheus.HistogramVec numOutstandingSyncReqsGauge prometheus.Gauge totalNumPollsCounter prometheus.Counter } @@ -121,6 +122,14 @@ func NewPollerMap(v2Client Client, enablePrometheus bool) *PollerMap { Buckets: []float64{0.0, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0}, }, []string{"limited"}) prometheus.MustRegister(pm.timelineSizeHistogramVec) + pm.gappyStateSizeVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "sliding_sync", + Subsystem: "poller", + Name: "gappy_state_size", + Help: "Number of events in a state block during a sync v2 gappy sync", + Buckets: []float64{1.0, 10.0, 100.0, 1000.0, 10000.0}, + }, nil) + prometheus.MustRegister(pm.gappyStateSizeVec) pm.totalNumPollsCounter = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "sliding_sync", Subsystem: "poller", @@ -156,6 +165,9 @@ func (h *PollerMap) Terminate() { if h.timelineSizeHistogramVec != nil { prometheus.Unregister(h.timelineSizeHistogramVec) } + if h.gappyStateSizeVec != nil { + prometheus.Unregister(h.gappyStateSizeVec) + } if h.totalNumPollsCounter != nil { prometheus.Unregister(h.totalNumPollsCounter) } @@ -221,6 +233,7 @@ func (h *PollerMap) EnsurePolling(pid PollerID, accessToken, v2since string, isS poller = newPoller(pid, accessToken, h.v2Client, h, logger, !needToWait && !isStartup) poller.processHistogramVec = h.processHistogramVec poller.timelineSizeVec = h.timelineSizeHistogramVec + poller.gappyStateSizeVec = h.gappyStateSizeVec poller.numOutstandingSyncReqs = h.numOutstandingSyncReqsGauge poller.totalNumPolls = h.totalNumPollsCounter go poller.Poll(v2since) @@ -377,6 +390,7 @@ type poller struct { pollHistogramVec *prometheus.HistogramVec processHistogramVec *prometheus.HistogramVec timelineSizeVec *prometheus.HistogramVec + gappyStateSizeVec *prometheus.HistogramVec numOutstandingSyncReqs prometheus.Gauge totalNumPolls prometheus.Counter } @@ -741,3 +755,10 @@ func (p *poller) trackTimelineSize(size int, limited bool) { } p.timelineSizeVec.WithLabelValues(label).Observe(float64(size)) } + +func (p *poller) trackGappyStateSize(size int) { + if p.gappyStateSizeVec == nil { + return + } + p.gappyStateSizeVec.WithLabelValues().Observe(float64(size)) +} From 5064f64b351e60bba1f31ce8f584a84e53a1d988 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 10 Jul 2023 14:23:48 +0100 Subject: [PATCH 050/156] Log error message to stdout if poller panics otherwise we only see the error message if we're using sentry. --- sync2/poller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync2/poller.go b/sync2/poller.go index 27a97bf1..44a370f2 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -447,7 +447,7 @@ func (p *poller) Poll(since string) { defer func() { panicErr := recover() if panicErr != nil { - logger.Error().Str("user", p.userID).Str("device", p.deviceID).Msg(string(debug.Stack())) + logger.Error().Str("user", p.userID).Str("device", p.deviceID).Msgf("%s. Traceback:\n%s", panicErr, debug.Stack()) internal.GetSentryHubFromContextOrDefault(ctx).RecoverWithContext(ctx, panicErr) } p.receiver.OnTerminated(ctx, p.userID, p.deviceID) From dcf8db347236d8e61eacaf2aa3c571adb902ff67 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 10 Jul 2023 16:19:05 +0100 Subject: [PATCH 051/156] Actually observe the new metric --- sync2/poller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/sync2/poller.go b/sync2/poller.go index 44a370f2..9da833d3 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -666,6 +666,7 @@ func (p *poller) parseRoomsResponse(ctx context.Context, res *SyncResponse) { }) hub.CaptureMessage(warnMsg) }) + p.trackGappyStateSize(len(prependStateEvents)) roomData.Timeline.Events = append(prependStateEvents, roomData.Timeline.Events...) } } From e19df00bc41e159385f8876dae13fd31b9700537 Mon Sep 17 00:00:00 2001 From: TheSunCat <44881120+TheSunCat@users.noreply.github.com> Date: Mon, 10 Jul 2023 17:39:53 +0200 Subject: [PATCH 052/156] Detail instructions for hosting --- README.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a7c5c0b5..7a7d9d15 100644 --- a/README.md +++ b/README.md @@ -34,22 +34,85 @@ changes in the proxy itself. ## Usage +### Setup Requires Postgres 13+. +First, you must create a Postgres database and secret: ```bash $ createdb syncv3 $ echo -n "$(openssl rand -hex 32)" > .secret # this MUST remain the same throughout the lifetime of the database created above. ``` -Compiling from source and running: +The Sliding Sync proxy requires some environment variables set to function. They are described when the proxy is run with the `--help` switch. + +Here is a short description of each, as of writing: +``` +SYNCV3_SERVER Required. The destination homeserver to talk to (CS API HTTPS URL) e.g 'https://matrix-client.matrix.org' +SYNCV3_DB Required. The postgres connection string: https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING +SYNCV3_SECRET Required. A secret to use to encrypt access tokens. Must remain the same for the lifetime of the database. +SYNCV3_BINDADDR Default: 0.0.0.0:8008. The interface and port to listen on. +SYNCV3_TLS_CERT Default: unset. Path to a certificate file to serve to HTTPS clients. Specifying this enables TLS on the bound address. +SYNCV3_TLS_KEY Default: unset. Path to a key file for the certificate. Must be provided along with the certificate file. +SYNCV3_PPROF Default: unset. The bind addr for pprof debugging e.g ':6060'. If not set, does not listen. +SYNCV3_PROM Default: unset. The bind addr for Prometheus metrics, which will be accessible at /metrics at this address. +SYNCV3_JAEGER_URL Default: unset. The Jaeger URL to send spans to e.g http://localhost:14268/api/traces - if unset does not send OTLP traces. +SYNCV3_SENTRY_DSN Default: unset. The Sentry DSN to report events to e.g https://sliding-sync@sentry.example.com/123 - if unset does not send sentry events. +SYNCV3_LOG_LEVEL Default: info. The level of verbosity for messages logged. Available values are trace, debug, info, warn, error and fatal +``` + +It is easiest to host the proxy on a separate hostname than the Matrix server, though it is possible to use the same hostname by forwarding the used endpoints. + +In both cases, the path `https://example.com/.well-known/matrix/client` must return a JSON with at least the following contents: +```json +{ + "m.server": { + "base_url": "https://example.com" + }, + "m.homeserver": { + "base_url": "https://example.com" + }, + "org.matrix.msc3575.proxy": { + "url": "https://syncv3.example.com" + } +} +``` + +#### Same hostname +The following nginx configuration can be used to pass the required endpoints to the sync proxy, running on local port 8009 (so as to not conflict with Synapse): +```nginx +location ~* ^/(client/|_matrix/client/unstable/org.matrix.msc3575/sync) { + proxy_pass http://localhost:8009; + proxy_set_header X-Forwarded-For $remote_addr; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Host $host; +} + +location ~* ^(\/_matrix|\/_synapse\/client) { + proxy_pass http://localhost:8008; + proxy_set_header X-Forwarded-For $remote_addr; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Host $host; +} + +location /.well-known/matrix/client { + add_header Access-Control-Allow-Origin *; +} +``` + +### Running +There are two ways to run the proxy: +- Compiling from source: ``` $ go build ./cmd/syncv3 -$ SYNCV3_SECRET=$(cat .secret) SYNCV3_SERVER="https://matrix-client.matrix.org" SYNCV3_DB="user=$(whoami) dbname=syncv3 sslmode=disable" SYNCV3_BINDADDR=0.0.0.0:8008 ./syncv3 +$ SYNCV3_SECRET=$(cat .secret) SYNCV3_SERVER="https://matrix-client.matrix.org" SYNCV3_DB="user=$(whoami) dbname=syncv3 sslmode=disable password='DATABASE_PASSWORD_HERE'" SYNCV3_BINDADDR=0.0.0.0:8008 ./syncv3 ``` -Using a Docker image: + +- Using a Docker image: ``` -docker run --rm -e "SYNCV3_SERVER=https://matrix-client.matrix.org" -e "SYNCV3_SECRET=$(cat .secret)" -e "SYNCV3_BINDADDR=:8008" -e "SYNCV3_DB=user=$(whoami) dbname=syncv3 sslmode=disable host=host.docker.internal" -p 8008:8008 ghcr.io/matrix-org/sliding-sync:latest +docker run --rm -e "SYNCV3_SERVER=https://matrix-client.matrix.org" -e "SYNCV3_SECRET=$(cat .secret)" -e "SYNCV3_BINDADDR=:8008" -e "SYNCV3_DB=user=$(whoami) dbname=syncv3 sslmode=disable host=host.docker.internal password='DATABASE_PASSWORD_HERE'" -p 8008:8008 ghcr.io/matrix-org/sliding-sync:latest ``` + + Optionally also set `SYNCV3_TLS_CERT=path/to/cert.pem` and `SYNCV3_TLS_KEY=path/to/key.pem` to listen on HTTPS instead of HTTP. Make sure to tweak the `SYNCV3_DB` environment variable if the Postgres database isn't running on the host. @@ -163,4 +226,4 @@ Run end-to-end tests: # to ghcr and pull the image. docker run --rm -e "SYNAPSE_COMPLEMENT_DATABASE=sqlite" -e "SERVER_NAME=synapse" -p 8888:8008 ghcr.io/matrix-org/synapse-service:v1.72.0 (go build ./cmd/syncv3 && dropdb syncv3_test && createdb syncv3_test && cd tests-e2e && ./run-tests.sh -count=1 .) -``` \ No newline at end of file +``` From d8ef7a604b6f159d6433caa60ef8bcb73c26a562 Mon Sep 17 00:00:00 2001 From: TheSunCat <44881120+TheSunCat@users.noreply.github.com> Date: Mon, 10 Jul 2023 17:47:32 +0200 Subject: [PATCH 053/156] Correct mention of --help flag, which does not exist --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7a7d9d15..5c0398c6 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ $ createdb syncv3 $ echo -n "$(openssl rand -hex 32)" > .secret # this MUST remain the same throughout the lifetime of the database created above. ``` -The Sliding Sync proxy requires some environment variables set to function. They are described when the proxy is run with the `--help` switch. +The Sliding Sync proxy requires some environment variables set to function. They are described when the proxy is run with missing variables. Here is a short description of each, as of writing: ``` From e947612ad959c93edf29fc27453202ab789a6f9c Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 11 Jul 2023 19:08:32 +0100 Subject: [PATCH 054/156] Fix #192: ignore unseen old events --- state/accumulator.go | 125 ++++++++++++++++++++------ state/accumulator_test.go | 81 ----------------- tests-integration/regressions_test.go | 112 +++++++++++++++++++++++ 3 files changed, 210 insertions(+), 108 deletions(-) create mode 100644 tests-integration/regressions_test.go diff --git a/state/accumulator.go b/state/accumulator.go index 0105786f..acab4555 100644 --- a/state/accumulator.go +++ b/state/accumulator.go @@ -293,34 +293,20 @@ func (a *Accumulator) Initialise(roomID string, state []json.RawMessage) (Initia // - Else it creates a new room state snapshot if the timeline contains state events (as this now represents the current state) // - It adds entries to the membership log for membership events. func (a *Accumulator) Accumulate(txn *sqlx.Tx, roomID string, prevBatch string, timeline []json.RawMessage) (numNew int, timelineNIDs []int64, err error) { - // Insert the events. Check for duplicates which can happen in the real world when joining - // Matrix HQ on Synapse. - dedupedEvents := make([]Event, 0, len(timeline)) - seenEvents := make(map[string]struct{}) - for i := range timeline { - e := Event{ - JSON: timeline[i], - RoomID: roomID, - } - if err := e.ensureFieldsSetOnEvent(); err != nil { - return 0, nil, fmt.Errorf("event malformed: %s", err) - } - if _, ok := seenEvents[e.ID]; ok { - logger.Warn().Str("event_id", e.ID).Str("room_id", roomID).Msg( - "Accumulator.Accumulate: seen the same event ID twice, ignoring", - ) - continue - } - if i == 0 && prevBatch != "" { - // tag the first timeline event with the prev batch token - e.PrevBatch = sql.NullString{ - String: prevBatch, - Valid: true, - } - } - dedupedEvents = append(dedupedEvents, e) - seenEvents[e.ID] = struct{}{} + // The first stage of accumulating events is mostly around validation around what the upstream HS sends us. For accumulation to work correctly + // we expect: + // - there to be no duplicate events + // - if there are new events, they are always new. + // Both of these assumptions can be false for different reasons + dedupedEvents, err := a.filterAndParseTimelineEvents(txn, roomID, timeline, prevBatch) + if err != nil { + err = fmt.Errorf("filterTimelineEvents: %w", err) + return } + if len(dedupedEvents) == 0 { + return 0, nil, err // nothing to do + } + eventIDToNID, err := a.eventsTable.Insert(txn, dedupedEvents, false) if err != nil { return 0, nil, err @@ -413,6 +399,91 @@ func (a *Accumulator) Accumulate(txn *sqlx.Tx, roomID string, prevBatch string, return numNew, timelineNIDs, nil } +// filterAndParseTimelineEvents takes a raw timeline array from sync v2 and applies sanity to it: +// - removes duplicate events: this is just a bug which has been seen on Synapse on matrix.org +// - removes old events: this is an edge case when joining rooms over federation, see https://github.com/matrix-org/sliding-sync/issues/192 +// - parses it and returns Event structs. +// - check which events are unknown. If all events are known, filter them all out. +func (a *Accumulator) filterAndParseTimelineEvents(txn *sqlx.Tx, roomID string, timeline []json.RawMessage, prevBatch string) ([]Event, error) { + // Check for duplicates which can happen in the real world when joining + // Matrix HQ on Synapse, as well as when you join rooms for the first time over federation. + dedupedEvents := make([]Event, 0, len(timeline)) + seenEvents := make(map[string]struct{}) + for i := range timeline { + e := Event{ + JSON: timeline[i], + RoomID: roomID, + } + if err := e.ensureFieldsSetOnEvent(); err != nil { + return nil, fmt.Errorf("event malformed: %s", err) + } + if _, ok := seenEvents[e.ID]; ok { + logger.Warn().Str("event_id", e.ID).Str("room_id", roomID).Msg( + "Accumulator.filterAndParseTimelineEvents: seen the same event ID twice, ignoring", + ) + continue + } + if i == 0 && prevBatch != "" { + // tag the first timeline event with the prev batch token + e.PrevBatch = sql.NullString{ + String: prevBatch, + Valid: true, + } + } + dedupedEvents = append(dedupedEvents, e) + seenEvents[e.ID] = struct{}{} + } + + // if we only have a single timeline event we cannot determine if it is old or not, as we rely on already seen events + // being after (higher index) than it. + if len(dedupedEvents) <= 1 { + return dedupedEvents, nil + } + + // Figure out which of these events are unseen and hence brand new live events. + // In some cases, we may have unseen OLD events - see https://github.com/matrix-org/sliding-sync/issues/192 + // in which case we need to drop those events. + dedupedEventIDs := make([]string, 0, len(seenEvents)) + for evID := range seenEvents { + dedupedEventIDs = append(dedupedEventIDs, evID) + } + unknownEventIDs, err := a.eventsTable.SelectUnknownEventIDs(txn, dedupedEventIDs) + if err != nil { + return nil, fmt.Errorf("filterAndParseTimelineEvents: failed to SelectUnknownEventIDs: %w", err) + } + + if len(unknownEventIDs) == 0 { + // every event has been seen already, no work to do + return nil, nil + } + + // In the happy case, we expect to see timeline arrays like this: (SEEN=S, UNSEEN=U) + // [S,S,U,U] -> want last 2 + // [U,U,U] -> want all + // In the backfill edge case, we might see: + // [U,S,S,S] -> want none + // [U,S,S,U] -> want last 1 + // We should never see scenarios like: + // [U,S,S,U,S,S] <- we should only see 1 contiguous block of seen events. + // If we do, we'll just ignore all unseen events less than the highest seen event. + + // The algorithm starts at the end and just looks for the first S event, returning the subslice after that S event (which may be []) + seenIndex := -1 + for i := len(dedupedEvents) - 1; i >= 0; i-- { + _, unseen := unknownEventIDs[dedupedEvents[i].ID] + if !unseen { + seenIndex = i + break + } + } + // seenIndex can be -1 if all are unseen, or len-1 if all are seen, either way if we +1 this slices correctly: + // no seen events s[A,B,C] => s[-1+1:] => [A,B,C] + // C is seen event s[A,B,C] => s[2+1:] => [] + // B is seen event s[A,B,C] => s[1+1:] => [C] + // A is seen event s[A,B,C] => s[0+1:] => [B,C] + return dedupedEvents[seenIndex+1:], nil +} + // Delta returns a list of events of at most `limit` for the room not including `lastEventNID`. // Returns the latest NID of the last event (most recent) func (a *Accumulator) Delta(roomID string, lastEventNID int64, limit int) (eventsJSON []json.RawMessage, latest int64, err error) { diff --git a/state/accumulator_test.go b/state/accumulator_test.go index 250854e8..64ee6c86 100644 --- a/state/accumulator_test.go +++ b/state/accumulator_test.go @@ -11,7 +11,6 @@ import ( "github.com/jmoiron/sqlx" "github.com/matrix-org/sliding-sync/sqlutil" "github.com/matrix-org/sliding-sync/sync2" - "github.com/matrix-org/sliding-sync/testutils" "github.com/tidwall/gjson" ) @@ -417,86 +416,6 @@ func TestAccumulatorDupeEvents(t *testing.T) { } } -// Regression test for corrupt state snapshots. -// This seems to have happened in the wild, whereby the snapshot exhibited 2 things: -// - A message event having a event_replaces_nid. This should be impossible as messages are not state. -// - Duplicate events in the state snapshot. -// -// We can reproduce a message event having a event_replaces_nid by doing the following: -// - Create a room with initial state A,C -// - Accumulate events D, A, B(msg). This should be impossible because we already got A initially but whatever, roll with it, blame state resets or something. -// - This leads to A,B being processed and D ignored if you just take the newest results. -// -// This can then be tested by: -// - Query the current room snapshot. This will include B(msg) when it shouldn't. -func TestAccumulatorMisorderedGraceful(t *testing.T) { - alice := "@alice:localhost" - bob := "@bob:localhost" - - eventA := testutils.NewStateEvent(t, "m.room.member", alice, alice, map[string]interface{}{"membership": "join"}) - eventC := testutils.NewStateEvent(t, "m.room.create", "", alice, map[string]interface{}{}) - eventD := testutils.NewStateEvent( - t, "m.room.member", bob, "join", map[string]interface{}{"membership": "join"}, - ) - eventBMsg := testutils.NewEvent( - t, "m.room.message", bob, map[string]interface{}{"body": "hello"}, - ) - t.Logf("A=member-alice, B=msg, C=create, D=member-bob") - - db, close := connectToDB(t) - defer close() - accumulator := NewAccumulator(db) - roomID := "!TestAccumulatorStateReset:localhost" - // Create a room with initial state A,C - _, err := accumulator.Initialise(roomID, []json.RawMessage{ - eventA, eventC, - }) - if err != nil { - t.Fatalf("failed to Initialise accumulator: %s", err) - } - - // Accumulate events D, A, B(msg). - err = sqlutil.WithTransaction(accumulator.db, func(txn *sqlx.Tx) error { - _, _, err = accumulator.Accumulate(txn, roomID, "", []json.RawMessage{eventD, eventA, eventBMsg}) - return err - }) - if err != nil { - t.Fatalf("failed to Accumulate: %s", err) - } - - eventIDs := []string{ - gjson.GetBytes(eventA, "event_id").Str, - gjson.GetBytes(eventBMsg, "event_id").Str, - gjson.GetBytes(eventC, "event_id").Str, - gjson.GetBytes(eventD, "event_id").Str, - } - t.Logf("Events A,B,C,D: %v", eventIDs) - txn := accumulator.db.MustBeginTx(context.Background(), nil) - idsToNIDs, err := accumulator.eventsTable.SelectNIDsByIDs(txn, eventIDs) - if err != nil { - t.Fatalf("Failed to SelectNIDsByIDs: %s", err) - } - if len(idsToNIDs) != len(eventIDs) { - t.Errorf("SelectNIDsByIDs: asked for %v got %v", eventIDs, idsToNIDs) - } - t.Logf("Events: %v", idsToNIDs) - - wantEventNIDs := []int64{ - idsToNIDs[eventIDs[0]], idsToNIDs[eventIDs[2]], idsToNIDs[eventIDs[3]], - } - sort.Slice(wantEventNIDs, func(i, j int) bool { - return wantEventNIDs[i] < wantEventNIDs[j] - }) - // Query the current room snapshot - gotSnapshotEvents := currentSnapshotNIDs(t, accumulator.snapshotTable, roomID) - if len(gotSnapshotEvents) != len(wantEventNIDs) { // events A,C,D - t.Errorf("corrupt snapshot, got %v want %v", gotSnapshotEvents, wantEventNIDs) - } - if !reflect.DeepEqual(wantEventNIDs, gotSnapshotEvents) { - t.Errorf("got %v want %v", gotSnapshotEvents, wantEventNIDs) - } -} - // Regression test for corrupt state snapshots. // This seems to have happened in the wild, whereby the snapshot exhibited 2 things: // - A message event having a event_replaces_nid. This should be impossible as messages are not state. diff --git a/tests-integration/regressions_test.go b/tests-integration/regressions_test.go new file mode 100644 index 00000000..d24869ce --- /dev/null +++ b/tests-integration/regressions_test.go @@ -0,0 +1,112 @@ +package syncv3 + +import ( + "encoding/json" + "testing" + "time" + + "github.com/matrix-org/sliding-sync/sync2" + "github.com/matrix-org/sliding-sync/sync3" + "github.com/matrix-org/sliding-sync/testutils" + "github.com/matrix-org/sliding-sync/testutils/m" +) + +// catch all file for any kind of regression test which doesn't fall into a unique category + +// Regression test for https://github.com/matrix-org/sliding-sync/issues/192 +// - Bob on his server invites Alice to a room. +// - Alice joins the room first over federation. Proxy does the right thing and sets her membership to join. There is no timeline though due to not having backfilled. +// - Alice's client backfills in the room which pulls in the invite event, but the SS proxy doesn't see it as it's backfill, not /sync. +// - Charlie joins the same room via SS, which makes the SS proxy see 50 timeline events, which includes the invite. +// As the proxy has never seen this invite event before, it assumes it is newer than the join event and inserts it, corrupting state. +// +// Manually confirmed this can happen with 3x Element clients. We need to make sure we drop those earlier events. +// The first join over federation presents itself as a single join event in the timeline, with the create event, etc in state. +func TestBackfillInviteDoesntCorruptState(t *testing.T) { + pqString := testutils.PrepareDBConnectionString() + // setup code + v2 := runTestV2Server(t) + v3 := runTestServer(t, v2, pqString) + defer v2.close() + defer v3.close() + + fedBob := "@bob:over_federation" + charlie := "@charlie:localhost" + charlieToken := "CHARLIE_TOKEN" + joinEvent := testutils.NewJoinEvent(t, alice) + + room := roomEvents{ + roomID: "!TestBackfillInviteDoesntCorruptState:localhost", + events: []json.RawMessage{ + joinEvent, + }, + state: createRoomState(t, fedBob, time.Now()), + } + v2.addAccount(t, alice, aliceToken) + v2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(room), + }, + }) + + // alice syncs and should see the room. + aliceRes := v3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + Ranges: sync3.SliceRanges{{0, 20}}, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 5, + }, + }, + }, + }) + m.MatchResponse(t, aliceRes, m.MatchList("a", m.MatchV3Count(1), m.MatchV3Ops(m.MatchV3SyncOp(0, 0, []string{room.roomID})))) + + // Alice's client "backfills" new data in, meaning the next user who joins is going to see a different set of timeline events + dummyMsg := testutils.NewMessageEvent(t, fedBob, "you didn't see this before joining") + charlieJoinEvent := testutils.NewJoinEvent(t, charlie) + backfilledTimelineEvents := append( + room.state, []json.RawMessage{ + dummyMsg, + testutils.NewStateEvent(t, "m.room.member", alice, fedBob, map[string]interface{}{ + "membership": "invite", + }), + joinEvent, + charlieJoinEvent, + }..., + ) + + // now charlie also joins the room, causing a different response from /sync v2 + v2.addAccount(t, charlie, charlieToken) + v2.queueResponse(charlie, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: room.roomID, + events: backfilledTimelineEvents, + }), + }, + }) + + // and now charlie hits SS, which might corrupt membership state for alice. + charlieRes := v3.mustDoV3Request(t, charlieToken, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + Ranges: sync3.SliceRanges{{0, 20}}, + }, + }, + }) + m.MatchResponse(t, charlieRes, m.MatchList("a", m.MatchV3Count(1), m.MatchV3Ops(m.MatchV3SyncOp(0, 0, []string{room.roomID})))) + + // alice should not see dummyMsg or the invite + aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) + m.MatchResponse(t, aliceRes, m.MatchNoV3Ops(), m.LogResponse(t), m.MatchRoomSubscriptionsStrict( + map[string][]m.RoomMatcher{ + room.roomID: { + m.MatchJoinCount(3), // alice, bob, charlie, + m.MatchNoInviteCount(), + m.MatchNumLive(1), + m.MatchRoomTimeline([]json.RawMessage{charlieJoinEvent}), + }, + }, + )) +} From c1b0f0b93b41fcbab5f71b4d231425056d60a4e1 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 12:24:02 +0100 Subject: [PATCH 055/156] Review comments --- .github/workflows/tests.yml | 1 + sync2/handler2/handler.go | 5 +---- sync2/handler2/handler_test.go | 7 ++++--- sync3/handler/connstate.go | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d0680cb9..7d8df5ca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,6 +2,7 @@ name: Tests on: push: + branches: ["main"] pull_request: permissions: diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 506a4813..15d8037e 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -12,9 +12,6 @@ import ( "github.com/jmoiron/sqlx" "github.com/matrix-org/sliding-sync/sqlutil" - "github.com/jmoiron/sqlx" - "github.com/matrix-org/sliding-sync/sqlutil" - "github.com/getsentry/sentry-go" "github.com/matrix-org/sliding-sync/internal" @@ -55,7 +52,7 @@ type Handler struct { } func NewHandler( - pMap *sync2.PollerMap, v2Store *sync2.Storage, store *state.Storage, + pMap sync2.IPollerMap, v2Store *sync2.Storage, store *state.Storage, pub pubsub.Notifier, sub pubsub.Listener, enablePrometheus bool, deviceDataUpdateDuration time.Duration, ) (*Handler, error) { h := &Handler{ diff --git a/sync2/handler2/handler_test.go b/sync2/handler2/handler_test.go index 20f064ab..fa315228 100644 --- a/sync2/handler2/handler_test.go +++ b/sync2/handler2/handler_test.go @@ -1,14 +1,15 @@ package handler2_test import ( - "github.com/jmoiron/sqlx" - "github.com/matrix-org/sliding-sync/sqlutil" "os" "reflect" "sync" "testing" "time" + "github.com/jmoiron/sqlx" + "github.com/matrix-org/sliding-sync/sqlutil" + "github.com/matrix-org/sliding-sync/pubsub" "github.com/matrix-org/sliding-sync/state" "github.com/matrix-org/sliding-sync/sync2" @@ -127,7 +128,7 @@ func TestHandlerFreshEnsurePolling(t *testing.T) { pMap := &mockPollerMap{} pub := newMockPub() sub := &mockSub{} - h, err := handler2.NewHandler(pMap, v2Store, store, pub, sub, false) + h, err := handler2.NewHandler(pMap, v2Store, store, pub, sub, false, time.Minute) assertNoError(t, err) alice := "@alice:localhost" deviceID := "ALICE" diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index a0aae60b..9f3f79ab 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -197,7 +197,7 @@ func (s *ConnState) onIncomingRequest(reqCtx context.Context, req *sync3.Request internal.Logf(reqCtx, "connstate", "list[%v] prev_empty=%v curr=%v", key, l.Prev == nil, listData) } for roomID, sub := range s.muxedReq.RoomSubscriptions { - internal.Logf(ctx, "connstate", "room sub[%v] %v", roomID, sub) + internal.Logf(reqCtx, "connstate", "room sub[%v] %v", roomID, sub) } // work out which rooms we'll return data for and add their relevant subscriptions to the builder From 8be09840d0b6b1f67b54cc542e1e26a9eb1f3217 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 17:10:47 +0100 Subject: [PATCH 056/156] Add SYNCV3_MAX_DB_CONN: use it in e2e tests This is designed to catch a class of SQL transaction bugs where we BEGIN a transaction and then forget to use that `txn` var, and do other things on `sql.DB` which will use a different connection. By testing with max conns = 1 this will deadlock. We also test with max conns = 2 to try to catch more pernicious failure modes. Using max conns = 1 effectively serialises access to the database, but some bugs may only be apparent when there is some limited amount of concurrency available e.g mid-processing this event, do X. With max conns = 1 we cannot test this, which is why we also test with max conns = 2. --- .github/workflows/tests.yml | 5 +++++ cmd/syncv3/main.go | 12 ++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7d8df5ca..537d81fa 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -88,6 +88,10 @@ jobs: if-no-files-found: error end_to_end: runs-on: ubuntu-latest + strategy: + matrix: + # test with unlimited + 1 + 2 max db conns. If we end up double transacting in the tests anywhere, conn=1 tests will fail. + max_db_conns: [0,1,2] services: synapse: # Custom image built from https://github.com/matrix-org/synapse/tree/v1.72.0/docker/complement with a dummy /complement/ca set @@ -142,6 +146,7 @@ jobs: SYNCV3_DB: user=postgres dbname=syncv3 sslmode=disable password=postgres host=localhost SYNCV3_SERVER: http://localhost:8008 SYNCV3_SECRET: itsasecret + SYNCV3_MAX_DB_CONN: ${{ matrix.max_db_conns }} E2E_TEST_SERVER_STDOUT: test-e2e-server.log - name: Upload test log diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 9938ecc3..04a0c334 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -6,6 +6,7 @@ import ( _ "net/http/pprof" "os" "os/signal" + "strconv" "strings" "syscall" "time" @@ -40,6 +41,7 @@ const ( EnvJaeger = "SYNCV3_JAEGER_URL" EnvSentryDsn = "SYNCV3_SENTRY_DSN" EnvLogLevel = "SYNCV3_LOG_LEVEL" + EnvMaxConns = "SYNCV3_MAX_DB_CONN" ) var helpMsg = fmt.Sprintf(` @@ -55,7 +57,8 @@ Environment var %s Default: unset. The Jaeger URL to send spans to e.g http://localhost:14268/api/traces - if unset does not send OTLP traces. %s Default: unset. The Sentry DSN to report events to e.g https://sliding-sync@sentry.example.com/123 - if unset does not send sentry events. %s Default: info. The level of verbosity for messages logged. Available values are trace, debug, info, warn, error and fatal -`, EnvServer, EnvDB, EnvSecret, EnvBindAddr, EnvTLSCert, EnvTLSKey, EnvPPROF, EnvPrometheus, EnvJaeger, EnvSentryDsn, EnvLogLevel) +%s Default: unset. Max database connections to use when communicating with postgres. Unset or 0 means no limit. +`, EnvServer, EnvDB, EnvSecret, EnvBindAddr, EnvTLSCert, EnvTLSKey, EnvPPROF, EnvPrometheus, EnvJaeger, EnvSentryDsn, EnvLogLevel, EnvMaxConns) func defaulting(in, dft string) string { if in == "" { @@ -81,6 +84,7 @@ func main() { EnvJaeger: os.Getenv(EnvJaeger), EnvSentryDsn: os.Getenv(EnvSentryDsn), EnvLogLevel: os.Getenv(EnvLogLevel), + EnvMaxConns: defaulting(os.Getenv(EnvMaxConns), "0"), } requiredEnvVars := []string{EnvServer, EnvDB, EnvSecret, EnvBindAddr} for _, requiredEnvVar := range requiredEnvVars { @@ -162,9 +166,13 @@ func main() { panic(err) } + maxConnsInt, err := strconv.Atoi(args[EnvMaxConns]) + if err != nil { + panic("invalid value for " + EnvMaxConns + ": " + args[EnvMaxConns]) + } h2, h3 := syncv3.Setup(args[EnvServer], args[EnvDB], args[EnvSecret], syncv3.Opts{ AddPrometheusMetrics: args[EnvPrometheus] != "", - DBMaxConns: 100, + DBMaxConns: maxConnsInt, DBConnMaxIdleTime: time.Hour, }) From 37dcd91e09432b9025281dc3050cceb6d7e91ef2 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 17:26:43 +0100 Subject: [PATCH 057/156] Set healthcheck retries for synapse too now as we start lots of them --- .github/workflows/tests.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 537d81fa..f71ec252 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -101,6 +101,11 @@ jobs: SERVER_NAME: synapse ports: - 8008:8008 + # Set health checks to wait until synapse has started + options: >- + --health-interval 10s + --health-timeout 5s + --health-retries 5 # Label used to access the service container postgres: # Docker Hub image From c47665f1e82ac05a636307a74dacaa476643a6bd Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 17:36:59 +0100 Subject: [PATCH 058/156] Actually honour max conns globally, not per storage struct --- state/storage.go | 4 ++++ sync2/storage.go | 4 ++++ v3.go | 30 +++++++++++++++++------------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/state/storage.go b/state/storage.go index ff70d981..b51026cc 100644 --- a/state/storage.go +++ b/state/storage.go @@ -57,6 +57,10 @@ func NewStorage(postgresURI string) *Storage { // TODO: if we panic(), will sentry have a chance to flush the event? logger.Panic().Err(err).Str("uri", postgresURI).Msg("failed to open SQL DB") } + return NewStorageWithDB(db) +} + +func NewStorageWithDB(db *sqlx.DB) *Storage { acc := &Accumulator{ db: db, roomsTable: NewRoomsTable(db), diff --git a/sync2/storage.go b/sync2/storage.go index 5f484179..2cfa4c8a 100644 --- a/sync2/storage.go +++ b/sync2/storage.go @@ -26,6 +26,10 @@ func NewStore(postgresURI, secret string) *Storage { // TODO: if we panic(), will sentry have a chance to flush the event? logger.Panic().Err(err).Str("uri", postgresURI).Msg("failed to open SQL DB") } + return NewStoreWithDB(db, secret) +} + +func NewStoreWithDB(db *sqlx.DB, secret string) *Storage { return &Storage{ DevicesTable: NewDevicesTable(db), TokensTable: NewTokensTable(db, secret), diff --git a/v3.go b/v3.go index cc72d093..e8948e94 100644 --- a/v3.go +++ b/v3.go @@ -77,20 +77,24 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han }, DestinationServer: destHomeserver, } - store := state.NewStorage(postgresURI) - storev2 := sync2.NewStore(postgresURI, secret) - for _, db := range []*sqlx.DB{store.DB, storev2.DB} { - if opts.DBMaxConns > 0 { - // https://github.com/go-sql-driver/mysql#important-settings - // "db.SetMaxIdleConns() is recommended to be set same to db.SetMaxOpenConns(). When it is smaller - // than SetMaxOpenConns(), connections can be opened and closed much more frequently than you expect." - db.SetMaxOpenConns(opts.DBMaxConns) - db.SetMaxIdleConns(opts.DBMaxConns) - } - if opts.DBConnMaxIdleTime > 0 { - db.SetConnMaxIdleTime(opts.DBConnMaxIdleTime) - } + db, err := sqlx.Open("postgres", postgresURI) + if err != nil { + sentry.CaptureException(err) + // TODO: if we panic(), will sentry have a chance to flush the event? + logger.Panic().Err(err).Str("uri", postgresURI).Msg("failed to open SQL DB") + } + if opts.DBMaxConns > 0 { + // https://github.com/go-sql-driver/mysql#important-settings + // "db.SetMaxIdleConns() is recommended to be set same to db.SetMaxOpenConns(). When it is smaller + // than SetMaxOpenConns(), connections can be opened and closed much more frequently than you expect." + db.SetMaxOpenConns(opts.DBMaxConns) + db.SetMaxIdleConns(opts.DBMaxConns) + } + if opts.DBConnMaxIdleTime > 0 { + db.SetConnMaxIdleTime(opts.DBConnMaxIdleTime) } + store := state.NewStorageWithDB(db) + storev2 := sync2.NewStoreWithDB(db, secret) bufferSize := 50 deviceDataUpdateFrequency := time.Second if opts.TestingSynchronousPubsub { From ea805f7a1c293dce11807c610b12cfde6e72e226 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 17:42:04 +0100 Subject: [PATCH 059/156] Log calculated values for sanity --- cmd/syncv3/main.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 04a0c334..654b1353 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -140,6 +140,8 @@ func main() { } } + fmt.Printf("Debug=%v LogLevel=%v MaxConns=%v\n", args[EnvDebug] == "1", args[EnvLogLevel], args[EnvMaxConns]) + if args[EnvDebug] == "1" { zerolog.SetGlobalLevel(zerolog.TraceLevel) } else { From 2e928284b3f931e3e293781878df594b40db711f Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 17:53:11 +0100 Subject: [PATCH 060/156] Test double transact --- state/snapshot_table.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/state/snapshot_table.go b/state/snapshot_table.go index b2e75ad3..2636900b 100644 --- a/state/snapshot_table.go +++ b/state/snapshot_table.go @@ -75,7 +75,8 @@ func (s *SnapshotTable) Insert(txn *sqlx.Tx, row *SnapshotRow) error { if row.OtherEvents == nil { row.OtherEvents = []int64{} } - err := txn.QueryRow( + // OH NO, NOT USING THE TXN! + err := s.db.QueryRow( `INSERT INTO syncv3_snapshots(room_id, events, membership_events) VALUES($1, $2, $3) RETURNING snapshot_id`, row.RoomID, row.OtherEvents, row.MembershipEvents, ).Scan(&id) From 0fa136f32ddacc45579e1308c421405d2bc6ce15 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 17:58:11 +0100 Subject: [PATCH 061/156] Revert "Test double transact" This reverts commit 2e928284b3f931e3e293781878df594b40db711f. --- state/snapshot_table.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/state/snapshot_table.go b/state/snapshot_table.go index 2636900b..b2e75ad3 100644 --- a/state/snapshot_table.go +++ b/state/snapshot_table.go @@ -75,8 +75,7 @@ func (s *SnapshotTable) Insert(txn *sqlx.Tx, row *SnapshotRow) error { if row.OtherEvents == nil { row.OtherEvents = []int64{} } - // OH NO, NOT USING THE TXN! - err := s.db.QueryRow( + err := txn.QueryRow( `INSERT INTO syncv3_snapshots(room_id, events, membership_events) VALUES($1, $2, $3) RETURNING snapshot_id`, row.RoomID, row.OtherEvents, row.MembershipEvents, ).Scan(&id) From 4e387ca9f0689cf04843c3c57c3ce835a47b6831 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 12 Jul 2023 18:20:09 +0100 Subject: [PATCH 062/156] v0.99.4 --- README.md | 8 ++++++++ cmd/syncv3/main.go | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5c0398c6..26ea18d7 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,13 @@ changes in the proxy itself. - Version 0.99.3 [2023/05/23](https://github.com/matrix-org/matrix-spec-proposals/blob/4103ee768a4a3e1decee80c2987f50f4c6b3d539/proposals/3575-sync.md) - Support for per-list `bump_event_types`. - Support for [`conn_id`](https://github.com/matrix-org/matrix-spec-proposals/blob/4103ee768a4a3e1decee80c2987f50f4c6b3d539/proposals/3575-sync.md#concurrent-connections) for distinguishing multiple concurrent connections. +- Version 0.99.4 [2023/07/12](https://github.com/matrix-org/matrix-spec-proposals/blob/4103ee768a4a3e1decee80c2987f50f4c6b3d539/proposals/3575-sync.md) + - Support for `SYNCV3_MAX_DB_CONN`, and reduce the amount of concurrent connections required during normal operation. + - Add more metrics and logs. Reduce log spam. + - Improve performance when handling changed device lists. + - Responses will consume from the live buffer even when clients change their request parameters to more speedily send new events down. + - Bugfix: return `invited_count` correctly when it transitions to 0. + - Bugfix: fix a data corruption bug when 2 users join a federated room where the first user was invited to said room. ## Usage @@ -58,6 +65,7 @@ SYNCV3_PROM Default: unset. The bind addr for Prometheus metrics, which wi SYNCV3_JAEGER_URL Default: unset. The Jaeger URL to send spans to e.g http://localhost:14268/api/traces - if unset does not send OTLP traces. SYNCV3_SENTRY_DSN Default: unset. The Sentry DSN to report events to e.g https://sliding-sync@sentry.example.com/123 - if unset does not send sentry events. SYNCV3_LOG_LEVEL Default: info. The level of verbosity for messages logged. Available values are trace, debug, info, warn, error and fatal +SYNCV3_MAX_DB_CONN Default: unset. Max database connections to use when communicating with postgres. Unset or 0 means no limit. ``` It is easiest to host the proxy on a separate hostname than the Matrix server, though it is possible to use the same hostname by forwarding the used endpoints. diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 654b1353..a43fa3a1 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -23,7 +23,7 @@ import ( var GitCommit string -const version = "0.99.3" +const version = "0.99.4" const ( // Required fields From baa5d05d3133a7f175b805767bcde170942cc830 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Thu, 13 Jul 2023 18:19:00 +0100 Subject: [PATCH 063/156] Use the rooms table initially when querying latest nids --- state/event_table.go | 4 ++-- state/event_table_test.go | 6 +++++- state/storage.go | 32 ++++++++++++++++++++++++++++++++ sync3/caches/global.go | 2 +- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index 925844c3..578afbf5 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -317,10 +317,10 @@ func (t *EventTable) LatestEventInRooms(txn *sqlx.Tx, roomIDs []string, highestN return } -func (t *EventTable) LatestEventNIDInRooms(roomIDs []string, highestNID int64) (roomToNID map[string]int64, err error) { +func (t *EventTable) LatestEventNIDInRooms(txn *sqlx.Tx, roomIDs []string, highestNID int64) (roomToNID map[string]int64, err error) { // the position (event nid) may be for a random different room, so we need to find the highest nid <= this position for this room var events []Event - err = t.db.Select( + err = txn.Select( &events, `SELECT event_nid, room_id FROM syncv3_events WHERE event_nid IN (SELECT max(event_nid) FROM syncv3_events WHERE event_nid <= $1 AND room_id = ANY($2) GROUP BY room_id)`, diff --git a/state/event_table_test.go b/state/event_table_test.go index 76104a65..c015b2b2 100644 --- a/state/event_table_test.go +++ b/state/event_table_test.go @@ -951,7 +951,11 @@ func TestLatestEventNIDInRooms(t *testing.T) { }, } for _, tc := range testCases { - gotRoomToNID, err := table.LatestEventNIDInRooms(tc.roomIDs, int64(tc.highestNID)) + var gotRoomToNID map[string]int64 + err = sqlutil.WithTransaction(table.db, func(txn *sqlx.Tx) error { + gotRoomToNID, err = table.LatestEventNIDInRooms(txn, tc.roomIDs, int64(tc.highestNID)) + return err + }) assertNoError(t, err) want := make(map[string]int64) // map event IDs to nids for roomID, eventID := range tc.wantMap { diff --git a/state/storage.go b/state/storage.go index b51026cc..656b4f4b 100644 --- a/state/storage.go +++ b/state/storage.go @@ -838,6 +838,38 @@ func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (result m return result, metadata, nil } +func (s *Storage) LatestEventNIDInRooms(roomIDs []string, highestNID int64) (roomToNID map[string]int64, err error) { + roomToNID = make(map[string]int64) + err = sqlutil.WithTransaction(s.Accumulator.db, func(txn *sqlx.Tx) error { + // Pull out the latest nids for all the rooms. If they are < highestNID then use them, else we need to query the + // events table (slow) for the latest nid in this room which is < highestNID. + fastRoomToLatestNIDs, err := s.Accumulator.roomsTable.LatestNIDs(txn, roomIDs) + if err != nil { + return err + } + var slowRooms []string + for _, roomID := range roomIDs { + nid := fastRoomToLatestNIDs[roomID] + if nid > 0 && nid <= highestNID { + roomToNID[roomID] = nid + } else { + // we need to do a slow query for this + slowRooms = append(slowRooms, roomID) + } + } + + slowRoomToLatestNIDs, err := s.EventsTable.LatestEventNIDInRooms(txn, slowRooms, highestNID) + if err != nil { + return err + } + for roomID, nid := range slowRoomToLatestNIDs { + roomToNID[roomID] = nid + } + return nil + }) + return roomToNID, err +} + // Returns a map from joined room IDs to EventMetadata, which is nil iff a non-nil error // is returned. func (s *Storage) JoinedRoomsAfterPosition(userID string, pos int64) ( diff --git a/sync3/caches/global.go b/sync3/caches/global.go index 36c2867a..bd41f532 100644 --- a/sync3/caches/global.go +++ b/sync3/caches/global.go @@ -158,7 +158,7 @@ func (c *GlobalCache) LoadJoinedRooms(ctx context.Context, userID string) ( i++ } - latestNIDs, err = c.store.EventsTable.LatestEventNIDInRooms(roomIDs, initialLoadPosition) + latestNIDs, err = c.store.LatestEventNIDInRooms(roomIDs, initialLoadPosition) if err != nil { return 0, nil, nil, nil, err } From 2d773ab7764c336a20daf16f68431c4c2b3d2fe1 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Thu, 13 Jul 2023 18:39:53 +0100 Subject: [PATCH 064/156] Try to bail early --- state/storage.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/state/storage.go b/state/storage.go index 656b4f4b..4e9cf1dd 100644 --- a/state/storage.go +++ b/state/storage.go @@ -858,6 +858,10 @@ func (s *Storage) LatestEventNIDInRooms(roomIDs []string, highestNID int64) (roo } } + if len(slowRooms) == 0 { + return nil // no work to do + } + slowRoomToLatestNIDs, err := s.EventsTable.LatestEventNIDInRooms(txn, slowRooms, highestNID) if err != nil { return err From de1cf98df36e3f39a2bd3b2172d788e1210fbed5 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 14 Jul 2023 10:29:21 +0100 Subject: [PATCH 065/156] Log when we go slow --- state/storage.go | 1 + 1 file changed, 1 insertion(+) diff --git a/state/storage.go b/state/storage.go index 4e9cf1dd..1cae99ac 100644 --- a/state/storage.go +++ b/state/storage.go @@ -861,6 +861,7 @@ func (s *Storage) LatestEventNIDInRooms(roomIDs []string, highestNID int64) (roo if len(slowRooms) == 0 { return nil // no work to do } + logger.Warn().Int("slow_rooms", len(slowRooms)).Msg("LatestEventNIDInRooms: pos value provided is far behind the database copy, performance degraded") slowRoomToLatestNIDs, err := s.EventsTable.LatestEventNIDInRooms(txn, slowRooms, highestNID) if err != nil { From 8bed1037ac6a087950096ea069712047fbc63dcc Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 14 Jul 2023 10:32:53 +0100 Subject: [PATCH 066/156] bugfix: ensure metadata maps are always init'd If they aren't, we can panic on nil map access. Some of this code failed to `, ok :=` the map access which would return a zero-initialised metadata which has nil maps. --- state/storage.go | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/state/storage.go b/state/storage.go index b51026cc..7bdee80d 100644 --- a/state/storage.go +++ b/state/storage.go @@ -178,6 +178,14 @@ func (s *Storage) GlobalSnapshot() (ss StartupSnapshot, err error) { // Extract hero info for all rooms. Requires a prepared snapshot in order to be called. func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result map[string]internal.RoomMetadata) error { + loadMetadata := func(roomID string) internal.RoomMetadata { + metadata, ok := result[roomID] + if !ok { + metadata = *internal.NewRoomMetadata(roomID) + } + return metadata + } + // Select the invited member counts rows, err := txn.Query(` SELECT room_id, count(state_key) FROM syncv3_events INNER JOIN ` + tempTableName + ` ON membership_nid=event_nid @@ -192,10 +200,7 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result if err := rows.Scan(&roomID, &inviteCount); err != nil { return err } - metadata, ok := result[roomID] - if !ok { - metadata = *internal.NewRoomMetadata(roomID) - } + metadata := loadMetadata(roomID) metadata.InviteCount = inviteCount result[roomID] = metadata } @@ -206,10 +211,8 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result return err } for _, ev := range events { - metadata, ok := result[ev.RoomID] - if !ok { - metadata = *internal.NewRoomMetadata(ev.RoomID) - } + metadata := loadMetadata(ev.RoomID) + // For a given room, we'll see many events (one for each event type in the // room's state). We need to pick the largest of these events' timestamps here. ts := gjson.ParseBytes(ev.JSON).Get("origin_server_ts").Uint() @@ -238,7 +241,7 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result return fmt.Errorf("failed to load state events for all rooms: %s", err) } for roomID, stateEvents := range roomIDToStateEvents { - metadata := result[roomID] + metadata := loadMetadata(roomID) for _, ev := range stateEvents { if ev.Type == "m.room.name" && ev.StateKey == "" { metadata.NameEvent = gjson.ParseBytes(ev.JSON).Get("content.name").Str @@ -280,7 +283,7 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result continue } seen[key] = true - metadata := result[roomID] + metadata := loadMetadata(roomID) metadata.Heroes = append(metadata.Heroes, internal.Hero{ ID: targetUser, Name: ev.Get("content.displayname").Str, @@ -293,7 +296,7 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result } var spaceRoomIDs []string for _, info := range roomInfos { - metadata := result[info.ID] + metadata := loadMetadata(info.ID) metadata.Encrypted = info.IsEncrypted metadata.UpgradedRoomID = info.UpgradedRoomID metadata.PredecessorRoomID = info.PredecessorRoomID @@ -310,7 +313,7 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result return fmt.Errorf("failed to select space children: %s", err) } for roomID, relations := range spaceRoomToRelations { - metadata := result[roomID] + metadata := loadMetadata(roomID) metadata.ChildSpaceRooms = make(map[string]struct{}, len(relations)) for _, r := range relations { // For now we only honour child state events, but we store all the mappings just in case. From 2c5df875af102822eb9be4dea1dc3dc25b63cd08 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 14 Jul 2023 10:37:25 +0100 Subject: [PATCH 067/156] Skip spaces which are no longer valid --- state/storage.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/state/storage.go b/state/storage.go index 7bdee80d..de9b0040 100644 --- a/state/storage.go +++ b/state/storage.go @@ -313,6 +313,12 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result return fmt.Errorf("failed to select space children: %s", err) } for roomID, relations := range spaceRoomToRelations { + if _, exists := result[roomID]; !exists { + // this can happen when you join a space (so it populates the spaces table) then leave the space, + // so there are no joined members in the space so result doesn't include the room. In this case, + // we don't want to have a stub metadata with just the space children, so skip it. + continue + } metadata := loadMetadata(roomID) metadata.ChildSpaceRooms = make(map[string]struct{}, len(relations)) for _, r := range relations { From 1e4b6374a0b559ca192c4e9be3ecd32dd62f16f4 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Fri, 14 Jul 2023 10:45:17 +0100 Subject: [PATCH 068/156] Use a pinned EW version for e2e tests to unbreak them --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f71ec252..1c630ea2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -174,6 +174,7 @@ jobs: - uses: actions/checkout@v3 with: repository: matrix-org/matrix-react-sdk + ref: "v3.71.0" # later versions break the SS E2E tests which need to be fixed :( - uses: actions/setup-node@v3 with: cache: 'yarn' From e17f6c5fcbe22fa79386a6c2cc843160bba96f31 Mon Sep 17 00:00:00 2001 From: kegsay Date: Fri, 14 Jul 2023 11:39:08 +0100 Subject: [PATCH 069/156] Update v3.go --- v3.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v3.go b/v3.go index 581eeaff..74ae2be2 100644 --- a/v3.go +++ b/v3.go @@ -73,7 +73,7 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han // Setup shared DB and HTTP client v2Client := &sync2.HTTPClient{ Client: &http.Client{ - Timeout: 45 * time.Second, + Timeout: 5 * time.Minute, }, DestinationServer: destHomeserver, } From fc04171c7cadaa4dc9e21bae568804b2716090ec Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 17 Jul 2023 10:48:21 +0100 Subject: [PATCH 070/156] Combine invite/join calcs into 1 query for speed --- state/storage.go | 56 +++++++++++++++++++------------------------ state/storage_test.go | 17 +++++++++---- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/state/storage.go b/state/storage.go index 3d1a0bdb..f6637a56 100644 --- a/state/storage.go +++ b/state/storage.go @@ -186,25 +186,6 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result return metadata } - // Select the invited member counts - rows, err := txn.Query(` - SELECT room_id, count(state_key) FROM syncv3_events INNER JOIN ` + tempTableName + ` ON membership_nid=event_nid - WHERE (membership='_invite' OR membership = 'invite') AND event_type='m.room.member' GROUP BY room_id`) - if err != nil { - return err - } - defer rows.Close() - for rows.Next() { - var roomID string - var inviteCount int - if err := rows.Scan(&roomID, &inviteCount); err != nil { - return err - } - metadata := loadMetadata(roomID) - metadata.InviteCount = inviteCount - result[roomID] = metadata - } - // work out latest timestamps events, err := s.Accumulator.eventsTable.selectLatestEventByTypeInAllRooms(txn) if err != nil { @@ -256,7 +237,7 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result // "This should be the first 5 members of the room, ordered by stream ordering, which are joined or invited." // Unclear if this is the first 5 *most recent* (backwards) or forwards. For now we'll use the most recent // ones, and select 6 of them so we can always use 5 no matter who is requesting the room name. - rows, err = txn.Query(` + rows, err := txn.Query(` SELECT rf.* FROM ( SELECT room_id, event, rank() OVER ( PARTITION BY room_id ORDER BY event_nid DESC @@ -819,32 +800,45 @@ func (s *Storage) RoomMembershipDelta(roomID string, from, to int64, limit int) } // Extract all rooms with joined members, and include the joined user list. Requires a prepared snapshot in order to be called. -func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (result map[string][]string, metadata map[string]internal.RoomMetadata, err error) { +func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (joinedMembers map[string][]string, metadata map[string]internal.RoomMetadata, err error) { rows, err := txn.Query( - `SELECT room_id, state_key from ` + tempTableName + ` INNER JOIN syncv3_events on membership_nid = event_nid WHERE membership='join' OR membership='_join' ORDER BY event_nid ASC`, + `SELECT room_id, state_key, membership from ` + tempTableName + ` INNER JOIN syncv3_events + on membership_nid = event_nid WHERE membership='join' OR membership='_join' OR membership='invite' OR membership='_invite' ORDER BY event_nid ASC`, ) if err != nil { return nil, nil, err } defer rows.Close() - result = make(map[string][]string) + joinedMembers = make(map[string][]string) var roomID string - var joinedUserID string + inviteCounts := make(map[string]int) + var stateKey string + var membership string for rows.Next() { - if err := rows.Scan(&roomID, &joinedUserID); err != nil { + if err := rows.Scan(&roomID, &stateKey, &membership); err != nil { return nil, nil, err } - users := result[roomID] - users = append(users, joinedUserID) - result[roomID] = users + switch membership { + case "join": + fallthrough + case "_join": + users := joinedMembers[roomID] + users = append(users, stateKey) + joinedMembers[roomID] = users + case "invite": + fallthrough + case "_invite": + inviteCounts[roomID] = inviteCounts[roomID] + 1 + } } metadata = make(map[string]internal.RoomMetadata) - for roomID, joinedMembers := range result { + for roomID, members := range joinedMembers { m := internal.NewRoomMetadata(roomID) - m.JoinCount = len(joinedMembers) + m.JoinCount = len(members) + m.InviteCount = inviteCounts[roomID] metadata[roomID] = *m } - return result, metadata, nil + return joinedMembers, metadata, nil } func (s *Storage) LatestEventNIDInRooms(roomIDs []string, highestNID int64) (roomToNID map[string]int64, err error) { diff --git a/state/storage_test.go b/state/storage_test.go index e4b053b5..a80e2e17 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -210,18 +210,19 @@ func TestStorageJoinedRoomsAfterPosition(t *testing.T) { } } - newMetadata := func(roomID string, joinCount int) internal.RoomMetadata { + newMetadata := func(roomID string, joinCount, inviteCount int) internal.RoomMetadata { m := internal.NewRoomMetadata(roomID) m.JoinCount = joinCount + m.InviteCount = inviteCount return *m } // also test MetadataForAllRooms roomIDToMetadata := map[string]internal.RoomMetadata{ - joinedRoomID: newMetadata(joinedRoomID, 1), - invitedRoomID: newMetadata(invitedRoomID, 1), - banRoomID: newMetadata(banRoomID, 1), - bobJoinedRoomID: newMetadata(bobJoinedRoomID, 2), + joinedRoomID: newMetadata(joinedRoomID, 1, 0), + invitedRoomID: newMetadata(invitedRoomID, 1, 1), + banRoomID: newMetadata(banRoomID, 1, 0), + bobJoinedRoomID: newMetadata(bobJoinedRoomID, 2, 0), } tempTableName, err := store.PrepareSnapshot(txn) @@ -687,6 +688,12 @@ func TestGlobalSnapshot(t *testing.T) { } } +/* +func TestAllJoinedMembers(t *testing.T) { + assertNoError(t, cleanDB(t)) + roomIDToEventMap := map[string][]json.RawMessage{} +} */ + func cleanDB(t *testing.T) error { // make a fresh DB which is unpolluted from other tests db, close := connectToDB(t) From 1dc228052fadfc25f79a9b842276cac6205e561f Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:24:10 +0200 Subject: [PATCH 071/156] Don't try to get data for empty room lists --- sync3/handler/connstate.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 9f3f79ab..c52939c6 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -481,13 +481,22 @@ func (s *ConnState) buildRooms(ctx context.Context, builtSubs []BuiltSubscriptio } } } - // old rooms use a different subscription - oldRooms := s.getInitialRoomData(ctx, *bs.RoomSubscription.IncludeOldRooms, oldRoomIDs...) - for oldRoomID, oldRoom := range oldRooms { - result[oldRoomID] = oldRoom + + // If we have old rooms to fetch, do so. + if len(oldRoomIDs) > 0 { + // old rooms use a different subscription + oldRooms := s.getInitialRoomData(ctx, *bs.RoomSubscription.IncludeOldRooms, oldRoomIDs...) + for oldRoomID, oldRoom := range oldRooms { + result[oldRoomID] = oldRoom + } } } + // There won't be anything to fetch, try the next subscription. + if len(roomIDs) == 0 { + continue + } + rooms := s.getInitialRoomData(ctx, bs.RoomSubscription, roomIDs...) for roomID, room := range rooms { result[roomID] = room From fbd865abbae38b269a014f49032e327062383458 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 17 Jul 2023 15:55:10 +0100 Subject: [PATCH 072/156] wip tests --- state/storage_test.go | 100 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 3 deletions(-) diff --git a/state/storage_test.go b/state/storage_test.go index a80e2e17..b7e8d57c 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -688,11 +688,105 @@ func TestGlobalSnapshot(t *testing.T) { } } -/* func TestAllJoinedMembers(t *testing.T) { assertNoError(t, cleanDB(t)) - roomIDToEventMap := map[string][]json.RawMessage{} -} */ + store := NewStorage(postgresConnectionString) + defer store.Teardown() + + alice := "@alice:localhost" + bob := "@bob:localhost" + charlie := "@charlie:localhost" + doris := "@doris:localhost" + eve := "@eve:localhost" + frank := "@frank:localhost" + + testCases := []struct { + Name string + InitMemberships [][2]string + AccumulateMemberships [][2]string + WantJoined []string + WantInvited []string + }{ + { + Name: "basic joined users", + InitMemberships: [][2]string{{alice, "join"}}, + AccumulateMemberships: [][2]string{{bob, "join"}}, + WantJoined: []string{alice, bob}, + }, + { + Name: "basic invited users", + InitMemberships: [][2]string{{alice, "join"}}, + AccumulateMemberships: [][2]string{{bob, "invite"}}, + WantJoined: []string{alice}, + WantInvited: []string{bob}, + }, + { + Name: "many join/leaves, use latest", + InitMemberships: [][2]string{{alice, "join"}, {charlie, "join"}, {frank, "join"}}, + AccumulateMemberships: [][2]string{{bob, "join"}, {charlie, "leave"}, {frank, "leave"}, {charlie, "join"}, {eve, "join"}}, + WantJoined: []string{alice, bob, charlie, eve}, + }, + { + Name: "many invites, use latest", + InitMemberships: [][2]string{{alice, "join"}, {doris, "join"}}, + AccumulateMemberships: [][2]string{{doris, "leave"}, {charlie, "invite"}, {doris, "invite"}}, + WantJoined: []string{alice}, + WantInvited: []string{charlie, doris}, + }, + { + Name: "invite and rejection in accumulate", + InitMemberships: [][2]string{{alice, "join"}}, + AccumulateMemberships: [][2]string{{frank, "invite"}, {frank, "leave"}}, + WantJoined: []string{alice}, + }, + { + Name: "invite in initial, rejection in accumulate", + InitMemberships: [][2]string{{alice, "join"}, {frank, "invite"}}, + AccumulateMemberships: [][2]string{{frank, "leave"}}, + WantJoined: []string{alice}, + }, + } + + initialStates := map[string][]json.RawMessage{ + roomJoined: append(createRoomState(t, alice)), + } + + for roomID, init := range initialStates { + _, err := store.Initialise(roomID, init) + assertNoError(t, err) + } + + // should get all joined members correctly + var joinedMembers map[string][]string + // should set join/invite counts correctly + var roomMetadatas map[string]internal.RoomMetadata + err := sqlutil.WithTransaction(store.DB, func(txn *sqlx.Tx) error { + tableName, err := store.PrepareSnapshot(txn) + if err != nil { + return err + } + joinedMembers, roomMetadatas, err = store.AllJoinedMembers(txn, tableName) + return err + }) + assertNoError(t, err) + +} + +func newMembershipEvent(t *testing.T, sender, target, membership string) json.RawMessage { + return testutils.NewStateEvent(t, "m.room.member", target, sender, map[string]interface{}{ + "membership": membership, + }) +} + +func createRoomState(t *testing.T, sender string) []json.RawMessage { + return []json.RawMessage{ + testutils.NewStateEvent(t, "m.room.create", "", sender, map[string]interface{}{ + "creator": sender, + "room_version": "10", + }), + newMembershipEvent(t, sender, sender, "join"), + } +} func cleanDB(t *testing.T) error { // make a fresh DB which is unpolluted from other tests From 9ebe7634ec54c61ed6fcacb432e22ee4d5fbcd30 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 17 Jul 2023 16:25:28 +0100 Subject: [PATCH 073/156] Implement table tests --- state/storage_test.go | 63 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/state/storage_test.go b/state/storage_test.go index b7e8d57c..471dfe9d 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "fmt" "reflect" "sort" "testing" @@ -700,10 +701,12 @@ func TestAllJoinedMembers(t *testing.T) { eve := "@eve:localhost" frank := "@frank:localhost" + // Alice is always the creator and the inviter for simplicity's sake testCases := []struct { Name string InitMemberships [][2]string AccumulateMemberships [][2]string + RoomID string // tests set this dynamically WantJoined []string WantInvited []string }{ @@ -715,10 +718,10 @@ func TestAllJoinedMembers(t *testing.T) { }, { Name: "basic invited users", - InitMemberships: [][2]string{{alice, "join"}}, + InitMemberships: [][2]string{{alice, "join"}, {charlie, "invite"}}, AccumulateMemberships: [][2]string{{bob, "invite"}}, WantJoined: []string{alice}, - WantInvited: []string{bob}, + WantInvited: []string{bob, charlie}, }, { Name: "many join/leaves, use latest", @@ -747,13 +750,35 @@ func TestAllJoinedMembers(t *testing.T) { }, } - initialStates := map[string][]json.RawMessage{ - roomJoined: append(createRoomState(t, alice)), + serialise := func(memberships [][2]string) []json.RawMessage { + var result []json.RawMessage + for _, userWithMembership := range memberships { + target := userWithMembership[0] + sender := userWithMembership[0] + membership := userWithMembership[1] + if membership == "invite" { + // Alice is always the inviter + sender = alice + } + result = append(result, testutils.NewStateEvent(t, "m.room.member", target, sender, map[string]interface{}{ + "membership": membership, + })) + } + return result } - for roomID, init := range initialStates { - _, err := store.Initialise(roomID, init) + for i, tc := range testCases { + roomID := fmt.Sprintf("!TestAllJoinedMembers_%d:localhost", i) + _, err := store.Initialise(roomID, append([]json.RawMessage{ + testutils.NewStateEvent(t, "m.room.create", "", alice, map[string]interface{}{ + "creator": alice, // alice is always the creator + }), + }, serialise(tc.InitMemberships)...)) assertNoError(t, err) + + _, _, err = store.Accumulate(roomID, "foo", serialise(tc.AccumulateMemberships)) + assertNoError(t, err) + testCases[i].RoomID = roomID // remember this for later } // should get all joined members correctly @@ -770,6 +795,32 @@ func TestAllJoinedMembers(t *testing.T) { }) assertNoError(t, err) + for _, tc := range testCases { + roomID := tc.RoomID + if roomID == "" { + t.Fatalf("test case has no room id set: %+v", tc) + } + // make sure joined members match + sort.Strings(joinedMembers[roomID]) + sort.Strings(tc.WantJoined) + if !reflect.DeepEqual(joinedMembers[roomID], tc.WantJoined) { + t.Errorf("%v: got joined members %v want %v", tc.Name, joinedMembers[roomID], tc.WantJoined) + } + // make sure join/invite counts match + wantJoined := len(tc.WantJoined) + wantInvited := len(tc.WantInvited) + metadata, ok := roomMetadatas[roomID] + if !ok { + t.Fatalf("no room metadata for room %v", roomID) + } + if metadata.InviteCount != wantInvited { + t.Errorf("%v: got invite count %d want %d", tc.Name, metadata.InviteCount, wantInvited) + } + if metadata.JoinCount != wantJoined { + t.Errorf("%v: got join count %d want %d", tc.Name, metadata.JoinCount, wantJoined) + } + } + } func newMembershipEvent(t *testing.T, sender, target, membership string) json.RawMessage { From 1895080e84b32e8225f94b925ced7a09be038f5c Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 17 Jul 2023 17:47:37 +0100 Subject: [PATCH 074/156] Remove unused functions --- state/storage_test.go | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/state/storage_test.go b/state/storage_test.go index 471dfe9d..c5eaa192 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -820,23 +820,6 @@ func TestAllJoinedMembers(t *testing.T) { t.Errorf("%v: got join count %d want %d", tc.Name, metadata.JoinCount, wantJoined) } } - -} - -func newMembershipEvent(t *testing.T, sender, target, membership string) json.RawMessage { - return testutils.NewStateEvent(t, "m.room.member", target, sender, map[string]interface{}{ - "membership": membership, - }) -} - -func createRoomState(t *testing.T, sender string) []json.RawMessage { - return []json.RawMessage{ - testutils.NewStateEvent(t, "m.room.create", "", sender, map[string]interface{}{ - "creator": sender, - "room_version": "10", - }), - newMembershipEvent(t, sender, sender, "join"), - } } func cleanDB(t *testing.T) error { From 790ae22726faae450ae0f3f44ae27c132582fcef Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:36:20 +0100 Subject: [PATCH 075/156] MatchRoomSubscription: include room ID in err msg --- testutils/m/match.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testutils/m/match.go b/testutils/m/match.go index a7b2f635..79021028 100644 --- a/testutils/m/match.go +++ b/testutils/m/match.go @@ -222,11 +222,11 @@ func MatchRoomSubscription(roomID string, matchers ...RoomMatcher) RespMatcher { return func(res *sync3.Response) error { room, ok := res.Rooms[roomID] if !ok { - return fmt.Errorf("MatchRoomSubscription: want sub for %s but it was missing", roomID) + return fmt.Errorf("MatchRoomSubscription[%s]: want sub but it was missing", roomID) } for _, m := range matchers { if err := m(room); err != nil { - return fmt.Errorf("MatchRoomSubscription: %s", err) + return fmt.Errorf("MatchRoomSubscription[%s]: %s", roomID, err) } } return nil From 4eb9bcc27f890a4d860564aa8773c9e7e8f8f94a Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:38:35 +0100 Subject: [PATCH 076/156] Improve MatchResponse errmsg formatting - Print it out in red so you can spot it easily in a long test log - Pretty print the response json, because life's too short to train your brain to be a JSON parser. --- testutils/m/match.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/testutils/m/match.go b/testutils/m/match.go index 79021028..1b5d952d 100644 --- a/testutils/m/match.go +++ b/testutils/m/match.go @@ -686,13 +686,16 @@ func MatchLists(matchers map[string][]ListMatcher) RespMatcher { } } +const AnsiRedForeground = "\x1b[31m" +const AnsiResetForeground = "\x1b[39m" + func MatchResponse(t *testing.T, res *sync3.Response, matchers ...RespMatcher) { t.Helper() for _, m := range matchers { err := m(res) if err != nil { - b, _ := json.Marshal(res) - t.Errorf("MatchResponse: %s\n%+v", err, string(b)) + b, _ := json.MarshalIndent(res, "", " ") + t.Errorf("%vMatchResponse: %s\n%s%v", AnsiRedForeground, err, string(b), AnsiResetForeground) } } } From 64183a3b99bca80b8dbca27e0d433f86c6639b22 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:39:34 +0100 Subject: [PATCH 077/156] Add helper Matcher for logging the rooms section Useful for debugging a test. --- testutils/m/match.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/testutils/m/match.go b/testutils/m/match.go index 1b5d952d..eb852f53 100644 --- a/testutils/m/match.go +++ b/testutils/m/match.go @@ -644,6 +644,15 @@ func LogResponse(t *testing.T) RespMatcher { } } +// LogRooms is like LogResponse, but only logs the rooms section of the response. +func LogRooms(t *testing.T) RespMatcher { + return func(res *sync3.Response) error { + dump, _ := json.MarshalIndent(res.Rooms, "", " ") + t.Logf("Response rooms were: %s", dump) + return nil + } +} + func CheckList(listKey string, res sync3.ResponseList, matchers ...ListMatcher) error { for _, m := range matchers { if err := m(res); err != nil { From 59a655461506fd53ed16e3cef522f5fd5f691307 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:42:13 +0100 Subject: [PATCH 078/156] Define `avatar` field on sync3.Room --- sync3/avatar.go | 42 +++++++++++++++++++++++++ sync3/room.go | 1 + sync3/room_test.go | 74 ++++++++++++++++++++++++++++++++++++++++++++ testutils/m/match.go | 33 ++++++++++++++++++++ 4 files changed, 150 insertions(+) create mode 100644 sync3/avatar.go create mode 100644 sync3/room_test.go diff --git a/sync3/avatar.go b/sync3/avatar.go new file mode 100644 index 00000000..63e494b8 --- /dev/null +++ b/sync3/avatar.go @@ -0,0 +1,42 @@ +package sync3 + +import ( + "bytes" + "encoding/json" +) + +// An AvatarChange represents a change to a room's avatar. There are three cases: +// - an empty string represents no change, and should be omitted when JSON-serialised; +// - the sentinel `` represents a room that has never had an avatar, +// or a room whose avatar has been removed. It is JSON-serialised as null. +// - All other strings represent the current avatar of the room and JSON-serialise as +// normal. +type AvatarChange string + +const DeletedAvatar = AvatarChange("") +const UnchangedAvatar AvatarChange = "" + +// NewAvatarChange interprets an optional avatar string as an AvatarChange. +func NewAvatarChange(avatar string) AvatarChange { + if avatar == "" { + return DeletedAvatar + } + return AvatarChange(avatar) +} + +func (a AvatarChange) MarshalJSON() ([]byte, error) { + if a == DeletedAvatar { + return []byte(`null`), nil + } else { + return json.Marshal(string(a)) + } +} + +// Note: the unmarshalling is only used in tests. +func (a *AvatarChange) UnmarshalJSON(data []byte) error { + if bytes.Equal(data, []byte("null")) { + *a = DeletedAvatar + return nil + } + return json.Unmarshal(data, (*string)(a)) +} diff --git a/sync3/room.go b/sync3/room.go index 6bf7c35d..3e400982 100644 --- a/sync3/room.go +++ b/sync3/room.go @@ -10,6 +10,7 @@ import ( type Room struct { Name string `json:"name,omitempty"` + AvatarChange AvatarChange `json:"avatar,omitempty"` RequiredState []json.RawMessage `json:"required_state,omitempty"` Timeline []json.RawMessage `json:"timeline,omitempty"` InviteState []json.RawMessage `json:"invite_state,omitempty"` diff --git a/sync3/room_test.go b/sync3/room_test.go new file mode 100644 index 00000000..09ec058a --- /dev/null +++ b/sync3/room_test.go @@ -0,0 +1,74 @@ +package sync3 + +import ( + "encoding/json" + "fmt" + "github.com/tidwall/gjson" + "reflect" + "testing" +) + +func TestAvatarChangeMarshalling(t *testing.T) { + var url = "mxc://..." + testCases := []struct { + Name string + AvatarChange AvatarChange + Check func(avatar gjson.Result) error + }{ + { + Name: "Avatar exists", + AvatarChange: NewAvatarChange(url), + Check: func(avatar gjson.Result) error { + if !(avatar.Exists() && avatar.Type == gjson.String && avatar.Str == url) { + return fmt.Errorf("unexpected marshalled avatar: got %#v want %s", avatar, url) + } + return nil + }, + }, + { + Name: "Avatar doesn't exist", + AvatarChange: DeletedAvatar, + Check: func(avatar gjson.Result) error { + if !(avatar.Exists() && avatar.Type == gjson.Null) { + return fmt.Errorf("unexpected marshalled Avatar: got %#v want null", avatar) + } + return nil + }, + }, + { + Name: "Avatar unchanged", + AvatarChange: UnchangedAvatar, + Check: func(avatar gjson.Result) error { + if avatar.Exists() { + return fmt.Errorf("unexpected marshalled Avatar: got %#v want omitted", avatar) + } + return nil + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + room := Room{AvatarChange: tc.AvatarChange} + marshalled, err := json.Marshal(room) + t.Logf("Marshalled to %s", string(marshalled)) + if err != nil { + t.Fatal(err) + } + avatar := gjson.GetBytes(marshalled, "avatar") + if err = tc.Check(avatar); err != nil { + t.Fatal(err) + } + + var unmarshalled Room + err = json.Unmarshal(marshalled, &unmarshalled) + if err != nil { + t.Fatal(err) + } + t.Logf("Unmarshalled to %#v", unmarshalled.AvatarChange) + if !reflect.DeepEqual(unmarshalled, room) { + t.Fatalf("Unmarshalled struct is different from original") + } + }) + } +} diff --git a/testutils/m/match.go b/testutils/m/match.go index eb852f53..f8f6c6a3 100644 --- a/testutils/m/match.go +++ b/testutils/m/match.go @@ -39,6 +39,39 @@ func MatchRoomName(name string) RoomMatcher { } } +// MatchRoomAvatar builds a RoomMatcher which checks that the given room response has +// set the room's avatar to the given value. +func MatchRoomAvatar(wantAvatar string) RoomMatcher { + return func(r sync3.Room) error { + if string(r.AvatarChange) != wantAvatar { + return fmt.Errorf("MatchRoomAvatar: got \"%s\" want \"%s\"", r.AvatarChange, wantAvatar) + } + return nil + } +} + +// MatchRoomUnsetAvatar builds a RoomMatcher which checks that the given room has no +// avatar, or has had its avatar deleted. +func MatchRoomUnsetAvatar() RoomMatcher { + return func(r sync3.Room) error { + if r.AvatarChange != sync3.DeletedAvatar { + return fmt.Errorf("MatchRoomAvatar: got \"%s\" want \"%s\"", r.AvatarChange, sync3.DeletedAvatar) + } + return nil + } +} + +// MatchRoomUnchangedAvatar builds a RoomMatcher which checks that the given room has no +// change to its avatar, or has had its avatar deleted. +func MatchRoomUnchangedAvatar() RoomMatcher { + return func(r sync3.Room) error { + if r.AvatarChange != sync3.UnchangedAvatar { + return fmt.Errorf("MatchRoomAvatar: got \"%s\" want \"%s\"", r.AvatarChange, sync3.UnchangedAvatar) + } + return nil + } +} + func MatchJoinCount(count int) RoomMatcher { return func(r sync3.Room) error { if r.JoinedCount != count { From 5ac3430486271f65ec94acd683c0bec37be62fd5 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:44:01 +0100 Subject: [PATCH 079/156] Test helper for setting an avatar Expose this on CSAPI for tests to use too. Also update SlidingSyncUntilMembership to check the membership only. Otherwise it would try to match an avatar. In 99% of cases this would be fine, but things like per-room avatars (or indeed per-room displaynames) would screw this up. --- tests-e2e/client_test.go | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/tests-e2e/client_test.go b/tests-e2e/client_test.go index 3ef9c817..a6982437 100644 --- a/tests-e2e/client_test.go +++ b/tests-e2e/client_test.go @@ -134,6 +134,7 @@ type CSAPI struct { Localpart string AccessToken string DeviceID string + AvatarURL string BaseURL string Client *http.Client // how long are we willing to wait for MustSyncUntil.... calls @@ -159,6 +160,16 @@ func (c *CSAPI) UploadContent(t *testing.T, fileBody []byte, fileName string, co return GetJSONFieldStr(t, body, "content_uri") } +// Use an empty string to remove your avatar. +func (c *CSAPI) SetAvatar(t *testing.T, avatarURL string) { + t.Helper() + reqBody := map[string]interface{}{ + "avatar_url": avatarURL, + } + c.MustDoFunc(t, "PUT", []string{"_matrix", "client", "v3", "profile", c.UserID, "avatar_url"}, WithJSONBody(t, reqBody)) + c.AvatarURL = avatarURL +} + // DownloadContent downloads media from the server, returning the raw bytes and the Content-Type. Fails the test on error. func (c *CSAPI) DownloadContent(t *testing.T, mxcUri string) ([]byte, string) { t.Helper() @@ -678,16 +689,32 @@ func (c *CSAPI) SlidingSyncUntilMembership(t *testing.T, pos string, roomID stri }) } - return c.SlidingSyncUntilEvent(t, pos, sync3.Request{ + return c.SlidingSyncUntil(t, pos, sync3.Request{ RoomSubscriptions: map[string]sync3.RoomSubscription{ roomID: { TimelineLimit: 10, }, }, - }, roomID, Event{ - Type: "m.room.member", - StateKey: &target.UserID, - Content: content, + }, func(r *sync3.Response) error { + room, ok := r.Rooms[roomID] + if !ok { + return fmt.Errorf("missing room %s", roomID) + } + for _, got := range room.Timeline { + wantEvent := Event{ + Type: "m.room.member", + StateKey: &target.UserID, + } + if err := eventsEqual([]Event{wantEvent}, []json.RawMessage{got}); err == nil { + gotMembership := gjson.GetBytes(got, "content.membership") + if gotMembership.Exists() && gotMembership.Type == gjson.String && gotMembership.Str == membership { + return nil + } + } else { + t.Log(err) + } + } + return fmt.Errorf("found room %s but missing event", roomID) }) } From bccfa874537c59df112fd31066e39f6c8c3d997a Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Tue, 18 Jul 2023 13:39:18 +0200 Subject: [PATCH 080/156] Persist the since token only if the last time was over 1 minute ago OR there are toDevice events in the response --- sync2/poller.go | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sync2/poller.go b/sync2/poller.go index f6c99a18..dbd9a05a 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -427,9 +427,10 @@ func (p *poller) Terminate() { } type pollLoopState struct { - firstTime bool - failCount int - since string + firstTime bool + failCount int + since string + lastStoredSince time.Time // The time we last stored the since token in the database } // Poll will block forever, repeatedly calling v2 sync. Do this in a goroutine. @@ -463,6 +464,8 @@ func (p *poller) Poll(since string) { firstTime: true, failCount: 0, since: since, + // Setting time.Time{} results in the first poll loop to immediately store the since token. + lastStoredSince: time.Time{}, } for !p.terminated.Load() { ctx, task := internal.StartTask(ctx, "Poll") @@ -544,8 +547,12 @@ func (p *poller) poll(ctx context.Context, s *pollLoopState) error { wasFirst := s.firstTime s.since = resp.NextBatch - // persist the since token (TODO: this could get slow if we hammer the DB too much) - p.receiver.UpdateDeviceSince(ctx, p.userID, p.deviceID, s.since) + // Persist the since token if it either was more than one minute ago since we + // last stored it OR the response contains to-device messages + if time.Since(s.lastStoredSince) > time.Minute || len(resp.ToDevice.Events) > 0 { + p.receiver.UpdateDeviceSince(ctx, p.userID, p.deviceID, s.since) + s.lastStoredSince = time.Now() + } if s.firstTime { s.firstTime = false From 3861bac4c6fd1ab90539d2eb5cf6d45c9f994ea1 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:54:31 +0100 Subject: [PATCH 081/156] Don't create two Heroes slices to the same array Otherwise you end up with two slices that share ownership of the same data but don't coordinate between one another, see e.g. https://go.dev/play/p/uk3o652Tvye --- internal/roomname.go | 33 +++++++++++++++++++++++++- internal/roomname_test.go | 42 +++++++++++++++++++++++++++++++++ sync3/caches/global.go | 8 +------ sync3/caches/user.go | 5 +++- sync3/handler/connstate_live.go | 4 +++- 5 files changed, 82 insertions(+), 10 deletions(-) diff --git a/internal/roomname.go b/internal/roomname.go index 73eb0205..dd7bfaae 100644 --- a/internal/roomname.go +++ b/internal/roomname.go @@ -13,7 +13,12 @@ type EventMetadata struct { Timestamp uint64 } -// RoomMetadata holds room-scoped data. It is primarily used in two places: +// RoomMetadata holds room-scoped data. +// TODO: This is a lie: we sometimes remove a user U from the list of heroes +// when calculating the sync response for that user U. Grep for `RemoveHero`. +// +// It is primarily used in two places: +// // - in the caches.GlobalCache, to hold the latest version of data that is consistent // between all users in the room; and // - in the sync3.RoomConnMetadata struct, to hold the version of data last seen by @@ -54,6 +59,32 @@ func NewRoomMetadata(roomID string) *RoomMetadata { } } +// CopyHeroes returns a version of the current RoomMetadata whose Heroes field is +// a brand-new copy of the original Heroes. The return value's Heroes field can be +// safely modified by the caller, but it is NOT safe for the caller to modify any other +// fields. +func (m *RoomMetadata) CopyHeroes() *RoomMetadata { + newMetadata := *m + + // XXX: We're doing this because we end up calling RemoveHero() to omit the + // currently-sycning user in various places. But this seems smelly. The set of + // heroes in the room is a global, room-scoped fact: it is a property of the room + // state and nothing else, and all users see the same set of heroes. + // + // I think the data model would be cleaner if we made the hero-reading functions + // aware of the currently syncing user, in order to ignore them without having to + // change the underlying data. + // + // copy the heroes or else we may modify the same slice which would be bad :( + newMetadata.Heroes = make([]Hero, len(m.Heroes)) + copy(newMetadata.Heroes, m.Heroes) + + // ⚠️ NB: there are other pointer fields (e.g. PredecessorRoomID *string) or + // and pointer-backed fields (e.g. LatestEventsByType map[string]EventMetadata) + // which are not deepcopied here. + return &newMetadata +} + // SameRoomName checks if the fields relevant for room names have changed between the two metadatas. // Returns true if there are no changes. func (m *RoomMetadata) SameRoomName(other *RoomMetadata) bool { diff --git a/internal/roomname_test.go b/internal/roomname_test.go index 4e942f43..01349eed 100644 --- a/internal/roomname_test.go +++ b/internal/roomname_test.go @@ -247,3 +247,45 @@ func TestCalculateRoomName(t *testing.T) { } } } + +func TestCopyHeroes(t *testing.T) { + const alice = "@alice:test" + const bob = "@bob:test" + const chris = "@chris:test" + m1 := RoomMetadata{Heroes: []Hero{ + {ID: alice}, + {ID: bob}, + {ID: chris}, + }} + + m2 := m1.CopyHeroes() + // Uncomment this to see why CopyHeroes is necessary! + //m2 := m1 + + t.Logf("Compare heroes:\n\tm1=%v\n\tm2=%v", m1.Heroes, m2.Heroes) + + t.Log("Remove chris from m1") + m1.RemoveHero(chris) + t.Logf("Compare heroes:\n\tm1=%v\n\tm2=%v", m1.Heroes, m2.Heroes) + + assertSliceIDs(t, "m1.Heroes", m1.Heroes, []string{alice, bob}) + assertSliceIDs(t, "m2.Heroes", m2.Heroes, []string{alice, bob, chris}) + + t.Log("Remove alice from m1") + m1.RemoveHero(alice) + t.Logf("Compare heroes:\n\tm1=%v\n\tm2=%v", m1.Heroes, m2.Heroes) + + assertSliceIDs(t, "m1.Heroes", m1.Heroes, []string{bob}) + assertSliceIDs(t, "m2.Heroes", m2.Heroes, []string{alice, bob, chris}) +} + +func assertSliceIDs(t *testing.T, desc string, h []Hero, ids []string) { + if len(h) != len(ids) { + t.Errorf("%s has length %d, expected %d", desc, len(h), len(ids)) + } + for index, id := range ids { + if h[index].ID != id { + t.Errorf("%s[%d] ID is %s, expected %s", desc, index, h[index].ID, id) + } + } +} diff --git a/sync3/caches/global.go b/sync3/caches/global.go index 36c2867a..eccc5a31 100644 --- a/sync3/caches/global.go +++ b/sync3/caches/global.go @@ -118,13 +118,7 @@ func (c *GlobalCache) copyRoom(roomID string) *internal.RoomMetadata { logger.Warn().Str("room", roomID).Msg("GlobalCache.LoadRoom: no metadata for this room, returning stub") return internal.NewRoomMetadata(roomID) } - srCopy := *sr - // copy the heroes or else we may modify the same slice which would be bad :( - srCopy.Heroes = make([]internal.Hero, len(sr.Heroes)) - for i := range sr.Heroes { - srCopy.Heroes[i] = sr.Heroes[i] - } - return &srCopy + return sr.CopyHeroes() } // LoadJoinedRooms loads all current joined room metadata for the user given, together diff --git a/sync3/caches/user.go b/sync3/caches/user.go index ef160f8b..28ec08c2 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -328,7 +328,10 @@ func (c *UserCache) LoadRoomData(roomID string) UserRoomData { } type roomUpdateCache struct { - roomID string + roomID string + // globalRoomData is a snapshot of the global metadata for this room immediately + // after this update. It is a copy, specific to the given user whose Heroes + // field can be freely modified. globalRoomData *internal.RoomMetadata userRoomData *UserRoomData } diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index a01a7ef6..98994f2e 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -290,8 +290,10 @@ func (s *connStateLive) processGlobalUpdates(ctx context.Context, builder *Rooms } } + metadata := rup.GlobalRoomMetadata().CopyHeroes() + metadata.RemoveHero(s.userID) delta = s.lists.SetRoom(sync3.RoomConnMetadata{ - RoomMetadata: *rup.GlobalRoomMetadata(), + RoomMetadata: *metadata, UserRoomData: *rup.UserRoomMetadata(), LastInterestedEventTimestamps: bumpTimestampInList, }) From 5913fbb0f3c24c5fb469253f5a8701efe3f5bea7 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:56:44 +0100 Subject: [PATCH 082/156] Parse m.room.avatar --- internal/roomname.go | 1 + state/storage.go | 4 +++- sync3/caches/global.go | 4 ++++ sync3/caches/user.go | 4 ++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/internal/roomname.go b/internal/roomname.go index dd7bfaae..47c7e4d0 100644 --- a/internal/roomname.go +++ b/internal/roomname.go @@ -30,6 +30,7 @@ type RoomMetadata struct { RoomID string Heroes []Hero NameEvent string // the content of m.room.name, NOT the calculated name + AvatarEvent string // the content of m.room.avatar, NOT the resolved avatar CanonicalAlias string JoinCount int InviteCount int diff --git a/state/storage.go b/state/storage.go index b51026cc..fa6e261f 100644 --- a/state/storage.go +++ b/state/storage.go @@ -232,7 +232,7 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result // Select the name / canonical alias for all rooms roomIDToStateEvents, err := s.currentNotMembershipStateEventsInAllRooms(txn, []string{ - "m.room.name", "m.room.canonical_alias", + "m.room.name", "m.room.canonical_alias", "m.room.avatar", }) if err != nil { return fmt.Errorf("failed to load state events for all rooms: %s", err) @@ -244,6 +244,8 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result metadata.NameEvent = gjson.ParseBytes(ev.JSON).Get("content.name").Str } else if ev.Type == "m.room.canonical_alias" && ev.StateKey == "" { metadata.CanonicalAlias = gjson.ParseBytes(ev.JSON).Get("content.alias").Str + } else if ev.Type == "m.room.avatar" && ev.StateKey == "" { + metadata.AvatarEvent = gjson.ParseBytes(ev.JSON).Get("content.url").Str } } result[roomID] = metadata diff --git a/sync3/caches/global.go b/sync3/caches/global.go index eccc5a31..fe6ca5f2 100644 --- a/sync3/caches/global.go +++ b/sync3/caches/global.go @@ -279,6 +279,10 @@ func (c *GlobalCache) OnNewEvent( if ed.StateKey != nil && *ed.StateKey == "" { metadata.NameEvent = ed.Content.Get("name").Str } + case "m.room.avatar": + if ed.StateKey != nil && *ed.StateKey == "" { + metadata.AvatarEvent = ed.Content.Get("url").Str + } case "m.room.encryption": if ed.StateKey != nil && *ed.StateKey == "" { metadata.Encrypted = true diff --git a/sync3/caches/user.go b/sync3/caches/user.go index 28ec08c2..44e126ac 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -73,6 +73,7 @@ type InviteData struct { Heroes []internal.Hero InviteEvent *EventData NameEvent string // the content of m.room.name, NOT the calculated name + AvatarEvent string // the content of m.room.avatar, NOT the calculated avatar CanonicalAlias string LastMessageTimestamp uint64 Encrypted bool @@ -114,6 +115,8 @@ func NewInviteData(ctx context.Context, userID, roomID string, inviteState []jso } case "m.room.name": id.NameEvent = j.Get("content.name").Str + case "m.room.avatar": + id.AvatarEvent = j.Get("content.avatar_url").Str case "m.room.canonical_alias": id.CanonicalAlias = j.Get("content.alias").Str case "m.room.encryption": @@ -147,6 +150,7 @@ func (i *InviteData) RoomMetadata() *internal.RoomMetadata { metadata := internal.NewRoomMetadata(i.roomID) metadata.Heroes = i.Heroes metadata.NameEvent = i.NameEvent + metadata.AvatarEvent = i.AvatarEvent metadata.CanonicalAlias = i.CanonicalAlias metadata.InviteCount = 1 metadata.JoinCount = 1 From 7097c1d27902e669e92d4cc81ee521cdbb697353 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:53:17 +0100 Subject: [PATCH 083/156] Track heroes' avatars --- internal/roomname.go | 30 ++++++++++++++++++++++++++---- state/storage.go | 5 +++-- sync3/caches/global.go | 6 ++++-- sync3/caches/user.go | 5 +++-- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/internal/roomname.go b/internal/roomname.go index 47c7e4d0..46d520c4 100644 --- a/internal/roomname.go +++ b/internal/roomname.go @@ -94,7 +94,13 @@ func (m *RoomMetadata) SameRoomName(other *RoomMetadata) bool { m.CanonicalAlias == other.CanonicalAlias && m.JoinCount == other.JoinCount && m.InviteCount == other.InviteCount && - sameHeroes(m.Heroes, other.Heroes)) + sameHeroNames(m.Heroes, other.Heroes)) +} + +// SameRoomAvatar checks if the fields relevant for room avatars have changed between the two metadatas. +// Returns true if there are no changes. +func (m *RoomMetadata) SameRoomAvatar(other *RoomMetadata) bool { + return m.AvatarEvent == other.AvatarEvent && sameHeroAvatars(m.Heroes, other.Heroes) } func (m *RoomMetadata) SameJoinCount(other *RoomMetadata) bool { @@ -105,7 +111,7 @@ func (m *RoomMetadata) SameInviteCount(other *RoomMetadata) bool { return m.InviteCount == other.InviteCount } -func sameHeroes(a, b []Hero) bool { +func sameHeroNames(a, b []Hero) bool { if len(a) != len(b) { return false } @@ -120,6 +126,21 @@ func sameHeroes(a, b []Hero) bool { return true } +func sameHeroAvatars(a, b []Hero) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i].ID != b[i].ID { + return false + } + if a[i].Avatar != b[i].Avatar { + return false + } + } + return true +} + func (m *RoomMetadata) RemoveHero(userID string) { for i, h := range m.Heroes { if h.ID == userID { @@ -134,8 +155,9 @@ func (m *RoomMetadata) IsSpace() bool { } type Hero struct { - ID string - Name string + ID string + Name string + Avatar string } func CalculateRoomName(heroInfo *RoomMetadata, maxNumNamesPerRoom int) string { diff --git a/state/storage.go b/state/storage.go index fa6e261f..5e1e9338 100644 --- a/state/storage.go +++ b/state/storage.go @@ -284,8 +284,9 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result seen[key] = true metadata := result[roomID] metadata.Heroes = append(metadata.Heroes, internal.Hero{ - ID: targetUser, - Name: ev.Get("content.displayname").Str, + ID: targetUser, + Name: ev.Get("content.displayname").Str, + Avatar: ev.Get("content.avatar_url").Str, }) result[roomID] = metadata } diff --git a/sync3/caches/global.go b/sync3/caches/global.go index fe6ca5f2..537ce00b 100644 --- a/sync3/caches/global.go +++ b/sync3/caches/global.go @@ -347,14 +347,16 @@ func (c *GlobalCache) OnNewEvent( for i := range metadata.Heroes { if metadata.Heroes[i].ID == *ed.StateKey { metadata.Heroes[i].Name = ed.Content.Get("displayname").Str + metadata.Heroes[i].Avatar = ed.Content.Get("avatar_url").Str found = true break } } if !found { metadata.Heroes = append(metadata.Heroes, internal.Hero{ - ID: *ed.StateKey, - Name: ed.Content.Get("displayname").Str, + ID: *ed.StateKey, + Name: ed.Content.Get("displayname").Str, + Avatar: ed.Content.Get("avatar_url").Str, }) } } diff --git a/sync3/caches/user.go b/sync3/caches/user.go index 44e126ac..5dd37785 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -109,8 +109,9 @@ func NewInviteData(ctx context.Context, userID, roomID string, inviteState []jso id.IsDM = j.Get("is_direct").Bool() } else if target == j.Get("sender").Str { id.Heroes = append(id.Heroes, internal.Hero{ - ID: target, - Name: j.Get("content.displayname").Str, + ID: target, + Name: j.Get("content.displayname").Str, + Avatar: j.Get("content.avatar_url").Str, }) } case "m.room.name": From 6b9a2217f8dbb87939ab3375a58c88958c4aaf65 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:58:33 +0100 Subject: [PATCH 084/156] Track changes to avatars when computing deltas --- internal/roomname.go | 15 +++++++++++++++ sync3/caches/user.go | 8 ++++++++ sync3/handler/connstate.go | 1 + sync3/handler/connstate_live.go | 5 +++++ sync3/lists.go | 6 ++++++ 5 files changed, 35 insertions(+) diff --git a/internal/roomname.go b/internal/roomname.go index 46d520c4..8e4ad465 100644 --- a/internal/roomname.go +++ b/internal/roomname.go @@ -244,3 +244,18 @@ func disambiguate(heroes []Hero) []string { } return disambiguatedNames } + +const noAvatar = "" + +// CalculateAvatar computes the avatar for the room, based on the global room metadata. +// Assumption: metadata.RemoveHero has been called to remove the user who is syncing +// from the list of heroes. +func CalculateAvatar(metadata *RoomMetadata) string { + if metadata.AvatarEvent != "" { + return metadata.AvatarEvent + } + if len(metadata.Heroes) == 1 { + return metadata.Heroes[0].Avatar + } + return noAvatar +} diff --git a/sync3/caches/user.go b/sync3/caches/user.go index 5dd37785..8b1a7a52 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -50,6 +50,14 @@ type UserRoomData struct { CanonicalisedName string // stripped leading symbols like #, all in lower case // Set of spaces this room is a part of, from the perspective of this user. This is NOT global room data // as the set of spaces may be different for different users. + + // ResolvedAvatarURL is the avatar that should be displayed to this user to + // represent this room. The empty string means that this room has no avatar. + // Avatars set in m.room.avatar take precedence; if this is missing and the room is + // a DM with one other user joined or invited, we fall back to that user's + // avatar (if any) as specified in their membership event in that room. + ResolvedAvatarURL string + Spaces map[string]struct{} // Map of tag to order float. // See https://spec.matrix.org/latest/client-server-api/#room-tagging diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 9f3f79ab..838c427c 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -596,6 +596,7 @@ func (s *ConnState) getInitialRoomData(ctx context.Context, roomSub sync3.RoomSu } rooms[roomID] = sync3.Room{ Name: internal.CalculateRoomName(metadata, 5), // TODO: customisable? + AvatarChange: sync3.NewAvatarChange(internal.CalculateAvatar(metadata)), NotificationCount: int64(userRoomData.NotificationCount), HighlightCount: int64(userRoomData.HighlightCount), Timeline: roomToTimeline[roomID], diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index 98994f2e..817560b3 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -218,6 +218,11 @@ func (s *connStateLive) processLiveUpdate(ctx context.Context, up caches.Update, metadata.RemoveHero(s.userID) thisRoom.Name = internal.CalculateRoomName(metadata, 5) // TODO: customisable? } + if delta.RoomAvatarChanged { + metadata := roomUpdate.GlobalRoomMetadata() + metadata.RemoveHero(s.userID) + thisRoom.AvatarChange = sync3.NewAvatarChange(internal.CalculateAvatar(metadata)) + } if delta.InviteCountChanged { thisRoom.InvitedCount = &roomUpdate.GlobalRoomMetadata().InviteCount } diff --git a/sync3/lists.go b/sync3/lists.go index f5e1dfbf..d8b404df 100644 --- a/sync3/lists.go +++ b/sync3/lists.go @@ -33,6 +33,7 @@ type RoomListDelta struct { type RoomDelta struct { RoomNameChanged bool + RoomAvatarChanged bool JoinCountChanged bool InviteCountChanged bool NotificationCountChanged bool @@ -73,6 +74,10 @@ func (s *InternalRequestLists) SetRoom(r RoomConnMetadata) (delta RoomDelta) { strings.Trim(internal.CalculateRoomName(&r.RoomMetadata, 5), "#!():_@"), ) } + delta.RoomAvatarChanged = !existing.SameRoomAvatar(&r.RoomMetadata) + if delta.RoomAvatarChanged { + r.ResolvedAvatarURL = internal.CalculateAvatar(&r.RoomMetadata) + } // Interpret the timestamp map on r as the changes we should apply atop the // existing timestamps. @@ -97,6 +102,7 @@ func (s *InternalRequestLists) SetRoom(r RoomConnMetadata) (delta RoomDelta) { r.CanonicalisedName = strings.ToLower( strings.Trim(internal.CalculateRoomName(&r.RoomMetadata, 5), "#!():_@"), ) + r.ResolvedAvatarURL = internal.CalculateAvatar(&r.RoomMetadata) // We'll automatically use the LastInterestedEventTimestamps provided by the // caller, so that recency sorts work. } From 065dcd4630245bfb6f2c5584bd6e58c41fd1770b Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 10:59:48 +0100 Subject: [PATCH 085/156] Fix comment typo --- sync3/caches/user.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/caches/user.go b/sync3/caches/user.go index 8b1a7a52..5f5e46e7 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -225,7 +225,7 @@ func (c *UserCache) Unsubscribe(id int) { // OnRegistered is called after the sync3.Dispatcher has successfully registered this // cache to receive updates. We use this to run some final initialisation logic that // is sensitive to race conditions; confusingly, most of the initialisation is driven -// externally by sync3.SyncLiveHandler.userCache. It's importatn that we don't spend too +// externally by sync3.SyncLiveHandler.userCaches. It's important that we don't spend too // long inside this function, because it is called within a global lock on the // sync3.Dispatcher (see sync3.Dispatcher.Register). func (c *UserCache) OnRegistered(ctx context.Context) error { From a02a559f7a7a5c7d3c1a7b33c2d9118202c6d8b2 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 11:00:10 +0100 Subject: [PATCH 086/156] Test cases --- sync3/handler/connstate_live.go | 7 + tests-e2e/lists_test.go | 388 ++++++++++++++++++++++++++++++++ 2 files changed, 395 insertions(+) diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index 817560b3..d5b0975d 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -297,6 +297,13 @@ func (s *connStateLive) processGlobalUpdates(ctx context.Context, builder *Rooms metadata := rup.GlobalRoomMetadata().CopyHeroes() metadata.RemoveHero(s.userID) + // TODO: if we change a room from being a DM to not being a DM, we should call + // SetRoom and recalculate avatars. To do that we'd need to + // - listen to m.direct global account data events + // - compute the symmetric difference between old and new + // - call SetRooms for each room in the difference. + // I'm assuming this happens so rarely that we can ignore this for now. PRs + // welcome if you a strong opinion to the contrary. delta = s.lists.SetRoom(sync3.RoomConnMetadata{ RoomMetadata: *metadata, UserRoomData: *rup.UserRoomMetadata(), diff --git a/tests-e2e/lists_test.go b/tests-e2e/lists_test.go index 0d60b783..11dd97c3 100644 --- a/tests-e2e/lists_test.go +++ b/tests-e2e/lists_test.go @@ -1297,3 +1297,391 @@ func TestRangeOutsideTotalRooms(t *testing.T) { ), ) } + +// Nicked from Synapse's tests, see +// https://github.com/matrix-org/synapse/blob/2cacd0849a02d43f88b6c15ee862398159ab827c/tests/test_utils/__init__.py#L154-L161 +// Resolution: 1×1, MIME type: image/png, Extension: png, Size: 67 B +var smallPNG = []byte( + "\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82", +) + +func TestAvatarFieldInRoomResponse(t *testing.T) { + alice := registerNamedUser(t, "alice") + bob := registerNamedUser(t, "bob") + chris := registerNamedUser(t, "chris") + + avatarURLs := map[string]struct{}{} + uploadAvatar := func(client *CSAPI, filename string) string { + avatar := alice.UploadContent(t, smallPNG, filename, "image/png") + if _, exists := avatarURLs[avatar]; exists { + t.Fatalf("New avatar %s has already been uploaded", avatar) + } + t.Logf("%s is uploaded as %s", filename, avatar) + avatarURLs[avatar] = struct{}{} + return avatar + } + + t.Log("Alice, Bob and Chris upload and set an avatar.") + aliceAvatar := uploadAvatar(alice, "alice.png") + bobAvatar := uploadAvatar(bob, "bob.png") + chrisAvatar := uploadAvatar(chris, "chris.png") + + alice.SetAvatar(t, aliceAvatar) + bob.SetAvatar(t, bobAvatar) + chris.SetAvatar(t, chrisAvatar) + + t.Log("Alice makes a public room, a DM with herself, a DM with Bob, a DM with Chris, and a group-DM with Bob and Chris.") + public := alice.CreateRoom(t, map[string]interface{}{"preset": "public_chat"}) + // TODO: you can create a DM with yourself e.g. as below. It probably ought to have + // your own face as an avatar. + // dmAlice := alice.CreateRoom(t, map[string]interface{}{ + // "preset": "trusted_private_chat", + // "is_direct": true, + // }) + dmBob := alice.CreateRoom(t, map[string]interface{}{ + "preset": "trusted_private_chat", + "is_direct": true, + "invite": []string{bob.UserID}, + }) + dmChris := alice.CreateRoom(t, map[string]interface{}{ + "preset": "trusted_private_chat", + "is_direct": true, + "invite": []string{chris.UserID}, + }) + dmBobChris := alice.CreateRoom(t, map[string]interface{}{ + "preset": "trusted_private_chat", + "is_direct": true, + "invite": []string{bob.UserID, chris.UserID}, + }) + + t.Logf("Rooms:\npublic=%s\ndmBob=%s\ndmChris=%s\ndmBobChris=%s", public, dmBob, dmChris, dmBobChris) + t.Log("Bob accepts his invites. Chris accepts none.") + bob.JoinRoom(t, dmBob, nil) + bob.JoinRoom(t, dmBobChris, nil) + + t.Log("Alice makes an initial sliding sync.") + res := alice.SlidingSync(t, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "rooms": { + Ranges: sync3.SliceRanges{{0, 4}}, + }, + }, + }) + + t.Log("Alice should see each room in the sync response with an appropriate avatar") + m.MatchResponse( + t, + res, + m.MatchRoomSubscription(public, m.MatchRoomUnsetAvatar()), + m.MatchRoomSubscription(dmBob, m.MatchRoomAvatar(bob.AvatarURL)), + m.MatchRoomSubscription(dmChris, m.MatchRoomAvatar(chris.AvatarURL)), + m.MatchRoomSubscription(dmBobChris, m.MatchRoomUnsetAvatar()), + ) + + t.Run("Avatar not resent on message", func(t *testing.T) { + t.Log("Bob sends a sentinel message.") + sentinel := bob.SendEventSynced(t, dmBob, Event{ + Type: "m.room.message", + Content: map[string]interface{}{ + "body": "Hello world", + "msgtype": "m.text", + }, + }) + + t.Log("Alice syncs until she sees the sentinel. She should not see the DM avatar change.") + res = alice.SlidingSyncUntil(t, res.Pos, sync3.Request{}, func(response *sync3.Response) error { + matchNoAvatarChange := m.MatchRoomSubscription(dmBob, m.MatchRoomUnchangedAvatar()) + if err := matchNoAvatarChange(response); err != nil { + t.Fatalf("Saw DM avatar change: %s", err) + } + matchSentinel := m.MatchRoomSubscription(dmBob, MatchRoomTimelineMostRecent(1, []Event{{ID: sentinel}})) + return matchSentinel(response) + }) + }) + + t.Run("DM declined", func(t *testing.T) { + t.Log("Chris leaves his DM with Alice.") + chris.LeaveRoom(t, dmChris) + + t.Log("Alice syncs until she sees Chris's leave.") + res = alice.SlidingSyncUntilMembership(t, res.Pos, dmChris, chris, "leave") + + t.Log("Alice sees Chris's avatar vanish.") + m.MatchResponse(t, res, m.MatchRoomSubscription(dmChris, m.MatchRoomUnsetAvatar())) + }) + + t.Run("Group DM declined", func(t *testing.T) { + t.Log("Chris leaves his group DM with Alice and Bob.") + chris.LeaveRoom(t, dmBobChris) + + t.Log("Alice syncs until she sees Chris's leave.") + res = alice.SlidingSyncUntilMembership(t, res.Pos, dmBobChris, chris, "leave") + + t.Log("Alice sees the room's avatar change to Bob's avatar.") + // Because this is now a DM room with exactly one other (joined|invited) member. + m.MatchResponse(t, res, m.MatchRoomSubscription(dmBobChris, m.MatchRoomAvatar(bob.AvatarURL))) + }) + + t.Run("Bob's avatar change propagates", func(t *testing.T) { + t.Log("Bob changes his avatar.") + bobAvatar2 := uploadAvatar(bob, "bob2.png") + bob.SetAvatar(t, bobAvatar2) + + avatarChangeInDM := false + avatarChangeInGroupDM := false + t.Log("Alice syncs until she sees Bob's new avatar.") + res = alice.SlidingSyncUntil( + t, + res.Pos, + sync3.Request{}, + func(response *sync3.Response) error { + if !avatarChangeInDM { + err := m.MatchRoomSubscription(dmBob, m.MatchRoomAvatar(bob.AvatarURL))(response) + if err == nil { + avatarChangeInDM = true + } + } + + if !avatarChangeInGroupDM { + err := m.MatchRoomSubscription(dmBobChris, m.MatchRoomAvatar(bob.AvatarURL))(response) + if err == nil { + avatarChangeInGroupDM = true + } + } + + if avatarChangeInDM && avatarChangeInGroupDM { + return nil + } + return fmt.Errorf("still waiting: avatarChangeInDM=%t avatarChangeInGroupDM=%t", avatarChangeInDM, avatarChangeInGroupDM) + }, + ) + + t.Log("Bob removes his avatar.") + bob.SetAvatar(t, "") + + avatarChangeInDM = false + avatarChangeInGroupDM = false + t.Log("Alice syncs until she sees Bob's avatars vanish.") + res = alice.SlidingSyncUntil( + t, + res.Pos, + sync3.Request{}, + func(response *sync3.Response) error { + if !avatarChangeInDM { + err := m.MatchRoomSubscription(dmBob, m.MatchRoomUnsetAvatar())(response) + if err == nil { + avatarChangeInDM = true + } else { + t.Log(err) + } + } + + if !avatarChangeInGroupDM { + err := m.MatchRoomSubscription(dmBobChris, m.MatchRoomUnsetAvatar())(response) + if err == nil { + avatarChangeInGroupDM = true + } else { + t.Log(err) + } + } + + if avatarChangeInDM && avatarChangeInGroupDM { + return nil + } + return fmt.Errorf("still waiting: avatarChangeInDM=%t avatarChangeInGroupDM=%t", avatarChangeInDM, avatarChangeInGroupDM) + }, + ) + + }) + + t.Run("Explicit avatar propagates in non-DM room", func(t *testing.T) { + t.Log("Alice sets an avatar for the public room.") + publicAvatar := uploadAvatar(alice, "public.png") + alice.SetState(t, public, "m.room.avatar", "", map[string]interface{}{ + "url": publicAvatar, + }) + t.Log("Alice syncs until she sees that avatar.") + res = alice.SlidingSyncUntil( + t, + res.Pos, + sync3.Request{}, + m.MatchRoomSubscriptions(map[string][]m.RoomMatcher{ + public: {m.MatchRoomAvatar(publicAvatar)}, + }), + ) + + t.Log("Alice changes the avatar for the public room.") + publicAvatar2 := uploadAvatar(alice, "public2.png") + alice.SetState(t, public, "m.room.avatar", "", map[string]interface{}{ + "url": publicAvatar2, + }) + t.Log("Alice syncs until she sees that avatar.") + res = alice.SlidingSyncUntil( + t, + res.Pos, + sync3.Request{}, + m.MatchRoomSubscriptions(map[string][]m.RoomMatcher{ + public: {m.MatchRoomAvatar(publicAvatar2)}, + }), + ) + + t.Log("Alice removes the avatar for the public room.") + alice.SetState(t, public, "m.room.avatar", "", map[string]interface{}{}) + t.Log("Alice syncs until she sees that avatar vanish.") + res = alice.SlidingSyncUntil( + t, + res.Pos, + sync3.Request{}, + m.MatchRoomSubscriptions(map[string][]m.RoomMatcher{ + public: {m.MatchRoomUnsetAvatar()}, + }), + ) + }) + + t.Run("Explicit avatar propagates in DM room", func(t *testing.T) { + t.Log("Alice re-invites Chris to their DM.") + alice.InviteRoom(t, dmChris, chris.UserID) + + t.Log("Alice syncs until she sees her invitation to Chris.") + res = alice.SlidingSyncUntilMembership(t, res.Pos, dmChris, chris, "invite") + + t.Log("Alice should see the DM with Chris's avatar.") + m.MatchResponse(t, res, m.MatchRoomSubscription(dmChris, m.MatchRoomAvatar(chris.AvatarURL))) + + t.Log("Chris joins the room.") + chris.JoinRoom(t, dmChris, nil) + + t.Log("Alice syncs until she sees Chris's join.") + res = alice.SlidingSyncUntilMembership(t, res.Pos, dmChris, chris, "join") + + t.Log("Alice shouldn't see the DM's avatar change..") + m.MatchResponse(t, res, m.MatchRoomSubscription(dmChris, m.MatchRoomUnchangedAvatar())) + + t.Log("Chris gives their DM a bespoke avatar.") + dmAvatar := uploadAvatar(chris, "dm.png") + chris.SetState(t, dmChris, "m.room.avatar", "", map[string]interface{}{ + "url": dmAvatar, + }) + + t.Log("Alice syncs until she sees that avatar.") + alice.SlidingSyncUntil(t, res.Pos, sync3.Request{}, m.MatchRoomSubscription(dmChris, m.MatchRoomAvatar(dmAvatar))) + + t.Log("Chris changes his global avatar, which adds a join event to the room.") + chrisAvatar2 := uploadAvatar(chris, "chris2.png") + chris.SetAvatar(t, chrisAvatar2) + + t.Log("Alice syncs until she sees that join event.") + res = alice.SlidingSyncUntilMembership(t, res.Pos, dmChris, chris, "join") + + t.Log("Her response should have either no avatar change, or the same bespoke avatar.") + // No change, ideally, but repeating the same avatar isn't _wrong_ + m.MatchResponse(t, res, m.MatchRoomSubscription(dmChris, func(r sync3.Room) error { + noChangeErr := m.MatchRoomUnchangedAvatar()(r) + sameBespokeAvatarErr := m.MatchRoomAvatar(dmAvatar)(r) + if noChangeErr == nil || sameBespokeAvatarErr == nil { + return nil + } + return fmt.Errorf("expected no change or the same bespoke avatar (%s), got '%s'", dmAvatar, r.AvatarChange) + })) + + t.Log("Chris updates the DM's avatar.") + dmAvatar2 := uploadAvatar(chris, "dm2.png") + chris.SetState(t, dmChris, "m.room.avatar", "", map[string]interface{}{ + "url": dmAvatar2, + }) + + t.Log("Alice syncs until she sees that avatar.") + res = alice.SlidingSyncUntil(t, res.Pos, sync3.Request{}, m.MatchRoomSubscription(dmChris, m.MatchRoomAvatar(dmAvatar2))) + + t.Log("Chris removes the DM's avatar.") + chris.SetState(t, dmChris, "m.room.avatar", "", map[string]interface{}{}) + + t.Log("Alice syncs until the DM avatar returns to Chris's most recent avatar.") + res = alice.SlidingSyncUntil(t, res.Pos, sync3.Request{}, m.MatchRoomSubscription(dmChris, m.MatchRoomAvatar(chris.AvatarURL))) + }) + + t.Run("Changing DM flag", func(t *testing.T) { + t.Skip("TODO: unimplemented") + t.Log("Alice clears the DM flag on Bob's room.") + alice.SetGlobalAccountData(t, "m.direct", map[string]interface{}{ + "content": map[string][]string{ + bob.UserID: {}, // no dmBob here + chris.UserID: {dmChris, dmBobChris}, + }, + }) + + t.Log("Alice syncs until she sees a new set of account data.") + res = alice.SlidingSyncUntil(t, res.Pos, sync3.Request{ + Extensions: extensions.Request{ + AccountData: &extensions.AccountDataRequest{ + extensions.Core{Enabled: &boolTrue}, + }, + }, + }, func(response *sync3.Response) error { + if response.Extensions.AccountData == nil { + return fmt.Errorf("no account data yet") + } + if len(response.Extensions.AccountData.Global) == 0 { + return fmt.Errorf("no global account data yet") + } + return nil + }) + + t.Log("The DM with Bob should no longer be a DM and should no longer have an avatar.") + m.MatchResponse(t, res, m.MatchRoomSubscription(dmBob, func(r sync3.Room) error { + if r.IsDM { + return fmt.Errorf("dmBob is still a DM") + } + return m.MatchRoomUnsetAvatar()(r) + })) + + t.Log("Alice sets the DM flag on Bob's room.") + alice.SetGlobalAccountData(t, "m.direct", map[string]interface{}{ + "content": map[string][]string{ + bob.UserID: {dmBob}, // dmBob reinstated + chris.UserID: {dmChris, dmBobChris}, + }, + }) + + t.Log("Alice syncs until she sees a new set of account data.") + res = alice.SlidingSyncUntil(t, res.Pos, sync3.Request{ + Extensions: extensions.Request{ + AccountData: &extensions.AccountDataRequest{ + extensions.Core{Enabled: &boolTrue}, + }, + }, + }, func(response *sync3.Response) error { + if response.Extensions.AccountData == nil { + return fmt.Errorf("no account data yet") + } + if len(response.Extensions.AccountData.Global) == 0 { + return fmt.Errorf("no global account data yet") + } + return nil + }) + + t.Log("The room should have Bob's avatar again.") + m.MatchResponse(t, res, m.MatchRoomSubscription(dmBob, func(r sync3.Room) error { + if !r.IsDM { + return fmt.Errorf("dmBob is still not a DM") + } + return m.MatchRoomAvatar(bob.AvatarURL)(r) + })) + + }) + + t.Run("See avatar when invited", func(t *testing.T) { + t.Log("Chris invites Alice to a DM.") + dmInvited := chris.CreateRoom(t, map[string]interface{}{ + "preset": "trusted_private_chat", + "is_direct": true, + "invite": []string{alice.UserID}, + }) + + t.Log("Alice syncs until she sees the invite.") + res = alice.SlidingSyncUntilMembership(t, res.Pos, dmInvited, alice, "invite") + + t.Log("The new room should use Chris's avatar.") + m.MatchResponse(t, res, m.MatchRoomSubscription(dmInvited, m.MatchRoomAvatar(chris.AvatarURL))) + }) +} From f6f1106fc42fb820e9740d2be241cae6f4852b2c Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Tue, 18 Jul 2023 14:37:33 +0200 Subject: [PATCH 087/156] Update test to include ToDevice messages --- sync2/poller_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sync2/poller_test.go b/sync2/poller_test.go index 9c229a56..10945809 100644 --- a/sync2/poller_test.go +++ b/sync2/poller_test.go @@ -277,6 +277,9 @@ func TestPollerPollFromExisting(t *testing.T) { json.RawMessage(`{"event":10}`), }, } + toDeviceResponses := [][]json.RawMessage{ + {}, {}, {}, {json.RawMessage(`{}`)}, + } hasPolledSuccessfully := make(chan struct{}) accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) { if since == "" { @@ -295,6 +298,10 @@ func TestPollerPollFromExisting(t *testing.T) { var joinResp SyncV2JoinResponse joinResp.Timeline.Events = roomTimelineResponses[sinceInt] return &SyncResponse{ + // Add in dummy toDevice messages, so the poller actually persists the since token. (Which + // it only does for the first poll, after 1min (this test doesn't run that long) OR there are + // ToDevice messages in the response) + ToDevice: EventsResponse{Events: toDeviceResponses[sinceInt]}, NextBatch: fmt.Sprintf("%d", sinceInt+1), Rooms: struct { Join map[string]SyncV2JoinResponse `json:"join"` From b4ba7707259e3a5de2f7130a9fcfc59efff9e65c Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 13:40:12 +0100 Subject: [PATCH 088/156] Fix an integration test failure --- sync3/caches/user.go | 2 +- sync3/lists.go | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sync3/caches/user.go b/sync3/caches/user.go index 5f5e46e7..cac117f0 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -46,7 +46,7 @@ type UserRoomData struct { // The zero value of this safe to use (0 latest nid, no prev batch, no timeline). RequestedLatestEvents state.LatestEvents - // TODO: should Canonicalised really be in RoomConMetadata? It's only set in SetRoom AFAICS + // TODO: should CanonicalisedName really be in RoomConMetadata? It's only set in SetRoom AFAICS CanonicalisedName string // stripped leading symbols like #, all in lower case // Set of spaces this room is a part of, from the perspective of this user. This is NOT global room data // as the set of spaces may be different for different users. diff --git a/sync3/lists.go b/sync3/lists.go index d8b404df..f8d3d551 100644 --- a/sync3/lists.go +++ b/sync3/lists.go @@ -73,6 +73,16 @@ func (s *InternalRequestLists) SetRoom(r RoomConnMetadata) (delta RoomDelta) { r.CanonicalisedName = strings.ToLower( strings.Trim(internal.CalculateRoomName(&r.RoomMetadata, 5), "#!():_@"), ) + } else { + // XXX: during TestConnectionTimeoutNotReset there is some situation where + // r.CanonicalisedName is the empty string. Looking at the SetRoom + // call in connstate_live.go, this is because the UserRoomMetadata on + // the RoomUpdate has an empty CanonicalisedName. Either + // a) that is expected, in which case we should _always_ write to + // r.CanonicalisedName here; or + // b) that is not expected, in which case... erm, I don't know what + // to conclude. + r.CanonicalisedName = existing.CanonicalisedName } delta.RoomAvatarChanged = !existing.SameRoomAvatar(&r.RoomMetadata) if delta.RoomAvatarChanged { From 9505e94f6cd78b3abfc07c48f71a35228c3d9811 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 15:07:34 +0100 Subject: [PATCH 089/156] Bump docker-setup-qemu-action to v2 Looking at https://github.com/docker/setup-qemu-action/releases I can only see the node 16 runtime as a breaking change. Fixes #170, hopefully. --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 46009b60..c6df586a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -26,7 +26,7 @@ jobs: - name: Checkout uses: actions/checkout@v3 - name: Set up QEMU - uses: docker/setup-qemu-action@v1 + uses: docker/setup-qemu-action@v2 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - name: Login to GitHub Containers From 885b42d92dd040a14ec7a81f38e30d6032b63b77 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 18 Jul 2023 19:19:34 +0100 Subject: [PATCH 090/156] Lookup correct m.room.avatar field for invites --- sync3/caches/user.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/caches/user.go b/sync3/caches/user.go index cac117f0..769cf4b2 100644 --- a/sync3/caches/user.go +++ b/sync3/caches/user.go @@ -125,7 +125,7 @@ func NewInviteData(ctx context.Context, userID, roomID string, inviteState []jso case "m.room.name": id.NameEvent = j.Get("content.name").Str case "m.room.avatar": - id.AvatarEvent = j.Get("content.avatar_url").Str + id.AvatarEvent = j.Get("content.url").Str case "m.room.canonical_alias": id.CanonicalAlias = j.Get("content.alias").Str case "m.room.encryption": From 46d56b8433e2a6dcf104c6ce8f13d0679f37a28b Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Wed, 19 Jul 2023 12:17:47 +0200 Subject: [PATCH 091/156] Add test to check that the since token is only stored in the database periodically --- sync2/poller.go | 11 +++-- sync2/poller_test.go | 109 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/sync2/poller.go b/sync2/poller.go index dbd9a05a..062366a6 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -22,8 +22,9 @@ type PollerID struct { DeviceID string } -// alias time.Sleep so tests can monkey patch it out +// alias time.Sleep/time.Since so tests can monkey patch it out var timeSleep = time.Sleep +var timeSince = time.Since // log at most once every duration. Always logs before terminating. var logInterval = 30 * time.Second @@ -511,7 +512,7 @@ func (p *poller) poll(ctx context.Context, s *pollLoopState) error { p.numOutstandingSyncReqs.Dec() } region.End() - p.trackRequestDuration(time.Since(start), s.since == "", s.firstTime) + p.trackRequestDuration(timeSince(start), s.since == "", s.firstTime) if p.terminated.Load() { return fmt.Errorf("poller terminated") } @@ -549,7 +550,7 @@ func (p *poller) poll(ctx context.Context, s *pollLoopState) error { s.since = resp.NextBatch // Persist the since token if it either was more than one minute ago since we // last stored it OR the response contains to-device messages - if time.Since(s.lastStoredSince) > time.Minute || len(resp.ToDevice.Events) > 0 { + if timeSince(s.lastStoredSince) > time.Minute || len(resp.ToDevice.Events) > 0 { p.receiver.UpdateDeviceSince(ctx, p.userID, p.deviceID, s.since) s.lastStoredSince = time.Now() } @@ -558,7 +559,7 @@ func (p *poller) poll(ctx context.Context, s *pollLoopState) error { s.firstTime = false p.wg.Done() } - p.trackProcessDuration(time.Since(start), wasInitial, wasFirst) + p.trackProcessDuration(timeSince(start), wasInitial, wasFirst) p.maybeLogStats(false) return nil } @@ -734,7 +735,7 @@ func (p *poller) parseRoomsResponse(ctx context.Context, res *SyncResponse) { } func (p *poller) maybeLogStats(force bool) { - if !force && time.Since(p.lastLogged) < logInterval { + if !force && timeSince(p.lastLogged) < logInterval { // only log at most once every logInterval return } diff --git a/sync2/poller_test.go b/sync2/poller_test.go index 10945809..13f8af8d 100644 --- a/sync2/poller_test.go +++ b/sync2/poller_test.go @@ -343,6 +343,101 @@ func TestPollerPollFromExisting(t *testing.T) { } } +// Check that the since token in the database +// 1. is updated if it is the first iteration of poll +// 2. is NOT updated for random events +// 3. is updated if the syncV2 response contains ToDevice messages +// 4. is updated if at least 1min has passed since we last stored a token +func TestPollerPollUpdateDeviceSincePeriodically(t *testing.T) { + pid := PollerID{UserID: "@alice:localhost", DeviceID: "FOOBAR"} + + syncResponses := make(chan *SyncResponse, 1) + accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) { + return <-syncResponses, 200, nil + }) + accumulator.updateSinceCalled = make(chan struct{}, 1) + poller := newPoller(pid, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false) + defer poller.Terminate() + go func() { + poller.Poll("") + }() + + hasPolledSuccessfully := make(chan struct{}) + + go func() { + poller.WaitUntilInitialSync() + close(hasPolledSuccessfully) + }() + + // 1. Initial poll updates the database + wantSince := "1" + syncResponses <- &SyncResponse{NextBatch: wantSince} + + select { + case <-hasPolledSuccessfully: + break + case <-time.After(time.Second): + t.Errorf("WaitUntilInitialSync failed to fire") + } + // Also check that UpdateDeviceSince was called + select { + case <-accumulator.updateSinceCalled: + case <-time.After(time.Millisecond * 100): // give the Poller some time to process the response + t.Fatalf("did not receive call to UpdateDeviceSince in time") + } + + if got := accumulator.pollerIDToSince[pid]; got != wantSince { + t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) + } + + // 2. Second request updates the state but NOT the database + next := "2" + syncResponses <- &SyncResponse{NextBatch: next} + if got := accumulator.pollerIDToSince[pid]; got != wantSince { + t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) + } + + select { + case <-accumulator.updateSinceCalled: + t.Fatalf("unexpected call to UpdateDeviceSince") + case <-time.After(time.Millisecond * 100): + } + + // 3. Sync response contains a toDevice message and should be stored in the database + next = "3" + wantSince = "3" + syncResponses <- &SyncResponse{ + NextBatch: next, + ToDevice: EventsResponse{Events: []json.RawMessage{{}}}, + } + select { + case <-accumulator.updateSinceCalled: + case <-time.After(time.Millisecond * 100): + t.Fatalf("did not receive call to UpdateDeviceSince in time") + } + + if got := accumulator.pollerIDToSince[pid]; got != wantSince { + t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) + } + + // 4. ... some time has passed, this triggers the 1min limit + timeSince = func(d time.Time) time.Duration { + return time.Minute * 2 + } + next = "10" + wantSince = "10" + syncResponses <- &SyncResponse{NextBatch: next} + select { + case <-accumulator.updateSinceCalled: + case <-time.After(time.Millisecond * 100): + t.Fatalf("did not receive call to UpdateDeviceSince in time") + } + + if got := accumulator.pollerIDToSince[pid]; got != wantSince { + t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) + } +} + // Tests that the poller backs off in 2,4,8,etc second increments to a variety of errors func TestPollerBackoff(t *testing.T) { deviceID := "FOOBAR" @@ -460,11 +555,12 @@ func (c *mockClient) WhoAmI(authHeader string) (string, string, error) { } type mockDataReceiver struct { - states map[string][]json.RawMessage - timelines map[string][]json.RawMessage - pollerIDToSince map[PollerID]string - incomingProcess chan struct{} - unblockProcess chan struct{} + states map[string][]json.RawMessage + timelines map[string][]json.RawMessage + pollerIDToSince map[PollerID]string + incomingProcess chan struct{} + unblockProcess chan struct{} + updateSinceCalled chan struct{} } func (a *mockDataReceiver) Accumulate(ctx context.Context, userID, deviceID, roomID, prevBatch string, timeline []json.RawMessage) { @@ -486,6 +582,9 @@ func (a *mockDataReceiver) SetTyping(ctx context.Context, roomID string, ephEven } func (s *mockDataReceiver) UpdateDeviceSince(ctx context.Context, userID, deviceID, since string) { s.pollerIDToSince[PollerID{UserID: userID, DeviceID: deviceID}] = since + if s.updateSinceCalled != nil { + s.updateSinceCalled <- struct{}{} + } } func (s *mockDataReceiver) AddToDeviceMessages(ctx context.Context, userID, deviceID string, msgs []json.RawMessage) { } From 22f640a3527844f6596ad094294355ce64fcb200 Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Wed, 19 Jul 2023 14:56:44 +0200 Subject: [PATCH 092/156] Check that calls to /sync use the expected since token --- sync2/poller_test.go | 52 ++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/sync2/poller_test.go b/sync2/poller_test.go index 13f8af8d..04c1c0f8 100644 --- a/sync2/poller_test.go +++ b/sync2/poller_test.go @@ -352,14 +352,18 @@ func TestPollerPollUpdateDeviceSincePeriodically(t *testing.T) { pid := PollerID{UserID: "@alice:localhost", DeviceID: "FOOBAR"} syncResponses := make(chan *SyncResponse, 1) + syncCalledWithSince := make(chan string) accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) { + if since != "" { + syncCalledWithSince <- since + } return <-syncResponses, 200, nil }) accumulator.updateSinceCalled = make(chan struct{}, 1) poller := newPoller(pid, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false) defer poller.Terminate() go func() { - poller.Poll("") + poller.Poll("0") }() hasPolledSuccessfully := make(chan struct{}) @@ -370,8 +374,9 @@ func TestPollerPollUpdateDeviceSincePeriodically(t *testing.T) { }() // 1. Initial poll updates the database - wantSince := "1" - syncResponses <- &SyncResponse{NextBatch: wantSince} + next := "1" + syncResponses <- &SyncResponse{NextBatch: next} + mustEqualSince(t, <-syncCalledWithSince, "0") select { case <-hasPolledSuccessfully: @@ -386,16 +391,16 @@ func TestPollerPollUpdateDeviceSincePeriodically(t *testing.T) { t.Fatalf("did not receive call to UpdateDeviceSince in time") } - if got := accumulator.pollerIDToSince[pid]; got != wantSince { - t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) + if got := accumulator.pollerIDToSince[pid]; got != next { + t.Fatalf("expected since to be updated to %s, but got %s", next, got) } + // The since token used by calls to doSyncV2 + wantSinceFromSync := next + // 2. Second request updates the state but NOT the database - next := "2" - syncResponses <- &SyncResponse{NextBatch: next} - if got := accumulator.pollerIDToSince[pid]; got != wantSince { - t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) - } + syncResponses <- &SyncResponse{NextBatch: "2"} + mustEqualSince(t, <-syncCalledWithSince, wantSinceFromSync) select { case <-accumulator.updateSinceCalled: @@ -403,38 +408,53 @@ func TestPollerPollUpdateDeviceSincePeriodically(t *testing.T) { case <-time.After(time.Millisecond * 100): } + if got := accumulator.pollerIDToSince[pid]; got != next { + t.Fatalf("expected since to be updated to %s, but got %s", next, got) + } + // 3. Sync response contains a toDevice message and should be stored in the database + wantSinceFromSync = "2" next = "3" - wantSince = "3" syncResponses <- &SyncResponse{ NextBatch: next, ToDevice: EventsResponse{Events: []json.RawMessage{{}}}, } + mustEqualSince(t, <-syncCalledWithSince, wantSinceFromSync) + select { case <-accumulator.updateSinceCalled: case <-time.After(time.Millisecond * 100): t.Fatalf("did not receive call to UpdateDeviceSince in time") } - if got := accumulator.pollerIDToSince[pid]; got != wantSince { - t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) + if got := accumulator.pollerIDToSince[pid]; got != next { + t.Fatalf("expected since to be updated to %s, but got %s", wantSinceFromSync, got) } + wantSinceFromSync = next // 4. ... some time has passed, this triggers the 1min limit timeSince = func(d time.Time) time.Duration { return time.Minute * 2 } next = "10" - wantSince = "10" syncResponses <- &SyncResponse{NextBatch: next} + mustEqualSince(t, <-syncCalledWithSince, wantSinceFromSync) + select { case <-accumulator.updateSinceCalled: case <-time.After(time.Millisecond * 100): t.Fatalf("did not receive call to UpdateDeviceSince in time") } - if got := accumulator.pollerIDToSince[pid]; got != wantSince { - t.Fatalf("expected since to be updated to %s, but got %s", wantSince, got) + if got := accumulator.pollerIDToSince[pid]; got != next { + t.Fatalf("expected since to be updated to %s, but got %s", wantSinceFromSync, got) + } +} + +func mustEqualSince(t *testing.T, gotSince, expectedSince string) { + t.Helper() + if gotSince != expectedSince { + t.Fatalf("client.DoSyncV2 using unexpected since token: %s, want %s", gotSince, expectedSince) } } From ae29d14c6fdfba85f9163af811e1941be40747b0 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 19 Jul 2023 15:56:43 +0100 Subject: [PATCH 093/156] Remove unused code --- state/accumulator.go | 22 ------- state/accumulator_test.go | 53 ----------------- state/event_table.go | 8 --- state/event_table_test.go | 119 -------------------------------------- 4 files changed, 202 deletions(-) diff --git a/state/accumulator.go b/state/accumulator.go index acab4555..7e29badb 100644 --- a/state/accumulator.go +++ b/state/accumulator.go @@ -483,25 +483,3 @@ func (a *Accumulator) filterAndParseTimelineEvents(txn *sqlx.Tx, roomID string, // A is seen event s[A,B,C] => s[0+1:] => [B,C] return dedupedEvents[seenIndex+1:], nil } - -// Delta returns a list of events of at most `limit` for the room not including `lastEventNID`. -// Returns the latest NID of the last event (most recent) -func (a *Accumulator) Delta(roomID string, lastEventNID int64, limit int) (eventsJSON []json.RawMessage, latest int64, err error) { - txn, err := a.db.Beginx() - if err != nil { - return nil, 0, err - } - defer txn.Commit() - events, err := a.eventsTable.SelectEventsBetween(txn, roomID, lastEventNID, EventsEnd, limit) - if err != nil { - return nil, 0, err - } - if len(events) == 0 { - return nil, lastEventNID, nil - } - eventsJSON = make([]json.RawMessage, len(events)) - for i := range events { - eventsJSON[i] = events[i].JSON - } - return eventsJSON, int64(events[len(events)-1].NID), nil -} diff --git a/state/accumulator_test.go b/state/accumulator_test.go index 64ee6c86..0e546ae5 100644 --- a/state/accumulator_test.go +++ b/state/accumulator_test.go @@ -200,59 +200,6 @@ func TestAccumulatorAccumulate(t *testing.T) { } } -func TestAccumulatorDelta(t *testing.T) { - roomID := "!TestAccumulatorDelta:localhost" - db, close := connectToDB(t) - defer close() - accumulator := NewAccumulator(db) - _, err := accumulator.Initialise(roomID, nil) - if err != nil { - t.Fatalf("failed to Initialise accumulator: %s", err) - } - roomEvents := []json.RawMessage{ - []byte(`{"event_id":"aD", "type":"m.room.create", "state_key":"", "content":{"creator":"@TestAccumulatorDelta:localhost"}}`), - []byte(`{"event_id":"aE", "type":"m.room.member", "state_key":"@TestAccumulatorDelta:localhost", "content":{"membership":"join"}}`), - []byte(`{"event_id":"aF", "type":"m.room.join_rules", "state_key":"", "content":{"join_rule":"public"}}`), - []byte(`{"event_id":"aG", "type":"m.room.message","content":{"body":"Hello World","msgtype":"m.text"}}`), - []byte(`{"event_id":"aH", "type":"m.room.join_rules", "state_key":"", "content":{"join_rule":"public"}}`), - []byte(`{"event_id":"aI", "type":"m.room.history_visibility", "state_key":"", "content":{"visibility":"public"}}`), - } - err = sqlutil.WithTransaction(accumulator.db, func(txn *sqlx.Tx) error { - _, _, err = accumulator.Accumulate(txn, roomID, "", roomEvents) - return err - }) - if err != nil { - t.Fatalf("failed to Accumulate: %s", err) - } - - // Draw the create event, tests limits - events, position, err := accumulator.Delta(roomID, EventsStart, 1) - if err != nil { - t.Fatalf("failed to Delta: %s", err) - } - if len(events) != 1 { - t.Fatalf("failed to get events from Delta, got %d want 1", len(events)) - } - if gjson.GetBytes(events[0], "event_id").Str != gjson.GetBytes(roomEvents[0], "event_id").Str { - t.Fatalf("failed to draw first event, got %s want %s", string(events[0]), string(roomEvents[0])) - } - if position == 0 { - t.Errorf("Delta returned zero position") - } - - // Draw up to the end - events, position, err = accumulator.Delta(roomID, position, 1000) - if err != nil { - t.Fatalf("failed to Delta: %s", err) - } - if len(events) != len(roomEvents)-1 { - t.Fatalf("failed to get events from Delta, got %d want %d", len(events), len(roomEvents)-1) - } - if position == 0 { - t.Errorf("Delta returned zero position") - } -} - func TestAccumulatorMembershipLogs(t *testing.T) { roomID := "!TestAccumulatorMembershipLogs:localhost" db, close := connectToDB(t) diff --git a/state/event_table.go b/state/event_table.go index 578afbf5..46f43df4 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -336,14 +336,6 @@ func (t *EventTable) LatestEventNIDInRooms(txn *sqlx.Tx, roomIDs []string, highe return } -func (t *EventTable) SelectEventsBetween(txn *sqlx.Tx, roomID string, lowerExclusive, upperInclusive int64, limit int) ([]Event, error) { - var events []Event - err := txn.Select(&events, `SELECT event_nid, event FROM syncv3_events WHERE event_nid > $1 AND event_nid <= $2 AND room_id = $3 ORDER BY event_nid ASC LIMIT $4`, - lowerExclusive, upperInclusive, roomID, limit, - ) - return events, err -} - func (t *EventTable) SelectLatestEventsBetween(txn *sqlx.Tx, roomID string, lowerExclusive, upperInclusive int64, limit int) ([]Event, error) { var events []Event // do not pull in events which were in the v2 state block diff --git a/state/event_table_test.go b/state/event_table_test.go index c015b2b2..db4bab36 100644 --- a/state/event_table_test.go +++ b/state/event_table_test.go @@ -297,125 +297,6 @@ func TestEventTableDupeInsert(t *testing.T) { } } -func TestEventTableSelectEventsBetween(t *testing.T) { - db, close := connectToDB(t) - defer close() - txn, err := db.Beginx() - if err != nil { - t.Fatalf("failed to start txn: %s", err) - } - table := NewEventTable(db) - searchRoomID := "!0TestEventTableSelectEventsBetween:localhost" - eventIDs := []string{ - "100TestEventTableSelectEventsBetween", - "101TestEventTableSelectEventsBetween", - "102TestEventTableSelectEventsBetween", - "103TestEventTableSelectEventsBetween", - "104TestEventTableSelectEventsBetween", - } - events := []Event{ - { - JSON: []byte(`{"event_id":"` + eventIDs[0] + `","type": "T1", "state_key":"S1", "room_id":"` + searchRoomID + `"}`), - }, - { - JSON: []byte(`{"event_id":"` + eventIDs[1] + `","type": "T2", "state_key":"S2", "room_id":"` + searchRoomID + `"}`), - }, - { - JSON: []byte(`{"event_id":"` + eventIDs[2] + `","type": "T3", "state_key":"", "room_id":"` + searchRoomID + `"}`), - }, - { - // different room - JSON: []byte(`{"event_id":"` + eventIDs[3] + `","type": "T4", "state_key":"", "room_id":"!1TestEventTableSelectEventsBetween:localhost"}`), - }, - { - JSON: []byte(`{"event_id":"` + eventIDs[4] + `","type": "T5", "state_key":"", "room_id":"` + searchRoomID + `"}`), - }, - } - idToNID, err := table.Insert(txn, events, true) - if err != nil { - t.Fatalf("Insert failed: %s", err) - } - if len(idToNID) != len(events) { - t.Fatalf("failed to insert events: got %d want %d", len(idToNID), len(events)) - } - txn.Commit() - - t.Run("subgroup", func(t *testing.T) { - t.Run("selecting multiple events known lower bound", func(t *testing.T) { - t.Parallel() - txn2, err := db.Beginx() - if err != nil { - t.Fatalf("failed to start txn: %s", err) - } - defer txn2.Rollback() - events, err := table.SelectByIDs(txn2, true, []string{eventIDs[0]}) - if err != nil || len(events) == 0 { - t.Fatalf("failed to extract event for lower bound: %s", err) - } - events, err = table.SelectEventsBetween(txn2, searchRoomID, int64(events[0].NID), EventsEnd, 1000) - if err != nil { - t.Fatalf("failed to SelectEventsBetween: %s", err) - } - // 3 as 1 is from a different room - if len(events) != 3 { - t.Fatalf("wanted 3 events, got %d", len(events)) - } - }) - t.Run("selecting multiple events known lower and upper bound", func(t *testing.T) { - t.Parallel() - txn3, err := db.Beginx() - if err != nil { - t.Fatalf("failed to start txn: %s", err) - } - defer txn3.Rollback() - events, err := table.SelectByIDs(txn3, true, []string{eventIDs[0], eventIDs[2]}) - if err != nil || len(events) == 0 { - t.Fatalf("failed to extract event for lower/upper bound: %s", err) - } - events, err = table.SelectEventsBetween(txn3, searchRoomID, int64(events[0].NID), int64(events[1].NID), 1000) - if err != nil { - t.Fatalf("failed to SelectEventsBetween: %s", err) - } - // eventIDs[1] and eventIDs[2] - if len(events) != 2 { - t.Fatalf("wanted 2 events, got %d", len(events)) - } - }) - t.Run("selecting multiple events unknown bounds (all events)", func(t *testing.T) { - t.Parallel() - txn4, err := db.Beginx() - if err != nil { - t.Fatalf("failed to start txn: %s", err) - } - defer txn4.Rollback() - gotEvents, err := table.SelectEventsBetween(txn4, searchRoomID, EventsStart, EventsEnd, 1000) - if err != nil { - t.Fatalf("failed to SelectEventsBetween: %s", err) - } - // one less as one event is for a different room - if len(gotEvents) != (len(events) - 1) { - t.Fatalf("wanted %d events, got %d", len(events)-1, len(gotEvents)) - } - }) - t.Run("selecting multiple events hitting the limit", func(t *testing.T) { - t.Parallel() - txn5, err := db.Beginx() - if err != nil { - t.Fatalf("failed to start txn: %s", err) - } - defer txn5.Rollback() - limit := 2 - gotEvents, err := table.SelectEventsBetween(txn5, searchRoomID, EventsStart, EventsEnd, limit) - if err != nil { - t.Fatalf("failed to SelectEventsBetween: %s", err) - } - if len(gotEvents) != limit { - t.Fatalf("wanted %d events, got %d", limit, len(gotEvents)) - } - }) - }) -} - func TestEventTableMembershipDetection(t *testing.T) { db, close := connectToDB(t) defer close() From 019661eb7611a846c47a4b65805b461c069b05ae Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 19 Jul 2023 18:23:09 +0100 Subject: [PATCH 094/156] Calculate heroes from the returned joined/invited members --- state/storage.go | 106 +++++++++++++++++++++++++----------------- state/storage_test.go | 50 ++++++++++++++++++++ 2 files changed, 114 insertions(+), 42 deletions(-) diff --git a/state/storage.go b/state/storage.go index bdff086e..8248049b 100644 --- a/state/storage.go +++ b/state/storage.go @@ -235,45 +235,6 @@ func (s *Storage) MetadataForAllRooms(txn *sqlx.Tx, tempTableName string, result result[roomID] = metadata } - // Select the most recent members for each room to serve as Heroes. The spec is ambiguous here: - // "This should be the first 5 members of the room, ordered by stream ordering, which are joined or invited." - // Unclear if this is the first 5 *most recent* (backwards) or forwards. For now we'll use the most recent - // ones, and select 6 of them so we can always use 5 no matter who is requesting the room name. - rows, err := txn.Query(` - SELECT rf.* FROM ( - SELECT room_id, event, rank() OVER ( - PARTITION BY room_id ORDER BY event_nid DESC - ) FROM syncv3_events INNER JOIN ` + tempTableName + ` ON membership_nid=event_nid WHERE ( - membership='join' OR membership='invite' OR membership='_join' - ) AND event_type='m.room.member' - ) rf WHERE rank <= 6;`) - if err != nil { - return fmt.Errorf("failed to query heroes: %s", err) - } - defer rows.Close() - seen := map[string]bool{} - for rows.Next() { - var roomID string - var event json.RawMessage - var rank int - if err := rows.Scan(&roomID, &event, &rank); err != nil { - return err - } - ev := gjson.ParseBytes(event) - targetUser := ev.Get("state_key").Str - key := roomID + " " + targetUser - if seen[key] { - continue - } - seen[key] = true - metadata := loadMetadata(roomID) - metadata.Heroes = append(metadata.Heroes, internal.Hero{ - ID: targetUser, - Name: ev.Get("content.displayname").Str, - Avatar: ev.Get("content.avatar_url").Str, - }) - result[roomID] = metadata - } roomInfos, err := s.Accumulator.roomsTable.SelectRoomInfos(txn) if err != nil { return fmt.Errorf("failed to select room infos: %s", err) @@ -803,9 +764,14 @@ func (s *Storage) RoomMembershipDelta(roomID string, from, to int64, limit int) } // Extract all rooms with joined members, and include the joined user list. Requires a prepared snapshot in order to be called. +// Populates the join/invite count and heroes for the returned metadata. func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (joinedMembers map[string][]string, metadata map[string]internal.RoomMetadata, err error) { + // Select the most recent members for each room to serve as Heroes. The spec is ambiguous here: + // "This should be the first 5 members of the room, ordered by stream ordering, which are joined or invited." + // Unclear if this is the first 5 *most recent* (backwards) or forwards. For now we'll use the most recent + // ones, and select 6 of them so we can always use 5 no matter who is requesting the room name. rows, err := txn.Query( - `SELECT room_id, state_key, membership from ` + tempTableName + ` INNER JOIN syncv3_events + `SELECT membership_nid, room_id, state_key, membership from ` + tempTableName + ` INNER JOIN syncv3_events on membership_nid = event_nid WHERE membership='join' OR membership='_join' OR membership='invite' OR membership='_invite' ORDER BY event_nid ASC`, ) if err != nil { @@ -813,14 +779,21 @@ func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (joinedMe } defer rows.Close() joinedMembers = make(map[string][]string) - var roomID string inviteCounts := make(map[string]int) + heroNIDs := make(map[string]*circularSlice) var stateKey string var membership string + var roomID string + var nid int64 for rows.Next() { - if err := rows.Scan(&roomID, &stateKey, &membership); err != nil { + if err := rows.Scan(&nid, &roomID, &stateKey, &membership); err != nil { return nil, nil, err } + heroes := heroNIDs[roomID] + if heroes == nil { + heroes = &circularSlice{max: 5} + heroNIDs[roomID] = heroes + } switch membership { case "join": fallthrough @@ -828,17 +801,42 @@ func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (joinedMe users := joinedMembers[roomID] users = append(users, stateKey) joinedMembers[roomID] = users + heroes.append(nid) case "invite": fallthrough case "_invite": inviteCounts[roomID] = inviteCounts[roomID] + 1 + heroes.append(nid) } } + + // now select the membership events for the heroes + var allHeroNIDs []int64 + for _, nids := range heroNIDs { + allHeroNIDs = append(allHeroNIDs, nids.vals...) + } + heroEvents, err := s.EventsTable.SelectByNIDs(txn, true, allHeroNIDs) + if err != nil { + return nil, nil, err + } + heroes := make(map[string][]internal.Hero) + for _, ev := range heroEvents { + evJSON := gjson.ParseBytes(ev.JSON) + roomHeroes := heroes[ev.RoomID] + roomHeroes = append(roomHeroes, internal.Hero{ + ID: ev.StateKey, + Name: evJSON.Get("content.displayname").Str, + Avatar: evJSON.Get("content.avatar_url").Str, + }) + heroes[ev.RoomID] = roomHeroes + } + metadata = make(map[string]internal.RoomMetadata) for roomID, members := range joinedMembers { m := internal.NewRoomMetadata(roomID) m.JoinCount = len(members) m.InviteCount = inviteCounts[roomID] + m.Heroes = heroes[roomID] metadata[roomID] = *m } return joinedMembers, metadata, nil @@ -938,3 +936,27 @@ func (s *Storage) Teardown() { panic("Storage.Teardown: " + err.Error()) } } + +// circularSlice is a slice which can be appended to which will wraparound at `max`. +// Mostly useful for lazily calculating heroes. The values returned aren't sorted. +type circularSlice struct { + i int + vals []int64 + max int +} + +func (s *circularSlice) append(val int64) { + if len(s.vals) < s.max { + // populate up to max + s.vals = append(s.vals, val) + s.i++ + return + } + // wraparound + if s.i == s.max { + s.i = 0 + } + // replace this entry + s.vals[s.i] = val + s.i++ +} diff --git a/state/storage_test.go b/state/storage_test.go index c5eaa192..f5ffeec6 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -822,6 +822,56 @@ func TestAllJoinedMembers(t *testing.T) { } } +func TestCircularSlice(t *testing.T) { + testCases := []struct { + name string + max int + appends []int64 + want []int64 // these get sorted in the test + }{ + { + name: "wraparound", + max: 5, + appends: []int64{9, 8, 7, 6, 5, 4, 3, 2}, + want: []int64{2, 3, 4, 5, 6}, + }, + { + name: "exact", + max: 5, + appends: []int64{9, 8, 7, 6, 5}, + want: []int64{5, 6, 7, 8, 9}, + }, + { + name: "unfilled", + max: 5, + appends: []int64{9, 8, 7}, + want: []int64{7, 8, 9}, + }, + { + name: "wraparound x2", + max: 5, + appends: []int64{9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 10}, + want: []int64{0, 1, 2, 3, 10}, + }, + } + for _, tc := range testCases { + cs := &circularSlice{ + max: tc.max, + } + for _, val := range tc.appends { + cs.append(val) + } + sort.Slice(cs.vals, func(i, j int) bool { + return cs.vals[i] < cs.vals[j] + }) + if !reflect.DeepEqual(cs.vals, tc.want) { + t.Errorf("%s: got %v want %v", tc.name, cs.vals, tc.want) + } + + } + +} + func cleanDB(t *testing.T) error { // make a fresh DB which is unpolluted from other tests db, close := connectToDB(t) From a5c11f33a22ce758820706fa081f862709d4006b Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 19 Jul 2023 18:29:02 +0100 Subject: [PATCH 095/156] Bugfixes to ensure identical hero slices to before --- state/storage.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/state/storage.go b/state/storage.go index 8248049b..d0cb63bd 100644 --- a/state/storage.go +++ b/state/storage.go @@ -791,7 +791,7 @@ func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (joinedMe } heroes := heroNIDs[roomID] if heroes == nil { - heroes = &circularSlice{max: 5} + heroes = &circularSlice{max: 6} heroNIDs[roomID] = heroes } switch membership { @@ -820,7 +820,9 @@ func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (joinedMe return nil, nil, err } heroes := make(map[string][]internal.Hero) - for _, ev := range heroEvents { + // loop backwards so the most recent hero is first in the hero list + for i := len(heroEvents) - 1; i >= 0; i-- { + ev := heroEvents[i] evJSON := gjson.ParseBytes(ev.JSON) roomHeroes := heroes[ev.RoomID] roomHeroes = append(roomHeroes, internal.Hero{ From 7dc999a44e6b2f38104c8af6e828368e547cdd60 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 24 Jul 2023 14:17:10 +0100 Subject: [PATCH 096/156] Add more metrics around connection buffers --- sync3/connmap.go | 64 ++++++++++++++++++++++++++++++++++++---- sync3/handler/handler.go | 24 ++------------- 2 files changed, 62 insertions(+), 26 deletions(-) diff --git a/sync3/connmap.go b/sync3/connmap.go index ebe24195..f37907b5 100644 --- a/sync3/connmap.go +++ b/sync3/connmap.go @@ -5,6 +5,7 @@ import ( "time" "github.com/ReneKroon/ttlcache/v2" + "github.com/prometheus/client_golang/prometheus" ) // ConnMap stores a collection of Conns. @@ -15,10 +16,15 @@ type ConnMap struct { userIDToConn map[string][]*Conn connIDToConn map[string]*Conn + numConns prometheus.Gauge + // counters for reasons why connections have expired + expiryTimedOutCounter prometheus.Counter + expiryBufferFullCounter prometheus.Counter + mu *sync.Mutex } -func NewConnMap() *ConnMap { +func NewConnMap(enablePrometheus bool) *ConnMap { cm := &ConnMap{ userIDToConn: make(map[string][]*Conn), connIDToConn: make(map[string]*Conn), @@ -27,17 +33,61 @@ func NewConnMap() *ConnMap { } cm.cache.SetTTL(30 * time.Minute) // TODO: customisable cm.cache.SetExpirationCallback(cm.closeConnExpires) + + if enablePrometheus { + cm.expiryTimedOutCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "sliding_sync", + Subsystem: "api", + Name: "expiry_conn_timed_out", + Help: "Counter of expired API connections due to reaching TTL limit", + }) + prometheus.MustRegister(cm.expiryTimedOutCounter) + cm.expiryBufferFullCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "sliding_sync", + Subsystem: "api", + Name: "expiry_conn_buffer_full", + Help: "Counter of expired API connections due to reaching buffer update limit", + }) + prometheus.MustRegister(cm.expiryBufferFullCounter) + cm.numConns = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "sliding_sync", + Subsystem: "api", + Name: "num_active_conns", + Help: "Number of active sliding sync connections.", + }) + prometheus.MustRegister(cm.numConns) + } return cm } func (m *ConnMap) Teardown() { m.cache.Close() + + if m.numConns != nil { + prometheus.Unregister(m.numConns) + } + if m.expiryBufferFullCounter != nil { + prometheus.Unregister(m.expiryBufferFullCounter) + } + if m.expiryTimedOutCounter != nil { + prometheus.Unregister(m.expiryTimedOutCounter) + } } -func (m *ConnMap) Len() int { +// UpdateMetrics recalculates the number of active connections. Do this when you think there is a change. +func (m *ConnMap) UpdateMetrics() { m.mu.Lock() defer m.mu.Unlock() - return len(m.connIDToConn) + m.updateMetrics(len(m.connIDToConn)) +} + +// updateMetrics is like UpdateMetrics but doesn't touch connIDToConn and hence need a lock. We use this internally +// when we need to update the metric and already have the lock held, as calling UpdateMetrics would deadlock. +func (m *ConnMap) updateMetrics(numConns int) { + if m.numConns == nil { + return + } + m.numConns.Set(float64(numConns)) } // Conns return all connections for this user|device @@ -64,8 +114,9 @@ func (m *ConnMap) Conn(cid ConnID) *Conn { return conn } // e.g buffer exceeded, close it and remove it from the cache - logger.Trace().Str("conn", cid.String()).Msg("closing connection due to dead connection (buffer full)") + logger.Info().Str("conn", cid.String()).Msg("closing connection due to dead connection (buffer full)") m.closeConn(conn) + m.expiryBufferFullCounter.Inc() return nil } @@ -92,6 +143,7 @@ func (m *ConnMap) CreateConn(cid ConnID, newConnHandler func() ConnHandler) (*Co m.cache.Set(cid.String(), conn) m.connIDToConn[cid.String()] = conn m.userIDToConn[cid.UserID] = append(m.userIDToConn[cid.UserID], conn) + m.updateMetrics(len(m.connIDToConn)) return conn, true } @@ -121,7 +173,8 @@ func (m *ConnMap) closeConnExpires(connID string, value interface{}) { m.mu.Lock() defer m.mu.Unlock() conn := value.(*Conn) - logger.Trace().Str("conn", connID).Msg("closing connection due to expired TTL in cache") + logger.Info().Str("conn", connID).Msg("closing connection due to expired TTL in cache") + m.expiryTimedOutCounter.Inc() m.closeConn(conn) } @@ -147,4 +200,5 @@ func (m *ConnMap) closeConn(conn *Conn) { m.userIDToConn[conn.UserID] = conns // remove user cache listeners etc h.Destroy() + m.updateMetrics(len(m.connIDToConn)) } diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 84c461c3..c02c5a29 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -59,7 +59,6 @@ type SyncLiveHandler struct { GlobalCache *caches.GlobalCache maxPendingEventUpdates int - numConns prometheus.Gauge setupHistVec *prometheus.HistogramVec histVec *prometheus.HistogramVec slowReqs prometheus.Counter @@ -74,7 +73,7 @@ func NewSync3Handler( V2: v2Client, Storage: store, V2Store: storev2, - ConnMap: sync3.NewConnMap(), + ConnMap: sync3.NewConnMap(enablePrometheus), userCaches: &sync.Map{}, Dispatcher: sync3.NewDispatcher(), GlobalCache: caches.NewGlobalCache(store), @@ -128,9 +127,6 @@ func (h *SyncLiveHandler) Teardown() { h.V2Sub.Teardown() h.EnsurePoller.Teardown() h.ConnMap.Teardown() - if h.numConns != nil { - prometheus.Unregister(h.numConns) - } if h.setupHistVec != nil { prometheus.Unregister(h.setupHistVec) } @@ -142,20 +138,7 @@ func (h *SyncLiveHandler) Teardown() { } } -func (h *SyncLiveHandler) updateMetrics() { - if h.numConns == nil { - return - } - h.numConns.Set(float64(h.ConnMap.Len())) -} - func (h *SyncLiveHandler) addPrometheusMetrics() { - h.numConns = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: "sliding_sync", - Subsystem: "api", - Name: "num_active_conns", - Help: "Number of active sliding sync connections.", - }) h.setupHistVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "sliding_sync", Subsystem: "api", @@ -176,7 +159,6 @@ func (h *SyncLiveHandler) addPrometheusMetrics() { Name: "slow_requests", Help: "Counter of slow (>=50s) requests, initial or otherwise.", }) - prometheus.MustRegister(h.numConns) prometheus.MustRegister(h.setupHistVec) prometheus.MustRegister(h.histVec) prometheus.MustRegister(h.slowReqs) @@ -398,7 +380,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ } pid := sync2.PollerID{UserID: token.UserID, DeviceID: token.DeviceID} - log.Trace().Any("pid", pid).Msg("checking poller exists and is running") + log.Trace().Any("pid", pid).Msg("checking poller exists and is running") h.EnsurePoller.EnsurePolling(req.Context(), pid, token.AccessTokenHash) log.Trace().Msg("poller exists and is running") // this may take a while so if the client has given up (e.g timed out) by this point, just stop. @@ -421,7 +403,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ } // once we have the conn, make sure our metrics are correct - defer h.updateMetrics() + defer h.ConnMap.UpdateMetrics() // Now the v2 side of things are running, we can make a v3 live sync conn // NB: this isn't inherently racey (we did the check for an existing conn before EnsurePolling) From 353630720405ffb6fb8afbf91137a027dd494a08 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 24 Jul 2023 14:42:02 +0100 Subject: [PATCH 097/156] Track buffer sizes in summary line --- internal/context.go | 15 +++++++++++++++ sync3/handler/connstate_live.go | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/internal/context.go b/internal/context.go index 18480626..bfb87b6c 100644 --- a/internal/context.go +++ b/internal/context.go @@ -2,6 +2,8 @@ package internal import ( "context" + "fmt" + "github.com/getsentry/sentry-go" "github.com/rs/zerolog" @@ -17,6 +19,7 @@ var ( type data struct { userID string deviceID string + bufferSummary string since int64 next int64 numRooms int @@ -53,6 +56,15 @@ func SetRequestContextUserID(ctx context.Context, userID, deviceID string) { } } +func SetConnBufferInfo(ctx context.Context, bufferLen, nextLen, bufferCap int) { + d := ctx.Value(ctxData) + if d == nil { + return + } + da := d.(*data) + da.bufferSummary = fmt.Sprintf("%d/%d/%d", bufferLen, nextLen, bufferCap) +} + func SetRequestContextResponseInfo( ctx context.Context, since, next int64, numRooms int, txnID string, numToDeviceEvents, numGlobalAccountData int, numChangedDevices, numLeftDevices int, @@ -108,5 +120,8 @@ func DecorateLogger(ctx context.Context, l *zerolog.Event) *zerolog.Event { if da.numLeftDevices > 0 { l = l.Int("dl-l", da.numLeftDevices) } + if da.bufferSummary != "" { + l = l.Str("b", da.bufferSummary) + } return l } diff --git a/sync3/handler/connstate_live.go b/sync3/handler/connstate_live.go index d5b0975d..012fb670 100644 --- a/sync3/handler/connstate_live.go +++ b/sync3/handler/connstate_live.go @@ -57,6 +57,7 @@ func (s *connStateLive) liveUpdate( if req.TimeoutMSecs() < 100 { req.SetTimeoutMSecs(100) } + startBufferSize := len(s.updates) // block until we get a new event, with appropriate timeout startTime := time.Now() hasLiveStreamed := false @@ -104,6 +105,9 @@ func (s *connStateLive) liveUpdate( } log.Trace().Bool("live_streamed", hasLiveStreamed).Msg("liveUpdate: returning") + + internal.SetConnBufferInfo(ctx, startBufferSize, len(s.updates), cap(s.updates)) + // TODO: op consolidation } From f0ed4969a6a6458307bd250cfb8c63595b556099 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 24 Jul 2023 14:43:31 +0100 Subject: [PATCH 098/156] nil checks --- sync3/connmap.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sync3/connmap.go b/sync3/connmap.go index f37907b5..fccc867e 100644 --- a/sync3/connmap.go +++ b/sync3/connmap.go @@ -116,7 +116,9 @@ func (m *ConnMap) Conn(cid ConnID) *Conn { // e.g buffer exceeded, close it and remove it from the cache logger.Info().Str("conn", cid.String()).Msg("closing connection due to dead connection (buffer full)") m.closeConn(conn) - m.expiryBufferFullCounter.Inc() + if m.expiryBufferFullCounter != nil { + m.expiryBufferFullCounter.Inc() + } return nil } @@ -174,7 +176,9 @@ func (m *ConnMap) closeConnExpires(connID string, value interface{}) { defer m.mu.Unlock() conn := value.(*Conn) logger.Info().Str("conn", connID).Msg("closing connection due to expired TTL in cache") - m.expiryTimedOutCounter.Inc() + if m.expiryTimedOutCounter != nil { + m.expiryTimedOutCounter.Inc() + } m.closeConn(conn) } From a90a9584c9bfc0f5066a9f571c5c800c0be9dc65 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 24 Jul 2023 16:22:22 +0100 Subject: [PATCH 099/156] Comments --- sync3/connmap.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/connmap.go b/sync3/connmap.go index fccc867e..02d2f3f9 100644 --- a/sync3/connmap.go +++ b/sync3/connmap.go @@ -81,7 +81,7 @@ func (m *ConnMap) UpdateMetrics() { m.updateMetrics(len(m.connIDToConn)) } -// updateMetrics is like UpdateMetrics but doesn't touch connIDToConn and hence need a lock. We use this internally +// updateMetrics is like UpdateMetrics but doesn't touch connIDToConn and hence doesn't need to lock. We use this internally // when we need to update the metric and already have the lock held, as calling UpdateMetrics would deadlock. func (m *ConnMap) updateMetrics(numConns int) { if m.numConns == nil { From d9d0609a51b620e3757204142ee4d4993d2d81d6 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 24 Jul 2023 17:46:30 +0100 Subject: [PATCH 100/156] Protect map against concurrent map r/w Comments did say to hold mu.. --- sync3/connmap.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sync3/connmap.go b/sync3/connmap.go index 02d2f3f9..0cae2748 100644 --- a/sync3/connmap.go +++ b/sync3/connmap.go @@ -115,7 +115,9 @@ func (m *ConnMap) Conn(cid ConnID) *Conn { } // e.g buffer exceeded, close it and remove it from the cache logger.Info().Str("conn", cid.String()).Msg("closing connection due to dead connection (buffer full)") + m.mu.Lock() m.closeConn(conn) + m.mu.Unlock() if m.expiryBufferFullCounter != nil { m.expiryBufferFullCounter.Inc() } From 6c83b3a75b98bb07b4d3d2ab1894cce2a88a9db5 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 24 Jul 2023 18:33:20 +0100 Subject: [PATCH 101/156] Adjust spam intervals --- sync3/conn.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/conn.go b/sync3/conn.go index 8010447b..33fd87bb 100644 --- a/sync3/conn.go +++ b/sync3/conn.go @@ -14,7 +14,7 @@ import ( // The amount of time to artificially wait if the server detects spamming clients. This time will // be added to responses when the server detects the same request being sent over and over e.g // /sync?pos=5 then /sync?pos=5 over and over. Likewise /sync without a ?pos=. -var SpamProtectionInterval = time.Second +var SpamProtectionInterval = 10 * time.Millisecond type ConnID struct { UserID string From bfb980bad89acd1929a3e4c34f4fe8a3085b137a Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 25 Jul 2023 10:16:07 +0100 Subject: [PATCH 102/156] bugfix: fix deadlock when connections expire due to full buffers Caused by the fix in https://github.com/matrix-org/sliding-sync/pull/220 --- sync3/connmap.go | 12 +++++++++--- tests-integration/connection_test.go | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/sync3/connmap.go b/sync3/connmap.go index 0cae2748..df850a47 100644 --- a/sync3/connmap.go +++ b/sync3/connmap.go @@ -105,6 +105,14 @@ func (m *ConnMap) Conns(userID, deviceID string) []*Conn { // Conn returns a connection with this ConnID. Returns nil if no connection exists. func (m *ConnMap) Conn(cid ConnID) *Conn { + m.mu.Lock() + defer m.mu.Unlock() + return m.getConn(cid) +} + +// getConn returns a connection with this ConnID. Returns nil if no connection exists. Expires connections if the buffer is full. +// Must hold mu. +func (m *ConnMap) getConn(cid ConnID) *Conn { cint, _ := m.cache.Get(cid.String()) if cint == nil { return nil @@ -115,9 +123,7 @@ func (m *ConnMap) Conn(cid ConnID) *Conn { } // e.g buffer exceeded, close it and remove it from the cache logger.Info().Str("conn", cid.String()).Msg("closing connection due to dead connection (buffer full)") - m.mu.Lock() m.closeConn(conn) - m.mu.Unlock() if m.expiryBufferFullCounter != nil { m.expiryBufferFullCounter.Inc() } @@ -129,7 +135,7 @@ func (m *ConnMap) CreateConn(cid ConnID, newConnHandler func() ConnHandler) (*Co // atomically check if a conn exists already and nuke it if it exists m.mu.Lock() defer m.mu.Unlock() - conn := m.Conn(cid) + conn := m.getConn(cid) if conn != nil { // tear down this connection and fallthrough isSpamming := conn.lastPos <= 1 diff --git a/tests-integration/connection_test.go b/tests-integration/connection_test.go index 2bf3960f..a0ab7ae1 100644 --- a/tests-integration/connection_test.go +++ b/tests-integration/connection_test.go @@ -622,6 +622,20 @@ func TestSessionExpiryOnBufferFill(t *testing.T) { if gjson.ParseBytes(body).Get("errcode").Str != "M_UNKNOWN_POS" { t.Errorf("got %v want errcode=M_UNKNOWN_POS", string(body)) } + + // make sure we can sync from fresh (regression for when we deadlocked after this point) + res = v3.mustDoV3Request(t, aliceToken, sync3.Request{ + RoomSubscriptions: map[string]sync3.RoomSubscription{ + roomID: { + TimelineLimit: 1, + }, + }, + }) + m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + roomID: { + m.MatchJoinCount(1), + }, + })) } func TestExpiredAccessToken(t *testing.T) { From 6d49b6cabe331c4224dd7c5bf1456ff05eafc109 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 25 Jul 2023 14:18:58 +0100 Subject: [PATCH 103/156] Add malformed/unusual events test --- tests-integration/regressions_test.go | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests-integration/regressions_test.go b/tests-integration/regressions_test.go index d24869ce..7ec68b72 100644 --- a/tests-integration/regressions_test.go +++ b/tests-integration/regressions_test.go @@ -110,3 +110,55 @@ func TestBackfillInviteDoesntCorruptState(t *testing.T) { }, )) } + +func TestMalformedEvents(t *testing.T) { + pqString := testutils.PrepareDBConnectionString() + // setup code + v2 := runTestV2Server(t) + v3 := runTestServer(t, v2, pqString) + defer v2.close() + defer v3.close() + + // unusual events ARE VALID EVENTS and should be sent to the client, but are unusual for some reason. + unusualEvents := []json.RawMessage{ + testutils.NewStateEvent(t, "", "", alice, map[string]interface{}{ + "empty string": "for event type", + }), + } + // malformed events are INVALID and should be ignored by the proxy. + malformedEvents := []json.RawMessage{ + testutils.NewStateEvent(t, "", "", alice, []string{"content", "as", "an", "array"}), + } + + room := roomEvents{ + roomID: "!TestMalformedEvents:localhost", + events: append(malformedEvents, unusualEvents...), + state: createRoomState(t, alice, time.Now()), + } + v2.addAccount(t, alice, aliceToken) + v2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(room), + }, + }) + + // alice syncs and should see the room. + aliceRes := v3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + Ranges: sync3.SliceRanges{{0, 20}}, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: int64(len(unusualEvents)), + }, + }, + }, + }) + m.MatchResponse(t, aliceRes, m.MatchList("a", m.MatchV3Count(1), m.MatchV3Ops(m.MatchV3SyncOp(0, 0, []string{room.roomID}))), + m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + room.roomID: { + m.MatchJoinCount(1), + m.MatchRoomTimeline(unusualEvents), + }, + })) + +} From d745c90d95cf9e79f2c02f860b13a6d2d58eccba Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 25 Jul 2023 14:25:30 +0100 Subject: [PATCH 104/156] Ignore malformed events But handle unusual events. With regression test. Fixes https://github.com/matrix-org/sliding-sync/issues/223 --- state/accumulator.go | 5 ++++- state/event_table.go | 2 +- tests-integration/regressions_test.go | 9 +++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/state/accumulator.go b/state/accumulator.go index 7e29badb..89f595c2 100644 --- a/state/accumulator.go +++ b/state/accumulator.go @@ -415,7 +415,10 @@ func (a *Accumulator) filterAndParseTimelineEvents(txn *sqlx.Tx, roomID string, RoomID: roomID, } if err := e.ensureFieldsSetOnEvent(); err != nil { - return nil, fmt.Errorf("event malformed: %s", err) + logger.Warn().Str("event_id", e.ID).Str("room_id", roomID).Err(err).Msg( + "Accumulator.filterAndParseTimelineEvents: failed to parse event, ignoring", + ) + continue } if _, ok := seenEvents[e.ID]; ok { logger.Warn().Str("event_id", e.ID).Str("room_id", roomID).Msg( diff --git a/state/event_table.go b/state/event_table.go index 46f43df4..d45bd3bc 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -57,7 +57,7 @@ func (ev *Event) ensureFieldsSetOnEvent() error { } if ev.Type == "" { typeResult := evJSON.Get("type") - if !typeResult.Exists() || typeResult.Str == "" { + if !typeResult.Exists() { // empty strings for 'type' are valid apparently return fmt.Errorf("event JSON missing type key") } ev.Type = typeResult.Str diff --git a/tests-integration/regressions_test.go b/tests-integration/regressions_test.go index 7ec68b72..f0afba5f 100644 --- a/tests-integration/regressions_test.go +++ b/tests-integration/regressions_test.go @@ -127,12 +127,17 @@ func TestMalformedEvents(t *testing.T) { } // malformed events are INVALID and should be ignored by the proxy. malformedEvents := []json.RawMessage{ - testutils.NewStateEvent(t, "", "", alice, []string{"content", "as", "an", "array"}), + json.RawMessage(`{}`), // empty object + json.RawMessage(`{"type":5}`), // type is an integer + json.RawMessage(`{"type":"foo","content":{},"event_id":""}`), // 0-length string as event ID + json.RawMessage(`{"type":"foo","content":{}}`), // missing event ID } room := roomEvents{ roomID: "!TestMalformedEvents:localhost", - events: append(malformedEvents, unusualEvents...), + // append malformed after unusual. All malformed events should be dropped, + // leaving only unusualEvents. + events: append(unusualEvents, malformedEvents...), state: createRoomState(t, alice, time.Now()), } v2.addAccount(t, alice, aliceToken) From 0fea507b65fec9c5af9f42f92344a32ef22d1af0 Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Tue, 25 Jul 2023 14:41:23 +0100 Subject: [PATCH 105/156] Add malformed tests for room state --- state/accumulator.go | 5 ++- state/event_table.go | 16 ++++--- tests-integration/regressions_test.go | 65 ++++++++++++++++++++++++++- 3 files changed, 76 insertions(+), 10 deletions(-) diff --git a/state/accumulator.go b/state/accumulator.go index 89f595c2..99471484 100644 --- a/state/accumulator.go +++ b/state/accumulator.go @@ -207,8 +207,9 @@ func (a *Accumulator) Initialise(roomID string, state []json.RawMessage) (Initia IsState: true, } } - if err := ensureFieldsSet(events); err != nil { - return fmt.Errorf("events malformed: %s", err) + events = filterAndEnsureFieldsSet(events) + if len(events) == 0 { + return fmt.Errorf("failed to insert events, all events were filtered out: %w", err) } eventIDToNID, err := a.eventsTable.Insert(txn, events, false) if err != nil { diff --git a/state/event_table.go b/state/event_table.go index d45bd3bc..1c360b1b 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -57,7 +57,7 @@ func (ev *Event) ensureFieldsSetOnEvent() error { } if ev.Type == "" { typeResult := evJSON.Get("type") - if !typeResult.Exists() { // empty strings for 'type' are valid apparently + if !typeResult.Exists() || typeResult.Type != gjson.String { // empty strings for 'type' are valid apparently return fmt.Errorf("event JSON missing type key") } ev.Type = typeResult.Str @@ -153,7 +153,7 @@ func (t *EventTable) SelectHighestNID() (highest int64, err error) { // we insert new events A and B in that order, then NID(A) < NID(B). func (t *EventTable) Insert(txn *sqlx.Tx, events []Event, checkFields bool) (map[string]int64, error) { if checkFields { - ensureFieldsSet(events) + events = filterAndEnsureFieldsSet(events) } result := make(map[string]int64) for i := range events { @@ -449,14 +449,18 @@ func (c EventChunker) Subslice(i, j int) sqlutil.Chunker { return c[i:j] } -func ensureFieldsSet(events []Event) error { +func filterAndEnsureFieldsSet(events []Event) []Event { + result := make([]Event, 0, len(events)) // ensure fields are set for i := range events { ev := events[i] if err := ev.ensureFieldsSetOnEvent(); err != nil { - return err + logger.Warn().Str("event_id", ev.ID).Err(err).Msg( + "filterAndEnsureFieldsSet: failed to parse event, ignoring", + ) + continue } - events[i] = ev + result = append(result, ev) } - return nil + return result } diff --git a/tests-integration/regressions_test.go b/tests-integration/regressions_test.go index f0afba5f..fc6e1984 100644 --- a/tests-integration/regressions_test.go +++ b/tests-integration/regressions_test.go @@ -111,7 +111,7 @@ func TestBackfillInviteDoesntCorruptState(t *testing.T) { )) } -func TestMalformedEvents(t *testing.T) { +func TestMalformedEventsTimeline(t *testing.T) { pqString := testutils.PrepareDBConnectionString() // setup code v2 := runTestV2Server(t) @@ -134,7 +134,7 @@ func TestMalformedEvents(t *testing.T) { } room := roomEvents{ - roomID: "!TestMalformedEvents:localhost", + roomID: "!TestMalformedEventsTimeline:localhost", // append malformed after unusual. All malformed events should be dropped, // leaving only unusualEvents. events: append(unusualEvents, malformedEvents...), @@ -165,5 +165,66 @@ func TestMalformedEvents(t *testing.T) { m.MatchRoomTimeline(unusualEvents), }, })) +} + +func TestMalformedEventsState(t *testing.T) { + pqString := testutils.PrepareDBConnectionString() + // setup code + v2 := runTestV2Server(t) + v3 := runTestServer(t, v2, pqString) + defer v2.close() + defer v3.close() + + // unusual events ARE VALID EVENTS and should be sent to the client, but are unusual for some reason. + unusualEvents := []json.RawMessage{ + testutils.NewStateEvent(t, "", "", alice, map[string]interface{}{ + "empty string": "for event type", + }), + } + // malformed events are INVALID and should be ignored by the proxy. + malformedEvents := []json.RawMessage{ + json.RawMessage(`{}`), // empty object + json.RawMessage(`{"type":5,"content":{},"event_id":"f","state_key":""}`), // type is an integer + json.RawMessage(`{"type":"foo","content":{},"event_id":"","state_key":""}`), // 0-length string as event ID + json.RawMessage(`{"type":"foo","content":{},"state_key":""}`), // missing event ID + } + latestEvent := testutils.NewEvent(t, "m.room.message", alice, map[string]interface{}{"body": "hi"}) + + room := roomEvents{ + roomID: "!TestMalformedEventsState:localhost", + events: []json.RawMessage{latestEvent}, + // append malformed after unusual. All malformed events should be dropped, + // leaving only unusualEvents. + state: append(createRoomState(t, alice, time.Now()), append(unusualEvents, malformedEvents...)...), + } + v2.addAccount(t, alice, aliceToken) + v2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(room), + }, + }) + + // alice syncs and should see the room. + aliceRes := v3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + Ranges: sync3.SliceRanges{{0, 20}}, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: int64(len(unusualEvents)), + RequiredState: [][2]string{{"", ""}}, + }, + }, + }, + }) + m.MatchResponse(t, aliceRes, m.MatchList("a", m.MatchV3Count(1), m.MatchV3Ops(m.MatchV3SyncOp(0, 0, []string{room.roomID}))), + m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + room.roomID: { + m.MatchJoinCount(1), + m.MatchRoomTimeline([]json.RawMessage{latestEvent}), + m.MatchRoomRequiredState([]json.RawMessage{ + unusualEvents[0], + }), + }, + })) } From a8253759c71bbbe2bf4749b8d075987ee70d5265 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 8 Jun 2023 14:23:32 +0100 Subject: [PATCH 106/156] Reproduce the problem --- tests-integration/timeline_test.go | 110 +++++++++++++++++++++++++++++ testutils/m/match.go | 9 +++ 2 files changed, 119 insertions(+) diff --git a/tests-integration/timeline_test.go b/tests-integration/timeline_test.go index cb188d56..7b3c8a3c 100644 --- a/tests-integration/timeline_test.go +++ b/tests-integration/timeline_test.go @@ -689,6 +689,116 @@ func TestTimelineTxnID(t *testing.T) { )) } +// Like TestTimelineTxnID, but where ... +func TestTimelineTxnIDAfterInitialSync(t *testing.T) { + pqString := testutils.PrepareDBConnectionString() + // setup code + v2 := runTestV2Server(t) + v3 := runTestServer(t, v2, pqString) + defer v2.close() + defer v3.close() + roomID := "!a:localhost" + latestTimestamp := time.Now() + t.Log("Alice and Bob are in the same room") + room := roomEvents{ + roomID: roomID, + events: append( + createRoomState(t, alice, latestTimestamp), + testutils.NewJoinEvent(t, bob), + ), + } + v2.addAccount(t, alice, aliceToken) + v2.addAccount(t, bob, bobToken) + v2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(room), + }, + NextBatch: "alice_after_initial_poll", + }) + v2.queueResponse(bob, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(room), + }, + NextBatch: "bob_after_initial_poll", + }) + + t.Log("Alice and Bob make initial sliding syncs.") + aliceRes := v3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, + }, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 2, + }, + }, + }, + }) + bobRes := v3.mustDoV3Request(t, bobToken, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, + }, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 2, + }, + }, + }, + }) + + t.Log("Alice has sent a message... but it arrives down Bob's poller first, without a transaction_id") + txnID := "m1234567890" + newEvent := testutils.NewEvent(t, "m.room.message", alice, map[string]interface{}{"body": "hi"}, testutils.WithUnsigned(map[string]interface{}{ + "transaction_id": txnID, + })) + newEventNoUnsigned, err := sjson.DeleteBytes(newEvent, "unsigned") + if err != nil { + t.Fatalf("failed to delete bytes: %s", err) + } + + v2.queueResponse(bob, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + events: []json.RawMessage{newEventNoUnsigned}, + }), + }, + }) + t.Log("Bob's poller sees the message.") + v2.waitUntilEmpty(t, bob) + + t.Log("Alice requests an incremental sliding sync with no request changes.") + aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) + t.Log("Alice should see no messages.") + m.MatchResponse(t, aliceRes, m.MatchRoomSubscriptionsStrict(nil)) + + // Now the message arrives down Alice's poller. + v2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + events: []json.RawMessage{newEvent}, + }), + }, + }) + t.Log("Alice's poller sees the message with transaction_id.") + v2.waitUntilEmpty(t, alice) + + t.Log("Alice makes another incremental sync request.") + aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) + t.Log("Alice's sync response includes the message with the txn ID.") + m.MatchResponse(t, aliceRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( + roomID, m.MatchRoomTimelineMostRecent(1, []json.RawMessage{newEvent}), + )) + + t.Log("Bob makes an incremental sliding sync") + bobRes = v3.mustDoV3RequestWithPos(t, bobToken, bobRes.Pos, sync3.Request{}) + t.Log("Bob should see the message without a transaction_id") + m.MatchResponse(t, bobRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( + roomID, m.MatchRoomTimelineMostRecent(1, []json.RawMessage{newEventNoUnsigned}), + )) +} + // Executes a sync v3 request without a ?pos and asserts that the count, rooms and timeline events m.Match the inputs given. func testTimelineLoadInitialEvents(v3 *testV3Server, token string, count int, wantRooms []roomEvents, numTimelineEventsPerRoom int) func(t *testing.T) { return func(t *testing.T) { diff --git a/testutils/m/match.go b/testutils/m/match.go index f8f6c6a3..18524619 100644 --- a/testutils/m/match.go +++ b/testutils/m/match.go @@ -283,6 +283,15 @@ func MatchRoomSubscriptions(wantSubs map[string][]RoomMatcher) RespMatcher { } } +func MatchNoLists() RespMatcher { + return func(res *sync3.Response) error { + if len(res.Lists) != 0 { + return fmt.Errorf("expected no lists in response, got %d lists: %v", len(res.Lists), res.Lists) + } + return nil + } +} + func MatchNoE2EEExtension() RespMatcher { return func(res *sync3.Response) error { if res.Extensions.E2EE != nil { From f9d49722f1a49df5f62b2496ebf6ac10f3468720 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 14:37:53 +0100 Subject: [PATCH 107/156] Construct a gadget for tracking pending txn IDs --- sync2/txnid.go | 94 ++++++++++++++++++++++------ sync2/txnid_test.go | 146 ++++++++++++++++++++++++++++++-------------- 2 files changed, 176 insertions(+), 64 deletions(-) diff --git a/sync2/txnid.go b/sync2/txnid.go index 00e0ede9..642585ea 100644 --- a/sync2/txnid.go +++ b/sync2/txnid.go @@ -1,38 +1,98 @@ package sync2 import ( + "fmt" + "sync" "time" "github.com/ReneKroon/ttlcache/v2" ) -type TransactionIDCache struct { - cache *ttlcache.Cache +type loaderFunc func(userID string) (deviceIDs []string) + +// PendingTransactionIDs is (conceptually) a map from event IDs to a list of device IDs. +// Its keys the IDs of event we've seen which a) lack a transaction ID, and b) were sent +// by one of the users we are polling for. The values are the list of the sender's +// devices whose pollers are yet to see a transaction ID. +// +// If another poller sees the same event +// - with a transaction ID, it emits a V2TransactionID payload with that ID and +// removes the event ID from this map. +// - without a transaction ID, it removes the polling device ID from the values +// list. If the device ID list is now empty, the poller emits an "all clear" +// V2TransactionID payload. +// +// This is a best-effort affair to ensure that the rest of the proxy can wait for +// transaction IDs to appear before transmitting an event down /sync to its sender. +// +// It's possible that we add an entry to this map and then the list of remaining +// device IDs becomes out of date, either due to a new device creation or an +// existing device expiring. We choose not to handle this case, because it is relatively +// rare. +// +// To avoid the map growing without bound, we use a ttlcache and drop entries +// after a short period of time. +type PendingTransactionIDs struct { + // mu guards the pending field. + mu sync.Mutex + pending *ttlcache.Cache + // loader should provide the list of device IDs + loader loaderFunc } -func NewTransactionIDCache() *TransactionIDCache { +func NewPendingTransactionIDs(loader loaderFunc) *PendingTransactionIDs { c := ttlcache.NewCache() c.SetTTL(5 * time.Minute) // keep transaction IDs for 5 minutes before forgetting about them c.SkipTTLExtensionOnHit(true) // we don't care how many times they ask for the item, 5min is the limit. - return &TransactionIDCache{ - cache: c, + return &PendingTransactionIDs{ + mu: sync.Mutex{}, + pending: c, + loader: loader, } } -// Store a new transaction ID received via v2 /sync -func (c *TransactionIDCache) Store(userID, eventID, txnID string) { - c.cache.Set(cacheKey(userID, eventID), txnID) -} +// MissingTxnID should be called to report that this device ID did not see a +// transaction ID for this event ID. Returns true if this is the first time we know +// for sure that we'll never see a txn ID for this event. +func (c *PendingTransactionIDs) MissingTxnID(eventID, userID, myDeviceID string) (bool, error) { + c.mu.Lock() + defer c.mu.Unlock() -// Get a transaction ID previously stored. -func (c *TransactionIDCache) Get(userID, eventID string) string { - val, _ := c.cache.Get(cacheKey(userID, eventID)) - if val != nil { - return val.(string) + data, err := c.pending.Get(eventID) + if err == ttlcache.ErrNotFound { + data = c.loader(userID) + } else if err != nil { + return false, fmt.Errorf("PendingTransactionIDs: failed to get device ids: %w", err) } - return "" + + deviceIDs, ok := data.([]string) + if !ok { + return false, fmt.Errorf("PendingTransactionIDs: failed to cast device IDs") + } + + deviceIDs, changed := removeDevice(myDeviceID, deviceIDs) + if changed { + err = c.pending.Set(eventID, deviceIDs) + if err != nil { + return false, fmt.Errorf("PendingTransactionIDs: failed to set device IDs: %w", err) + } + } + return changed && len(deviceIDs) == 0, nil +} + +// SeenTxnID should be called to report that this device saw a transaction ID +// for this event. +func (c *PendingTransactionIDs) SeenTxnID(eventID string) error { + c.mu.Lock() + c.mu.Unlock() + return c.pending.Set(eventID, []string{}) } -func cacheKey(userID, eventID string) string { - return userID + " " + eventID +func removeDevice(device string, devices []string) ([]string, bool) { + for i, otherDevice := range devices { + if otherDevice == device { + return append(devices[:i], devices[i+1:]...), true + } + } + return devices, false } diff --git a/sync2/txnid_test.go b/sync2/txnid_test.go index 72d1967e..844957c3 100644 --- a/sync2/txnid_test.go +++ b/sync2/txnid_test.go @@ -2,54 +2,106 @@ package sync2 import "testing" -func TestTransactionIDCache(t *testing.T) { - alice := "@alice:localhost" - bob := "@bob:localhost" - eventA := "$a:localhost" - eventB := "$b:localhost" - eventC := "$c:localhost" - txn1 := "1" - txn2 := "2" - cache := NewTransactionIDCache() - cache.Store(alice, eventA, txn1) - cache.Store(bob, eventB, txn1) // different users can use same txn ID - cache.Store(alice, eventC, txn2) - - testCases := []struct { - eventID string - userID string - want string - }{ - { - eventID: eventA, - userID: alice, - want: txn1, - }, - { - eventID: eventB, - userID: bob, - want: txn1, - }, - { - eventID: eventC, - userID: alice, - want: txn2, - }, - { - eventID: "$invalid", - userID: alice, - want: "", - }, - { - eventID: eventA, - userID: "@invalid", - want: "", - }, +func TestPendingTransactionIDs(t *testing.T) { + pollingDevicesByUser := map[string][]string{ + "alice": {"A1", "A2"}, + "bob": {"B1"}, + "chris": {}, + "delia": {"D1", "D2", "D3", "D4"}, + "enid": {"E1", "E2"}, } - for _, tc := range testCases { - txnID := cache.Get(tc.userID, tc.eventID) - if txnID != tc.want { - t.Errorf("%+v: got %v want %v", tc, txnID, tc.want) + mockLoad := func(userID string) (deviceIDs []string) { + devices, ok := pollingDevicesByUser[userID] + if !ok { + t.Fatalf("Mock didn't have devices for %s", userID) } + newDevices := make([]string, len(devices)) + copy(newDevices, devices) + return newDevices + } + + pending := NewPendingTransactionIDs(mockLoad) + + // Alice. + // We're tracking two of Alice's devices. + allClear, err := pending.MissingTxnID("event1", "alice", "A1") + assertNoError(t, err) + assertAllClear(t, allClear, false) // waiting on A2 + + // If for some reason the poller sees the same event for the same device, we should + // still be waiting for A2. + allClear, err = pending.MissingTxnID("event1", "alice", "A1") + assertNoError(t, err) + assertAllClear(t, allClear, false) + + // If for some reason Alice spun up a new device, we are still going to be waiting + // for A2. + allClear, err = pending.MissingTxnID("event1", "alice", "A_unknown_device") + assertNoError(t, err) + assertAllClear(t, allClear, false) + + // If A2 sees the event without a txnID, we should emit the all clear signal. + allClear, err = pending.MissingTxnID("event1", "alice", "A2") + assertNoError(t, err) + assertAllClear(t, allClear, true) + + // If for some reason A2 sees the event a second time, we shouldn't re-emit the + // all clear signal. + allClear, err = pending.MissingTxnID("event1", "alice", "A2") + assertNoError(t, err) + assertAllClear(t, allClear, false) + + // Bob. + // We're only tracking one device for Bob + allClear, err = pending.MissingTxnID("event2", "bob", "B1") + assertNoError(t, err) + assertAllClear(t, allClear, true) // not waiting on any devices + + // Chris. + // We're not tracking any devices for Chris. A MissingTxnID call for him shouldn't + // cause anything to explode. + allClear, err = pending.MissingTxnID("event3", "chris", "C_unknown_device") + assertNoError(t, err) + + // Delia. + // Delia is tracking four devices. + allClear, err = pending.MissingTxnID("event4", "delia", "D1") + assertNoError(t, err) + assertAllClear(t, allClear, false) // waiting on E2, E3 and E4 + + // One of Delia's devices, say D2, sees a txn ID for E4. + err = pending.SeenTxnID("event4") + assertNoError(t, err) + + // The other devices see the event. Neither should emit all clear. + allClear, err = pending.MissingTxnID("event4", "delia", "D3") + assertNoError(t, err) + assertAllClear(t, allClear, false) + + allClear, err = pending.MissingTxnID("event4", "delia", "D4") + assertNoError(t, err) + assertAllClear(t, allClear, false) + + // Enid. + // Enid has two devices. Her first poller (E1) is lucky and sees the transaction ID. + err = pending.SeenTxnID("event5") + assertNoError(t, err) + + // Her second poller misses the transaction ID, but this shouldn't cause an all clear. + allClear, err = pending.MissingTxnID("event4", "delia", "E2") + assertNoError(t, err) + assertAllClear(t, allClear, false) +} + +func assertAllClear(t *testing.T, got bool, want bool) { + if got != want { + t.Errorf("Expected allClear=%t, got %t", want, got) + } +} + +func assertNoError(t *testing.T, err error) { + t.Helper() + if err != nil { + t.Fatalf("got error: %s", err) } } From 8592fe3af2b4dec081acb935becdd9fd9e4f2d50 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 12 Jun 2023 18:55:02 +0100 Subject: [PATCH 108/156] TODO note --- sync2/handler2/handler.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 15d8037e..cfde423a 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -307,7 +307,11 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(fmt.Errorf("errMsg")) continue } - + // TODO: if all pollers for this user's devices have seen this event, then + // we can send an "all clear" message. Maybe this is just a V2TransactionID + // with an empty string for the TransactionID. In order to do this we will + // need to keep track of which events have been seen by which devices. Maybe + // NIDs suffice? h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ EventID: eventID, UserID: userID, From c5d7570e0940db4d35b180039dcbe3a70dce0af5 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 15:02:33 +0100 Subject: [PATCH 109/156] poller: mark txn IDs as seen --- sync2/handler2/handler.go | 1 + sync2/handler2/handler_test.go | 4 ++++ sync2/poller.go | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index cfde423a..699b90d2 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -312,6 +312,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev // with an empty string for the TransactionID. In order to do this we will // need to keep track of which events have been seen by which devices. Maybe // NIDs suffice? + h.pMap.SeenTxnID(eventID) h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ EventID: eventID, UserID: userID, diff --git a/sync2/handler2/handler_test.go b/sync2/handler2/handler_test.go index fa315228..0bd61c21 100644 --- a/sync2/handler2/handler_test.go +++ b/sync2/handler2/handler_test.go @@ -42,6 +42,10 @@ func (p *mockPollerMap) NumPollers() int { } func (p *mockPollerMap) Terminate() {} +func (p *mockPollerMap) SeenTxnID(eventID string) error { + return nil +} + func (p *mockPollerMap) EnsurePolling(pid sync2.PollerID, accessToken, v2since string, isStartup bool, logger zerolog.Logger) { p.calls = append(p.calls, pollInfo{ pid: pid, diff --git a/sync2/poller.go b/sync2/poller.go index 062366a6..d98f5ad4 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -64,6 +64,7 @@ type IPollerMap interface { EnsurePolling(pid PollerID, accessToken, v2since string, isStartup bool, logger zerolog.Logger) NumPollers() int Terminate() + SeenTxnID(eventID string) error } // PollerMap is a map of device ID to Poller @@ -72,6 +73,7 @@ type PollerMap struct { callbacks V2DataReceiver pollerMu *sync.Mutex Pollers map[PollerID]*poller + pendingTxnIDs *PendingTransactionIDs executor chan func() executorRunning bool processHistogramVec *prometheus.HistogramVec @@ -112,6 +114,7 @@ func NewPollerMap(v2Client Client, enablePrometheus bool) *PollerMap { Pollers: make(map[PollerID]*poller), executor: make(chan func(), 0), } + pm.pendingTxnIDs = NewPendingTransactionIDs(pm.deviceIDs) if enablePrometheus { pm.processHistogramVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "sliding_sync", @@ -195,6 +198,24 @@ func (h *PollerMap) NumPollers() (count int) { return } +// deviceIDs returns the slice of all devices currently being polled for by this user. +// The return value is brand-new and is fully owned by the caller. +func (h *PollerMap) deviceIDs(userID string) []string { + h.pollerMu.Lock() + defer h.pollerMu.Unlock() + devices := make([]string, 0) + for _, p := range h.Pollers { + if !p.terminated.Load() && p.userID == userID { + devices = append(devices, p.deviceID) + } + } + return devices +} + +func (h *PollerMap) SeenTxnID(eventID string) error { + return h.pendingTxnIDs.SeenTxnID(eventID) +} + // EnsurePolling makes sure there is a poller for this device, making one if need be. // Blocks until at least 1 sync is done if and only if the poller was just created. // This ensures that calls to the database will return data. From 008157c146724ae2116b4b326caa423637e240ee Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 15:34:27 +0100 Subject: [PATCH 110/156] poller: send all-clear --- pubsub/v2.go | 6 ++- sync2/handler2/handler.go | 74 +++++++++++++++++++--------------- sync2/handler2/handler_test.go | 4 ++ sync2/poller.go | 5 +++ 4 files changed, 55 insertions(+), 34 deletions(-) diff --git a/pubsub/v2.go b/pubsub/v2.go index 7dfb01e0..d44ad322 100644 --- a/pubsub/v2.go +++ b/pubsub/v2.go @@ -41,12 +41,14 @@ type V2Accumulate struct { func (*V2Accumulate) Type() string { return "V2Accumulate" } -// V2TransactionID is emitted by a poller when it sees an event with a transaction ID. +// V2TransactionID is emitted by a poller when it sees an event with a transaction ID, +// or when it is certain that no other poller will see a transaction ID for this event +// (the "all-clear"). type V2TransactionID struct { EventID string UserID string DeviceID string - TransactionID string + TransactionID string // Note: an empty transaction ID represents the all-clear. NID int64 } diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 699b90d2..96e9522e 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -3,7 +3,6 @@ package handler2 import ( "context" "encoding/json" - "fmt" "hash/fnv" "os" "sync" @@ -240,15 +239,24 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev // Remember any transaction IDs that may be unique to this user eventIDsWithTxns := make([]string, 0, len(timeline)) // in timeline order eventIDToTxnID := make(map[string]string, len(timeline)) // event_id -> txn_id + // Also remember events which were sent by this user but lack a transaction ID. + eventIDsLackingTxns := make([]string, 0, len(timeline)) + for _, e := range timeline { - txnID := gjson.GetBytes(e, "unsigned.transaction_id") - if !txnID.Exists() { + parsed := gjson.ParseBytes(e) + eventID := parsed.Get("event_id").Str + + if txnID := parsed.Get("unsigned.transaction_id"); txnID.Exists() { + eventIDsWithTxns = append(eventIDsWithTxns, eventID) + eventIDToTxnID[eventID] = txnID.Str continue } - eventID := gjson.GetBytes(e, "event_id").Str - eventIDsWithTxns = append(eventIDsWithTxns, eventID) - eventIDToTxnID[eventID] = txnID.Str + + if sender := parsed.Get("sender"); sender.Str == userID { + eventIDsLackingTxns = append(eventIDsLackingTxns, eventID) + } } + if len(eventIDToTxnID) > 0 { // persist the txn IDs err := h.Store.TransactionsTable.Insert(userID, deviceID, eventIDToTxnID) @@ -269,57 +277,59 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev // no new events return } + + // We've updated the database. Now tell any pubsub listeners what we learned. h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2Accumulate{ RoomID: roomID, PrevBatch: prevBatch, EventNIDs: latestNIDs, }) - if len(eventIDToTxnID) > 0 { + if len(eventIDToTxnID) > 0 || len(eventIDsLackingTxns) > 0 { // The call to h.Store.Accumulate above only tells us about new events' NIDS; // for existing events we need to requery the database to fetch them. // Rather than try to reuse work, keep things simple and just fetch NIDs for // all events with txnIDs. var nidsByIDs map[string]int64 + eventIDsToFetch := append(eventIDsWithTxns, eventIDsLackingTxns...) err = sqlutil.WithTransaction(h.Store.DB, func(txn *sqlx.Tx) error { - nidsByIDs, err = h.Store.EventsTable.SelectNIDsByIDs(txn, eventIDsWithTxns) + nidsByIDs, err = h.Store.EventsTable.SelectNIDsByIDs(txn, eventIDsToFetch) return err }) if err != nil { logger.Err(err). Int("timeline", len(timeline)). Int("num_transaction_ids", len(eventIDsWithTxns)). + Int("num_missing_transaction_ids", len(eventIDsLackingTxns)). Str("room", roomID). - Msg("V2: failed to fetch nids for events with transaction_ids") + Msg("V2: failed to fetch nids for event transaction_id handling") internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) return } - for _, eventID := range eventIDsWithTxns { + for eventID, nid := range nidsByIDs { txnID, ok := eventIDToTxnID[eventID] - if !ok { - continue - } - nid, ok := nidsByIDs[eventID] - if !ok { - errMsg := "V2: failed to fetch NID for txnID" - logger.Error().Str("user", userID).Str("device", deviceID).Msg(errMsg) - internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(fmt.Errorf("errMsg")) - continue + if ok { + h.pMap.SeenTxnID(eventID) + h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ + EventID: eventID, + UserID: userID, + DeviceID: deviceID, + TransactionID: txnID, + NID: nid, + }) + } else { + allClear, _ := h.pMap.MissingTxnID(eventID, userID, deviceID) + if allClear { + h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ + EventID: eventID, + UserID: userID, + DeviceID: deviceID, + TransactionID: "", + NID: nid, + }) + } } - // TODO: if all pollers for this user's devices have seen this event, then - // we can send an "all clear" message. Maybe this is just a V2TransactionID - // with an empty string for the TransactionID. In order to do this we will - // need to keep track of which events have been seen by which devices. Maybe - // NIDs suffice? - h.pMap.SeenTxnID(eventID) - h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ - EventID: eventID, - UserID: userID, - DeviceID: deviceID, - TransactionID: txnID, - NID: nid, - }) } } } diff --git a/sync2/handler2/handler_test.go b/sync2/handler2/handler_test.go index 0bd61c21..3b69f07b 100644 --- a/sync2/handler2/handler_test.go +++ b/sync2/handler2/handler_test.go @@ -42,6 +42,10 @@ func (p *mockPollerMap) NumPollers() int { } func (p *mockPollerMap) Terminate() {} +func (p *mockPollerMap) MissingTxnID(eventID, userID, deviceID string) (bool, error) { + return false, nil +} + func (p *mockPollerMap) SeenTxnID(eventID string) error { return nil } diff --git a/sync2/poller.go b/sync2/poller.go index d98f5ad4..95931d60 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -64,6 +64,7 @@ type IPollerMap interface { EnsurePolling(pid PollerID, accessToken, v2since string, isStartup bool, logger zerolog.Logger) NumPollers() int Terminate() + MissingTxnID(eventID, userID, deviceID string) (bool, error) SeenTxnID(eventID string) error } @@ -212,6 +213,10 @@ func (h *PollerMap) deviceIDs(userID string) []string { return devices } +func (h *PollerMap) MissingTxnID(eventID, userID, deviceID string) (bool, error) { + return h.pendingTxnIDs.MissingTxnID(eventID, userID, deviceID) +} + func (h *PollerMap) SeenTxnID(eventID string) error { return h.pendingTxnIDs.SeenTxnID(eventID) } From 59d547d79dc6921545556609937d12c722cd3ad9 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 16:48:05 +0100 Subject: [PATCH 111/156] Stub txn id waiter --- sync3/handler/txn_id_waiter.go | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 sync3/handler/txn_id_waiter.go diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go new file mode 100644 index 00000000..c6ac84a5 --- /dev/null +++ b/sync3/handler/txn_id_waiter.go @@ -0,0 +1,25 @@ +package handler + +import ( + "github.com/matrix-org/sliding-sync/sync3/caches" +) + +type TxnIDWaiter struct { + userID string + publish func(update caches.Update) + queued map[string][]caches.Update +} + +func NewTxnIDWaiter(userID string, publish func(caches.Update)) *TxnIDWaiter { + return &TxnIDWaiter{ + userID: userID, + publish: publish, + queued: make(map[string][]caches.Update), + } +} + +func (t *TxnIDWaiter) Ingest(up caches.Update) { + // TODO: investigate whether this update needs to be queued. + // TODO: bound the queue size? + t.publish(up) +} From db69afc57a28f0eca57d2f49fe4006d955b11a5c Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 16:56:17 +0100 Subject: [PATCH 112/156] Never call s.live.onUpdate outside of s.OnUpdate --- sync3/handler/connstate.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 1611e2a7..3a9f60c1 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -679,10 +679,10 @@ func (s *ConnState) OnRoomUpdate(ctx context.Context, up caches.RoomUpdate) { } internal.AssertWithContext(ctx, "missing global room metadata", update.GlobalRoomMetadata() != nil) internal.Logf(ctx, "connstate", "queued update %d", update.EventData.NID) - s.live.onUpdate(update) + s.OnUpdate(ctx, update) case caches.RoomUpdate: internal.AssertWithContext(ctx, "missing global room metadata", update.GlobalRoomMetadata() != nil) - s.live.onUpdate(update) + s.OnUpdate(ctx, update) default: logger.Warn().Str("room_id", up.RoomID()).Msg("OnRoomUpdate unknown update type") } From 980451821feb6d023f4cc4abbf35d608e4921f43 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 16:56:44 +0100 Subject: [PATCH 113/156] Plumb in txn id waiter --- sync3/handler/connstate.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 3a9f60c1..f77166f2 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -42,7 +42,8 @@ type ConnState struct { // roomID -> latest load pos loadPositions map[string]int64 - live *connStateLive + txnIDWaiter *TxnIDWaiter + live *connStateLive globalCache *caches.GlobalCache userCache *caches.UserCache @@ -80,6 +81,7 @@ func NewConnState( ConnState: cs, updates: make(chan caches.Update, maxPendingEventUpdates), } + cs.txnIDWaiter = NewTxnIDWaiter(userID, cs.live.onUpdate) // subscribe for updates before loading. We risk seeing dupes but that's fine as load positions // will stop us double-processing. cs.userCacheID = cs.userCache.Subsribe(cs) @@ -663,7 +665,8 @@ func (s *ConnState) UserID() string { } func (s *ConnState) OnUpdate(ctx context.Context, up caches.Update) { - s.live.onUpdate(up) + // will eventually call s.live.onUpdate + s.txnIDWaiter.Ingest(up) } // Called by the user cache when updates arrive From a59e8d5d4068aeec8a5787faec113a1d8cdb9444 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 17:33:54 +0100 Subject: [PATCH 114/156] Ask InternalRequestLists if room is Visible --- sync3/handler/connstate.go | 9 +++++++++ sync3/lists.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index f77166f2..dcad6ba7 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -691,6 +691,15 @@ func (s *ConnState) OnRoomUpdate(ctx context.Context, up caches.RoomUpdate) { } } +func (s *ConnState) subscribedOrVisible(roomID string) bool { + _, subscribed := s.roomSubscriptions[roomID] + if subscribed { + return true + } + + return s.lists.Visible(roomID, s.muxedReq.Lists) +} + // clampSliceRangeToListSize helps us to send client-friendly SYNC and INVALIDATE ranges. // // Suppose the client asks for a window on positions [10, 19]. If the list diff --git a/sync3/lists.go b/sync3/lists.go index f8d3d551..8eb02036 100644 --- a/sync3/lists.go +++ b/sync3/lists.go @@ -222,6 +222,34 @@ func (s *InternalRequestLists) ListsByVisibleRoomIDs(muxedReqLists map[string]Re return listsByRoomIDs } +// Visible determines if a single room is currently visible in the given set of lists. +func (s *InternalRequestLists) Visible(roomID string, muxedReqLists map[string]RequestList) bool { + for listKey, reqList := range muxedReqLists { + sortedRooms := s.lists[listKey].SortableRooms + if sortedRooms == nil { + continue + } + + var ranges SliceRanges + if reqList.SlowGetAllRooms != nil && *reqList.SlowGetAllRooms { + ranges = SliceRanges{{0, sortedRooms.Len() - 1}} + } else { + ranges = reqList.Ranges + } + + subslices := ranges.SliceInto(sortedRooms) + for _, subslice := range subslices { + sortedSubslice := subslice.(*SortableRooms) + for _, otherRoomID := range sortedSubslice.RoomIDs() { + if roomID == otherRoomID { + return true + } + } + } + } + return false +} + // Assign a new list at the given key. If Overwrite, any existing list is replaced. If DoNotOverwrite, the existing // list is returned if one exists, else a new list is created. Returns the list and true if the list was overwritten. func (s *InternalRequestLists) AssignList(ctx context.Context, listKey string, filters *RequestFilters, sort []string, shouldOverwrite OverwriteVal) (*FilteredSortableRooms, bool) { From 99d042e6cbd53e7c6d53eed1113b8a0818366336 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 17:39:18 +0100 Subject: [PATCH 115/156] Decide whether we should queue the update --- sync3/handler/connstate.go | 2 +- sync3/handler/txn_id_waiter.go | 35 +++++++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index dcad6ba7..f9a3a212 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -81,7 +81,7 @@ func NewConnState( ConnState: cs, updates: make(chan caches.Update, maxPendingEventUpdates), } - cs.txnIDWaiter = NewTxnIDWaiter(userID, cs.live.onUpdate) + cs.txnIDWaiter = NewTxnIDWaiter(userID, cs.live.onUpdate, cs.subscribedOrVisible) // subscribe for updates before loading. We risk seeing dupes but that's fine as load positions // will stop us double-processing. cs.userCacheID = cs.userCache.Subsribe(cs) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index c6ac84a5..e101572a 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -5,21 +5,38 @@ import ( ) type TxnIDWaiter struct { - userID string - publish func(update caches.Update) - queued map[string][]caches.Update + userID string + publish func(update caches.Update) + subscribedOrVisible func(roomID string) bool + queued map[string][]caches.Update } -func NewTxnIDWaiter(userID string, publish func(caches.Update)) *TxnIDWaiter { +func NewTxnIDWaiter(userID string, publish func(caches.Update), subscribedOrVisible func(string) bool) *TxnIDWaiter { return &TxnIDWaiter{ - userID: userID, - publish: publish, - queued: make(map[string][]caches.Update), + userID: userID, + publish: publish, + subscribedOrVisible: subscribedOrVisible, + queued: make(map[string][]caches.Update), } } func (t *TxnIDWaiter) Ingest(up caches.Update) { - // TODO: investigate whether this update needs to be queued. + if !t.shouldQueue(up) { + t.publish(up) + } + // TODO: bound the queue size? - t.publish(up) + // TODO: enqueue and timeout +} + +func (t *TxnIDWaiter) shouldQueue(up caches.Update) bool { + e, isEventUpdate := up.(*caches.RoomEventUpdate) + if isEventUpdate { + // TODO: ensure we don't keep length-0 or nil slices in the queued map so this works correctly. + _, roomQueued := t.queued[e.EventData.RoomID] + if (e.EventData.Sender == t.userID || roomQueued) && t.subscribedOrVisible(e.EventData.RoomID) { + return true + } + } + return false } From 4a6623ff7730dde1c91403ca6874361b71845f87 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 18:44:48 +0100 Subject: [PATCH 116/156] Include room ID in the txnid payload --- pubsub/v2.go | 3 ++- sync2/handler2/handler.go | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pubsub/v2.go b/pubsub/v2.go index d44ad322..6b85d202 100644 --- a/pubsub/v2.go +++ b/pubsub/v2.go @@ -46,7 +46,8 @@ func (*V2Accumulate) Type() string { return "V2Accumulate" } // (the "all-clear"). type V2TransactionID struct { EventID string - UserID string + RoomID string + UserID string // of the sender DeviceID string TransactionID string // Note: an empty transaction ID represents the all-clear. NID int64 diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 96e9522e..602c1694 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -313,6 +313,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev h.pMap.SeenTxnID(eventID) h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ EventID: eventID, + RoomID: roomID, UserID: userID, DeviceID: deviceID, TransactionID: txnID, @@ -323,6 +324,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev if allClear { h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ EventID: eventID, + RoomID: roomID, UserID: userID, DeviceID: deviceID, TransactionID: "", From c15429305ce2eea6d13934e432cbe4879486e5dc Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 18:46:15 +0100 Subject: [PATCH 117/156] Queue events for up to 1s --- sync3/handler/txn_id_waiter.go | 61 +++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index e101572a..4942ef35 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -2,13 +2,17 @@ package handler import ( "github.com/matrix-org/sliding-sync/sync3/caches" + "time" ) +var maxDelay = 1 * time.Second + type TxnIDWaiter struct { userID string publish func(update caches.Update) subscribedOrVisible func(roomID string) bool - queued map[string][]caches.Update + // TODO: probably need a mutex around t.queues so the expiry won't race with enqueuing + queues map[string][]*caches.RoomEventUpdate } func NewTxnIDWaiter(userID string, publish func(caches.Update), subscribedOrVisible func(string) bool) *TxnIDWaiter { @@ -16,27 +20,60 @@ func NewTxnIDWaiter(userID string, publish func(caches.Update), subscribedOrVisi userID: userID, publish: publish, subscribedOrVisible: subscribedOrVisible, - queued: make(map[string][]caches.Update), + queues: make(map[string][]*caches.RoomEventUpdate), + // TODO: metric that tracks how long events were queued for. } } func (t *TxnIDWaiter) Ingest(up caches.Update) { - if !t.shouldQueue(up) { + eventUpdate, isEventUpdate := up.(*caches.RoomEventUpdate) + if !isEventUpdate { + t.publish(up) + return + } + + roomID := eventUpdate.EventData.RoomID + _, roomQueued := t.queues[roomID] + // We only want to queue this event if our user sent it, or if the room already has queued events. + if eventUpdate.EventData.Sender != t.userID && roomQueued { + t.publish(up) + return + } + + // Don't bother queuing the event if the room isn't visible to the user. + if !t.subscribedOrVisible(roomID) { t.publish(up) + return } + // We've decided to queue the event. + queue, exists := t.queues[roomID] + if !exists { + queue = make([]*caches.RoomEventUpdate, 0, 10) + } // TODO: bound the queue size? - // TODO: enqueue and timeout + t.queues[roomID] = append(queue, eventUpdate) + + time.AfterFunc(maxDelay, func() { t.publishUpToNID(roomID, eventUpdate.EventData.NID) }) } -func (t *TxnIDWaiter) shouldQueue(up caches.Update) bool { - e, isEventUpdate := up.(*caches.RoomEventUpdate) - if isEventUpdate { - // TODO: ensure we don't keep length-0 or nil slices in the queued map so this works correctly. - _, roomQueued := t.queued[e.EventData.RoomID] - if (e.EventData.Sender == t.userID || roomQueued) && t.subscribedOrVisible(e.EventData.RoomID) { - return true +func (t *TxnIDWaiter) publishUpToNID(roomID string, publishNID int64) { + queue, exists := t.queues[roomID] + if !exists { + return + } + + var i int + for i = 0; i < len(queue); i++ { + // Scan forwards through the queue until we find an event with nid > publishNID. + if queue[i].EventData.NID > publishNID { + break } } - return false + // Now queue[:i] has events with nid <= publishNID, and queue[i:] has nids > publishNID. + // strip off the first i events from the slice and publish them. + toPublish, queue := queue[:i], queue[i:] + for _, eventUpdate := range toPublish { + t.publish(eventUpdate) + } } From c897887137fafd41bd8808b8e29c3dd04c340239 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 18:50:09 +0100 Subject: [PATCH 118/156] todo marker --- sync3/handler/handler.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index c02c5a29..c0cb8d2f 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -622,7 +622,13 @@ func (h *SyncLiveHandler) Accumulate(p *pubsub.V2Accumulate) { func (h *SyncLiveHandler) OnTransactionID(p *pubsub.V2TransactionID) { _, task := internal.StartTask(context.Background(), "TransactionID") defer task.End() - // TODO implement me + + // There is some event E for which we now have a transaction ID, or else now know + // that we will never get a transaction ID. In either case, tell the sender's + // connections to unblock that event in the transaction ID waiter. + + // TODO implement me. Something like + // h.ConnMap.ClearUpdateQueues(p.UserID, p.RoomID, p.NID) } // Called from the v2 poller, implements V2DataReceiver From 142290fa0ef1aec2a92b606581143bfd0fbdb171 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 25 Jul 2023 19:32:33 +0100 Subject: [PATCH 119/156] WIP make test pass?? --- tests-integration/timeline_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests-integration/timeline_test.go b/tests-integration/timeline_test.go index 7b3c8a3c..aa3c8ed2 100644 --- a/tests-integration/timeline_test.go +++ b/tests-integration/timeline_test.go @@ -785,6 +785,10 @@ func TestTimelineTxnIDAfterInitialSync(t *testing.T) { v2.waitUntilEmpty(t, alice) t.Log("Alice makes another incremental sync request.") + // TODO: this is a hack to make the test pass by ensureing the API has enough time for the queue timer to expire. + // Need to add early expiry in response to the txn id payload. + // Will also need to ensure this request blocks long enough for that to happen before timing out. + time.Sleep(2 * time.Second) aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) t.Log("Alice's sync response includes the message with the txn ID.") m.MatchResponse(t, aliceRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( From 56411d671e0b3bee3e0456fb28c50f24b2db2fe5 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 11:44:59 +0100 Subject: [PATCH 120/156] gotestfmt: hide successful jobs, packages, tests and downloads --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1c630ea2..481feb8d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -59,7 +59,7 @@ jobs: - name: Test run: | set -euo pipefail - go test -count=1 -covermode=atomic -coverpkg ./... -p 1 -v -json $(go list ./... | grep -v tests-e2e) -coverprofile synccoverage.out 2>&1 | tee ./test-integration.log | gotestfmt + go test -count=1 -covermode=atomic -coverpkg ./... -p 1 -v -json $(go list ./... | grep -v tests-e2e) -hide all -coverprofile synccoverage.out 2>&1 | tee ./test-integration.log | gotestfmt shell: bash env: POSTGRES_HOST: localhost @@ -144,7 +144,7 @@ jobs: - name: Run end-to-end tests run: | set -euo pipefail - ./run-tests.sh -count=1 -v -json . 2>&1 | tee test-e2e-runner.log | gotestfmt + ./run-tests.sh -count=1 -v -json . 2>&1 -hide all | tee test-e2e-runner.log | gotestfmt working-directory: tests-e2e shell: bash env: From f74794bcb4d10b6efb5210805a4fa9e48ebc72a3 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 11:52:47 +0100 Subject: [PATCH 121/156] Include txn_id (if present) in room event updates --- sync3/caches/global.go | 9 +++++++++ sync3/dispatcher.go | 17 +++++++++-------- sync3/handler/txn_id_waiter.go | 8 ++++++-- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/sync3/caches/global.go b/sync3/caches/global.go index 10d5c342..fd6d80f6 100644 --- a/sync3/caches/global.go +++ b/sync3/caches/global.go @@ -21,6 +21,15 @@ type EventData struct { Content gjson.Result Timestamp uint64 Sender string + // TransactionID is the unsigned.transaction_id field in the event as stored in the + // syncv3_events table, or the empty string if there is no such field. + // + // We may see the event on poller A without a transaction_id, and then later on + // poller B with a transaction_id. If this happens, we make a temporary note of the + // transaction_id in the syncv3_txns table, but do not edit the persisted event. + // This means that this field is not authoritative; we only include it here as a + // hint to avoid unnecessary waits for V2TransactionID payloads. + TransactionID string // the number of joined users in this room. Use this value and don't try to work it out as you // may get it wrong due to Synapse sending duplicate join events(!) This value has them de-duped diff --git a/sync3/dispatcher.go b/sync3/dispatcher.go index 3d80ff9a..12ddfeb9 100644 --- a/sync3/dispatcher.go +++ b/sync3/dispatcher.go @@ -87,14 +87,15 @@ func (d *Dispatcher) newEventData(event json.RawMessage, roomID string, latestPo eventType := ev.Get("type").Str return &caches.EventData{ - Event: event, - RoomID: roomID, - EventType: eventType, - StateKey: stateKey, - Content: ev.Get("content"), - NID: latestPos, - Timestamp: ev.Get("origin_server_ts").Uint(), - Sender: ev.Get("sender").Str, + Event: event, + RoomID: roomID, + EventType: eventType, + StateKey: stateKey, + Content: ev.Get("content"), + NID: latestPos, + Timestamp: ev.Get("origin_server_ts").Uint(), + Sender: ev.Get("sender").Str, + TransactionID: ev.Get("unsigned.transaction_id").Str, } } diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 4942ef35..d4369d9c 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -33,9 +33,13 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { } roomID := eventUpdate.EventData.RoomID + + // We only want to queue this event if + // - our user sent it AND it lacks a txn_id; OR + // - the room already has queued events. _, roomQueued := t.queues[roomID] - // We only want to queue this event if our user sent it, or if the room already has queued events. - if eventUpdate.EventData.Sender != t.userID && roomQueued { + missingTxnID := eventUpdate.EventData.Sender == t.userID && eventUpdate.EventData.TransactionID == "" + if !(missingTxnID || roomQueued) { t.publish(up) return } From 5904e5a3c77c0b8d37df1aa73ba47968d97c217e Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 12:23:38 +0100 Subject: [PATCH 122/156] Make transaction id delay time configurable --- cmd/syncv3/main.go | 7 ++++--- sync3/handler/connstate.go | 4 ++-- sync3/handler/handler.go | 5 ++++- sync3/handler/txn_id_waiter.go | 10 +++++----- tests-integration/v3_test.go | 4 ++++ v3.go | 5 ++++- 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index a43fa3a1..66b19b3d 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -173,9 +173,10 @@ func main() { panic("invalid value for " + EnvMaxConns + ": " + args[EnvMaxConns]) } h2, h3 := syncv3.Setup(args[EnvServer], args[EnvDB], args[EnvSecret], syncv3.Opts{ - AddPrometheusMetrics: args[EnvPrometheus] != "", - DBMaxConns: maxConnsInt, - DBConnMaxIdleTime: time.Hour, + AddPrometheusMetrics: args[EnvPrometheus] != "", + DBMaxConns: maxConnsInt, + DBConnMaxIdleTime: time.Hour, + MaxTransactionIDDelay: time.Second, }) go h2.StartV2Pollers() diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index f9a3a212..ee65781c 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -60,7 +60,7 @@ type ConnState struct { func NewConnState( userID, deviceID string, userCache *caches.UserCache, globalCache *caches.GlobalCache, ex extensions.HandlerInterface, joinChecker JoinChecker, setupHistVec *prometheus.HistogramVec, histVec *prometheus.HistogramVec, - maxPendingEventUpdates int, + maxPendingEventUpdates int, maxTransactionIDDelay time.Duration, ) *ConnState { cs := &ConnState{ globalCache: globalCache, @@ -81,7 +81,7 @@ func NewConnState( ConnState: cs, updates: make(chan caches.Update, maxPendingEventUpdates), } - cs.txnIDWaiter = NewTxnIDWaiter(userID, cs.live.onUpdate, cs.subscribedOrVisible) + cs.txnIDWaiter = NewTxnIDWaiter(userID, maxTransactionIDDelay, cs.live.onUpdate, cs.subscribedOrVisible) // subscribe for updates before loading. We risk seeing dupes but that's fine as load positions // will stop us double-processing. cs.userCacheID = cs.userCache.Subsribe(cs) diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index c0cb8d2f..117b456c 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -58,6 +58,7 @@ type SyncLiveHandler struct { GlobalCache *caches.GlobalCache maxPendingEventUpdates int + maxTransactionIDDelay time.Duration setupHistVec *prometheus.HistogramVec histVec *prometheus.HistogramVec @@ -67,6 +68,7 @@ type SyncLiveHandler struct { func NewSync3Handler( store *state.Storage, storev2 *sync2.Storage, v2Client sync2.Client, secret string, pub pubsub.Notifier, sub pubsub.Listener, enablePrometheus bool, maxPendingEventUpdates int, + maxTransactionIDDelay time.Duration, ) (*SyncLiveHandler, error) { logger.Info().Msg("creating handler") sh := &SyncLiveHandler{ @@ -78,6 +80,7 @@ func NewSync3Handler( Dispatcher: sync3.NewDispatcher(), GlobalCache: caches.NewGlobalCache(store), maxPendingEventUpdates: maxPendingEventUpdates, + maxTransactionIDDelay: maxTransactionIDDelay, } sh.Extensions = &extensions.Handler{ Store: store, @@ -411,7 +414,7 @@ func (h *SyncLiveHandler) setupConnection(req *http.Request, syncReq *sync3.Requ // to check for an existing connection though, as it's possible for the client to call /sync // twice for a new connection. conn, created := h.ConnMap.CreateConn(connID, func() sync3.ConnHandler { - return NewConnState(token.UserID, token.DeviceID, userCache, h.GlobalCache, h.Extensions, h.Dispatcher, h.setupHistVec, h.histVec, h.maxPendingEventUpdates) + return NewConnState(token.UserID, token.DeviceID, userCache, h.GlobalCache, h.Extensions, h.Dispatcher, h.setupHistVec, h.histVec, h.maxPendingEventUpdates, h.maxTransactionIDDelay) }) if created { log.Info().Msg("created new connection") diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index d4369d9c..3ad4434a 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -5,22 +5,22 @@ import ( "time" ) -var maxDelay = 1 * time.Second - type TxnIDWaiter struct { userID string publish func(update caches.Update) subscribedOrVisible func(roomID string) bool // TODO: probably need a mutex around t.queues so the expiry won't race with enqueuing - queues map[string][]*caches.RoomEventUpdate + queues map[string][]*caches.RoomEventUpdate + maxDelay time.Duration } -func NewTxnIDWaiter(userID string, publish func(caches.Update), subscribedOrVisible func(string) bool) *TxnIDWaiter { +func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(caches.Update), subscribedOrVisible func(string) bool) *TxnIDWaiter { return &TxnIDWaiter{ userID: userID, publish: publish, subscribedOrVisible: subscribedOrVisible, queues: make(map[string][]*caches.RoomEventUpdate), + maxDelay: maxDelay, // TODO: metric that tracks how long events were queued for. } } @@ -58,7 +58,7 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { // TODO: bound the queue size? t.queues[roomID] = append(queue, eventUpdate) - time.AfterFunc(maxDelay, func() { t.publishUpToNID(roomID, eventUpdate.EventData.NID) }) + time.AfterFunc(t.maxDelay, func() { t.publishUpToNID(roomID, eventUpdate.EventData.NID) }) } func (t *TxnIDWaiter) publishUpToNID(roomID string, publishNID int64) { diff --git a/tests-integration/v3_test.go b/tests-integration/v3_test.go index de72d954..ea1e4300 100644 --- a/tests-integration/v3_test.go +++ b/tests-integration/v3_test.go @@ -370,6 +370,7 @@ func runTestServer(t testutils.TestBenchInterface, v2Server *testV2Server, postg TestingSynchronousPubsub: true, // critical to avoid flakey tests AddPrometheusMetrics: false, MaxPendingEventUpdates: 200, + MaxTransactionIDDelay: 1 * time.Millisecond, } if len(opts) > 0 { opt := opts[0] @@ -380,6 +381,9 @@ func runTestServer(t testutils.TestBenchInterface, v2Server *testV2Server, postg combinedOpts.MaxPendingEventUpdates = opt.MaxPendingEventUpdates handler.BufferWaitTime = 5 * time.Millisecond } + if opt.MaxTransactionIDDelay > 0 { + combinedOpts.MaxTransactionIDDelay = opt.MaxTransactionIDDelay + } } h2, h3 := syncv3.Setup(v2Server.url(), postgresConnectionString, os.Getenv("SYNCV3_SECRET"), combinedOpts) // for ease of use we don't start v2 pollers at startup in tests diff --git a/v3.go b/v3.go index e8948e94..0d6debe4 100644 --- a/v3.go +++ b/v3.go @@ -37,6 +37,9 @@ type Opts struct { // if true, publishing messages will block until the consumer has consumed it. // Assumes a single producer and a single consumer. TestingSynchronousPubsub bool + // MaxTransactionIDDelay is the longest amount of time that we will wait for + // confirmation of an event's transaction_id before sending it to its sender. + MaxTransactionIDDelay time.Duration DBMaxConns int DBConnMaxIdleTime time.Duration @@ -115,7 +118,7 @@ func Setup(destHomeserver, postgresURI, secret string, opts Opts) (*handler2.Han pMap.SetCallbacks(h2) // create v3 handler - h3, err := handler.NewSync3Handler(store, storev2, v2Client, secret, pubSub, pubSub, opts.AddPrometheusMetrics, opts.MaxPendingEventUpdates) + h3, err := handler.NewSync3Handler(store, storev2, v2Client, secret, pubSub, pubSub, opts.AddPrometheusMetrics, opts.MaxPendingEventUpdates, opts.MaxTransactionIDDelay) if err != nil { panic(err) } From d7440ab5a13990fd4652a88adba451e26c491e8a Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 12:43:50 +0100 Subject: [PATCH 123/156] correctly pass arg to gotestfmt --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 481feb8d..09e2526b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -59,7 +59,7 @@ jobs: - name: Test run: | set -euo pipefail - go test -count=1 -covermode=atomic -coverpkg ./... -p 1 -v -json $(go list ./... | grep -v tests-e2e) -hide all -coverprofile synccoverage.out 2>&1 | tee ./test-integration.log | gotestfmt + go test -count=1 -covermode=atomic -coverpkg ./... -p 1 -v -json $(go list ./... | grep -v tests-e2e) -coverprofile synccoverage.out 2>&1 | tee ./test-integration.log | gotestfmt -hide all shell: bash env: POSTGRES_HOST: localhost @@ -144,7 +144,7 @@ jobs: - name: Run end-to-end tests run: | set -euo pipefail - ./run-tests.sh -count=1 -v -json . 2>&1 -hide all | tee test-e2e-runner.log | gotestfmt + ./run-tests.sh -count=1 -v -json . 2>&1 | tee test-e2e-runner.log | gotestfmt -hide all working-directory: tests-e2e shell: bash env: From 1bb082d9fb9319b82642660f08f75315e05a606b Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 12:51:16 +0100 Subject: [PATCH 124/156] fixup NewConnState args --- sync3/handler/connstate_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sync3/handler/connstate_test.go b/sync3/handler/connstate_test.go index 17700ea8..a69ac8ce 100644 --- a/sync3/handler/connstate_test.go +++ b/sync3/handler/connstate_test.go @@ -107,7 +107,7 @@ func TestConnStateInitial(t *testing.T) { } return result } - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) if userID != cs.UserID() { t.Fatalf("UserID returned wrong value, got %v want %v", cs.UserID(), userID) } @@ -272,7 +272,7 @@ func TestConnStateMultipleRanges(t *testing.T) { userCache.LazyRoomDataOverride = mockLazyRoomOverride dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) // request first page res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ @@ -451,7 +451,7 @@ func TestBumpToOutsideRange(t *testing.T) { userCache.LazyRoomDataOverride = mockLazyRoomOverride dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) // Ask for A,B res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ Lists: map[string]sync3.RequestList{"a": { @@ -562,7 +562,7 @@ func TestConnStateRoomSubscriptions(t *testing.T) { } dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) // subscribe to room D res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ RoomSubscriptions: map[string]sync3.RoomSubscription{ From d67445b6af16710f57ea469a9cfc42785b1cedec Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 13:07:06 +0100 Subject: [PATCH 125/156] Correctly pop from left of queue --- sync3/handler/txn_id_waiter.go | 1 + 1 file changed, 1 insertion(+) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 3ad4434a..289fc52e 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -77,6 +77,7 @@ func (t *TxnIDWaiter) publishUpToNID(roomID string, publishNID int64) { // Now queue[:i] has events with nid <= publishNID, and queue[i:] has nids > publishNID. // strip off the first i events from the slice and publish them. toPublish, queue := queue[:i], queue[i:] + t.queues[roomID] = queue for _, eventUpdate := range toPublish { t.publish(eventUpdate) } From 3af56247560be5ad7b02682c644aa6ffdd3fdc66 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 13:25:09 +0100 Subject: [PATCH 126/156] runTestServer: if opts are given, specify MaxDelay explicitly --- tests-integration/v3_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests-integration/v3_test.go b/tests-integration/v3_test.go index ea1e4300..c2435b97 100644 --- a/tests-integration/v3_test.go +++ b/tests-integration/v3_test.go @@ -377,13 +377,11 @@ func runTestServer(t testutils.TestBenchInterface, v2Server *testV2Server, postg combinedOpts.AddPrometheusMetrics = opt.AddPrometheusMetrics combinedOpts.DBConnMaxIdleTime = opt.DBConnMaxIdleTime combinedOpts.DBMaxConns = opt.DBMaxConns + combinedOpts.MaxTransactionIDDelay = opt.MaxTransactionIDDelay if opt.MaxPendingEventUpdates > 0 { combinedOpts.MaxPendingEventUpdates = opt.MaxPendingEventUpdates handler.BufferWaitTime = 5 * time.Millisecond } - if opt.MaxTransactionIDDelay > 0 { - combinedOpts.MaxTransactionIDDelay = opt.MaxTransactionIDDelay - } } h2, h3 := syncv3.Setup(v2Server.url(), postgresConnectionString, os.Getenv("SYNCV3_SECRET"), combinedOpts) // for ease of use we don't start v2 pollers at startup in tests From 726868c78b842f243b9239b27e1be6315beb6fc6 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 13:25:44 +0100 Subject: [PATCH 127/156] Bypass queue logic if delay is turned off --- sync3/handler/txn_id_waiter.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 289fc52e..28d5d594 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -26,6 +26,11 @@ func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(caches.U } func (t *TxnIDWaiter) Ingest(up caches.Update) { + if t.maxDelay <= 0 { + t.publish(up) + return + } + eventUpdate, isEventUpdate := up.(*caches.RoomEventUpdate) if !isEventUpdate { t.publish(up) From 9d864067e7e26b208fb396f0b8bbb6edfefc85a5 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 13:45:45 +0100 Subject: [PATCH 128/156] make Publish func public --- sync3/handler/txn_id_waiter.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 28d5d594..5c2e00ff 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -63,10 +63,10 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { // TODO: bound the queue size? t.queues[roomID] = append(queue, eventUpdate) - time.AfterFunc(t.maxDelay, func() { t.publishUpToNID(roomID, eventUpdate.EventData.NID) }) + time.AfterFunc(t.maxDelay, func() { t.PublishUpToNID(roomID, eventUpdate.EventData.NID) }) } -func (t *TxnIDWaiter) publishUpToNID(roomID string, publishNID int64) { +func (t *TxnIDWaiter) PublishUpToNID(roomID string, publishNID int64) { queue, exists := t.queues[roomID] if !exists { return From fe74488a58985254900ff9d20d545dfba7822727 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 13:47:33 +0100 Subject: [PATCH 129/156] Clear queues on receipt of txn payload --- sync3/conn.go | 1 + sync3/connmap.go | 9 +++++++++ sync3/handler/connstate.go | 4 ++++ sync3/handler/handler.go | 4 +--- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/sync3/conn.go b/sync3/conn.go index 33fd87bb..d9d3682a 100644 --- a/sync3/conn.go +++ b/sync3/conn.go @@ -32,6 +32,7 @@ type ConnHandler interface { // status code to send back. OnIncomingRequest(ctx context.Context, cid ConnID, req *Request, isInitial bool, start time.Time) (*Response, error) OnUpdate(ctx context.Context, update caches.Update) + PublishEventsUpTo(roomID string, nid int64) Destroy() Alive() bool } diff --git a/sync3/connmap.go b/sync3/connmap.go index 0cae2748..8d08dd27 100644 --- a/sync3/connmap.go +++ b/sync3/connmap.go @@ -208,3 +208,12 @@ func (m *ConnMap) closeConn(conn *Conn) { h.Destroy() m.updateMetrics(len(m.connIDToConn)) } + +func (m *ConnMap) ClearUpdateQueues(userID, roomID string, nid int64) { + m.mu.Lock() + defer m.mu.Unlock() + + for _, conn := range m.userIDToConn[userID] { + conn.handler.PublishEventsUpTo(roomID, nid) + } +} diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index ee65781c..4be31caa 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -700,6 +700,10 @@ func (s *ConnState) subscribedOrVisible(roomID string) bool { return s.lists.Visible(roomID, s.muxedReq.Lists) } +func (s *ConnState) PublishEventsUpTo(roomID string, nid int64) { + s.txnIDWaiter.PublishUpToNID(roomID, nid) +} + // clampSliceRangeToListSize helps us to send client-friendly SYNC and INVALIDATE ranges. // // Suppose the client asks for a window on positions [10, 19]. If the list diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index 117b456c..a8a56c0a 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -629,9 +629,7 @@ func (h *SyncLiveHandler) OnTransactionID(p *pubsub.V2TransactionID) { // There is some event E for which we now have a transaction ID, or else now know // that we will never get a transaction ID. In either case, tell the sender's // connections to unblock that event in the transaction ID waiter. - - // TODO implement me. Something like - // h.ConnMap.ClearUpdateQueues(p.UserID, p.RoomID, p.NID) + h.ConnMap.ClearUpdateQueues(p.UserID, p.RoomID, p.NID) } // Called from the v2 poller, implements V2DataReceiver From f3750d219a35609f7e7fd57ba80ef57eca0312eb Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 14:06:05 +0100 Subject: [PATCH 130/156] Tweak comments --- sync2/txnid.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sync2/txnid.go b/sync2/txnid.go index 642585ea..b5c2b21d 100644 --- a/sync2/txnid.go +++ b/sync2/txnid.go @@ -11,13 +11,15 @@ import ( type loaderFunc func(userID string) (deviceIDs []string) // PendingTransactionIDs is (conceptually) a map from event IDs to a list of device IDs. -// Its keys the IDs of event we've seen which a) lack a transaction ID, and b) were sent -// by one of the users we are polling for. The values are the list of the sender's +// Its keys are the IDs of event we've seen which a) lack a transaction ID, and b) were +// sent by one of the users we are polling for. The values are the list of the sender's // devices whose pollers are yet to see a transaction ID. // // If another poller sees the same event +// // - with a transaction ID, it emits a V2TransactionID payload with that ID and // removes the event ID from this map. +// // - without a transaction ID, it removes the polling device ID from the values // list. If the device ID list is now empty, the poller emits an "all clear" // V2TransactionID payload. @@ -88,6 +90,9 @@ func (c *PendingTransactionIDs) SeenTxnID(eventID string) error { return c.pending.Set(eventID, []string{}) } +// removeDevice takes a device ID slice and returns a device ID slice with one +// particular string removed. Assumes that the given slice has no duplicates. +// Does not modify the given slice in situ. func removeDevice(device string, devices []string) ([]string, bool) { for i, otherDevice := range devices { if otherDevice == device { From ad309b5a2f13063a0acc4b830391300389ecfd1d Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 14:06:12 +0100 Subject: [PATCH 131/156] Remove unused matcher --- testutils/m/match.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/testutils/m/match.go b/testutils/m/match.go index 18524619..f8f6c6a3 100644 --- a/testutils/m/match.go +++ b/testutils/m/match.go @@ -283,15 +283,6 @@ func MatchRoomSubscriptions(wantSubs map[string][]RoomMatcher) RespMatcher { } } -func MatchNoLists() RespMatcher { - return func(res *sync3.Response) error { - if len(res.Lists) != 0 { - return fmt.Errorf("expected no lists in response, got %d lists: %v", len(res.Lists), res.Lists) - } - return nil - } -} - func MatchNoE2EEExtension() RespMatcher { return func(res *sync3.Response) error { if res.Extensions.E2EE != nil { From 14113e215fdc7cdd5aa7bf48d1b2d6627f5bdce6 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 26 Jul 2023 16:29:08 +0100 Subject: [PATCH 132/156] WIP test cases for the TxnIDWaiter --- sync3/handler/connstate.go | 9 ++++++- sync3/handler/txn_id_waiter.go | 47 ++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 4be31caa..2518ea74 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -81,7 +81,14 @@ func NewConnState( ConnState: cs, updates: make(chan caches.Update, maxPendingEventUpdates), } - cs.txnIDWaiter = NewTxnIDWaiter(userID, maxTransactionIDDelay, cs.live.onUpdate, cs.subscribedOrVisible) + cs.txnIDWaiter = NewTxnIDWaiter( + userID, + maxTransactionIDDelay, + func(delayed bool, update caches.Update) { + cs.live.onUpdate(update) + }, + cs.subscribedOrVisible, + ) // subscribe for updates before loading. We risk seeing dupes but that's fine as load positions // will stop us double-processing. cs.userCacheID = cs.userCache.Subsribe(cs) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 5c2e00ff..94954970 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -7,14 +7,14 @@ import ( type TxnIDWaiter struct { userID string - publish func(update caches.Update) + publish func(delayed bool, update caches.Update) subscribedOrVisible func(roomID string) bool // TODO: probably need a mutex around t.queues so the expiry won't race with enqueuing queues map[string][]*caches.RoomEventUpdate maxDelay time.Duration } -func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(caches.Update), subscribedOrVisible func(string) bool) *TxnIDWaiter { +func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(bool, caches.Update), subscribedOrVisible func(string) bool) *TxnIDWaiter { return &TxnIDWaiter{ userID: userID, publish: publish, @@ -27,43 +27,44 @@ func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(caches.U func (t *TxnIDWaiter) Ingest(up caches.Update) { if t.maxDelay <= 0 { - t.publish(up) + t.publish(false, up) return } eventUpdate, isEventUpdate := up.(*caches.RoomEventUpdate) if !isEventUpdate { - t.publish(up) + t.publish(false, up) return } - roomID := eventUpdate.EventData.RoomID - + ed := eventUpdate.EventData // We only want to queue this event if // - our user sent it AND it lacks a txn_id; OR // - the room already has queued events. - _, roomQueued := t.queues[roomID] - missingTxnID := eventUpdate.EventData.Sender == t.userID && eventUpdate.EventData.TransactionID == "" + _, roomQueued := t.queues[ed.RoomID] + missingTxnID := ed.Sender == t.userID && ed.TransactionID == "" if !(missingTxnID || roomQueued) { - t.publish(up) + t.publish(false, up) return } // Don't bother queuing the event if the room isn't visible to the user. - if !t.subscribedOrVisible(roomID) { - t.publish(up) + if !t.subscribedOrVisible(ed.RoomID) { + t.publish(false, up) return } // We've decided to queue the event. - queue, exists := t.queues[roomID] + queue, exists := t.queues[ed.RoomID] if !exists { queue = make([]*caches.RoomEventUpdate, 0, 10) } // TODO: bound the queue size? - t.queues[roomID] = append(queue, eventUpdate) + t.queues[ed.RoomID] = append(queue, eventUpdate) + logger.Trace().Str("room_id", ed.RoomID).Ints64("q", nids(t.queues[ed.RoomID])).Msgf("enqueue event NID %d", ed.NID) - time.AfterFunc(t.maxDelay, func() { t.PublishUpToNID(roomID, eventUpdate.EventData.NID) }) + // TODO: if t gets gced, will this function still run? If so, will things explode? + time.AfterFunc(t.maxDelay, func() { t.PublishUpToNID(ed.RoomID, ed.NID) }) } func (t *TxnIDWaiter) PublishUpToNID(roomID string, publishNID int64) { @@ -82,8 +83,22 @@ func (t *TxnIDWaiter) PublishUpToNID(roomID string, publishNID int64) { // Now queue[:i] has events with nid <= publishNID, and queue[i:] has nids > publishNID. // strip off the first i events from the slice and publish them. toPublish, queue := queue[:i], queue[i:] - t.queues[roomID] = queue + if len(queue) == 0 { + delete(t.queues, roomID) + } else { + t.queues[roomID] = queue + } + + logger.Trace().Str("room_id", roomID).Ints64("q", nids(queue)).Msgf("publish event up to NID %d", publishNID) for _, eventUpdate := range toPublish { - t.publish(eventUpdate) + t.publish(true, eventUpdate) + } +} + +func nids(updates []*caches.RoomEventUpdate) []int64 { + rv := make([]int64, len(updates)) + for i, up := range updates { + rv[i] = up.EventData.NID } + return rv } From 86d156c5113a39c500622d86612bb7f4dddbdadb Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Wed, 26 Jul 2023 17:12:42 +0100 Subject: [PATCH 133/156] Actually use the modified Event in result --- state/event_table.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index 1c360b1b..c01b454e 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -453,14 +453,14 @@ func filterAndEnsureFieldsSet(events []Event) []Event { result := make([]Event, 0, len(events)) // ensure fields are set for i := range events { - ev := events[i] + ev := &events[i] if err := ev.ensureFieldsSetOnEvent(); err != nil { logger.Warn().Str("event_id", ev.ID).Err(err).Msg( "filterAndEnsureFieldsSet: failed to parse event, ignoring", ) continue } - result = append(result, ev) + result = append(result, *ev) } return result } From eb401bcb770d2a16248d2847c6e22adc7580810e Mon Sep 17 00:00:00 2001 From: Sami Olmari Date: Thu, 27 Jul 2023 00:17:21 +0300 Subject: [PATCH 134/156] Fix Nginx-example regex case-sensitivity. Matrix endpoints are specified to be case sensitive always, fix the Nginx-example to be case-sensitive instead case-insensitive Signed-off-by: Sami Olmari --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 26ea18d7..963f81f5 100644 --- a/README.md +++ b/README.md @@ -88,14 +88,14 @@ In both cases, the path `https://example.com/.well-known/matrix/client` must ret #### Same hostname The following nginx configuration can be used to pass the required endpoints to the sync proxy, running on local port 8009 (so as to not conflict with Synapse): ```nginx -location ~* ^/(client/|_matrix/client/unstable/org.matrix.msc3575/sync) { +location ~ ^/(client/|_matrix/client/unstable/org.matrix.msc3575/sync) { proxy_pass http://localhost:8009; proxy_set_header X-Forwarded-For $remote_addr; proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header Host $host; } -location ~* ^(\/_matrix|\/_synapse\/client) { +location ~ ^(\/_matrix|\/_synapse\/client) { proxy_pass http://localhost:8008; proxy_set_header X-Forwarded-For $remote_addr; proxy_set_header X-Forwarded-Proto $scheme; From 6c81134056dfec8c51db3f3849ec5030499b88e6 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 12:01:06 +0100 Subject: [PATCH 135/156] PollerMap.deviceIDs: use zero-inited string slice --- sync2/poller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync2/poller.go b/sync2/poller.go index 95931d60..77e7767f 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -204,7 +204,7 @@ func (h *PollerMap) NumPollers() (count int) { func (h *PollerMap) deviceIDs(userID string) []string { h.pollerMu.Lock() defer h.pollerMu.Unlock() - devices := make([]string, 0) + var devices []string for _, p := range h.Pollers { if !p.terminated.Load() && p.userID == userID { devices = append(devices, p.deviceID) From 210e95b0d89c27dfe08bd220b2b7923fc0b6c45d Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 12:01:37 +0100 Subject: [PATCH 136/156] PendingTransactionIDs: defer mutex unlock --- sync2/txnid.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/sync2/txnid.go b/sync2/txnid.go index b5c2b21d..79e7c006 100644 --- a/sync2/txnid.go +++ b/sync2/txnid.go @@ -35,7 +35,7 @@ type loaderFunc func(userID string) (deviceIDs []string) // To avoid the map growing without bound, we use a ttlcache and drop entries // after a short period of time. type PendingTransactionIDs struct { - // mu guards the pending field. + // mu guards the pending field. See MissingTxnID for rationale. mu sync.Mutex pending *ttlcache.Cache // loader should provide the list of device IDs @@ -57,6 +57,24 @@ func NewPendingTransactionIDs(loader loaderFunc) *PendingTransactionIDs { // transaction ID for this event ID. Returns true if this is the first time we know // for sure that we'll never see a txn ID for this event. func (c *PendingTransactionIDs) MissingTxnID(eventID, userID, myDeviceID string) (bool, error) { + // While ttlcache is threadsafe, it does not provide a way to atomically update + // (get+set) a value, which means we are still open to races. For example: + // + // - We have three pollers A, B, C. + // - Poller A sees an event without txn id and calls MissingTxnID. + // - `c.pending.Get()` fails, so we load up all device IDs: [A, B, C]. + // - Then `c.pending.Set()` with [B, C]. + // - Poller B sees the same event, also missing txn ID and calls MissingTxnID. + // - Poller C does the same concurrently. + // + // If the Get+Set isn't atomic, then we might do e.g. + // - B gets [B, C] and prepares to write [C]. + // - C gets [B, C] and prepares to write [B]. + // - Last writer wins. Either way, we never write [] and so never return true + // (the all-clear signal.) + // + // This wouldn't be the end of the world (the API process has a maximum delay, and + // the ttlcache will expire the entry), but it would still be nice to avoid it. c.mu.Lock() defer c.mu.Unlock() @@ -86,7 +104,7 @@ func (c *PendingTransactionIDs) MissingTxnID(eventID, userID, myDeviceID string) // for this event. func (c *PendingTransactionIDs) SeenTxnID(eventID string) error { c.mu.Lock() - c.mu.Unlock() + defer c.mu.Unlock() return c.pending.Set(eventID, []string{}) } From 5d986294ba000bd2741b5c04eb40e772e5df2e3a Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 12:10:46 +0100 Subject: [PATCH 137/156] Fixup tests --- sync2/txnid_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sync2/txnid_test.go b/sync2/txnid_test.go index 844957c3..f9b15990 100644 --- a/sync2/txnid_test.go +++ b/sync2/txnid_test.go @@ -67,9 +67,9 @@ func TestPendingTransactionIDs(t *testing.T) { // Delia is tracking four devices. allClear, err = pending.MissingTxnID("event4", "delia", "D1") assertNoError(t, err) - assertAllClear(t, allClear, false) // waiting on E2, E3 and E4 + assertAllClear(t, allClear, false) // waiting on D2, D3 and D4 - // One of Delia's devices, say D2, sees a txn ID for E4. + // One of Delia's devices, say D2, sees a txn ID for event 4. err = pending.SeenTxnID("event4") assertNoError(t, err) @@ -94,6 +94,7 @@ func TestPendingTransactionIDs(t *testing.T) { } func assertAllClear(t *testing.T, got bool, want bool) { + t.Helper() if got != want { t.Errorf("Expected allClear=%t, got %t", want, got) } From 6e8bbcc052947e1b97dbe127f62b4c6433ce78fa Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 12:14:10 +0100 Subject: [PATCH 138/156] Don't use txn ID buffering in integration tests --- sync3/handler/connstate_test.go | 8 ++++---- tests-integration/v3_test.go | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sync3/handler/connstate_test.go b/sync3/handler/connstate_test.go index a69ac8ce..46c00d6c 100644 --- a/sync3/handler/connstate_test.go +++ b/sync3/handler/connstate_test.go @@ -107,7 +107,7 @@ func TestConnStateInitial(t *testing.T) { } return result } - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, 0) if userID != cs.UserID() { t.Fatalf("UserID returned wrong value, got %v want %v", cs.UserID(), userID) } @@ -272,7 +272,7 @@ func TestConnStateMultipleRanges(t *testing.T) { userCache.LazyRoomDataOverride = mockLazyRoomOverride dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, 0) // request first page res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ @@ -451,7 +451,7 @@ func TestBumpToOutsideRange(t *testing.T) { userCache.LazyRoomDataOverride = mockLazyRoomOverride dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, 0) // Ask for A,B res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ Lists: map[string]sync3.RequestList{"a": { @@ -562,7 +562,7 @@ func TestConnStateRoomSubscriptions(t *testing.T) { } dispatcher.Register(context.Background(), userCache.UserID, userCache) dispatcher.Register(context.Background(), sync3.DispatcherAllUsers, globalCache) - cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, time.Millisecond) + cs := NewConnState(userID, deviceID, userCache, globalCache, &NopExtensionHandler{}, &NopJoinTracker{}, nil, nil, 1000, 0) // subscribe to room D res, err := cs.OnIncomingRequest(context.Background(), ConnID, &sync3.Request{ RoomSubscriptions: map[string]sync3.RoomSubscription{ diff --git a/tests-integration/v3_test.go b/tests-integration/v3_test.go index c2435b97..c29b2126 100644 --- a/tests-integration/v3_test.go +++ b/tests-integration/v3_test.go @@ -370,7 +370,7 @@ func runTestServer(t testutils.TestBenchInterface, v2Server *testV2Server, postg TestingSynchronousPubsub: true, // critical to avoid flakey tests AddPrometheusMetrics: false, MaxPendingEventUpdates: 200, - MaxTransactionIDDelay: 1 * time.Millisecond, + MaxTransactionIDDelay: 0, // disable the txnID buffering to avoid flakey tests } if len(opts) > 0 { opt := opts[0] From d010ff41efa56eb9ff289c95b7aedf5e7886c234 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 12:15:24 +0100 Subject: [PATCH 139/156] Add missing interface method on connHandlerMock --- sync3/conn_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/sync3/conn_test.go b/sync3/conn_test.go index c326938c..8b148ea6 100644 --- a/sync3/conn_test.go +++ b/sync3/conn_test.go @@ -25,6 +25,7 @@ func (c *connHandlerMock) UserID() string { func (c *connHandlerMock) Destroy() {} func (c *connHandlerMock) Alive() bool { return true } func (c *connHandlerMock) OnUpdate(ctx context.Context, update caches.Update) {} +func (c *connHandlerMock) PublishEventsUpTo(roomID string, nid int64) {} // Test that Conn can send and receive requests based on positions func TestConn(t *testing.T) { From d0067008e1081f64ea9704c16242e6b3624dbe80 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 13:03:14 +0100 Subject: [PATCH 140/156] Fix new integration test timing --- tests-integration/timeline_test.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests-integration/timeline_test.go b/tests-integration/timeline_test.go index aa3c8ed2..481edea9 100644 --- a/tests-integration/timeline_test.go +++ b/tests-integration/timeline_test.go @@ -3,6 +3,7 @@ package syncv3 import ( "encoding/json" "fmt" + slidingsync "github.com/matrix-org/sliding-sync" "testing" "time" @@ -694,7 +695,11 @@ func TestTimelineTxnIDAfterInitialSync(t *testing.T) { pqString := testutils.PrepareDBConnectionString() // setup code v2 := runTestV2Server(t) - v3 := runTestServer(t, v2, pqString) + v3 := runTestServer(t, v2, pqString, slidingsync.Opts{ + // This needs to be greater than the request timeout, which is hardcoded to a + // minimum of 100ms in connStateLive.liveUpdate. + MaxTransactionIDDelay: 200 * time.Millisecond, + }) defer v2.close() defer v3.close() roomID := "!a:localhost" @@ -785,10 +790,6 @@ func TestTimelineTxnIDAfterInitialSync(t *testing.T) { v2.waitUntilEmpty(t, alice) t.Log("Alice makes another incremental sync request.") - // TODO: this is a hack to make the test pass by ensureing the API has enough time for the queue timer to expire. - // Need to add early expiry in response to the txn id payload. - // Will also need to ensure this request blocks long enough for that to happen before timing out. - time.Sleep(2 * time.Second) aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) t.Log("Alice's sync response includes the message with the txn ID.") m.MatchResponse(t, aliceRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( From e35f6b80cbcc2d4ae2dde0893c99a034f8483112 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 13:17:36 +0100 Subject: [PATCH 141/156] Remove visibility nonsense --- sync3/handler/connstate.go | 10 ---------- sync3/handler/txn_id_waiter.go | 22 +++++++--------------- sync3/lists.go | 28 ---------------------------- 3 files changed, 7 insertions(+), 53 deletions(-) diff --git a/sync3/handler/connstate.go b/sync3/handler/connstate.go index 2518ea74..c6a39378 100644 --- a/sync3/handler/connstate.go +++ b/sync3/handler/connstate.go @@ -87,7 +87,6 @@ func NewConnState( func(delayed bool, update caches.Update) { cs.live.onUpdate(update) }, - cs.subscribedOrVisible, ) // subscribe for updates before loading. We risk seeing dupes but that's fine as load positions // will stop us double-processing. @@ -698,15 +697,6 @@ func (s *ConnState) OnRoomUpdate(ctx context.Context, up caches.RoomUpdate) { } } -func (s *ConnState) subscribedOrVisible(roomID string) bool { - _, subscribed := s.roomSubscriptions[roomID] - if subscribed { - return true - } - - return s.lists.Visible(roomID, s.muxedReq.Lists) -} - func (s *ConnState) PublishEventsUpTo(roomID string, nid int64) { s.txnIDWaiter.PublishUpToNID(roomID, nid) } diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 94954970..75c1285a 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -6,21 +6,19 @@ import ( ) type TxnIDWaiter struct { - userID string - publish func(delayed bool, update caches.Update) - subscribedOrVisible func(roomID string) bool + userID string + publish func(delayed bool, update caches.Update) // TODO: probably need a mutex around t.queues so the expiry won't race with enqueuing queues map[string][]*caches.RoomEventUpdate maxDelay time.Duration } -func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(bool, caches.Update), subscribedOrVisible func(string) bool) *TxnIDWaiter { +func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(bool, caches.Update)) *TxnIDWaiter { return &TxnIDWaiter{ - userID: userID, - publish: publish, - subscribedOrVisible: subscribedOrVisible, - queues: make(map[string][]*caches.RoomEventUpdate), - maxDelay: maxDelay, + userID: userID, + publish: publish, + queues: make(map[string][]*caches.RoomEventUpdate), + maxDelay: maxDelay, // TODO: metric that tracks how long events were queued for. } } @@ -48,12 +46,6 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { return } - // Don't bother queuing the event if the room isn't visible to the user. - if !t.subscribedOrVisible(ed.RoomID) { - t.publish(false, up) - return - } - // We've decided to queue the event. queue, exists := t.queues[ed.RoomID] if !exists { diff --git a/sync3/lists.go b/sync3/lists.go index 8eb02036..f8d3d551 100644 --- a/sync3/lists.go +++ b/sync3/lists.go @@ -222,34 +222,6 @@ func (s *InternalRequestLists) ListsByVisibleRoomIDs(muxedReqLists map[string]Re return listsByRoomIDs } -// Visible determines if a single room is currently visible in the given set of lists. -func (s *InternalRequestLists) Visible(roomID string, muxedReqLists map[string]RequestList) bool { - for listKey, reqList := range muxedReqLists { - sortedRooms := s.lists[listKey].SortableRooms - if sortedRooms == nil { - continue - } - - var ranges SliceRanges - if reqList.SlowGetAllRooms != nil && *reqList.SlowGetAllRooms { - ranges = SliceRanges{{0, sortedRooms.Len() - 1}} - } else { - ranges = reqList.Ranges - } - - subslices := ranges.SliceInto(sortedRooms) - for _, subslice := range subslices { - sortedSubslice := subslice.(*SortableRooms) - for _, otherRoomID := range sortedSubslice.RoomIDs() { - if roomID == otherRoomID { - return true - } - } - } - } - return false -} - // Assign a new list at the given key. If Overwrite, any existing list is replaced. If DoNotOverwrite, the existing // list is returned if one exists, else a new list is created. Returns the list and true if the list was overwritten. func (s *InternalRequestLists) AssignList(ctx context.Context, listKey string, filters *RequestFilters, sort []string, shouldOverwrite OverwriteVal) (*FilteredSortableRooms, bool) { From b6dbe918ab3a9ebaebead2d7f30b9b0a124b9d7e Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 13:17:51 +0100 Subject: [PATCH 142/156] WIP test cases for TxnIDWaiter --- sync3/handler/txn_id_waiter_test.go | 195 ++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 sync3/handler/txn_id_waiter_test.go diff --git a/sync3/handler/txn_id_waiter_test.go b/sync3/handler/txn_id_waiter_test.go new file mode 100644 index 00000000..627b42ac --- /dev/null +++ b/sync3/handler/txn_id_waiter_test.go @@ -0,0 +1,195 @@ +package handler + +import ( + "github.com/matrix-org/sliding-sync/sync3/caches" + "testing" + "time" +) + +func TestTxnIDWaiterQueuingLogic(t *testing.T) { + const alice = "alice" + const bob = "bob" + const room1 = "!theroom" + const room2 = "!daszimmer" + + testCases := []struct { + Name string + Ingest []caches.Update + WaitForUpdate int + ExpectDelayed bool + }{ + { + Name: "empty queue, non-event update", + Ingest: []caches.Update{&caches.AccountDataUpdate{}}, + WaitForUpdate: 0, + ExpectDelayed: false, + }, + { + Name: "empty queue, event update, another sender", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: bob, + }, + }}, + WaitForUpdate: 0, + ExpectDelayed: false, + }, + { + Name: "empty queue, event update, has txn_id", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "txntxntxn", + }, + }}, + WaitForUpdate: 0, + ExpectDelayed: false, + }, + { + Name: "empty queue, event update, no txn_id", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + }, + }}, + WaitForUpdate: 0, + ExpectDelayed: true, + }, + { + Name: "nonempty queue, non-event update", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + NID: 1, + }, + }, + &caches.AccountDataUpdate{}, + }, + WaitForUpdate: 1, + ExpectDelayed: false, // not a room event, no need to queued behind alice's event + }, + { + Name: "nonempty queue, event update, different sender", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + NID: 1, + }, + }, + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: bob, + NID: 2, + }, + }, + }, + WaitForUpdate: 1, + ExpectDelayed: true, // should be queued behind alice's event + }, + { + Name: "nonempty queue, event update, has txn_id", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + NID: 1, + }, + }, + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + NID: 2, + TransactionID: "I have a txn", + }, + }, + }, + WaitForUpdate: 1, + ExpectDelayed: true, // should still be queued behind alice's first event + }, + { + Name: "existence of queue only matters per-room", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + NID: 1, + }, + }, + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room2, + Sender: alice, + NID: 2, + TransactionID: "I have a txn", + }, + }, + }, + WaitForUpdate: 1, + ExpectDelayed: false, // queue only tracks room1 + }, + } + + type publishArg struct { + delayed bool + update caches.Update + } + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + updates := make(chan publishArg, 100) + publish := func(delayed bool, update caches.Update) { + updates <- publishArg{delayed, update} + } + + w := NewTxnIDWaiter(alice, time.Millisecond, publish) + + for _, up := range tc.Ingest { + w.Ingest(up) + } + + wantedUpdate := tc.Ingest[tc.WaitForUpdate] + var got publishArg + WaitForSelectedUpdate: + for { + select { + case got = <-updates: + t.Logf("Got update %v", got.update) + if got.update == wantedUpdate { + break WaitForSelectedUpdate + } + case <-time.After(5 * time.Millisecond): + t.Fatalf("Did not see update %v published", wantedUpdate) + } + } + + if got.delayed != tc.ExpectDelayed { + t.Errorf("Got delayed=%t want delayed=%t", got.delayed, tc.ExpectDelayed) + } + }) + } +} + +// TODO: tests which demonstrate that PublishEventsUpTo() +// - correctly pops off the start of the queue +// - is idempotent +// - only affects the given room ID +// - deletes map entry if queue is empty (so that roomQueued is set correctly) From 347eb45b42de9d5af6b7a61fd942bc1aa07f97f2 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 13:29:58 +0100 Subject: [PATCH 143/156] Remove debug --- sync3/handler/txn_id_waiter.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 75c1285a..864707b6 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -53,7 +53,6 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { } // TODO: bound the queue size? t.queues[ed.RoomID] = append(queue, eventUpdate) - logger.Trace().Str("room_id", ed.RoomID).Ints64("q", nids(t.queues[ed.RoomID])).Msgf("enqueue event NID %d", ed.NID) // TODO: if t gets gced, will this function still run? If so, will things explode? time.AfterFunc(t.maxDelay, func() { t.PublishUpToNID(ed.RoomID, ed.NID) }) @@ -81,7 +80,6 @@ func (t *TxnIDWaiter) PublishUpToNID(roomID string, publishNID int64) { t.queues[roomID] = queue } - logger.Trace().Str("room_id", roomID).Ints64("q", nids(queue)).Msgf("publish event up to NID %d", publishNID) for _, eventUpdate := range toPublish { t.publish(true, eventUpdate) } From 7bb244d0e396b544151752c8d81d840e130f06a7 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 13:30:08 +0100 Subject: [PATCH 144/156] TxnIDWaiter: Use mutex to guard queues --- sync3/handler/txn_id_waiter.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 864707b6..798de3fc 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -2,13 +2,15 @@ package handler import ( "github.com/matrix-org/sliding-sync/sync3/caches" + "sync" "time" ) type TxnIDWaiter struct { userID string publish func(delayed bool, update caches.Update) - // TODO: probably need a mutex around t.queues so the expiry won't race with enqueuing + // mu guards the queues map. + mu sync.Mutex queues map[string][]*caches.RoomEventUpdate maxDelay time.Duration } @@ -17,6 +19,7 @@ func NewTxnIDWaiter(userID string, maxDelay time.Duration, publish func(bool, ca return &TxnIDWaiter{ userID: userID, publish: publish, + mu: sync.Mutex{}, queues: make(map[string][]*caches.RoomEventUpdate), maxDelay: maxDelay, // TODO: metric that tracks how long events were queued for. @@ -39,6 +42,10 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { // We only want to queue this event if // - our user sent it AND it lacks a txn_id; OR // - the room already has queued events. + + t.mu.Lock() + defer t.mu.Unlock() + _, roomQueued := t.queues[ed.RoomID] missingTxnID := ed.Sender == t.userID && ed.TransactionID == "" if !(missingTxnID || roomQueued) { @@ -59,6 +66,9 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { } func (t *TxnIDWaiter) PublishUpToNID(roomID string, publishNID int64) { + t.mu.Lock() + defer t.mu.Unlock() + queue, exists := t.queues[roomID] if !exists { return From 504e02410ea95889240fd732e134834d449c9624 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 13:35:00 +0100 Subject: [PATCH 145/156] Remove more debug --- sync3/handler/txn_id_waiter.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 798de3fc..ed3a14a6 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -94,11 +94,3 @@ func (t *TxnIDWaiter) PublishUpToNID(roomID string, publishNID int64) { t.publish(true, eventUpdate) } } - -func nids(updates []*caches.RoomEventUpdate) []int64 { - rv := make([]int64, len(updates)) - for i, up := range updates { - rv[i] = up.EventData.NID - } - return rv -} From 1cdb4dcdddd44f1fe3c432d8ce09ce6e21ffb57f Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 15:05:34 +0100 Subject: [PATCH 146/156] TestyMcTestFace --- sync3/handler/txn_id_waiter_test.go | 159 ++++++++++++++++++++++++++-- 1 file changed, 151 insertions(+), 8 deletions(-) diff --git a/sync3/handler/txn_id_waiter_test.go b/sync3/handler/txn_id_waiter_test.go index 627b42ac..db1f94c5 100644 --- a/sync3/handler/txn_id_waiter_test.go +++ b/sync3/handler/txn_id_waiter_test.go @@ -6,7 +6,15 @@ import ( "time" ) -func TestTxnIDWaiterQueuingLogic(t *testing.T) { +type publishArg struct { + delayed bool + update caches.Update +} + +// Test that +// - events are (reported as being) delayed when we expect them to be +// - delayed events are automatically published after the maximum delay period +func TestTxnIDWaiter_QueuingLogic(t *testing.T) { const alice = "alice" const bob = "bob" const room1 = "!theroom" @@ -148,11 +156,6 @@ func TestTxnIDWaiterQueuingLogic(t *testing.T) { }, } - type publishArg struct { - delayed bool - update caches.Update - } - for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { updates := make(chan publishArg, 100) @@ -188,8 +191,148 @@ func TestTxnIDWaiterQueuingLogic(t *testing.T) { } } -// TODO: tests which demonstrate that PublishEventsUpTo() +// Test that PublishUpToNID // - correctly pops off the start of the queue // - is idempotent -// - only affects the given room ID // - deletes map entry if queue is empty (so that roomQueued is set correctly) +func TestTxnIDWaiter_PublishUpToNID(t *testing.T) { + const alice = "@alice:example.com" + const room = "!unimportant" + var published []publishArg + publish := func(delayed bool, update caches.Update) { + published = append(published, publishArg{delayed, update}) + } + // Use an hour's expiry to effectively disable expiry. + w := NewTxnIDWaiter(alice, time.Hour, publish) + // Ingest 5 events, each of which would be queued by themselves. + for i := int64(2); i <= 6; i++ { + w.Ingest(&caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room, + Sender: alice, + TransactionID: "", + NID: i, + }, + }) + } + + t.Log("Queue has nids [2,3,4,5,6]") + t.Log("Publishing up to 1 should do nothing") + w.PublishUpToNID(room, 1) + assertNIDs(t, published, nil) + + t.Log("Publishing up to 3 should yield nids [2, 3] in that order") + w.PublishUpToNID(room, 3) + assertNIDs(t, published, []int64{2, 3}) + assertDelayed(t, published[:2]) + + t.Log("Publishing up to 3 a second time should do nothing") + w.PublishUpToNID(room, 3) + assertNIDs(t, published, []int64{2, 3}) + + t.Log("Publishing up to 2 at this point should do nothing.") + w.PublishUpToNID(room, 2) + assertNIDs(t, published, []int64{2, 3}) + + t.Log("Publishing up to 6 should yield nids [4, 5, 6] in that order") + w.PublishUpToNID(room, 6) + assertNIDs(t, published, []int64{2, 3, 4, 5, 6}) + assertDelayed(t, published[2:5]) + + t.Log("Publishing up to 6 a second time should do nothing") + w.PublishUpToNID(room, 6) + assertNIDs(t, published, []int64{2, 3, 4, 5, 6}) + + t.Log("Ingesting another event that doesn't need to be queueing should be published immediately") + w.Ingest(&caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room, + Sender: "@notalice:example.com", + TransactionID: "", + NID: 7, + }, + }) + assertNIDs(t, published, []int64{2, 3, 4, 5, 6, 7}) + if published[len(published)-1].delayed { + t.Errorf("Final event was delayed, but should have been published immediately") + } +} + +// Test that PublishUpToNID only publishes in the given room +func TestTxnIDWaiter_PublishUpToNID_MultipleRooms(t *testing.T) { + const alice = "@alice:example.com" + var published []publishArg + publish := func(delayed bool, update caches.Update) { + published = append(published, publishArg{delayed, update}) + } + // Use an hour's expiry to effectively disable expiry. + w := NewTxnIDWaiter(alice, time.Hour, publish) + // Ingest four queueable events across two rooms. + w.Ingest(&caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: "!room1", + Sender: alice, + TransactionID: "", + NID: 1, + }, + }) + w.Ingest(&caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: "!room2", + Sender: alice, + TransactionID: "", + NID: 2, + }, + }) + w.Ingest(&caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: "!room2", + Sender: alice, + TransactionID: "", + NID: 3, + }, + }) + w.Ingest(&caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: "!room1", + Sender: alice, + TransactionID: "", + NID: 4, + }, + }) + + t.Log("Queues are [1, 4] and [2, 3]") + t.Log("Publish up to NID 4 in room 1 should yield nids [1, 4]") + w.PublishUpToNID("!room1", 4) + assertNIDs(t, published, []int64{1, 4}) + assertDelayed(t, published) + + t.Log("Queues are [1, 4] and [2, 3]") + t.Log("Publish up to NID 3 in room 2 should yield nids [2, 3]") + w.PublishUpToNID("!room2", 3) + assertNIDs(t, published, []int64{1, 4, 2, 3}) + assertDelayed(t, published) +} + +func assertDelayed(t *testing.T, published []publishArg) { + for _, p := range published { + if !p.delayed { + t.Errorf("published arg with NID %d was not delayed, but we expected it to be", p.update.(*caches.RoomEventUpdate).EventData.NID) + } + } +} + +func assertNIDs(t *testing.T, published []publishArg, expectedNIDs []int64) { + if len(published) != len(expectedNIDs) { + t.Errorf("Got %d nids, but expected %d", len(published), len(expectedNIDs)) + } + for i := range published { + rup, ok := published[i].update.(*caches.RoomEventUpdate) + if !ok { + t.Errorf("Update %d (%v) was not a RoomEventUpdate", i, published[i].update) + } + if rup.EventData.NID != expectedNIDs[i] { + t.Errorf("Update %d (%v) got nid %d, expected %d", i, *rup, rup.EventData.NID, expectedNIDs[i]) + } + } +} From 70243a4f7e0a911b8e711c4839924f77c361a91d Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 18:12:45 +0100 Subject: [PATCH 147/156] Don't delay state events --- sync3/handler/txn_id_waiter.go | 9 +++--- sync3/handler/txn_id_waiter_test.go | 50 +++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index ed3a14a6..6ba663e1 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -39,15 +39,14 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { } ed := eventUpdate.EventData - // We only want to queue this event if - // - our user sent it AND it lacks a txn_id; OR - // - the room already has queued events. + // An event should be queued if + // - it's a state event that our user sent, lacking a txn_id; OR + // - the room already has queued events. t.mu.Lock() defer t.mu.Unlock() - _, roomQueued := t.queues[ed.RoomID] - missingTxnID := ed.Sender == t.userID && ed.TransactionID == "" + missingTxnID := ed.StateKey == nil && ed.Sender == t.userID && ed.TransactionID == "" if !(missingTxnID || roomQueued) { t.publish(false, up) return diff --git a/sync3/handler/txn_id_waiter_test.go b/sync3/handler/txn_id_waiter_test.go index db1f94c5..27b4890c 100644 --- a/sync3/handler/txn_id_waiter_test.go +++ b/sync3/handler/txn_id_waiter_test.go @@ -2,6 +2,7 @@ package handler import ( "github.com/matrix-org/sliding-sync/sync3/caches" + "github.com/tidwall/gjson" "testing" "time" ) @@ -86,6 +87,51 @@ func TestTxnIDWaiter_QueuingLogic(t *testing.T) { WaitForUpdate: 1, ExpectDelayed: false, // not a room event, no need to queued behind alice's event }, + { + Name: "empty queue, join event for sender", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + NID: 1, + EventType: "m.room.member", + StateKey: ptr(alice), + Content: gjson.Parse(`{"membership": "join"}`), + }, + }, + }, + WaitForUpdate: 0, + ExpectDelayed: false, + }, + { + Name: "nonempty queue, join event for sender", + Ingest: []caches.Update{ + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + NID: 1, + }, + }, + &caches.RoomEventUpdate{ + EventData: &caches.EventData{ + RoomID: room1, + Sender: alice, + TransactionID: "", + NID: 2, + EventType: "m.room.member", + StateKey: ptr(alice), + Content: gjson.Parse(`{"membership": "join"}`), + }, + }, + }, + WaitForUpdate: 1, + ExpectDelayed: true, + }, + { Name: "nonempty queue, event update, different sender", Ingest: []caches.Update{ @@ -336,3 +382,7 @@ func assertNIDs(t *testing.T, published []publishArg, expectedNIDs []int64) { } } } + +func ptr(s string) *string { + return &s +} From e64de5c77162085c180f2264a71cdbcbc1bed55b Mon Sep 17 00:00:00 2001 From: David Robertson Date: Thu, 27 Jul 2023 18:16:13 +0100 Subject: [PATCH 148/156] Make it easier to debug TestInviteAcceptance This was the best way for me to make sense of the test log. --- tests-e2e/membership_transitions_test.go | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests-e2e/membership_transitions_test.go b/tests-e2e/membership_transitions_test.go index 2271d78e..bb721f3c 100644 --- a/tests-e2e/membership_transitions_test.go +++ b/tests-e2e/membership_transitions_test.go @@ -233,17 +233,18 @@ func TestInviteRejection(t *testing.T) { } func TestInviteAcceptance(t *testing.T) { - alice := registerNewUser(t) - bob := registerNewUser(t) + alice := registerNamedUser(t, "alice") + bob := registerNamedUser(t, "bob") // ensure that invite state correctly propagates. One room will already be in 'invite' state // prior to the first proxy sync, whereas the 2nd will transition. + t.Logf("Alice creates two rooms and invites Bob to the first.") firstInviteRoomID := alice.CreateRoom(t, map[string]interface{}{"preset": "private_chat", "name": "First"}) alice.InviteRoom(t, firstInviteRoomID, bob.UserID) secondInviteRoomID := alice.CreateRoom(t, map[string]interface{}{"preset": "private_chat", "name": "Second"}) t.Logf("first %s second %s", firstInviteRoomID, secondInviteRoomID) - // sync as bob, we should see 1 invite + t.Log("Sync as Bob, requesting invites only. He should see 1 invite") res := bob.SlidingSync(t, sync3.Request{ Lists: map[string]sync3.RequestList{ "a": { @@ -273,10 +274,12 @@ func TestInviteAcceptance(t *testing.T) { }, })) - // now invite bob + t.Log("Alice invites bob to room 2.") alice.InviteRoom(t, secondInviteRoomID, bob.UserID) + t.Log("Alice syncs until she sees Bob's invite.") alice.SlidingSyncUntilMembership(t, "", secondInviteRoomID, bob, "invite") + t.Log("Bob syncs. He should see the invite to room 2 as well.") res = bob.SlidingSync(t, sync3.Request{ Lists: map[string]sync3.RequestList{ "a": { @@ -304,13 +307,16 @@ func TestInviteAcceptance(t *testing.T) { }, })) - // now accept the invites + t.Log("Bob accept the invites.") bob.JoinRoom(t, firstInviteRoomID, nil) bob.JoinRoom(t, secondInviteRoomID, nil) + + t.Log("Alice syncs until she sees Bob join room 1.") alice.SlidingSyncUntilMembership(t, "", firstInviteRoomID, bob, "join") + t.Log("Alice syncs until she sees Bob join room 2.") alice.SlidingSyncUntilMembership(t, "", secondInviteRoomID, bob, "join") - // the list should be purged + t.Log("Bob does an incremental sync") res = bob.SlidingSync(t, sync3.Request{ Lists: map[string]sync3.RequestList{ "a": { @@ -318,12 +324,13 @@ func TestInviteAcceptance(t *testing.T) { }, }, }, WithPos(res.Pos)) + t.Log("Both of his invites should be purged.") m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(0), m.MatchV3Ops( m.MatchV3DeleteOp(1), m.MatchV3DeleteOp(0), ))) - // fresh sync -> no invites + t.Log("Bob makes a fresh sliding sync request.") res = bob.SlidingSync(t, sync3.Request{ Lists: map[string]sync3.RequestList{ "a": { @@ -334,6 +341,7 @@ func TestInviteAcceptance(t *testing.T) { }, }, }) + t.Log("He should see no invites.") m.MatchResponse(t, res, m.MatchNoV3Ops(), m.MatchRoomSubscriptionsStrict(nil), m.MatchList("a", m.MatchV3Count(0))) } From 58788287d4bd65eacb26e20f4d5f38f3334394ea Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 28 Jul 2023 18:37:52 +0100 Subject: [PATCH 149/156] Add t.Helper calls Co-authored-by: kegsay --- sync3/handler/txn_id_waiter_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sync3/handler/txn_id_waiter_test.go b/sync3/handler/txn_id_waiter_test.go index 27b4890c..6f4e53c1 100644 --- a/sync3/handler/txn_id_waiter_test.go +++ b/sync3/handler/txn_id_waiter_test.go @@ -361,6 +361,7 @@ func TestTxnIDWaiter_PublishUpToNID_MultipleRooms(t *testing.T) { } func assertDelayed(t *testing.T, published []publishArg) { + t.Helper() for _, p := range published { if !p.delayed { t.Errorf("published arg with NID %d was not delayed, but we expected it to be", p.update.(*caches.RoomEventUpdate).EventData.NID) @@ -369,6 +370,7 @@ func assertDelayed(t *testing.T, published []publishArg) { } func assertNIDs(t *testing.T, published []publishArg, expectedNIDs []int64) { + t.Helper() if len(published) != len(expectedNIDs) { t.Errorf("Got %d nids, but expected %d", len(published), len(expectedNIDs)) } From 9248d4e512728c7a6dbc4bca6b1451c4f220894c Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 28 Jul 2023 18:38:05 +0100 Subject: [PATCH 150/156] Review comments --- sync3/handler/txn_id_waiter.go | 1 - v3.go | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/sync3/handler/txn_id_waiter.go b/sync3/handler/txn_id_waiter.go index 6ba663e1..36986344 100644 --- a/sync3/handler/txn_id_waiter.go +++ b/sync3/handler/txn_id_waiter.go @@ -60,7 +60,6 @@ func (t *TxnIDWaiter) Ingest(up caches.Update) { // TODO: bound the queue size? t.queues[ed.RoomID] = append(queue, eventUpdate) - // TODO: if t gets gced, will this function still run? If so, will things explode? time.AfterFunc(t.maxDelay, func() { t.PublishUpToNID(ed.RoomID, ed.NID) }) } diff --git a/v3.go b/v3.go index 0d6debe4..72cf37c8 100644 --- a/v3.go +++ b/v3.go @@ -39,6 +39,7 @@ type Opts struct { TestingSynchronousPubsub bool // MaxTransactionIDDelay is the longest amount of time that we will wait for // confirmation of an event's transaction_id before sending it to its sender. + // Set to 0 to disable this delay mechanism entirely. MaxTransactionIDDelay time.Duration DBMaxConns int From 7c5442d7e852568a57efeae20f724dc194069de3 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Fri, 28 Jul 2023 18:55:18 +0100 Subject: [PATCH 151/156] Integration test review comments --- tests-integration/timeline_test.go | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests-integration/timeline_test.go b/tests-integration/timeline_test.go index 481edea9..62ba9230 100644 --- a/tests-integration/timeline_test.go +++ b/tests-integration/timeline_test.go @@ -690,14 +690,25 @@ func TestTimelineTxnID(t *testing.T) { )) } -// Like TestTimelineTxnID, but where ... -func TestTimelineTxnIDAfterInitialSync(t *testing.T) { +// TestTimelineTxnID checks that Alice sees her transaction_id if +// - Bob's poller sees Alice's event, +// - Alice's poller sees Alice's event with txn_id, and +// - Alice syncs. +// +// This test is similar but not identical. It checks that Alice sees her transaction_id if +// - Bob's poller sees Alice's event, +// - Alice does an incremental sync, which should omit her event, +// - Alice's poller sees Alice's event with txn_id, and +// - Alice syncs, seeing her event with txn_id. +func TestTimelineTxnIDBuffersForTxnID(t *testing.T) { pqString := testutils.PrepareDBConnectionString() // setup code v2 := runTestV2Server(t) v3 := runTestServer(t, v2, pqString, slidingsync.Opts{ // This needs to be greater than the request timeout, which is hardcoded to a - // minimum of 100ms in connStateLive.liveUpdate. + // minimum of 100ms in connStateLive.liveUpdate. This ensures that the + // liveUpdate call finishes before the TxnIDWaiter publishes the update, + // meaning that Alice doesn't see her event before the txn ID is known. MaxTransactionIDDelay: 200 * time.Millisecond, }) defer v2.close() @@ -772,6 +783,13 @@ func TestTimelineTxnIDAfterInitialSync(t *testing.T) { t.Log("Bob's poller sees the message.") v2.waitUntilEmpty(t, bob) + t.Log("Bob makes an incremental sliding sync") + bobRes = v3.mustDoV3RequestWithPos(t, bobToken, bobRes.Pos, sync3.Request{}) + t.Log("Bob should see the message without a transaction_id") + m.MatchResponse(t, bobRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( + roomID, m.MatchRoomTimelineMostRecent(1, []json.RawMessage{newEventNoUnsigned}), + )) + t.Log("Alice requests an incremental sliding sync with no request changes.") aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) t.Log("Alice should see no messages.") @@ -796,12 +814,6 @@ func TestTimelineTxnIDAfterInitialSync(t *testing.T) { roomID, m.MatchRoomTimelineMostRecent(1, []json.RawMessage{newEvent}), )) - t.Log("Bob makes an incremental sliding sync") - bobRes = v3.mustDoV3RequestWithPos(t, bobToken, bobRes.Pos, sync3.Request{}) - t.Log("Bob should see the message without a transaction_id") - m.MatchResponse(t, bobRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( - roomID, m.MatchRoomTimelineMostRecent(1, []json.RawMessage{newEventNoUnsigned}), - )) } // Executes a sync v3 request without a ?pos and asserts that the count, rooms and timeline events m.Match the inputs given. From 8b6f48a24b79b3afaab8cc312d1dd914ead0807f Mon Sep 17 00:00:00 2001 From: Kegan Dougal Date: Mon, 31 Jul 2023 10:24:36 +0100 Subject: [PATCH 152/156] Add more logging to response lines --- internal/context.go | 21 ++++++++++++++++++++- sync3/handler/handler.go | 2 +- sync3/response.go | 11 ++++------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/internal/context.go b/internal/context.go index bfb87b6c..f3a48b49 100644 --- a/internal/context.go +++ b/internal/context.go @@ -20,6 +20,7 @@ type data struct { userID string deviceID string bufferSummary string + connID string since int64 next int64 numRooms int @@ -28,6 +29,9 @@ type data struct { numGlobalAccountData int numChangedDevices int numLeftDevices int + numLists int + roomSubs int + roomUnsubs int } // prepare a request context so it can contain syncv3 info @@ -67,7 +71,7 @@ func SetConnBufferInfo(ctx context.Context, bufferLen, nextLen, bufferCap int) { func SetRequestContextResponseInfo( ctx context.Context, since, next int64, numRooms int, txnID string, numToDeviceEvents, numGlobalAccountData int, - numChangedDevices, numLeftDevices int, + numChangedDevices, numLeftDevices int, connID string, numLists int, roomSubs, roomUnsubs int, ) { d := ctx.Value(ctxData) if d == nil { @@ -82,6 +86,10 @@ func SetRequestContextResponseInfo( da.numGlobalAccountData = numGlobalAccountData da.numChangedDevices = numChangedDevices da.numLeftDevices = numLeftDevices + da.connID = connID + da.numLists = numLists + da.roomSubs = roomSubs + da.roomUnsubs = roomUnsubs } func DecorateLogger(ctx context.Context, l *zerolog.Event) *zerolog.Event { @@ -123,5 +131,16 @@ func DecorateLogger(ctx context.Context, l *zerolog.Event) *zerolog.Event { if da.bufferSummary != "" { l = l.Str("b", da.bufferSummary) } + if da.roomSubs > 0 { + l = l.Int("sub", da.roomSubs) + } + if da.roomUnsubs > 0 { + l = l.Int("usub", da.roomUnsubs) + } + if da.numLists > 0 { + l = l.Int("l", da.numLists) + } + // always log the connection ID so we know when it isn't set + l = l.Str("c", da.connID) return l } diff --git a/sync3/handler/handler.go b/sync3/handler/handler.go index c02c5a29..dce1f2a2 100644 --- a/sync3/handler/handler.go +++ b/sync3/handler/handler.go @@ -288,7 +288,7 @@ func (h *SyncLiveHandler) serve(w http.ResponseWriter, req *http.Request) error } internal.SetRequestContextResponseInfo( req.Context(), cpos, resp.PosInt(), len(resp.Rooms), requestBody.TxnID, numToDeviceEvents, numGlobalAccountData, - numChangedDevices, numLeftDevices, + numChangedDevices, numLeftDevices, requestBody.ConnID, len(requestBody.Lists), len(requestBody.RoomSubscriptions), len(requestBody.UnsubscribeRooms), ) w.Header().Set("Content-Type", "application/json") diff --git a/sync3/response.go b/sync3/response.go index fa0df500..8cf8a1d1 100644 --- a/sync3/response.go +++ b/sync3/response.go @@ -21,9 +21,8 @@ type Response struct { Rooms map[string]Room `json:"rooms"` Extensions extensions.Response `json:"extensions"` - Pos string `json:"pos"` - TxnID string `json:"txn_id,omitempty"` - Session string `json:"session_id,omitempty"` + Pos string `json:"pos"` + TxnID string `json:"txn_id,omitempty"` } type ResponseList struct { @@ -68,9 +67,8 @@ func (r *Response) UnmarshalJSON(b []byte) error { } `json:"lists"` Extensions extensions.Response `json:"extensions"` - Pos string `json:"pos"` - TxnID string `json:"txn_id,omitempty"` - Session string `json:"session_id,omitempty"` + Pos string `json:"pos"` + TxnID string `json:"txn_id,omitempty"` }{} if err := json.Unmarshal(b, &temporary); err != nil { return err @@ -78,7 +76,6 @@ func (r *Response) UnmarshalJSON(b []byte) error { r.Rooms = temporary.Rooms r.Pos = temporary.Pos r.TxnID = temporary.TxnID - r.Session = temporary.Session r.Extensions = temporary.Extensions r.Lists = make(map[string]ResponseList, len(temporary.Lists)) From baa35961a96e6342211617d729a3a124e5946601 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 31 Jul 2023 13:07:54 +0100 Subject: [PATCH 153/156] E2E test --- tests-e2e/transaction_id_test.go | 106 +++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 26 deletions(-) diff --git a/tests-e2e/transaction_id_test.go b/tests-e2e/transaction_id_test.go index c49cb5a1..c3a53ee4 100644 --- a/tests-e2e/transaction_id_test.go +++ b/tests-e2e/transaction_id_test.go @@ -32,33 +32,10 @@ func TestTransactionIDsAppear(t *testing.T) { // we cannot use MatchTimeline here because the Unsigned section contains 'age' which is not // deterministic and MatchTimeline does not do partial matches. - matchTransactionID := func(eventID, txnID string) m.RoomMatcher { - return func(r sync3.Room) error { - for _, ev := range r.Timeline { - var got Event - if err := json.Unmarshal(ev, &got); err != nil { - return fmt.Errorf("failed to unmarshal event: %s", err) - } - if got.ID != eventID { - continue - } - tx, ok := got.Unsigned["transaction_id"] - if !ok { - return fmt.Errorf("unsigned block for %s has no transaction_id", eventID) - } - gotTxnID := tx.(string) - if gotTxnID != txnID { - return fmt.Errorf("wrong transaction_id, got %s want %s", gotTxnID, txnID) - } - t.Logf("%s has txn ID %s", eventID, gotTxnID) - return nil - } - return fmt.Errorf("not found event %s", eventID) - } - } + m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ roomID: { - matchTransactionID(eventID, "foobar"), + matchTransactionID(t, eventID, "foobar"), }, })) @@ -74,8 +51,85 @@ func TestTransactionIDsAppear(t *testing.T) { res = client.SlidingSyncUntilEvent(t, res.Pos, sync3.Request{}, roomID, Event{ID: eventID}) m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ roomID: { - matchTransactionID(eventID, "foobar2"), + matchTransactionID(t, eventID, "foobar2"), }, })) } + +// This test has 1 poller expecting a txn ID and 10 others that won't see one. +// We test that sending device sees a txnID. Without the TxnIDWaiter logic in place, +// this test is likely (but not guaranteed) to fail. +func TestTransactionIDsAppearWithMultiplePollers(t *testing.T) { + alice := registerNamedUser(t, "alice") + + t.Log("Alice creates a room and syncs until she sees it.") + roomID := alice.CreateRoom(t, map[string]interface{}{}) + res := alice.SlidingSync(t, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 10, + }, + Ranges: sync3.SliceRanges{{0, 20}}, + }, + }, + }) + m.MatchResponse(t, res, m.MatchRoomSubscription(roomID)) + + t.Log("Alice makes other devices and starts them syncing.") + for i := 0; i < 10; i++ { + device := *alice + device.Login(t, "password", fmt.Sprintf("device_%d", i)) + device.SlidingSync(t, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 10, + }, + Ranges: sync3.SliceRanges{{0, 20}}, + }, + }, + }) + } + + t.Log("Alice sends a message with a transaction ID.") + const txnID = "foobar" + sendRes := alice.MustDoFunc(t, "PUT", []string{"_matrix", "client", "v3", "rooms", roomID, "send", "m.room.message", txnID}, + WithJSONBody(t, map[string]interface{}{ + "msgtype": "m.text", + "body": "Hello, world!", + })) + body := ParseJSON(t, sendRes) + eventID := GetJSONFieldStr(t, body, "event_id") + + t.Log("Alice syncs on her main devices until she sees her message.") + res = alice.SlidingSyncUntilEventID(t, res.Pos, roomID, eventID) + + m.MatchResponse(t, res, m.MatchRoomSubscription(roomID, matchTransactionID(t, eventID, txnID))) +} + +func matchTransactionID(t *testing.T, eventID, txnID string) m.RoomMatcher { + return func(r sync3.Room) error { + for _, ev := range r.Timeline { + var got Event + if err := json.Unmarshal(ev, &got); err != nil { + return fmt.Errorf("failed to unmarshal event: %s", err) + } + if got.ID != eventID { + continue + } + tx, ok := got.Unsigned["transaction_id"] + if !ok { + return fmt.Errorf("unsigned block for %s has no transaction_id", eventID) + } + gotTxnID := tx.(string) + if gotTxnID != txnID { + return fmt.Errorf("wrong transaction_id, got %s want %s", gotTxnID, txnID) + } + t.Logf("%s has txn ID %s", eventID, gotTxnID) + return nil + } + return fmt.Errorf("not found event %s", eventID) + } +} From 28b11f86a89b3076d2cd65f3b692a04f8d1aabb8 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 31 Jul 2023 13:29:34 +0100 Subject: [PATCH 154/156] Emit txn id payloads when there are no new events --- sync2/handler2/handler.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 602c1694..6a9b866a 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -273,17 +273,15 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) return } - if numNew == 0 { - // no new events - return - } // We've updated the database. Now tell any pubsub listeners what we learned. - h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2Accumulate{ - RoomID: roomID, - PrevBatch: prevBatch, - EventNIDs: latestNIDs, - }) + if numNew != 0 { + h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2Accumulate{ + RoomID: roomID, + PrevBatch: prevBatch, + EventNIDs: latestNIDs, + }) + } if len(eventIDToTxnID) > 0 || len(eventIDsLackingTxns) > 0 { // The call to h.Store.Accumulate above only tells us about new events' NIDS; From 139068d3056c4823b0704b8dc2e2e79c75353c06 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 1 Aug 2023 11:27:31 +0100 Subject: [PATCH 155/156] Move PendingTxnIDs to the handler2.Handler More sense conceptually---pollermap doesn't track any cross-poller state---and removes some indirection. Followup to #146. No functional changes. --- sync2/handler2/handler.go | 8 +++++--- sync2/handler2/handler_test.go | 7 ++----- sync2/poller.go | 17 +++-------------- 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 6a9b866a..760230bf 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -41,7 +41,8 @@ type Handler struct { Notif int } // room_id => fnv_hash([typing user ids]) - typingMap map[string]uint64 + typingMap map[string]uint64 + PendingTxnIDs *sync2.PendingTransactionIDs deviceDataTicker *sync2.DeviceDataTicker e2eeWorkerPool *internal.WorkerPool @@ -64,6 +65,7 @@ func NewHandler( Notif int }), typingMap: make(map[string]uint64), + PendingTxnIDs: sync2.NewPendingTransactionIDs(pMap.DeviceIDs), deviceDataTicker: sync2.NewDeviceDataTicker(deviceDataUpdateDuration), e2eeWorkerPool: internal.NewWorkerPool(500), // TODO: assign as fraction of db max conns, not hardcoded } @@ -308,7 +310,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev for eventID, nid := range nidsByIDs { txnID, ok := eventIDToTxnID[eventID] if ok { - h.pMap.SeenTxnID(eventID) + h.PendingTxnIDs.SeenTxnID(eventID) h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ EventID: eventID, RoomID: roomID, @@ -318,7 +320,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID, prev NID: nid, }) } else { - allClear, _ := h.pMap.MissingTxnID(eventID, userID, deviceID) + allClear, _ := h.PendingTxnIDs.MissingTxnID(eventID, userID, deviceID) if allClear { h.v2Pub.Notify(pubsub.ChanV2, &pubsub.V2TransactionID{ EventID: eventID, diff --git a/sync2/handler2/handler_test.go b/sync2/handler2/handler_test.go index 3b69f07b..b123292b 100644 --- a/sync2/handler2/handler_test.go +++ b/sync2/handler2/handler_test.go @@ -42,11 +42,7 @@ func (p *mockPollerMap) NumPollers() int { } func (p *mockPollerMap) Terminate() {} -func (p *mockPollerMap) MissingTxnID(eventID, userID, deviceID string) (bool, error) { - return false, nil -} - -func (p *mockPollerMap) SeenTxnID(eventID string) error { +func (p *mockPollerMap) DeviceIDs(userID string) []string { return nil } @@ -58,6 +54,7 @@ func (p *mockPollerMap) EnsurePolling(pid sync2.PollerID, accessToken, v2since s isStartup: isStartup, }) } + func (p *mockPollerMap) assertCallExists(t *testing.T, pi pollInfo) { for _, c := range p.calls { if reflect.DeepEqual(pi, c) { diff --git a/sync2/poller.go b/sync2/poller.go index 77e7767f..fbe2da5c 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -64,8 +64,7 @@ type IPollerMap interface { EnsurePolling(pid PollerID, accessToken, v2since string, isStartup bool, logger zerolog.Logger) NumPollers() int Terminate() - MissingTxnID(eventID, userID, deviceID string) (bool, error) - SeenTxnID(eventID string) error + DeviceIDs(userID string) []string } // PollerMap is a map of device ID to Poller @@ -74,7 +73,6 @@ type PollerMap struct { callbacks V2DataReceiver pollerMu *sync.Mutex Pollers map[PollerID]*poller - pendingTxnIDs *PendingTransactionIDs executor chan func() executorRunning bool processHistogramVec *prometheus.HistogramVec @@ -115,7 +113,6 @@ func NewPollerMap(v2Client Client, enablePrometheus bool) *PollerMap { Pollers: make(map[PollerID]*poller), executor: make(chan func(), 0), } - pm.pendingTxnIDs = NewPendingTransactionIDs(pm.deviceIDs) if enablePrometheus { pm.processHistogramVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "sliding_sync", @@ -199,9 +196,9 @@ func (h *PollerMap) NumPollers() (count int) { return } -// deviceIDs returns the slice of all devices currently being polled for by this user. +// DeviceIDs returns the slice of all devices currently being polled for by this user. // The return value is brand-new and is fully owned by the caller. -func (h *PollerMap) deviceIDs(userID string) []string { +func (h *PollerMap) DeviceIDs(userID string) []string { h.pollerMu.Lock() defer h.pollerMu.Unlock() var devices []string @@ -213,14 +210,6 @@ func (h *PollerMap) deviceIDs(userID string) []string { return devices } -func (h *PollerMap) MissingTxnID(eventID, userID, deviceID string) (bool, error) { - return h.pendingTxnIDs.MissingTxnID(eventID, userID, deviceID) -} - -func (h *PollerMap) SeenTxnID(eventID string) error { - return h.pendingTxnIDs.SeenTxnID(eventID) -} - // EnsurePolling makes sure there is a poller for this device, making one if need be. // Blocks until at least 1 sync is done if and only if the poller was just created. // This ensures that calls to the database will return data. From 9a787d08ab0175c7c4330bf6ce7a76de0e7ee1c2 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Tue, 1 Aug 2023 17:02:21 +0100 Subject: [PATCH 156/156] Add extra integration test --- tests-integration/timeline_test.go | 115 +++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/tests-integration/timeline_test.go b/tests-integration/timeline_test.go index 62ba9230..3a7f6237 100644 --- a/tests-integration/timeline_test.go +++ b/tests-integration/timeline_test.go @@ -816,6 +816,121 @@ func TestTimelineTxnIDBuffersForTxnID(t *testing.T) { } +// Similar to TestTimelineTxnIDBuffersForTxnID, this test checks: +// - Bob's poller sees Alice's event, +// - Alice does an incremental sync, which should omit her event, +// - Alice's poller sees Alice's event without a txn_id, and +// - Alice syncs, seeing her event without txn_id. +// I.e. we're checking that the "all clear" empties out the buffer of events. +func TestTimelineTxnIDRespectsAllClear(t *testing.T) { + pqString := testutils.PrepareDBConnectionString() + // setup code + v2 := runTestV2Server(t) + v3 := runTestServer(t, v2, pqString, slidingsync.Opts{ + // This needs to be greater than the request timeout, which is hardcoded to a + // minimum of 100ms in connStateLive.liveUpdate. This ensures that the + // liveUpdate call finishes before the TxnIDWaiter publishes the update, + // meaning that Alice doesn't see her event before the txn ID is known. + MaxTransactionIDDelay: 200 * time.Millisecond, + }) + defer v2.close() + defer v3.close() + roomID := "!a:localhost" + latestTimestamp := time.Now() + t.Log("Alice and Bob are in the same room") + room := roomEvents{ + roomID: roomID, + events: append( + createRoomState(t, alice, latestTimestamp), + testutils.NewJoinEvent(t, bob), + ), + } + v2.addAccount(t, alice, aliceToken) + v2.addAccount(t, bob, bobToken) + v2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(room), + }, + NextBatch: "alice_after_initial_poll", + }) + v2.queueResponse(bob, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(room), + }, + NextBatch: "bob_after_initial_poll", + }) + + t.Log("Alice and Bob make initial sliding syncs.") + aliceRes := v3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, + }, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 2, + }, + }, + }, + }) + bobRes := v3.mustDoV3Request(t, bobToken, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, + }, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 2, + }, + }, + }, + }) + + t.Log("Alice has sent a message... but it arrives down Bob's poller first, without a transaction_id") + newEventNoTxn := testutils.NewEvent(t, "m.room.message", alice, map[string]interface{}{"body": "hi"}) + + v2.queueResponse(bob, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + events: []json.RawMessage{newEventNoTxn}, + }), + }, + }) + t.Log("Bob's poller sees the message.") + v2.waitUntilEmpty(t, bob) + + t.Log("Bob makes an incremental sliding sync") + bobRes = v3.mustDoV3RequestWithPos(t, bobToken, bobRes.Pos, sync3.Request{}) + t.Log("Bob should see the message without a transaction_id") + m.MatchResponse(t, bobRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( + roomID, m.MatchRoomTimelineMostRecent(1, []json.RawMessage{newEventNoTxn}), + )) + + t.Log("Alice requests an incremental sliding sync with no request changes.") + aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) + t.Log("Alice should see no messages.") + m.MatchResponse(t, aliceRes, m.MatchRoomSubscriptionsStrict(nil)) + + // Now the message arrives down Alice's poller. + v2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + events: []json.RawMessage{newEventNoTxn}, + }), + }, + }) + t.Log("Alice's poller sees the message without transaction_id.") + v2.waitUntilEmpty(t, alice) + + t.Log("Alice makes another incremental sync request.") + aliceRes = v3.mustDoV3RequestWithPos(t, aliceToken, aliceRes.Pos, sync3.Request{}) + t.Log("Alice's sync response includes the event without a txn ID.") + m.MatchResponse(t, aliceRes, m.MatchList("a", m.MatchV3Count(1)), m.MatchNoV3Ops(), m.MatchRoomSubscription( + roomID, m.MatchRoomTimelineMostRecent(1, []json.RawMessage{newEventNoTxn}), + )) + +} + // Executes a sync v3 request without a ?pos and asserts that the count, rooms and timeline events m.Match the inputs given. func testTimelineLoadInitialEvents(v3 *testV3Server, token string, count int, wantRooms []roomEvents, numTimelineEventsPerRoom int) func(t *testing.T) { return func(t *testing.T) {