From 7a64877463facb716b62b08ce1da9ba0f4dd099c Mon Sep 17 00:00:00 2001 From: Chris Danis Date: Mon, 23 Sep 2024 16:15:40 -0400 Subject: [PATCH] group and dedupe by hashcode Signed-off-by: Chris Danis --- cmd/query/app/querysvc/adjusters.go | 2 +- model/adjuster/span_hash_deduper.go | 32 ++++++++++++++++++------ model/adjuster/span_hash_deduper_test.go | 24 +++++++++--------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/cmd/query/app/querysvc/adjusters.go b/cmd/query/app/querysvc/adjusters.go index be2469f4ab0..24e84a161df 100644 --- a/cmd/query/app/querysvc/adjusters.go +++ b/cmd/query/app/querysvc/adjusters.go @@ -15,7 +15,7 @@ import ( func StandardAdjusters(maxClockSkewAdjust time.Duration) []adjuster.Adjuster { return []adjuster.Adjuster{ adjuster.ZipkinSpanIDUniquifier(), - adjuster.DedupeBySpanID(), + adjuster.DedupeBySpanHash(), adjuster.ClockSkew(maxClockSkewAdjust), adjuster.IPTagAdjuster(), adjuster.OTelTagAdjuster(), diff --git a/model/adjuster/span_hash_deduper.go b/model/adjuster/span_hash_deduper.go index 6886c464993..44318246348 100644 --- a/model/adjuster/span_hash_deduper.go +++ b/model/adjuster/span_hash_deduper.go @@ -8,21 +8,39 @@ import ( "github.com/jaegertracing/jaeger/model" ) -// DedupeBySpanID returns an adjuster that removes all but one span with the same SpanID. +// DedupeBySpanHash returns an adjuster that removes all but one span with the same hashcode. // This is useful for when spans are duplicated in archival storage, as happens with // ElasticSearch archival. -func DedupeBySpanID() Adjuster { +func DedupeBySpanHash() Adjuster { return Func(func(trace *model.Trace) (*model.Trace, error) { - deduper := &spanIDDeduper{trace: trace} - deduper.groupSpansByID() - deduper.dedupeSpansByID() + deduper := &spanHashDeduper{trace: trace} + deduper.groupSpansByHash() + deduper.dedupeSpansByHash() return deduper.trace, nil }) } -func (d *spanIDDeduper) dedupeSpansByID() { +type spanHashDeduper struct { + trace *model.Trace + spansByHash map[uint64][]*model.Span +} + +func (d *spanHashDeduper) groupSpansByHash() { + spansByHash := make(map[uint64][]*model.Span) + for _, span := range d.trace.Spans { + hash, _ := model.HashCode(span) + if spans, ok := spansByHash[hash]; ok { + spansByHash[hash] = append(spans, span) + } else { + spansByHash[hash] = []*model.Span{span} + } + } + d.spansByHash = spansByHash +} + +func (d *spanHashDeduper) dedupeSpansByHash() { d.trace.Spans = nil - for _, spans := range d.spansByID { + for _, spans := range d.spansByHash { d.trace.Spans = append(d.trace.Spans, spans[0]) } } diff --git a/model/adjuster/span_hash_deduper_test.go b/model/adjuster/span_hash_deduper_test.go index dc3cefcea3c..ba7625a68b4 100644 --- a/model/adjuster/span_hash_deduper_test.go +++ b/model/adjuster/span_hash_deduper_test.go @@ -50,16 +50,15 @@ func newUniqueSpansTrace() *model.Trace { Spans: []*model.Span{ { TraceID: traceID, - SpanID: clientSpanID, + SpanID: anotherSpanID, Tags: model.KeyValues{ // span.kind = server model.String(keySpanKind, trace.SpanKindServer.String()), }, }, { - // some other span, child of server span TraceID: traceID, - SpanID: anotherSpanID, + SpanID: anotherSpanID, // same ID as before, but different metadata References: []model.SpanRef{model.NewChildOfRef(traceID, clientSpanID)}, }, }, @@ -74,9 +73,9 @@ func getSpanIDs(spans []*model.Span) []int { return ids } -func TestDedupeBySpanIDTriggers(t *testing.T) { +func TestDedupeBySpanHashTriggers(t *testing.T) { trace := newDuplicatedSpansTrace() - deduper := DedupeBySpanID() + deduper := DedupeBySpanHash() trace, err := deduper.Adjust(trace) require.NoError(t, err) @@ -86,28 +85,29 @@ func TestDedupeBySpanIDTriggers(t *testing.T) { assert.ElementsMatch(t, []int{int(clientSpanID), int(anotherSpanID)}, ids, "should keep unique span IDs") } -func TestDedupeBySpanIDNotTriggered(t *testing.T) { +func TestDedupeBySpanHashNotTriggered(t *testing.T) { trace := newUniqueSpansTrace() - deduper := DedupeBySpanID() + deduper := DedupeBySpanHash() trace, err := deduper.Adjust(trace) require.NoError(t, err) assert.Len(t, trace.Spans, 2, "should not dedupe spans") ids := getSpanIDs(trace.Spans) - assert.ElementsMatch(t, []int{int(clientSpanID), int(anotherSpanID)}, ids, "should keep unique span IDs") + assert.ElementsMatch(t, []int{int(anotherSpanID), int(anotherSpanID)}, ids, "should keep unique span IDs") + assert.NotEqual(t, trace.Spans[0], trace.Spans[1], "should keep unique hashcodes") } -func TestDedupeBySpanIDEmpty(t *testing.T) { +func TestDedupeBySpanHashEmpty(t *testing.T) { trace := &model.Trace{} - deduper := DedupeBySpanID() + deduper := DedupeBySpanHash() trace, err := deduper.Adjust(trace) require.NoError(t, err) assert.Empty(t, trace.Spans, "should be empty") } -func TestDedupeBySpanIDManyManySpans(t *testing.T) { +func TestDedupeBySpanHashManyManySpans(t *testing.T) { traceID := model.NewTraceID(0, 42) spans := make([]*model.Span, 0, 100) const distinctSpanIDs = 10 @@ -118,7 +118,7 @@ func TestDedupeBySpanIDManyManySpans(t *testing.T) { }) } trace := &model.Trace{Spans: spans} - deduper := DedupeBySpanID() + deduper := DedupeBySpanHash() trace, err := deduper.Adjust(trace) require.NoError(t, err)