Skip to content

Commit 84b600a

Browse files
Bug 1990006 - Create an initial ETL workflow for gecko-trace component
This patch adds an initial ETL workflow for processing traces collected by the [gecko-trace component](1) from varius Gecko based Firefox products. [1]: https://searchfox.org/firefox-main/source/toolkit/components/gecko-trace
1 parent 6665ce3 commit 84b600a

File tree

25 files changed

+990
-0
lines changed

25 files changed

+990
-0
lines changed

dags.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2505,3 +2505,25 @@ bqetl_market_intel_bot:
25052505
tags:
25062506
- impact/tier_3
25072507
- repo/bigquery-etl
2508+
2509+
bqetl_gecko_trace:
2510+
catchup: false
2511+
default_args:
2512+
depends_on_past: false
2513+
email:
2514+
2515+
2516+
email_on_failure: true
2517+
email_on_retry: true
2518+
end_date: null
2519+
max_active_tis_per_dag: null
2520+
2521+
retries: 2
2522+
retry_delay: 30m
2523+
start_date: "2025-09-26"
2524+
description: |
2525+
Processes gecko trace data across multiple Firefox applications.
2526+
repo: bigquery-etl
2527+
schedule_interval: 0 9 * * *
2528+
tags:
2529+
- impact/tier_3
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# gecko_trace.build_root_span
2+
3+
Builds a root span tree structure from an array of span objects.
4+
5+
## Signature
6+
7+
```sql
8+
gecko_trace.build_root_span(spans ARRAY<JSON>) RETURNS JSON
9+
```
10+
11+
## Arguments
12+
13+
- `spans`: Array of JSON objects representing individual spans. Each span should
14+
contain at minimum:
15+
- `span_id`: Unique identifier for the span
16+
- `parent_span_id`: ID of the parent span (null for root spans)
17+
18+
## Description
19+
20+
Takes an array of JSON span objects and constructs a hierarchical tree structure
21+
by linking spans with their parent-child relationships.
22+
23+
If no explicit root span is found, the function will attempt to find a single
24+
"missing" root span. If there are multiple or no missing roots, an error is
25+
thrown.
26+
27+
## Returns
28+
29+
Returns a JSON object representing the root span with all child spans nested in
30+
`childSpans` arrays throughout the tree structure.
31+
32+
## Example
33+
34+
```sql
35+
SELECT gecko_trace.build_root_span([
36+
JSON '{"span_id": "root", "parent_span_id": null, "name": "main_process"}',
37+
JSON '{"span_id": "child1", "parent_span_id": "root", "name": "network_request"}',
38+
JSON '{"span_id": "child2", "parent_span_id": "root", "name": "dom_parse"}',
39+
JSON '{"span_id": "grandchild", "parent_span_id": "child1", "name": "dns_lookup"}'
40+
])
41+
```
42+
43+
This would return a tree structure where the root span contains two child spans
44+
in its `childSpans` array, and one of those children has its own child span.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
friendly_name: Gecko Trace Build Root Span
3+
description: |-
4+
Builds a root span tree structure from an array of span objects.
5+
6+
Takes an array of JSON span objects and constructs a hierarchical tree structure
7+
by linking spans with their parent-child relationships. Returns the root span
8+
with all child spans nested in a `childSpans` array property.
9+
10+
If no root span is found, the function will attempt to find a single "missing"
11+
root span. If there are multiple or no missing roots, an error is thrown.
12+
13+
This function is used for processing Gecko trace data to reconstruct the
14+
hierarchical structure of spans within a trace.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
CREATE OR REPLACE FUNCTION gecko_trace.build_root_span(spans ARRAY<JSON>)
2+
RETURNS JSON
3+
LANGUAGE js AS r"""
4+
const spansById = new Map();
5+
let rootSpanId;
6+
7+
spans.forEach((span) => {
8+
const spanId = span.span_id;
9+
// Re-attach any children accumulated while parent was "missing"
10+
const maybeMissingSelf = spansById.get(spanId);
11+
span.childSpans = maybeMissingSelf?.childSpans ?? [];
12+
spansById.set(spanId, span);
13+
14+
if (!span.parent_span_id) {
15+
rootSpanId = spanId; // yay, we found the root span
16+
return;
17+
}
18+
19+
const parent = spansById.get(span.parent_span_id) || {
20+
span_id: span.parent_span_id,
21+
childSpans: [],
22+
type: "missing",
23+
};
24+
parent.childSpans.push(span);
25+
spansById.set(span.parent_span_id, parent);
26+
});
27+
28+
if (!rootSpanId) {
29+
// Find the single missing root, if any
30+
const missingRoots = Array.from(spansById.values()).filter(
31+
(span) => span.type == "missing",
32+
);
33+
if (missingRoots.length != 1) {
34+
throw new Error(
35+
`Unable to construct span tree: expected exactly one missing root span, but found ${missingRoots.length}`,
36+
);
37+
}
38+
39+
rootSpanId = missingRoots[0].span_id;
40+
}
41+
42+
return spansById.get(rootSpanId);
43+
""";
44+
45+
-- Tests
46+
SELECT
47+
-- Test with simple parent-child relationship
48+
assert.not_null(
49+
gecko_trace.build_root_span(
50+
[
51+
JSON '{"span_id": "root", "parent_span_id": null, "name": "root_span"}',
52+
JSON '{"span_id": "child1", "parent_span_id": "root", "name": "child_span"}'
53+
]
54+
)
55+
),
56+
-- Test with empty array
57+
assert.null(gecko_trace.build_root_span([])),
58+
-- Test single span (should be root)
59+
assert.equals(
60+
"root",
61+
JSON_VALUE(
62+
gecko_trace.build_root_span(
63+
[JSON '{"span_id": "root", "parent_span_id": null, "name": "root_span"}']
64+
),
65+
"$.span_id"
66+
)
67+
);
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# gecko_trace.calculate_signature
2+
3+
Calculates a signature hash for a trace based on its root span structure.
4+
5+
## Signature
6+
7+
```sql
8+
gecko_trace.calculate_signature(rootSpan JSON) RETURNS STRING
9+
```
10+
11+
## Arguments
12+
13+
- `rootSpan`: JSON object representing the root span of a trace tree, typically
14+
generated by `gecko_trace.build_root_span()`. Should contain:
15+
- `name`: Span name
16+
- `scope`: Object with `name` property
17+
- `resource`: Object with `attributes` property
18+
- `events`: Optional array of event objects with `name` and `attributes`
19+
- `childSpans`: Array of child span objects with the same structure
20+
21+
## Description
22+
23+
Uses a fast hash function (cyrb64) to generate a deterministic signature based
24+
on the hierarchical structure and attributes of spans in a trace. The signature
25+
is calculated by traversing the span tree depth-first and hashing:
26+
27+
- Resource attributes (excluding certain internal IDs like
28+
`gecko_process_internal_id`)
29+
- Scope names
30+
- Span names
31+
- Event names and attributes
32+
33+
## Returns
34+
35+
Returns a string hash that serves as a deterministic signature for the trace
36+
structure. Traces with identical signatures have the same execution pattern and
37+
can be grouped together for analysis.
38+
39+
## Example
40+
41+
```sql
42+
WITH root_span AS (
43+
SELECT gecko_trace.build_root_span(spans_array) as root
44+
FROM traces_table
45+
WHERE trace_id = 'some_trace_id'
46+
)
47+
SELECT gecko_trace.calculate_signature(root) as signature
48+
FROM root_span
49+
```
50+
51+
## Notes
52+
53+
- Internal process IDs and other volatile attributes are excluded from hashing
54+
to focus on logical execution patterns
55+
- Used in conjunction with `gecko_trace.build_root_span()` for complete trace
56+
analysis workflows
57+
- Returns empty string for NULL input
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
friendly_name: Gecko Trace Calculate Signature
3+
description: |-
4+
Calculates a signature hash for a trace based on its root span structure.
5+
6+
Uses a fast hash function (cyrb64) to generate a deterministic signature
7+
based on the hierarchical structure and attributes of spans in a trace.
8+
The signature is calculated by traversing the span tree and hashing:
9+
- Resource attributes (excluding certain internal IDs like gecko_process_internal_id)
10+
- Scope names
11+
- Span names
12+
- Event names and attributes
13+
14+
The function returns a string hash that can be used to identify traces with
15+
similar execution patterns.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
CREATE OR REPLACE FUNCTION gecko_trace.calculate_signature(rootSpan JSON)
2+
RETURNS STRING
3+
LANGUAGE js AS r"""
4+
// cyrb53 (c) 2018 bryc (github.com/bryc). License: Public domain. Attribution appreciated.
5+
// A fast and simple 64-bit (or 53-bit) string hash function with decent collision resistance.
6+
// Largely inspired by MurmurHash2/3, but with a focus on speed/simplicity.
7+
// See https://stackoverflow.com/questions/7616461/generate-a-hash-from-string-in-javascript/52171480#52171480
8+
// https://github.com/bryc/code/blob/master/jshash/experimental/cyrb53.js
9+
const cyrb64 = (str, seed = 0) => {
10+
let h1 = 0xdeadbeef ^ seed,
11+
h2 = 0x41c6ce57 ^ seed;
12+
for (let i = 0, ch; i < str.length; i++) {
13+
ch = str.charCodeAt(i);
14+
h1 = Math.imul(h1 ^ ch, 2654435761);
15+
h2 = Math.imul(h2 ^ ch, 1597334677);
16+
}
17+
h1 = Math.imul(h1 ^ (h1 >>> 16), 2246822507);
18+
h1 ^= Math.imul(h2 ^ (h2 >>> 13), 3266489909);
19+
h2 = Math.imul(h2 ^ (h2 >>> 16), 2246822507);
20+
h2 ^= Math.imul(h1 ^ (h1 >>> 13), 3266489909);
21+
// For a single 53-bit numeric return value we could return
22+
// 4294967296 * (2097151 & h2) + (h1 >>> 0);
23+
// but we instead return the full 64-bit value:
24+
return [h2 >>> 0, h1 >>> 0];
25+
};
26+
27+
const seed = 0;
28+
let digest = "";
29+
const hash = (str) => {
30+
const [h2, h1] = cyrb64(digest + str, seed);
31+
digest =
32+
h2.toString(36).padStart(7, "0") + h1.toString(36).padStart(7, "0");
33+
};
34+
35+
const ATTRS_TO_SKIP = {"gecko_process_internal_id": null}
36+
const hashAttrs = (attrs) => {
37+
for (const [key, value] of Object.entries(attrs)) {
38+
if (key in ATTRS_TO_SKIP) continue;
39+
hash(key);
40+
hash(value);
41+
}
42+
}
43+
44+
const hashEvents = (events) => {
45+
for (const event of events) {
46+
hash(event.name);
47+
hashAttrs(event.attributes);
48+
}
49+
};
50+
51+
const stack = [rootSpan];
52+
while (stack.length > 0) {
53+
const span = stack.pop();
54+
hashAttrs(span.resource.attributes);
55+
hash(span.scope.name);
56+
hash(span.name);
57+
if (span.events) {
58+
hashEvents(span.events);
59+
}
60+
stack.push(...span.childSpans);
61+
}
62+
63+
return digest;
64+
""";
65+
66+
-- Tests
67+
SELECT
68+
-- Test with simple root span
69+
assert.not_null(
70+
gecko_trace.calculate_signature(
71+
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
72+
)
73+
),
74+
-- Test that same input produces same signature
75+
assert.equals(
76+
gecko_trace.calculate_signature(
77+
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
78+
),
79+
gecko_trace.calculate_signature(
80+
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
81+
)
82+
),
83+
-- Test that null input returns empty string
84+
assert.equals("", gecko_trace.calculate_signature(NULL));

0 commit comments

Comments
 (0)