Skip to content

Commit 58aafbc

Browse files
wwycBenWu
andauthored
DENG-9849: Update copy dedup code to null out city and subdivision fields for browser apps (#8362)
* Null geo in copy_dedup * fixed ci failures * Fixed param type * Used channel to app name mapping * Update bigquery_etl/copy_deduplicate.py Co-authored-by: Ben Wu <[email protected]> * Use app id * Added client id check * Update bigquery_etl/copy_deduplicate.py Co-authored-by: Ben Wu <[email protected]> * Added regex table id --------- Co-authored-by: Ben Wu <[email protected]>
1 parent d7a18b7 commit 58aafbc

File tree

2 files changed

+70
-2
lines changed

2 files changed

+70
-2
lines changed

bigquery_etl/copy_deduplicate.py

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,19 @@
99

1010
import json
1111
import logging
12+
import re
1213
from datetime import datetime, timedelta
1314
from functools import partial
1415
from itertools import groupby
1516
from multiprocessing.pool import ThreadPool
17+
from typing import List
1618

1719
import click
1820
from google.api_core.exceptions import BadRequest
1921
from google.cloud import bigquery
2022

2123
from bigquery_etl.cli.utils import table_matches_patterns
24+
from bigquery_etl.config import ConfigLoader
2225
from bigquery_etl.util.bigquery_id import sql_table_id
2326
from bigquery_etl.util.client_queue import ClientQueue
2427
from bigquery_etl.util.common import TempDatasetReference
@@ -73,14 +76,72 @@
7376
--
7477
-- Retain only one document for each ID.
7578
SELECT
76-
* EXCEPT(_n)
79+
* EXCEPT(_n){replace_geo}
7780
FROM
7881
numbered_duplicates
7982
WHERE
8083
_n = 1
8184
"""
8285

8386

87+
def _has_field_path(schema: List[bigquery.SchemaField], path: List[str]) -> bool:
88+
"""Return True if nested field path (e.g., ['metadata','geo','city']) exists."""
89+
for name in path:
90+
f = next((field for field in schema if field.name == name), None)
91+
if not f:
92+
return False
93+
schema = getattr(f, "fields", []) or []
94+
return True
95+
96+
97+
def _select_geo(live_table: str, client: bigquery.Client) -> str:
98+
"""Build a SELECT REPLACE clause that NULLs metadata.geo.* if applicable."""
99+
_, dataset_id, table_id = live_table.split(".")
100+
101+
excluded_tables = set(
102+
ConfigLoader.get("geo_deprecation", "skip_tables", fallback=[])
103+
)
104+
if re.sub(r"_v\d+$", "", table_id) in excluded_tables:
105+
return ""
106+
107+
app_id = dataset_id.removesuffix("_live")
108+
included_apps = set(
109+
ConfigLoader.get("geo_deprecation", "include_app_ids", fallback=[])
110+
)
111+
if app_id not in included_apps:
112+
return ""
113+
114+
table = client.get_table(live_table)
115+
116+
include_client_id = table.labels.get("include_client_id") == "true"
117+
if not include_client_id:
118+
return ""
119+
120+
# Check schema to ensure geo fields exists
121+
schema = table.schema
122+
required_fields = ("city", "subdivision1", "subdivision2")
123+
has_required_fields = all(
124+
_has_field_path(schema, ["metadata", "geo", field]) for field in required_fields
125+
)
126+
if not has_required_fields:
127+
return ""
128+
129+
return """
130+
REPLACE (
131+
(SELECT AS STRUCT
132+
metadata.* REPLACE (
133+
(SELECT AS STRUCT
134+
metadata.geo.* REPLACE (
135+
CAST(NULL AS STRING) AS city,
136+
CAST(NULL AS STRING) AS subdivision1,
137+
CAST(NULL AS STRING) AS subdivision2
138+
)
139+
) AS geo
140+
)
141+
) AS metadata)
142+
"""
143+
144+
84145
def _get_query_job_configs(
85146
client,
86147
live_table,
@@ -92,7 +153,8 @@ def _get_query_job_configs(
92153
num_retries,
93154
temp_dataset,
94155
):
95-
sql = QUERY_TEMPLATE.format(live_table=live_table)
156+
replace_geo = _select_geo(live_table, client)
157+
sql = QUERY_TEMPLATE.format(live_table=live_table, replace_geo=replace_geo)
96158
stable_table = f"{live_table.replace('_live.', '_stable.', 1)}${date:%Y%m%d}"
97159
kwargs = dict(use_legacy_sql=False, dry_run=dry_run, priority=priority)
98160
start_time = datetime(*date.timetuple()[:6])

bqetl_project.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,12 @@ generate:
558558
- firefox_desktop_background_defaultagent
559559
- firefox_desktop_background_tasks
560560

561+
geo_deprecation:
562+
include_app_ids:
563+
- org_mozilla_ios_klar
564+
skip_tables:
565+
- newtab
566+
561567
retention_exclusion_list:
562568
- sql/moz-fx-data-shared-prod/search_derived/acer_cohort_v1
563569
- sql/moz-fx-data-shared-prod/telemetry_derived/clients_first_seen_v3

0 commit comments

Comments
 (0)