mozilla
diff --git a/‎bigquery_etl/backfill/shredder_mitigation.py‎
Lines changed: 21 additions & 7 deletions b/‎bigquery_etl/backfill/shredder_mitigation.py‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎bigquery_etl/backfill/shredder_mitigation_checks_template.sql‎
Lines changed: 4 additions & 4 deletions b/‎bigquery_etl/backfill/shredder_mitigation_checks_template.sql‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/metadata.yaml‎
Lines changed: 18 additions & 0 deletions b/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/metadata.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/query.py‎
Lines changed: 185 additions & 0 deletions b/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/query.py‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/schema.yaml‎
Lines changed: 66 additions & 0 deletions b/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/schema.yaml‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/dataset_metadata.yaml‎
Lines changed: 15 additions & 0 deletions b/‎sql/moz-fx-data-shared-prod/addon_moderations_derived/dataset_metadata.yaml‎
Lines changed: 15 additions & 0 deletions
@@ -30,6 +30,7 @@
 DEFAULT_PROJECT_ID = "moz-fx-data-shared-prod"
 SHREDDER_MITIGATION_QUERY_NAME = "shredder_mitigation_query"
 SHREDDER_MITIGATION_CHECKS_NAME = "shredder_mitigation_checks"
+DEFAULT_FOR_NULLS = "??"
 WILDCARD_STRING = "???????"
 WILDCARD_NUMBER = -9999999
 QUERY_FILE_RE = re.compile(
@@ -202,7 +203,7 @@ def generate_query(
         if not select_list or not from_clause:
             raise click.ClickException(
                 f"Missing required clause to generate query.\n"
-                f"Actuals: SELECT: {select_list}, FROM: {self.full_table_id}"
+                f"Actual: SELECT: {select_list}, FROM: {self.full_table_id}"
             )
         query = f"SELECT {', '.join(map(str, select_list))}"
         query += f" FROM {from_clause}" if from_clause is not None else ""
@@ -575,7 +576,8 @@ def generate_query_with_shredder_mitigation(
     common_select = (
         [previous.partitioning["field"]]
         + [
-            f"COALESCE({dim.name}, '{WILDCARD_STRING}') AS {dim.name}"
+            f"IF({dim.name} IS NULL OR {dim.name} = '{DEFAULT_FOR_NULLS}', '{WILDCARD_STRING}',"
+            f" {dim.name}) AS {dim.name}"
             for dim in common_dimensions
             if (
                 dim.name != previous.partitioning["field"]
@@ -688,7 +690,7 @@ def generate_query_with_shredder_mitigation(
             if metric.data_type != DataTypeGroup.FLOAT
         ]
         + [
-            f"ROUND({previous_agg.query_cte}.{metric.name}, 10) - "  # Round FLOAT to avoid exponentials.
+            f"ROUND({previous_agg.query_cte}.{metric.name}, 10) - "  # Round FLOAT to avoid exponential numbers.
             f"ROUND(COALESCE({new_agg.query_cte}.{metric.name}, 0), 10) AS {metric.name}"
             for metric in metrics
             if metric.data_type == DataTypeGroup.FLOAT
@@ -758,13 +760,13 @@ def generate_query_with_shredder_mitigation(
     final_select = f"{', '.join(combined_list)}"
 
     # Generate formatted output strings to display generated-query information in console.
-    common_ouput = "".join(
+    common_output = "".join(
         [
             f"{dim.column_type.name} > {dim.name}:{dim.data_type.name}\n"
             for dim in common_dimensions
         ]
     )
-    metrics_ouput = "".join(
+    metrics_output = "".join(
         [
             f"{dim.column_type.name} > {dim.name}:{dim.data_type.name}\n"
             for dim in metrics
@@ -778,7 +780,7 @@ def generate_query_with_shredder_mitigation(
     )
     click.echo(
         click.style(
-            f"Query columns:\n" f"{common_ouput + metrics_ouput + changed_output}",
+            f"Query columns:\n" f"{common_output + metrics_output + changed_output}",
             fg="yellow",
         )
     )
@@ -816,10 +818,22 @@ def generate_query_with_shredder_mitigation(
     # Generate checks to compare versions after each partition backfill.
     checks_select = (
         [new.partitioning["field"]]
+        + [
+            f"IF({dim.name} IS NULL OR {dim.name} = '{DEFAULT_FOR_NULLS}', '{WILDCARD_STRING}',"
+            f" {dim.name}) AS {dim.name}"
+            for dim in common_dimensions
+            if (
+                dim.name != previous.partitioning["field"]
+                and dim.data_type == DataTypeGroup.STRING
+            )
+        ]
         + [
             dim.name
             for dim in common_dimensions
-            if (dim.name != new.partitioning["field"])
+            if (
+                dim.name != new.partitioning["field"]
+                and dim.data_type != DataTypeGroup.STRING
+            )
         ]
         + [f"SUM({metric.name})" f" AS {metric.name}" for metric in metrics]
     )
 
@@ -27,8 +27,8 @@ SELECT
       CONCAT(
         ((SELECT COUNT(*) FROM previous_not_matching)),
         " rows in the previous data don't match backfilled data! Run auto-generated checks for ",
-        "all mismatches & search for rows missing or with differences in metrics. 5 sample rows: ",
-        (SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 5)))
+        "all mismatches & search for rows missing or with differences in metrics. Sample row in previous version: ",
+        (SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM previous_not_matching LIMIT 1)))
       )
     ),
     NULL
@@ -61,8 +61,8 @@ SELECT
       CONCAT(
         ((SELECT COUNT(*) FROM backfilled_not_matching)),
         " rows in backfill don't match previous version of data! Run auto-generated checks for ",
-        "all mismatches & search for rows added or with differences in metrics. 5 sample rows: ",
-        (SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 5)))
+        "all mismatches & search for rows added or with differences in metrics. Sample row in new_version: ",
+        (SELECT TO_JSON_STRING(ARRAY(SELECT AS STRUCT * FROM backfilled_not_matching LIMIT 1)))
       )
     ),
     NULL
 
@@ -0,0 +1,18 @@
+friendly_name: Cinder Decisions Raw
+description: |-
+  Download of decisions regarding addon moderations
+owners:
+- [email protected]
+labels:
+  incremental: true
+  owner1: example
+scheduling:
+  dag_name: bqetl_addons
+bigquery:
+  time_partitioning:
+    type: day
+    field: 'date'
+    require_partition_filter: false
+    expiration_days: null
+  range_partitioning: null
+references: {}
@@ -0,0 +1,185 @@
+"""Cinder API Addon Moderations Decisions - download decisions, clean and upload to BigQuery."""
+
+import csv
+import json
+import os
+import tempfile
+from argparse import ArgumentParser
+from time import sleep
+
+import requests
+from google.cloud import bigquery
+
+CSV_FIELDS = [
+    "user",
+    "queue_slug",
+    "job_id",
+    "uuid",
+    "applied_policies",
+    "entity",
+    "entity_slug",
+    "entity_id",
+    "created_at",
+    "decision_type",
+    "job_assigned_at",
+    "typed_metadata",
+]
+
+CINDER_BEARER_TOKEN = os.environ.get("CINDER_TOKEN")
+
+
+def post_response(url, headers, data):
+    """POST response function."""
+    response = requests.post(url, headers=headers, data=data)
+    if (response.status_code == 401) or (response.status_code == 400):
+        print(f"***Error: {response.status_code}***")
+        print(response.text)
+    return response
+
+
+def get_response(url, headers):
+    """GET response function."""
+    response = requests.get(url, headers=headers)
+    if (response.status_code == 401) or (response.status_code == 400):
+        print(f"***Error: {response.status_code}***")
+        print(response.text)
+    return response
+
+
+def read_json(filename: str) -> dict:
+    """Read JSON file."""
+    with open(filename, "r") as f:
+        data = json.loads(f.read())
+    return data
+
+
+def write_dict_to_csv(json_data, filename):
+    """Write a dictionary to a csv."""
+    with open(filename, "w") as out_file:
+        dict_writer = csv.DictWriter(out_file, CSV_FIELDS)
+        dict_writer.writeheader()
+        dict_writer.writerows(json_data)
+
+
+def cinder_addon_decisions_download(date, bearer_token):
+    """Download data from Cinder - bearer_token are called here."""
+    # getting overview metrics for different kpis / Deliverables
+    url = "https://stage.cinder.nonprod.webservices.mozgcp.net/api/v1/decisions/"
+    headers = {"accept": "application/json", "authorization": f"Bearer {bearer_token}"}
+    print(url)
+    response = get_response(url, headers)
+    return response
+
+
+def check_json(cinder_addon_decisions_response_text):
+    """Script will return an empty dictionary for apps on days when there is no data. Check for that here."""
+    with tempfile.NamedTemporaryFile() as tmp_json:
+        with open(tmp_json.name, "w") as f_json:
+            f_json.write(cinder_addon_decisions_response_text)
+            try:
+                query_export = read_json(f_json.name)
+            except (
+                ValueError
+            ):  # ex. json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
+                return None
+    return query_export
+
+
+def clean_json(query_export, date):
+    """Turn the json file into a list to be input into a CSV for bq upload."""
+    fields_list = []
+    for item in query_export["items"]:
+        field_dict = {
+            "user": item["user"],
+            "queue_slug": item["queue_slug"],
+            "job_id": item["job_id"],
+            "uuid": item["uuid"],
+            "applied_policies": item["applied_policies"],
+            "entity": item["entity"],
+            "entity_slug": item["entity_slug"],
+            "entity_id": item["entity_id"],
+            "created_at": item["created_at"],
+            "decision_type": item["decision_type"],
+            "job_assigned_at": item["job_assigned_at"],
+            "typed_metadata": item["typed_metadata"],
+        }
+        fields_list.append(field_dict)
+    return fields_list
+
+
+def upload_to_bigquery(csv_data, project, dataset, table_name, date):
+    """Upload the data to bigquery."""
+    date = date
+    print("writing json to csv")
+    partition = f"{date}".replace("-", "")
+    print(partition)
+    with tempfile.NamedTemporaryFile() as tmp_csv:
+        with open(tmp_csv.name, "w+b") as f_csv:
+            write_dict_to_csv(csv_data, f_csv.name)
+            client = bigquery.Client(project)
+            job_config = bigquery.LoadJobConfig(
+                create_disposition="CREATE_IF_NEEDED",
+                write_disposition="WRITE_TRUNCATE",
+                time_partitioning=bigquery.TimePartitioning(
+                    type_=bigquery.TimePartitioningType.DAY,
+                    field="date",
+                ),
+                skip_leading_rows=1,
+                schema=[
+                    bigquery.SchemaField("date", "DATE"),
+                    bigquery.SchemaField("user", "STRING"),
+                    bigquery.SchemaField("queue_slug", "STRING"),
+                    bigquery.SchemaField("job_id", "STRING"),
+                    bigquery.SchemaField("uuid", "STRING"),
+                    bigquery.SchemaField("applied_policies", "STRING"),
+                    bigquery.SchemaField("entity", "STRING"),
+                    bigquery.SchemaField("entity_slug", "STRING"),
+                    bigquery.SchemaField("entity_id", "STRING"),
+                    bigquery.SchemaField("created_at", "DATE"),
+                    bigquery.SchemaField("decision_type", "STRING"),
+                    bigquery.SchemaField("job_assigned_at", "STRING"),
+                    bigquery.SchemaField("typed_metadata", "STRING"),
+                ],
+            )
+            destination = f"{project}.{dataset}.{table_name}${partition}"
+            job = client.load_table_from_file(f_csv, destination, job_config=job_config)
+            print(
+                f"Writing Decisions data to {destination}. BigQuery job ID: {job.job_id}"
+            )
+            job.result()
+
+
+def main():
+    """Input data, call functions, get stuff done."""
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument("--date", required=True)
+    parser.add_argument("--project", default="moz-fx-data-shared-prod")
+    parser.add_argument("--dataset", default="addon_moderations_derived")
+
+    args = parser.parse_args()
+
+    project = args.project
+    dataset = args.dataset
+    table_name = "cinder_decisions_raw_v1"
+
+    date = args.date
+    bearer_token = CINDER_BEARER_TOKEN
+
+    data = []
+
+    json_file = cinder_addon_decisions_download(date, bearer_token)
+    query_export = check_json(json_file.text)
+
+    if query_export is not None:
+        # This section writes the tmp json data into a temp CSV file which will then be put into a BigQuery table
+        cinder_addon_decisions_data = clean_json(query_export, date)
+        data.extend(cinder_addon_decisions_data)
+    else:
+        print("no data for today")
+    sleep(5)
+
+    upload_to_bigquery(data, project, dataset, table_name, date)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+fields:
+
+- mode: NULLABLE
+  name: date
+  type: DATE
+  description: date when job run and field that the table is partitioned by
+
+- mode: NULLABLE
+  name: user
+  type: STRING
+  description: User who submitted the report
+
+- mode: NULLABLE
+  name: queue_slug
+  type: STRING
+  description: Queue_slug
+
+- mode: NULLABLE
+  name: job_id
+  type: STRING
+  description: Job_id of Decision
+
+- mode: NULLABLE
+  name: uuid
+  type: STRING
+  description: ID of UU
+
+- mode: REPEATED
+  name: applied_policies
+  type: STRING
+  description: Policies applied to moderate addon
+
+- mode: REPEATED
+  name: entity
+  type: STRING
+  description: Information about the entity
+
+- mode: NULLABLE
+  name: entity_slug
+  type: STRING
+  description: Entity Slug
+
+- mode: NULLABLE
+  name: entity_id
+  type: STRING
+  description: Add on ID
+
+- mode: NULLABLE
+  name: created_at
+  type: STRING
+  description: Date decision made
+
+- mode: NULLABLE
+  name: decision_type
+  type: STRING
+  description: type of decision
+
+- mode: NULLABLE
+  name: job_assigned_at
+  type: STRING
+  description: Date addon report was assigned to a moderator
+
+- mode: NULLABLE
+  name: typed_metadata
+  type: STRING
+  description: Contains more data
@@ -0,0 +1,15 @@
+friendly_name: Addon Moderations
+description: |-
+  Dataset for anything to do with Addon Moderations
+dataset_base_acl: derived
+user_facing: false
+labels: {}
+default_table_workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
+workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
+syndication: {}