initial add of scripts for addon_moderations (#6484)

Marlene-M-Hirose · web-flow · commit b79a7b924213 · 2024-11-15T21:22:05.000Z
* initial add of scripts for addon_moderations

* move files over to addon_moderations_derived, update metadata.yaml to partition by date

* update metadata.yaml to type:day and field:date

* change DAG from bqetl_cinder to bqetl_addons

* remove query file, update query.py file"

* take out bexplicit bearer token

* take out clustering
diff --git a/sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/metadata.yaml b/sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/metadata.yaml
@@ -15,6 +15,4 @@ bigquery:
     require_partition_filter: false
     expiration_days: null
   range_partitioning: null
-  clustering:
-    fields: []
 references: {}
diff --git a/sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/query.py b/sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/query.py
@@ -0,0 +1,185 @@
+"""Cinder API Addon Moderations Decisions - download decisions, clean and upload to BigQuery."""
+
+import csv
+import json
+import os
+import tempfile
+from argparse import ArgumentParser
+from time import sleep
+
+import requests
+from google.cloud import bigquery
+
+CSV_FIELDS = [
+    "user",
+    "queue_slug",
+    "job_id",
+    "uuid",
+    "applied_policies",
+    "entity",
+    "entity_slug",
+    "entity_id",
+    "created_at",
+    "decision_type",
+    "job_assigned_at",
+    "typed_metadata",
+]
+
+CINDER_BEARER_TOKEN = os.environ.get("CINDER_TOKEN")
+
+
+def post_response(url, headers, data):
+    """POST response function."""
+    response = requests.post(url, headers=headers, data=data)
+    if (response.status_code == 401) or (response.status_code == 400):
+        print(f"***Error: {response.status_code}***")
+        print(response.text)
+    return response
+
+
+def get_response(url, headers):
+    """GET response function."""
+    response = requests.get(url, headers=headers)
+    if (response.status_code == 401) or (response.status_code == 400):
+        print(f"***Error: {response.status_code}***")
+        print(response.text)
+    return response
+
+
+def read_json(filename: str) -> dict:
+    """Read JSON file."""
+    with open(filename, "r") as f:
+        data = json.loads(f.read())
+    return data
+
+
+def write_dict_to_csv(json_data, filename):
+    """Write a dictionary to a csv."""
+    with open(filename, "w") as out_file:
+        dict_writer = csv.DictWriter(out_file, CSV_FIELDS)
+        dict_writer.writeheader()
+        dict_writer.writerows(json_data)
+
+
+def cinder_addon_decisions_download(date, bearer_token):
+    """Download data from Cinder - bearer_token are called here."""
+    # getting overview metrics for different kpis / Deliverables
+    url = "https://stage.cinder.nonprod.webservices.mozgcp.net/api/v1/decisions/"
+    headers = {"accept": "application/json", "authorization": f"Bearer {bearer_token}"}
+    print(url)
+    response = get_response(url, headers)
+    return response
+
+
+def check_json(cinder_addon_decisions_response_text):
+    """Script will return an empty dictionary for apps on days when there is no data. Check for that here."""
+    with tempfile.NamedTemporaryFile() as tmp_json:
+        with open(tmp_json.name, "w") as f_json:
+            f_json.write(cinder_addon_decisions_response_text)
+            try:
+                query_export = read_json(f_json.name)
+            except (
+                ValueError
+            ):  # ex. json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
+                return None
+    return query_export
+
+
+def clean_json(query_export, date):
+    """Turn the json file into a list to be input into a CSV for bq upload."""
+    fields_list = []
+    for item in query_export["items"]:
+        field_dict = {
+            "user": item["user"],
+            "queue_slug": item["queue_slug"],
+            "job_id": item["job_id"],
+            "uuid": item["uuid"],
+            "applied_policies": item["applied_policies"],
+            "entity": item["entity"],
+            "entity_slug": item["entity_slug"],
+            "entity_id": item["entity_id"],
+            "created_at": item["created_at"],
+            "decision_type": item["decision_type"],
+            "job_assigned_at": item["job_assigned_at"],
+            "typed_metadata": item["typed_metadata"],
+        }
+        fields_list.append(field_dict)
+    return fields_list
+
+
+def upload_to_bigquery(csv_data, project, dataset, table_name, date):
+    """Upload the data to bigquery."""
+    date = date
+    print("writing json to csv")
+    partition = f"{date}".replace("-", "")
+    print(partition)
+    with tempfile.NamedTemporaryFile() as tmp_csv:
+        with open(tmp_csv.name, "w+b") as f_csv:
+            write_dict_to_csv(csv_data, f_csv.name)
+            client = bigquery.Client(project)
+            job_config = bigquery.LoadJobConfig(
+                create_disposition="CREATE_IF_NEEDED",
+                write_disposition="WRITE_TRUNCATE",
+                time_partitioning=bigquery.TimePartitioning(
+                    type_=bigquery.TimePartitioningType.DAY,
+                    field="date",
+                ),
+                skip_leading_rows=1,
+                schema=[
+                    bigquery.SchemaField("date", "DATE"),
+                    bigquery.SchemaField("user", "STRING"),
+                    bigquery.SchemaField("queue_slug", "STRING"),
+                    bigquery.SchemaField("job_id", "STRING"),
+                    bigquery.SchemaField("uuid", "STRING"),
+                    bigquery.SchemaField("applied_policies", "STRING"),
+                    bigquery.SchemaField("entity", "STRING"),
+                    bigquery.SchemaField("entity_slug", "STRING"),
+                    bigquery.SchemaField("entity_id", "STRING"),
+                    bigquery.SchemaField("created_at", "DATE"),
+                    bigquery.SchemaField("decision_type", "STRING"),
+                    bigquery.SchemaField("job_assigned_at", "STRING"),
+                    bigquery.SchemaField("typed_metadata", "STRING"),
+                ],
+            )
+            destination = f"{project}.{dataset}.{table_name}${partition}"
+            job = client.load_table_from_file(f_csv, destination, job_config=job_config)
+            print(
+                f"Writing Decisions data to {destination}. BigQuery job ID: {job.job_id}"
+            )
+            job.result()
+
+
+def main():
+    """Input data, call functions, get stuff done."""
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument("--date", required=True)
+    parser.add_argument("--project", default="moz-fx-data-shared-prod")
+    parser.add_argument("--dataset", default="addon_moderations_derived")
+
+    args = parser.parse_args()
+
+    project = args.project
+    dataset = args.dataset
+    table_name = "cinder_decisions_raw_v1"
+
+    date = args.date
+    bearer_token = CINDER_BEARER_TOKEN
+
+    data = []
+
+    json_file = cinder_addon_decisions_download(date, bearer_token)
+    query_export = check_json(json_file.text)
+
+    if query_export is not None:
+        # This section writes the tmp json data into a temp CSV file which will then be put into a BigQuery table
+        cinder_addon_decisions_data = clean_json(query_export, date)
+        data.extend(cinder_addon_decisions_data)
+    else:
+        print("no data for today")
+    sleep(5)
+
+    upload_to_bigquery(data, project, dataset, table_name, date)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/schema.yaml b/sql/moz-fx-data-shared-prod/addon_moderations_derived/cinder_decisions_raw_v1/schema.yaml
@@ -0,0 +1,66 @@
+fields:
+
+- mode: NULLABLE
+  name: date
+  type: DATE
+  description: date when job run and field that the table is partitioned by
+
+- mode: NULLABLE
+  name: user
+  type: STRING
+  description: User who submitted the report
+
+- mode: NULLABLE
+  name: queue_slug
+  type: STRING
+  description: Queue_slug
+
+- mode: NULLABLE
+  name: job_id
+  type: STRING
+  description: Job_id of Decision
+
+- mode: NULLABLE
+  name: uuid
+  type: STRING
+  description: ID of UU
+
+- mode: REPEATED
+  name: applied_policies
+  type: STRING
+  description: Policies applied to moderate addon
+
+- mode: REPEATED
+  name: entity
+  type: STRING
+  description: Information about the entity
+
+- mode: NULLABLE
+  name: entity_slug
+  type: STRING
+  description: Entity Slug
+
+- mode: NULLABLE
+  name: entity_id
+  type: STRING
+  description: Add on ID
+
+- mode: NULLABLE
+  name: created_at
+  type: STRING
+  description: Date decision made
+
+- mode: NULLABLE
+  name: decision_type
+  type: STRING
+  description: type of decision
+
+- mode: NULLABLE
+  name: job_assigned_at
+  type: STRING
+  description: Date addon report was assigned to a moderator
+
+- mode: NULLABLE
+  name: typed_metadata
+  type: STRING
+  description: Contains more data
diff --git a/sql/moz-fx-data-shared-prod/addon_moderations_derived/dataset_metadata.yaml b/sql/moz-fx-data-shared-prod/addon_moderations_derived/dataset_metadata.yaml
@@ -0,0 +1,15 @@
+friendly_name: Addon Moderations
+description: |-
+  Dataset for anything to do with Addon Moderations
+dataset_base_acl: derived
+user_facing: false
+labels: {}
+default_table_workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
+workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
+syndication: {}