diff --git a/jobs/eam-integrations/scripts/test.py b/jobs/eam-integrations/scripts/test.py index dfe43181..d3f5a12f 100644 --- a/jobs/eam-integrations/scripts/test.py +++ b/jobs/eam-integrations/scripts/test.py @@ -1,30 +1 @@ -import requests -from requests.auth import HTTPBasicAuth -base_URL = "https://mozilla-np.xmatters.com/api/xm/1" - -person_name = "342d509e-6ae3-4c0a-bd59-b07cdc7c6eb3" -endpoint_URL = "/people/" + person_name + "/supervisors" - -url = base_URL + endpoint_URL - -print("Sending request to url: " + url) -auth = HTTPBasicAuth("serviceuser", "welcome1") - -response = requests.get(url, auth=auth) - -responseCode = response.status_code -if responseCode == 200: - rjson = response.json() - for d in rjson.get("data"): - print( - 'User "' - + person_name - + '" has supervisor "' - + d["targetName"] - + '" with first name "' - + d["firstName"] - + '" and last name "' - + d["lastName"] - + '"' - ) diff --git a/jobs/search-term-data-validation-v2/.dockerignore b/jobs/search-term-data-validation-v2/.dockerignore deleted file mode 100644 index cff5d6ab..00000000 --- a/jobs/search-term-data-validation-v2/.dockerignore +++ /dev/null @@ -1,7 +0,0 @@ -.ci_job.yaml -.ci_workflow.yaml -.DS_Store -*.pyc -.pytest_cache/ -__pycache__/ -venv/ diff --git a/jobs/search-term-data-validation-v2/.gitignore b/jobs/search-term-data-validation-v2/.gitignore deleted file mode 100644 index 2e9942c0..00000000 --- a/jobs/search-term-data-validation-v2/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -.DS_Store -*.pyc -__pycache__/ -venv/ diff --git a/jobs/search-term-data-validation-v2/Dockerfile b/jobs/search-term-data-validation-v2/Dockerfile deleted file mode 100644 index 62c0512f..00000000 --- a/jobs/search-term-data-validation-v2/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -FROM python:3.10 -MAINTAINER REPLACE ME - -# https://github.com/mozilla-services/Dockerflow/blob/master/docs/building-container.md -ARG USER_ID="10001" -ARG GROUP_ID="app" -ARG HOME="/app" - -ENV HOME=${HOME} -RUN groupadd --gid ${USER_ID} ${GROUP_ID} && \ - useradd --create-home --uid ${USER_ID} --gid ${GROUP_ID} --home-dir ${HOME} ${GROUP_ID} - -WORKDIR ${HOME} - -RUN pip install --upgrade pip - -COPY requirements.txt requirements.txt -RUN pip install -r requirements.txt - -COPY . . - -RUN pip install . - -# Drop root and change ownership of the application folder to the user -RUN chown -R ${USER_ID}:${GROUP_ID} ${HOME} -USER ${USER_ID} diff --git a/jobs/search-term-data-validation-v2/README.md b/jobs/search-term-data-validation-v2/README.md deleted file mode 100644 index b6bf9c0c..00000000 --- a/jobs/search-term-data-validation-v2/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Search Volume Data Validation - -This job contains scripts for evaluating whether our recorded search terms -(candidate search volume for being sanitized and stored) are changing in ways -that might invalidate assumptions on which we've built our sanitization model. - -**Why 'v2' in the name?** -The original one broke after a few refactors to the repo's template code. -Four separate engineers over two months did not successfully identify the problem. -So we went with recreating from scratch, which also allowed us to exercise the template changes. -Bumping the version on the original one wouldn't have acomplished this. - -## Usage - -This script is intended to be run in a docker container. -Build the docker image with: - -```sh -docker build -t search-term-data-validation-v2 . -``` - -To run locally, install dependencies with: - -```sh -pip install -r requirements.txt -``` - -Run the scripts with: - -```sh -python search_term_data_validation_v2/main.py --data_validation_origin --data_validation_reporting_destination -``` - -The origin table in mozdata (which we treat as staging) is: `mozdata.search_terms_unsanitized_analysis.prototype_data_validation_metrics` -The origin table in prod is: `moz-fx-data-shared-prod.search_terms.sanitization_job_data_validation_metrics` - -The destination table in mozdata (which we treat as staging) is: `mozdata.search_terms_unsanitized_analysis.prototype_data_validation_reports_v1` -The destination table in prod is: `moz-fx-data-shared-prod.search_terms_derived.search_term_data_validation_reports_v1` - -## Development - -`python search_term_data_validation_v2/main.py` is the main control script -`python search_term_data_validation_v2/data_validation.py` is the module containing the python code the script calls -`tests` contains unit tests for functions in `python search_term_data_validation_v2/data_validation.py` - diff --git a/jobs/search-term-data-validation-v2/ci_job.yaml b/jobs/search-term-data-validation-v2/ci_job.yaml deleted file mode 100644 index 229da6bc..00000000 --- a/jobs/search-term-data-validation-v2/ci_job.yaml +++ /dev/null @@ -1,15 +0,0 @@ -build-job-search-term-data-validation-v2: - docker: - - image: << pipeline.parameters.git-image >> - steps: - - checkout - - compare-branch: - pattern: ^jobs/search-term-data-validation-v2/ - - setup_remote_docker: - version: << pipeline.parameters.docker-version >> - - run: - name: Build Docker image - command: docker build -t app:build jobs/search-term-data-validation-v2/ - - run: - name: Test Code - command: docker run app:build pytest diff --git a/jobs/search-term-data-validation-v2/ci_workflow.yaml b/jobs/search-term-data-validation-v2/ci_workflow.yaml deleted file mode 100644 index 5d992ec9..00000000 --- a/jobs/search-term-data-validation-v2/ci_workflow.yaml +++ /dev/null @@ -1,13 +0,0 @@ -job-search-term-data-validation-v2: - jobs: - - build-job-search-term-data-validation-v2 - - gcp-gcr/build-and-push-image: - context: data-eng-airflow-gcr - docker-context: jobs/search-term-data-validation-v2/ - path: jobs/search-term-data-validation-v2/ - image: search-term-data-validation-v2_docker_etl - requires: - - build-job-search-term-data-validation-v2 - filters: - branches: - only: main \ No newline at end of file diff --git a/jobs/search-term-data-validation-v2/pytest.ini b/jobs/search-term-data-validation-v2/pytest.ini deleted file mode 100644 index e618d7a5..00000000 --- a/jobs/search-term-data-validation-v2/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -testpaths = - tests diff --git a/jobs/search-term-data-validation-v2/requirements.txt b/jobs/search-term-data-validation-v2/requirements.txt deleted file mode 100644 index 4a80e19d..00000000 --- a/jobs/search-term-data-validation-v2/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -pandas==1.3.5 -numpy==1.21.0 -google-cloud-bigquery==3.0.1 -spacy>=3.0.0,<4.0.0 -spacy-fastlang==1.0.1 -db-dtypes==1.0.0 - -black==20.8b1 -pytest==7.1.2 -pytest-asyncio==0.18.3 -pytest-black==0.3.11 diff --git a/jobs/search-term-data-validation-v2/search_term_data_validation_v2/data_validation.py b/jobs/search-term-data-validation-v2/search_term_data_validation_v2/data_validation.py deleted file mode 100644 index aea66426..00000000 --- a/jobs/search-term-data-validation-v2/search_term_data_validation_v2/data_validation.py +++ /dev/null @@ -1,499 +0,0 @@ -from google.cloud import bigquery -from datetime import date, timedelta, datetime -from collections import namedtuple - -import numpy as np -import pandas as pd -import asyncio -import re -import json -import string - -project = "mozdata" - - -def calculate_data_validation_metrics(metadata_source, languages_source): - """ - Calculate metrics for determining whether our search volume is changing in ways that might invalidate our current sanitization model. - - Arguments: - - - metadata_source: a string. The name of the table containing the metadata to be fetched. - - languages_source: a string. The name of the table containing language distributions for search term jobs. - - Returns: A dataframe of the data validation metrics for the sanitization jobs. - """ - if re.fullmatch(r"[A-Za-z0-9\.\-\_]+", metadata_source): - metadata_source_no_injection = metadata_source - else: - raise Exception( - "metadata_source in incorrect format. This should be a fully qualified table name like myproject.mydataset.my_table" - ) - - if re.fullmatch(r"[A-Za-z0-9\.\-\_]+", languages_source): - languages_source_no_injection = languages_source - else: - raise Exception( - "metadata_source in incorrect format. This should be a fully qualified table name like myproject.mydataset.my_table" - ) - - # We are using f-strings here because BQ does not allow table names to be parametrized - # and we need to be able to run the same script in the staging and prod db environments for reliable testing outcomes. - SUCCESSFUL_SANITIZATION_JOB_RUN_METADATA = f""" - SELECT - finished_at, - SAFE_DIVIDE(total_search_terms_removed_by_sanitization_job, total_search_terms_analyzed) AS pct_sanitized_search_terms, - SAFE_DIVIDE(contained_at, total_search_terms_analyzed) AS pct_sanitized_contained_at, - SAFE_DIVIDE(contained_numbers, total_search_terms_analyzed) AS pct_sanitized_contained_numbers, - SAFE_DIVIDE(contained_name, total_search_terms_analyzed) AS pct_sanitized_contained_name, - SAFE_DIVIDE(sum_terms_containing_us_census_surname, total_search_terms_analyzed) AS pct_terms_containing_us_census_surname, - SAFE_DIVIDE(sum_uppercase_chars_all_search_terms, sum_chars_all_search_terms) AS pct_uppercase_chars_all_search_terms, - SAFE_DIVIDE(sum_words_all_search_terms, total_search_terms_analyzed) AS avg_words_all_search_terms, - 1 - SAFE_DIVIDE(languages.english_count, languages.all_languages_count) AS pct_terms_non_english - FROM `{metadata_source_no_injection}` AS metadata - JOIN - ( - SELECT - job_start_time, - max(case when language_code = 'en' then search_term_count end) english_count, - sum(search_term_count) as all_languages_count, - FROM `{languages_source_no_injection}` - GROUP BY job_start_time - ) AS languages - ON metadata.started_at = languages.job_start_time - WHERE status = 'SUCCESS' - ORDER BY finished_at ASC; - """ - client = bigquery.Client(project=project) - query_job = client.query(SUCCESSFUL_SANITIZATION_JOB_RUN_METADATA) - results_as_dataframe = query_job.result().to_dataframe() - - return results_as_dataframe - - -def export_data_validation_metrics_to_bigquery(dataframe, destination_table_id): - """ - Append data validation metrics to the BigQuery table tracking these metrics from job metadata. - - Arguments: - - dataframe: A dataframe of validation metrics to be added. - - destination_table_id: the fully qualified name of the table for the data to be exported into. - - Returns: Nothing. - It does print a result value as a cursory logging mechanism. That result object can be parsed and logged to wherever we like. - """ - client = bigquery.Client(project=project) - - schema = [ - bigquery.SchemaField("finished_at", bigquery.enums.SqlTypeNames.STRING), - bigquery.SchemaField( - "pct_sanitized_search_terms", bigquery.enums.SqlTypeNames.FLOAT64 - ), - bigquery.SchemaField( - "pct_sanitized_contained_at", bigquery.enums.SqlTypeNames.FLOAT64 - ), - bigquery.SchemaField( - "pct_sanitized_contained_numbers", bigquery.enums.SqlTypeNames.FLOAT64 - ), - bigquery.SchemaField( - "pct_sanitized_contained_name", bigquery.enums.SqlTypeNames.FLOAT64 - ), - bigquery.SchemaField( - "pct_terms_containing_us_census_surname", - bigquery.enums.SqlTypeNames.FLOAT64, - ), - bigquery.SchemaField( - "pct_uppercase_chars_all_search_terms", bigquery.enums.SqlTypeNames.FLOAT64 - ), - bigquery.SchemaField( - "avg_words_all_search_terms", bigquery.enums.SqlTypeNames.FLOAT64 - ), - bigquery.SchemaField( - "pct_terms_non_english", bigquery.enums.SqlTypeNames.FLOAT64 - ), - ] - - destination_table = bigquery.Table(destination_table_id) - job = client.insert_rows_from_dataframe( - table=destination_table, dataframe=dataframe, selected_fields=schema - ) - - print(job) - - -def retrieve_data_validation_metrics(metrics_source): - """ - Pull all the sanitization job data validation metrics. - - Arguments: - - - metadata_source: a string. The name of the table containing the data validation metrics to be fetched. - - Returns: A dataframe of the data validation metrics. - """ - if re.fullmatch(r"[A-Za-z0-9\.\-\_]+", metrics_source): - metrics_source_no_injection = metrics_source - else: - raise Exception( - "metadata_source in incorrect format. This should be a fully qualified table name like myproject.mydataset.my_table" - ) - - # We are using f-strings here because BQ does not allow table names to be parametrized - # and we need to be able to run the same script in the staging and prod db environments for reliable testing outcomes. - DATA_VALIDATION_METRICS_QUERY = f""" - SELECT - * - FROM `{metrics_source_no_injection}` AS metadata - ORDER BY finished_at ASC; - """ - client = bigquery.Client(project=project) - query_job = client.query(DATA_VALIDATION_METRICS_QUERY) - results_as_dataframe = query_job.result().to_dataframe() - - return results_as_dataframe - - -def range_check( - validation_data: pd.DataFrame, - metric: str, - full_lookback_window: int, - test_window: int, - range_lower_bound: float, - range_upper_bound: float, -): - print(f"Performing range check for metric: {metric}") - """ - Determines if all the values in a test window of days fall inside some percentile of the normal range for a set of comparison values in a comparison window of days. - - Inputs: - - - validation_data: the dataframe with the data in it to be checked. - ASSUMES the presence of a 'finished_at' column, whose date is used to calculate lookback and test windows. - - metric: the name of the column in the input dataframe on which to perform the check. - - full_lookback_window: an integer number of days that the comparison set should cover. - - test_window. an integer number of days that the test set should cover. - ASSUMES that the test window immediately succeeds the full_lookback_window. - - range_lower_bound: a float between 0 and 1 indicating the lower edge of the window of normal values from the comparison set - inside which at least one of the values in the test set should fall. - - range_upper_bound: a float between 0 and 1 indicating the upper edge of the window of normal values from the comparison set - inside which at least one of the values in the test set should fall. - - - Outputs: - - finished_at: the finished_at timestamp of the job run to which this check applies. - - num_values_compared: an integer representing the total number of range values included in this comparison. - - should_trigger: a bool indicating whether the values in the test window are all falling OUTSIDE the expected range. - - range_lower: a float. The lower bound of the expected range calculated from comparison values. - - range_upper: a float. The upper bound of the expected range calculated from comparison values. - - test_range: a list. The entirety of the test values. - - """ - if not (0 < range_lower_bound < 1 and 0 < range_upper_bound < 1): - raise Exception( - "range_lower_bound and range_upper_bound should both be between zero (0) and one (1)." - ) - - if "finished_at" not in validation_data.columns.values: - raise Exception("dataframe must include a finished_at column.") - - if metric not in validation_data.columns.values: - raise Exception(f'dataframe does not include target metric "{metric}"') - - today = date.today() - latest_finished_at = max(validation_data["finished_at"]) - - test_earliest_date = today - timedelta(days=test_window) - - comparison_earliest_date = test_earliest_date - timedelta(days=full_lookback_window) - - comparison_values = validation_data["finished_at"].apply( - lambda m: comparison_earliest_date < m.date() <= test_earliest_date - ) - test_values = validation_data["finished_at"].apply( - lambda m: test_earliest_date < m.date() <= today - ) - - comparison_range = validation_data.loc[comparison_values] - test_range = validation_data.loc[test_values] - - range_lower, range_upper = comparison_range[metric].quantile( - q=[range_lower_bound, range_upper_bound] - ) - - should_trigger = len(test_range[metric]) != 0 and ( - all(test_range[metric] > range_upper) or all(test_range[metric] < range_lower) - ) - - print(f"Completed range check for metric: {metric}") - return ( - latest_finished_at, - len(comparison_range), - should_trigger, - range_lower, - range_upper, - list(test_range[metric]), - ) - - -def mean_check( - validation_data: pd.DataFrame, - metric: str, - full_lookback_window: int, - test_window: int, - moving_average_window: int, - mean_lower_bound: float, - mean_upper_bound: float, -): - print(f"Performing mean check for metric: {metric}") - - """ - Determines if all the moving averages in a test window of days fall inside some percentile of the moving average for a set of comparison values in a comparison window of days. - - Inputs: - - - validation_data: the dataframe with the data in it to be checked. - ASSUMES the presence of a 'finished_at' column, whose date is used to calculate lookback and test windows. - - metric: the name of the column in the input dataframe on which to perform the check. - - full_lookback_window: an integer number of days that the comparison set should cover. - - test_window. an integer number of days that the test set should cover. - ASSUMES that the test window immediately succeeds the full_lookback_window. - - moving_average_window: an integer. Number of prior days over which to calculate an average for a given day. - - mean lower bound: a float between 0 and 1 indicating the lower edge of the window of normal values from the comparison set - inside which at least one of the values in the test set should fall. - - mean upper bound: a float between 0 and 1 indicating the upper edge of the window of normal values from the comparison set - inside which at least one of the values in the test set should fall. - - - Outputs: - - finished_at: the finished_at timestamp of the job run to which this check applies. - - num_moving_averages_compared: an integer representing the total number of moving average values included in this comparison. - - should_trigger: a bool indicating whether the values in the test window are all falling OUTSIDE the expected range. - - mean_lower: a float. The lower bound of the expected range of moving averages calculated from comparison values. - - mean_upper: a float. The upper bound of the expected range of moving averages calculated from comparison values. - - moving_average_windo: an integer. The moving average window passed into the function. - - test_moving_averages: a list. The entirety of the test values. - - """ - if not (0 < mean_lower_bound < 1 and 0 < mean_upper_bound < 1): - raise Exception( - "mean_lower_bound and mean_upper_bound should both be between zero (0) and one (1)." - ) - - if "finished_at" not in validation_data.columns.values: - raise Exception("dataframe must include a finished_at column.") - - if metric not in validation_data.columns.values: - raise Exception(f'dataframe does not include target metric "{metric}"') - - today = date.today() - latest_finished_at = max(validation_data["finished_at"]) - - test_earliest_date = today - timedelta(days=test_window) - comparison_earliest_date = test_earliest_date - timedelta(days=full_lookback_window) - - x_day_moving_average = f"{moving_average_window}_day_{metric}_moving_avg" - validation_data[x_day_moving_average] = ( - validation_data[metric] - .rolling(window=moving_average_window, min_periods=0) - .mean() - ) - - comparison_values = validation_data["finished_at"].apply( - lambda m: comparison_earliest_date < m.date() <= test_earliest_date - ) - test_values = validation_data["finished_at"].apply( - lambda m: test_earliest_date < m.date() <= today - ) - - comparison_range = validation_data.loc[comparison_values] - test_range = validation_data.loc[test_values] - - mean_lower, mean_upper = comparison_range[x_day_moving_average].quantile( - q=[mean_lower_bound, mean_upper_bound] - ) - - test_moving_averages = test_range[x_day_moving_average] - should_trigger = len(test_moving_averages) != 0 and ( - all(test_moving_averages > mean_upper) or all(test_moving_averages < mean_lower) - ) - num_moving_averages_compared = int( - comparison_range[x_day_moving_average].notna().sum() - ) - - print(f"Completed mean check for metric: {metric}") - return ( - latest_finished_at, - num_moving_averages_compared, - should_trigger, - mean_lower, - mean_upper, - moving_average_window, - list(test_moving_averages), - ) - - -def record_validation_results(val_df, destination_table): - print(f"Recording validation results to destination table: {destination_table}") - - InputSet = namedtuple( - "InputSet", - "name full_lookback_window range_test_window range_lower_bound range_upper_bound mean_test_window mean_lower_bound mean_upper_bound moving_average_window", - ) - client = bigquery.Client(project=project) - started_at = datetime.utcnow() - - for metric in [ - InputSet( - name="pct_sanitized_search_terms", - full_lookback_window=90, - range_test_window=4, - range_lower_bound=0.125, - range_upper_bound=0.875, - mean_test_window=8, - mean_lower_bound=0.01, - mean_upper_bound=0.99, - moving_average_window=7, - ), - InputSet( - name="pct_sanitized_contained_at", - full_lookback_window=90, - range_test_window=4, - range_lower_bound=0.125, - range_upper_bound=0.875, - mean_test_window=8, - mean_lower_bound=0.025, - mean_upper_bound=0.975, - moving_average_window=7, - ), - InputSet( - name="pct_sanitized_contained_numbers", - full_lookback_window=90, - range_test_window=3, - range_lower_bound=0.075, - range_upper_bound=0.925, - mean_test_window=8, - mean_lower_bound=0.01, - mean_upper_bound=0.99, - moving_average_window=7, - ), - InputSet( - name="pct_sanitized_contained_name", - full_lookback_window=90, - range_test_window=5, - range_lower_bound=0.025, - range_upper_bound=0.975, - mean_test_window=7, - mean_lower_bound=0.01, - mean_upper_bound=0.99, - moving_average_window=7, - ), - InputSet( - name="pct_terms_containing_us_census_surname", - full_lookback_window=90, - range_test_window=3, - range_lower_bound=0.1, - range_upper_bound=0.9, - mean_test_window=8, - mean_lower_bound=0.01, - mean_upper_bound=0.99, - moving_average_window=9, - ), - InputSet( - name="pct_uppercase_chars_all_search_terms", - full_lookback_window=90, - range_test_window=4, - range_lower_bound=0.075, - range_upper_bound=0.925, - mean_test_window=8, - mean_lower_bound=0.01, - mean_upper_bound=0.99, - moving_average_window=7, - ), - InputSet( - name="avg_words_all_search_terms", - full_lookback_window=90, - range_test_window=4, - range_lower_bound=0.125, - range_upper_bound=0.875, - mean_test_window=8, - mean_lower_bound=0.025, - mean_upper_bound=0.975, - moving_average_window=7, - ), - InputSet( - name="pct_terms_non_english", - full_lookback_window=90, - range_test_window=4, - range_lower_bound=0.125, - range_upper_bound=0.875, - mean_test_window=8, - mean_lower_bound=0.01, - mean_upper_bound=0.99, - moving_average_window=5, - ), - ]: - ( - finished_at, - num_ranges_compared, - range_alarm, - range_low, - range_high, - range_test_vals, - ) = range_check( - val_df, - metric.name, - metric.full_lookback_window, - metric.range_test_window, - metric.range_lower_bound, - metric.range_upper_bound, - ) - ( - finished_at, - num_moving_averages_compared, - mean_alarm, - mean_low, - mean_high, - mean_window, - mean_test_vals, - ) = mean_check( - val_df, - metric.name, - metric.full_lookback_window, - metric.mean_test_window, - metric.moving_average_window, - metric.mean_lower_bound, - metric.mean_upper_bound, - ) - - rows_to_insert = [ - { - "from_sanitization_job_finished_at": finished_at.strftime( - "%Y-%m-%d %H:%M:%S" - ), - "started_at": started_at.strftime("%Y-%m-%d %H:%M:%S"), - "range_alarm": range_alarm, - "range_low": range_low, - "range_high": range_high, - "num_ranges_compared": num_ranges_compared, - "range_test_vals": str(range_test_vals), - "mean_alarm": mean_alarm, - "mean_low": mean_low, - "mean_high": mean_high, - "num_moving_averages_compared": num_moving_averages_compared, - "mean_test_vals": str(mean_test_vals), - "metric": metric.name, - "full_lookback_window_num_days": metric.full_lookback_window, - "range_test_window_num_days": metric.range_test_window, - "mean_test_window_num_days": metric.mean_test_window, - "moving_average_window_num_days": metric.moving_average_window, - "range_percentile_lower_bound": metric.range_lower_bound, - "range_percentile_upper_bound": metric.range_upper_bound, - "mean_percentile_lower_bound": metric.range_lower_bound, - "mean_percentile_upper_bound": metric.range_upper_bound, - }, - ] - errors = client.insert_rows_json(destination_table, rows_to_insert) - if errors: - print(f"Problem recording data validation results: {errors}") - else: - print("Data validation results recorded successfully!") diff --git a/jobs/search-term-data-validation-v2/search_term_data_validation_v2/main.py b/jobs/search-term-data-validation-v2/search_term_data_validation_v2/main.py deleted file mode 100644 index 533800b1..00000000 --- a/jobs/search-term-data-validation-v2/search_term_data_validation_v2/main.py +++ /dev/null @@ -1,37 +0,0 @@ -print("Look at me! I'm running a Python command! WOW!") - -import argparse -import pandas as pd - -from datetime import date, timedelta -from collections import namedtuple - -from data_validation import retrieve_data_validation_metrics, record_validation_results - -print("Dependencies successfully imported!") - -parser = argparse.ArgumentParser( - description="Validate Recent Search Input Against Historical Norms", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, -) -parser.add_argument( - "--data_validation_origin", help="Origin table for data validation metrics" -) -parser.add_argument( - "--data_validation_reporting_destination", - help="Table to store data validation metric test results", -) -print("Parser successfully instantiated!") - -args = parser.parse_args() -print("Args successfully parsed!") -print(f"Data Validation Origin: {args.data_validation_origin}") -print(f"Data Validation Reporting Destination: {args.data_validation_reporting_destination}") - -print("Retrieving Data Validation Metrics Now...") -validation_df = retrieve_data_validation_metrics(args.data_validation_origin) -print(f"Input Dataframe Shape: {validation_df.shape}") - -print("Recording validation results...") -record_validation_results(validation_df, args.data_validation_reporting_destination) - diff --git a/jobs/search-term-data-validation-v2/setup.py b/jobs/search-term-data-validation-v2/setup.py deleted file mode 100644 index 40b73e1c..00000000 --- a/jobs/search-term-data-validation-v2/setup.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python - -from setuptools import setup, find_packages - -readme = open("README.md").read() - -setup( - name="search-term-data-validation-v2", - version="0.1.0", - author="Mozilla Corporation", - packages=find_packages(include=["search-term-data-validation-v2"]), - long_description=readme, - include_package_data=True, - license="MPL 2.0", -) diff --git a/jobs/search-term-data-validation-v2/tests/__init__.py b/jobs/search-term-data-validation-v2/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/jobs/search-term-data-validation-v2/tests/test_data_validation.py b/jobs/search-term-data-validation-v2/tests/test_data_validation.py deleted file mode 100644 index b44de753..00000000 --- a/jobs/search-term-data-validation-v2/tests/test_data_validation.py +++ /dev/null @@ -1,342 +0,0 @@ -import pytest -from search_term_data_validation_v2.data_validation import range_check, mean_check -import pandas as pd -import numpy as np - - -def test_range_check__wrong_format_lower_bound(): - example_df = pd.DataFrame({}) - - try: - result = range_check( - validation_data=example_df, - metric="column_name", - full_lookback_window=3, - test_window=1, - range_lower_bound=25, - range_upper_bound=0.75, - ) - except Exception as e: - assert ( - str(e) - == "range_lower_bound and range_upper_bound should both be between zero (0) and one (1)." - ) - - -def test_range_check__wrong_format_upper_bound(): - example_df = pd.DataFrame({}) - - try: - result = range_check( - validation_data=example_df, - metric="column_name", - full_lookback_window=3, - test_window=1, - range_lower_bound=0.25, - range_upper_bound=75, - ) - except Exception as e: - assert ( - str(e) - == "range_lower_bound and range_upper_bound should both be between zero (0) and one (1)." - ) - - -def test_range_check__no_finished_at_column(): - example_df = pd.DataFrame({}) - - try: - result = range_check( - validation_data=example_df, - metric="column_name", - full_lookback_window=3, - test_window=1, - range_lower_bound=0.25, - range_upper_bound=0.75, - ) - except Exception as e: - assert str(e) == "dataframe must include a finished_at column." - - -def test_range_check__target_metric_not_present(): - example_df = pd.DataFrame({"finished_at": []}) - - try: - result = range_check( - validation_data=example_df, - metric="column_that_is_not_in_df", - full_lookback_window=3, - test_window=1, - range_lower_bound=0.25, - range_upper_bound=0.75, - ) - except Exception as e: - assert ( - str(e) - == 'dataframe does not include target metric "column_that_is_not_in_df"' - ) - - -def test_range_check__happy_path__test_metric_in_range(): - example_df = pd.DataFrame( - { - "finished_at": [ - np.datetime64("today", "D") - np.timedelta64(12, "D"), - np.datetime64("today", "D") - np.timedelta64(11, "D"), - np.datetime64("today", "D") - np.timedelta64(10, "D"), - np.datetime64("today", "D") - np.timedelta64(9, "D"), - np.datetime64("today", "D") - np.timedelta64(8, "D"), - np.datetime64("today", "D") - np.timedelta64(7, "D"), - np.datetime64("today", "D") - np.timedelta64(6, "D"), - np.datetime64("today", "D") - np.timedelta64(5, "D"), - np.datetime64("today", "D") - np.timedelta64(4, "D"), - np.datetime64("today", "D") - np.timedelta64(3, "D"), - np.datetime64("today", "D") - np.timedelta64(2, "D"), - np.datetime64("today", "D") - np.timedelta64(1, "D"), - np.datetime64("today", "D") - np.timedelta64(0, "D"), - ], - "pct_something_consistent": [10, 9, 9, 8, 7, 6, 5, 6, 7, 8, 6, 9, 7], - } - ) - - result = range_check( - validation_data=example_df, - metric="pct_something_consistent", - full_lookback_window=12, - test_window=1, - range_lower_bound=0.2, - range_upper_bound=0.8, - ) - ( - latest_timestamp, - num_compared, - should_alarm, - lower_bound, - upper_bound, - test_values, - ) = result - - assert num_compared == 12 - assert should_alarm == False - assert lower_bound == 6.0 - assert upper_bound == 9.0 - assert test_values == [7] - - -def test_range_check__happy_path__test_metric_out_of_range(): - example_df = pd.DataFrame( - { - "finished_at": [ - np.datetime64("today", "D") - np.timedelta64(12, "D"), - np.datetime64("today", "D") - np.timedelta64(11, "D"), - np.datetime64("today", "D") - np.timedelta64(10, "D"), - np.datetime64("today", "D") - np.timedelta64(9, "D"), - np.datetime64("today", "D") - np.timedelta64(8, "D"), - np.datetime64("today", "D") - np.timedelta64(7, "D"), - np.datetime64("today", "D") - np.timedelta64(6, "D"), - np.datetime64("today", "D") - np.timedelta64(5, "D"), - np.datetime64("today", "D") - np.timedelta64(4, "D"), - np.datetime64("today", "D") - np.timedelta64(3, "D"), - np.datetime64("today", "D") - np.timedelta64(2, "D"), - np.datetime64("today", "D") - np.timedelta64(1, "D"), - np.datetime64("today", "D") - np.timedelta64(0, "D"), - ], - "pct_something_consistent": [10, 9, 9, 8, 7, 6, 5, 6, 7, 8, 6, 9, 3], - } - ) - - result = range_check( - validation_data=example_df, - metric="pct_something_consistent", - full_lookback_window=12, - test_window=1, - range_lower_bound=0.2, - range_upper_bound=0.8, - ) - ( - latest_timestamp, - num_compared, - should_alarm, - lower_bound, - upper_bound, - test_values, - ) = result - - assert num_compared == 12 - assert should_alarm == True - assert lower_bound == 6.0 - assert upper_bound == 9.0 - assert test_values == [3] - - -def test_mean_check__wrong_format_lower_bound(): - example_df = pd.DataFrame({}) - - try: - result = mean_check( - validation_data=example_df, - metric="column_name", - full_lookback_window=3, - test_window=1, - moving_average_window=1, - mean_lower_bound=25, - mean_upper_bound=0.75, - ) - except Exception as e: - assert ( - str(e) - == "mean_lower_bound and mean_upper_bound should both be between zero (0) and one (1)." - ) - - -def test_mean_check__wrong_format_upper_bound(): - example_df = pd.DataFrame({}) - - try: - result = mean_check( - validation_data=example_df, - metric="column_name", - full_lookback_window=3, - test_window=1, - moving_average_window=1, - mean_lower_bound=0.25, - mean_upper_bound=75, - ) - except Exception as e: - assert ( - str(e) - == "mean_lower_bound and mean_upper_bound should both be between zero (0) and one (1)." - ) - - -def test_mean_check__no_finished_at_column(): - example_df = pd.DataFrame({}) - - try: - result = mean_check( - validation_data=example_df, - metric="column_name", - full_lookback_window=3, - test_window=1, - moving_average_window=1, - mean_lower_bound=0.25, - mean_upper_bound=0.75, - ) - except Exception as e: - assert str(e) == "dataframe must include a finished_at column." - - -def test_mean_check__target_metric_not_present(): - example_df = pd.DataFrame({"finished_at": []}) - - try: - result = mean_check( - validation_data=example_df, - metric="column_that_is_not_in_df", - full_lookback_window=3, - test_window=1, - moving_average_window=1, - mean_lower_bound=0.25, - mean_upper_bound=0.75, - ) - except Exception as e: - assert ( - str(e) - == 'dataframe does not include target metric "column_that_is_not_in_df"' - ) - - -def test_mean_check__happy_path__test_metric_in_moving_average_range(): - example_df = pd.DataFrame( - { - "finished_at": [ - np.datetime64("today", "D") - np.timedelta64(12, "D"), - np.datetime64("today", "D") - np.timedelta64(11, "D"), - np.datetime64("today", "D") - np.timedelta64(10, "D"), - np.datetime64("today", "D") - np.timedelta64(9, "D"), - np.datetime64("today", "D") - np.timedelta64(8, "D"), - np.datetime64("today", "D") - np.timedelta64(7, "D"), - np.datetime64("today", "D") - np.timedelta64(6, "D"), - np.datetime64("today", "D") - np.timedelta64(5, "D"), - np.datetime64("today", "D") - np.timedelta64(4, "D"), - np.datetime64("today", "D") - np.timedelta64(3, "D"), - np.datetime64("today", "D") - np.timedelta64(2, "D"), - np.datetime64("today", "D") - np.timedelta64(1, "D"), - np.datetime64("today", "D") - np.timedelta64(0, "D"), - ], - "pct_something_consistent": [10, 9, 9, 8, 7, 6, 5, 6, 7, 8, 6, 9, 7], - } - ) - - result = mean_check( - validation_data=example_df, - metric="pct_something_consistent", - full_lookback_window=12, - test_window=1, - moving_average_window=3, - mean_lower_bound=0.2, - mean_upper_bound=0.8, - ) - ( - latest_timestamp, - num_compared, - should_alarm, - lower_bound, - upper_bound, - moving_average_window, - test_values, - ) = result - - assert num_compared == 12 - assert should_alarm == False - assert lower_bound == 6.2 - assert upper_bound == 9.200000000000001 - assert test_values == [7.333333333333333] - - -def test_mean_check__happy_path__test_metric_out_of_moving_average_range(): - example_df = pd.DataFrame( - { - "finished_at": [ - np.datetime64("today", "D") - np.timedelta64(12, "D"), - np.datetime64("today", "D") - np.timedelta64(11, "D"), - np.datetime64("today", "D") - np.timedelta64(10, "D"), - np.datetime64("today", "D") - np.timedelta64(9, "D"), - np.datetime64("today", "D") - np.timedelta64(8, "D"), - np.datetime64("today", "D") - np.timedelta64(7, "D"), - np.datetime64("today", "D") - np.timedelta64(6, "D"), - np.datetime64("today", "D") - np.timedelta64(5, "D"), - np.datetime64("today", "D") - np.timedelta64(4, "D"), - np.datetime64("today", "D") - np.timedelta64(3, "D"), - np.datetime64("today", "D") - np.timedelta64(2, "D"), - np.datetime64("today", "D") - np.timedelta64(1, "D"), - np.datetime64("today", "D") - np.timedelta64(0, "D"), - ], - "pct_something_consistent": [10, 9, 9, 8, 7, 6, 5, 6, 7, 8, 6, 9, 3], - } - ) - - result = mean_check( - validation_data=example_df, - metric="pct_something_consistent", - full_lookback_window=12, - test_window=1, - moving_average_window=3, - mean_lower_bound=0.2, - mean_upper_bound=0.8, - ) - ( - latest_timestamp, - num_compared, - should_alarm, - lower_bound, - upper_bound, - moving_average_window, - test_values, - ) = result - - assert num_compared == 12 - assert should_alarm == True - assert lower_bound == 6.2 - assert upper_bound == 9.200000000000001 - assert test_values == [6.0]