From 07243ab48e903fbe3d3d8678627c1d2f68bdfd8f Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Tue, 19 Nov 2019 12:35:31 -0700 Subject: [PATCH 1/3] Store timestamp in aggregate table instead of crawl id --- https_crawl.pl | 15 ++++++++------- sql/https_crawl_aggregate.sql | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/https_crawl.pl b/https_crawl.pl index f134662..42aade1 100755 --- a/https_crawl.pl +++ b/https_crawl.pl @@ -333,7 +333,6 @@ sub crawl_sites{ mixed_requests max_ss_diff redirects - max_id requests is_redirect redirect_hosts' @@ -412,18 +411,18 @@ sub prep_db { domain, https, http_and_https, - https_errs, http, + https_errs, + http, unknown, autoupgrade, mixed_requests, max_screenshot_diff, redirects, - max_https_crawl_id, requests, is_redirect, redirect_hosts, session_request_limit) - values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,$CC{URLS_PER_SITE}) + values (?,?,?,?,?,?,?,?,?,?,?,?,?,$CC{URLS_PER_SITE}) on conflict (domain) do update set ( https, http_and_https, @@ -434,11 +433,11 @@ sub prep_db { mixed_requests, max_screenshot_diff, redirects, - max_https_crawl_id, requests, is_redirect, redirect_hosts, - session_request_limit + session_request_limit, + updated ) = ( EXCLUDED.https, EXCLUDED.http_and_https, @@ -453,7 +452,9 @@ sub prep_db { EXCLUDED.requests, EXCLUDED.is_redirect, EXCLUDED.redirect_hosts, - EXCLUDED.session_request_limit) + EXCLUDED.session_request_limit, + now() + ) where EXCLUDED.is_redirect = false or https_crawl_aggregate.is_redirect = true diff --git a/sql/https_crawl_aggregate.sql b/sql/https_crawl_aggregate.sql index 354edf3..5157a42 100644 --- a/sql/https_crawl_aggregate.sql +++ b/sql/https_crawl_aggregate.sql @@ -37,8 +37,8 @@ CREATE TABLE https_crawl_aggregate ( requests integer NOT NULL, session_request_limit integer NOT NULL, is_redirect boolean DEFAULT false NOT NULL, - max_https_crawl_id bigint NOT NULL, - redirect_hosts jsonb + redirect_hosts jsonb, + updated timestamp with time zone DEFAULT now() NOT NULL ); From 37a371f9294db35a82d19d75bbdfc3391b660511 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Tue, 19 Nov 2019 13:03:19 -0700 Subject: [PATCH 2/3] Remove crawl id from conflict values --- https_crawl.pl | 1 - 1 file changed, 1 deletion(-) diff --git a/https_crawl.pl b/https_crawl.pl index 42aade1..4d43638 100755 --- a/https_crawl.pl +++ b/https_crawl.pl @@ -448,7 +448,6 @@ sub prep_db { EXCLUDED.mixed_requests, EXCLUDED.max_screenshot_diff, EXCLUDED.redirects, - EXCLUDED.max_https_crawl_id, EXCLUDED.requests, EXCLUDED.is_redirect, EXCLUDED.redirect_hosts, From f2f04cfecbb74cabab3a5c5d7d5b43b860ae139b Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Wed, 20 Nov 2019 16:42:50 -0700 Subject: [PATCH 3/3] Update aggregate description with updated column --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5253163..724560c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -210,8 +210,8 @@ Aggregate of [https_crawl](#https_crawl) that creates latest crawl sessions base |requests|Number of comparison requests actually made during the crawl session|integer|| |session_request_limit|The number of comparisons wanted for the session|integer|| |is_redirect|Whether the domain was actually crawled or is a redirect from another host in the table that was crawled|boolean|| -|max_https_crawl_id|https_crawl.id of last comparison made during crawl session|bigint|| |redirect_hosts|key/value pairs of hosts and the number of redirects to it|jsonb|| +|updated|When last updated|timestamp with time zone|| #### https_upgrade_metrics