From 3d55bff2d370d3f1070d165f81c7c4149b0e625d Mon Sep 17 00:00:00 2001 From: mirceachira Date: Mon, 16 Sep 2019 16:03:40 +0300 Subject: [PATCH 1/2] Added search feature, search project key by part of url --- shub/search.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ shub/tool.py | 1 + 2 files changed, 56 insertions(+) create mode 100755 shub/search.py diff --git a/shub/search.py b/shub/search.py new file mode 100755 index 00000000..09d3814c --- /dev/null +++ b/shub/search.py @@ -0,0 +1,55 @@ +from datetime import datetime + +import click +from dateparser import parse +from scrapinghub import ScrapinghubClient + + +HELP = """ +Given a project key and part of an url, fetch job ids from Scrapy Cloud. + +This is useful when you want to find a job in an efficient way starting from +an url. + +The project key and an url (or part of it). The matching is case sensitive! + + shub search 123456 "B07F3NG1234" + +You can provide other parameters to narrow down the search significantly such +as the spider name and the date interval to search for. Or both! The default +is to search only the last 6 months. + + shub search 123456 "B07F3NG1234" --spider="amazon" + + shub search 123456 "B07F3NG1234" --start_date="last week" --end_date="2 days ago" +""" + +SHORT_HELP = "Fetch job ids from Scrapy Cloud based on urls" + + +@click.command(help=HELP, short_help=SHORT_HELP) +@click.argument('project_key') +@click.argument('url_content') +@click.option( + '--start_date', + default='6 months ago', + help='date to start searching from, defaults to 6 months ago' +) +@click.option('--end_date', default='now', help='date to end the search') +@click.option('-s', '--spider', help='the spider to search') +def cli(project_key, url_content, start_date, end_date, spider): + def date_string_to_seconds(date): + return int((parse(date) - datetime(1970, 1, 1)).total_seconds() * 1000) + + start_time = date_string_to_seconds(start_date) + end_time = date_string_to_seconds(end_date) + + project = ScrapinghubClient().get_project(project_key) + + jobs = project.jobs.iter(startts=start_time, endts=end_time, spider=spider) + for job_dict in jobs: + job = project.jobs.get(job_dict['key']) + for req in job.requests.iter(filter=[('url', 'contains', [url_content])]): + click.echo(job_dict['key']) + break + diff --git a/shub/tool.py b/shub/tool.py index 14b2f80e..28db7c20 100644 --- a/shub/tool.py +++ b/shub/tool.py @@ -51,6 +51,7 @@ def cli(): "migrate_eggs", "image", "cancel", + "search" ] for command in commands: From bcc772a14b5f9c2f6ca86e73bbe5f9817d3e5f66 Mon Sep 17 00:00:00 2001 From: mirceachira Date: Mon, 16 Sep 2019 16:04:02 +0300 Subject: [PATCH 2/2] Added dateparser dependency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 922d9f03..91cf666a 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ 'six>=1.7.0', 'tqdm', 'toml', + 'dateparser' ], classifiers=[ 'Development Status :: 5 - Production/Stable',