diff --git a/README.md b/README.md index 2bb9ea6..302677b 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ It will download the last version of every file present on Wayback Machine to `. -p, --maximum-snapshot NUMBER Maximum snapshot pages to consider (Default is 100) Count an average of 150,000 snapshots per page -l, --list Only list file urls in a JSON format with the archived timestamps, won't download anything + -u, --user-agent STRING UserAgent for connection (Default is Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0) ## Specify directory to save files to @@ -175,6 +176,16 @@ Example: wayback_machine_downloader http://example.com --concurrency 20 +## Specify UserAgent for connection + + -u, --user-agent STRING + +UserAgent for connection (Default is Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0) + +Example: + + wayback_machine_downloader http://example.com --user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" + ## Using the Docker image As an alternative installation way, we have a Docker image! Retrieve the wayback-machine-downloader Docker image this way: diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 8b9f2fd..e1e53a4 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -58,6 +58,10 @@ option_parser = OptionParser.new do |opts| options[:list] = true end + opts.on("-u", "--user-agent STRING", String, "UserAgent for connection (Default is Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0)") do |t| + options[:user_agent] = t + end + opts.on("-v", "--version", "Display version") do |t| options[:version] = t end diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 04005c8..226a90f 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -18,7 +18,7 @@ class WaybackMachineDownloader attr_accessor :base_url, :exact_url, :directory, :all_timestamps, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, - :all, :maximum_pages, :threads_count + :all, :maximum_pages, :threads_count, :user_agent def initialize params @base_url = params[:base_url] @@ -32,6 +32,7 @@ def initialize params @all = params[:all] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i + @user_agent = params[:user_agent] ? params[:user_agent] : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0" end def backup_name @@ -268,7 +269,7 @@ def download_file file_remote_info structure_dir_path dir_path open(file_path, "wb") do |file| begin - URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri| + open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain", "User-Agent" => @user_agent) do |uri| file.write(uri.read) end rescue OpenURI::HTTPError => e diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index 903f42b..e641d44 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -5,7 +5,7 @@ def get_raw_list_from_api url, page_index request_url += url request_url += parameters_for_api page_index - URI.open(request_url).read + open(request_url, "User-Agent" => @user_agent).read end def parameters_for_api page_index