diff --git a/pdf_converter/app/url_validator.rb b/pdf_converter/app/url_validator.rb index 23315f0..b650186 100644 --- a/pdf_converter/app/url_validator.rb +++ b/pdf_converter/app/url_validator.rb @@ -9,10 +9,6 @@ class UrlValidator REQUIRED_S3_SIGNATURE_PARAMS = ['X-Amz-Algorithm'].freeze PDF_EXTENSIONS = ['.pdf'].freeze - def initialize - @logger = Logger.new($stdout) if defined?(Logger) - end - # Validates if a URL is a properly signed S3 URL for destination uploads # @param url [String] The URL to validate # @return [Boolean] true if URL is a valid signed S3 URL for uploads @@ -31,14 +27,12 @@ def valid_s3_signed_url?(url) # @param url [String] The URL to validate # @return [Boolean] true if URL is valid HTTP/HTTPS def valid_url?(url) - return false if url.nil? || url.empty? - - begin - uri = URI.parse(url) - uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) - rescue URI::InvalidURIError - false - end + return false unless valid_basic_url?(url) + + uri = URI.parse(url) + uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) + rescue URI::InvalidURIError + false end # Extracts S3 bucket, key, and region information from S3 URL @@ -57,34 +51,38 @@ def extract_s3_info(url) # @param require_pdf [Boolean] Whether to check for PDF extension # @return [Boolean] true if URL passes all validation checks def validate_s3_url(url, require_pdf:) - return false if url.nil? || url.empty? + return false unless valid_basic_url?(url) uri = URI.parse(url) - - # Validate scheme (HTTP only allowed for LocalStack) - return false unless valid_scheme?(uri) - - # Must be S3 hostname or LocalStack - return false unless valid_s3_host?(uri.host) - - # Check PDF extension if required - return false if require_pdf && !pdf_file?(uri.path) - - # Must have required signature parameters - s3_signature_params?(uri.query) + valid_scheme?(uri) && + valid_s3_host?(uri.host) && + valid_pdf_requirement?(uri.path, require_pdf) && + s3_signature_params?(uri.query) rescue URI::InvalidURIError false end + # Validates that URL is not nil or empty + # @param url [String] The URL to check + # @return [Boolean] true if URL has content + def valid_basic_url?(url) + !url.nil? && !url.empty? + end + + # Validates PDF requirement is met + # @param path [String] The URL path + # @param require_pdf [Boolean] Whether PDF is required + # @return [Boolean] true if PDF requirement is satisfied + def valid_pdf_requirement?(path, require_pdf) + !require_pdf || pdf_file?(path) + end + # Validates URL scheme (HTTPS required, except HTTP for LocalStack) # @param uri [URI] Parsed URI object # @return [Boolean] true if scheme is valid def valid_scheme?(uri) - if uri.scheme == 'http' - S3UrlParser.localstack_hostname?(uri.host) - else - uri.scheme == 'https' - end + scheme = uri.scheme + scheme == 'https' || (scheme == 'http' && S3UrlParser.localstack_hostname?(uri.host)) end # Validates that hostname is either S3 or LocalStack @@ -95,7 +93,7 @@ def valid_s3_host?(hostname) end def pdf_file?(path) - return false if path.nil? || path.empty? + return false unless path && !path.empty? # Extract filename from path filename = File.basename(path) @@ -106,13 +104,9 @@ def pdf_file?(path) # @param query_string [String] The URL query string # @return [Boolean] true if all required signature params present def s3_signature_params?(query_string) - return false if query_string.nil? || query_string.empty? + return false unless query_string && !query_string.empty? query_params = URI.decode_www_form(query_string).to_h REQUIRED_S3_SIGNATURE_PARAMS.all? { |param| query_params.key?(param) } end - - def log_debug(message) - @logger&.debug(message) if defined?(@logger) - end end diff --git a/pdf_converter/lib/s3_url_parser.rb b/pdf_converter/lib/s3_url_parser.rb index c1a5cfa..4ec3a29 100644 --- a/pdf_converter/lib/s3_url_parser.rb +++ b/pdf_converter/lib/s3_url_parser.rb @@ -16,7 +16,7 @@ class S3UrlParser # @param hostname [String] The hostname to check # @return [Boolean] true if hostname matches S3 patterns def self.s3_hostname?(hostname) - return false if hostname.nil? + return false unless hostname S3_HOSTNAME_PATTERNS.any? { |pattern| hostname.match?(pattern) } end @@ -25,7 +25,7 @@ def self.s3_hostname?(hostname) # @param hostname [String] The hostname to check # @return [Boolean] true if hostname is LocalStack def self.localstack_hostname?(hostname) - return false if hostname.nil? + return false unless hostname # Allow localhost and 127.0.0.1 for LocalStack testing hostname == 'localhost' || hostname == '127.0.0.1' || hostname.start_with?('localstack') @@ -42,7 +42,7 @@ def self.path_style_s3?(hostname) # @param hostname [String] The hostname to check # @return [Boolean] true if virtual-hosted-style S3 def self.virtual_hosted_style_s3?(hostname) - return false if hostname.nil? + return false unless hostname hostname.match?(/\.s3\./) end @@ -51,19 +51,26 @@ def self.virtual_hosted_style_s3?(hostname) # @param url [String] The S3 URL to parse # @return [Hash, nil] Hash with :bucket, :key, :region or nil if invalid def self.extract_s3_info(url) - return nil if url.nil? || url.empty? + return nil unless url && !url.empty? uri = URI.parse(url) - - if path_style_s3?(uri.host) - extract_path_style_info(uri) - elsif virtual_hosted_style_s3?(uri.host) - extract_virtual_hosted_info(uri) - end + extract_by_style(uri) rescue StandardError nil end + # Determines URL style and extracts info accordingly + # @param uri [URI] Parsed URI object + # @return [Hash, nil] Extracted S3 info or nil + def self.extract_by_style(uri) + host = uri.host + return extract_path_style_info(uri) if path_style_s3?(host) + return extract_virtual_hosted_info(uri) if virtual_hosted_style_s3?(host) + + nil + end + private_class_method :extract_by_style + # Extracts region from S3 hostname # @param hostname [String] The S3 hostname # @return [String, nil] The region or nil if not found @@ -74,22 +81,26 @@ def self.extract_region_from_hostname(hostname) match ? match[1] : nil end + # Validates path parts have required structure + # @param path_parts [Array] Split path components + # @return [Boolean] true if path parts are valid + def self.valid_path_parts?(path_parts) + path_parts.length >= 3 && !path_parts[1].empty? + end + private_class_method :valid_path_parts? + # Extracts info from path-style S3 URL # @param uri [URI] Parsed URI object # @return [Hash, nil] Hash with :bucket, :key, :region or nil def self.extract_path_style_info(uri) # For path-style: https://s3.region.amazonaws.com/bucket/key path_parts = uri.path.split('/', 3) - return nil if path_parts.length < 3 || path_parts[1].empty? - - bucket = path_parts[1] - key = path_parts[2] - region = extract_region_from_hostname(uri.host) || 'us-east-1' + return nil unless valid_path_parts?(path_parts) { - bucket: bucket, - key: key, - region: region + bucket: path_parts[1], + key: path_parts[2], + region: extract_region_from_hostname(uri.host) || 'us-east-1' } end private_class_method :extract_path_style_info @@ -99,18 +110,23 @@ def self.extract_path_style_info(uri) # @return [Hash, nil] Hash with :bucket, :key, :region or nil def self.extract_virtual_hosted_info(uri) # For virtual-hosted-style: https://bucket.s3.region.amazonaws.com/key - hostname_parts = uri.host.split('.') + host = uri.host + hostname_parts = host.split('.') return nil if hostname_parts.length < 4 - bucket = hostname_parts[0] - key = uri.path.start_with?('/') ? uri.path[1..] : uri.path - region = extract_region_from_hostname(uri.host) || 'us-east-1' - { - bucket: bucket, - key: key, - region: region + bucket: hostname_parts[0], + key: normalize_path(uri.path), + region: extract_region_from_hostname(host) || 'us-east-1' } end + + # Removes leading slash from path + # @param path [String] The URL path + # @return [String] Path without leading slash + def self.normalize_path(path) + path.start_with?('/') ? path[1..] : path + end + private_class_method :normalize_path private_class_method :extract_virtual_hosted_info end