Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 30 additions & 36 deletions pdf_converter/app/url_validator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ class UrlValidator
REQUIRED_S3_SIGNATURE_PARAMS = ['X-Amz-Algorithm'].freeze
PDF_EXTENSIONS = ['.pdf'].freeze

def initialize
@logger = Logger.new($stdout) if defined?(Logger)
end

# Validates if a URL is a properly signed S3 URL for destination uploads
# @param url [String] The URL to validate
# @return [Boolean] true if URL is a valid signed S3 URL for uploads
Expand All @@ -31,14 +27,12 @@ def valid_s3_signed_url?(url)
# @param url [String] The URL to validate
# @return [Boolean] true if URL is valid HTTP/HTTPS
def valid_url?(url)
return false if url.nil? || url.empty?

begin
uri = URI.parse(url)
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
rescue URI::InvalidURIError
false
end
return false unless valid_basic_url?(url)

uri = URI.parse(url)
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
rescue URI::InvalidURIError
false
end

# Extracts S3 bucket, key, and region information from S3 URL
Expand All @@ -57,34 +51,38 @@ def extract_s3_info(url)
# @param require_pdf [Boolean] Whether to check for PDF extension
# @return [Boolean] true if URL passes all validation checks
def validate_s3_url(url, require_pdf:)
return false if url.nil? || url.empty?
return false unless valid_basic_url?(url)

uri = URI.parse(url)

# Validate scheme (HTTP only allowed for LocalStack)
return false unless valid_scheme?(uri)

# Must be S3 hostname or LocalStack
return false unless valid_s3_host?(uri.host)

# Check PDF extension if required
return false if require_pdf && !pdf_file?(uri.path)

# Must have required signature parameters
s3_signature_params?(uri.query)
valid_scheme?(uri) &&
valid_s3_host?(uri.host) &&
valid_pdf_requirement?(uri.path, require_pdf) &&
s3_signature_params?(uri.query)
rescue URI::InvalidURIError
false
end

# Validates that URL is not nil or empty
# @param url [String] The URL to check
# @return [Boolean] true if URL has content
def valid_basic_url?(url)
!url.nil? && !url.empty?
end

# Validates PDF requirement is met
# @param path [String] The URL path
# @param require_pdf [Boolean] Whether PDF is required
# @return [Boolean] true if PDF requirement is satisfied
def valid_pdf_requirement?(path, require_pdf)
!require_pdf || pdf_file?(path)
end

# Validates URL scheme (HTTPS required, except HTTP for LocalStack)
# @param uri [URI] Parsed URI object
# @return [Boolean] true if scheme is valid
def valid_scheme?(uri)
if uri.scheme == 'http'
S3UrlParser.localstack_hostname?(uri.host)
else
uri.scheme == 'https'
end
scheme = uri.scheme
scheme == 'https' || (scheme == 'http' && S3UrlParser.localstack_hostname?(uri.host))
end

# Validates that hostname is either S3 or LocalStack
Expand All @@ -95,7 +93,7 @@ def valid_s3_host?(hostname)
end

def pdf_file?(path)
return false if path.nil? || path.empty?
return false unless path && !path.empty?

# Extract filename from path
filename = File.basename(path)
Expand All @@ -106,13 +104,9 @@ def pdf_file?(path)
# @param query_string [String] The URL query string
# @return [Boolean] true if all required signature params present
def s3_signature_params?(query_string)
return false if query_string.nil? || query_string.empty?
return false unless query_string && !query_string.empty?

query_params = URI.decode_www_form(query_string).to_h
REQUIRED_S3_SIGNATURE_PARAMS.all? { |param| query_params.key?(param) }
end

def log_debug(message)
@logger&.debug(message) if defined?(@logger)
end
end
68 changes: 42 additions & 26 deletions pdf_converter/lib/s3_url_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class S3UrlParser
# @param hostname [String] The hostname to check
# @return [Boolean] true if hostname matches S3 patterns
def self.s3_hostname?(hostname)
return false if hostname.nil?
return false unless hostname

S3_HOSTNAME_PATTERNS.any? { |pattern| hostname.match?(pattern) }
end
Expand All @@ -25,7 +25,7 @@ def self.s3_hostname?(hostname)
# @param hostname [String] The hostname to check
# @return [Boolean] true if hostname is LocalStack
def self.localstack_hostname?(hostname)
return false if hostname.nil?
return false unless hostname

# Allow localhost and 127.0.0.1 for LocalStack testing
hostname == 'localhost' || hostname == '127.0.0.1' || hostname.start_with?('localstack')
Expand All @@ -42,7 +42,7 @@ def self.path_style_s3?(hostname)
# @param hostname [String] The hostname to check
# @return [Boolean] true if virtual-hosted-style S3
def self.virtual_hosted_style_s3?(hostname)
return false if hostname.nil?
return false unless hostname

hostname.match?(/\.s3\./)
end
Expand All @@ -51,19 +51,26 @@ def self.virtual_hosted_style_s3?(hostname)
# @param url [String] The S3 URL to parse
# @return [Hash, nil] Hash with :bucket, :key, :region or nil if invalid
def self.extract_s3_info(url)
return nil if url.nil? || url.empty?
return nil unless url && !url.empty?

uri = URI.parse(url)

if path_style_s3?(uri.host)
extract_path_style_info(uri)
elsif virtual_hosted_style_s3?(uri.host)
extract_virtual_hosted_info(uri)
end
extract_by_style(uri)
rescue StandardError
nil
end

# Determines URL style and extracts info accordingly
# @param uri [URI] Parsed URI object
# @return [Hash, nil] Extracted S3 info or nil
def self.extract_by_style(uri)
host = uri.host
return extract_path_style_info(uri) if path_style_s3?(host)
return extract_virtual_hosted_info(uri) if virtual_hosted_style_s3?(host)

nil
end
private_class_method :extract_by_style

# Extracts region from S3 hostname
# @param hostname [String] The S3 hostname
# @return [String, nil] The region or nil if not found
Expand All @@ -74,22 +81,26 @@ def self.extract_region_from_hostname(hostname)
match ? match[1] : nil
end

# Validates path parts have required structure
# @param path_parts [Array] Split path components
# @return [Boolean] true if path parts are valid
def self.valid_path_parts?(path_parts)
path_parts.length >= 3 && !path_parts[1].empty?
end
private_class_method :valid_path_parts?

# Extracts info from path-style S3 URL
# @param uri [URI] Parsed URI object
# @return [Hash, nil] Hash with :bucket, :key, :region or nil
def self.extract_path_style_info(uri)
# For path-style: https://s3.region.amazonaws.com/bucket/key
path_parts = uri.path.split('/', 3)
return nil if path_parts.length < 3 || path_parts[1].empty?

bucket = path_parts[1]
key = path_parts[2]
region = extract_region_from_hostname(uri.host) || 'us-east-1'
return nil unless valid_path_parts?(path_parts)

{
bucket: bucket,
key: key,
region: region
bucket: path_parts[1],
key: path_parts[2],
region: extract_region_from_hostname(uri.host) || 'us-east-1'
}
end
private_class_method :extract_path_style_info
Expand All @@ -99,18 +110,23 @@ def self.extract_path_style_info(uri)
# @return [Hash, nil] Hash with :bucket, :key, :region or nil
def self.extract_virtual_hosted_info(uri)
# For virtual-hosted-style: https://bucket.s3.region.amazonaws.com/key
hostname_parts = uri.host.split('.')
host = uri.host
hostname_parts = host.split('.')
return nil if hostname_parts.length < 4

bucket = hostname_parts[0]
key = uri.path.start_with?('/') ? uri.path[1..] : uri.path
region = extract_region_from_hostname(uri.host) || 'us-east-1'

{
bucket: bucket,
key: key,
region: region
bucket: hostname_parts[0],
key: normalize_path(uri.path),
region: extract_region_from_hostname(host) || 'us-east-1'
}
end

# Removes leading slash from path
# @param path [String] The URL path
# @return [String] Path without leading slash
def self.normalize_path(path)
path.start_with?('/') ? path[1..] : path
end
private_class_method :normalize_path
private_class_method :extract_virtual_hosted_info
end