Skip to content

Commit 2ec1037

Browse files
authored
Add Scrapy ApifyHttpProxyMiddleware for managing proxies (#158)
1 parent 9580522 commit 2ec1037

File tree

9 files changed

+335
-5
lines changed

9 files changed

+335
-5
lines changed

src/apify/scrapy/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from .middlewares import ApifyRetryMiddleware
21
from .pipelines import ActorDatasetPushPipeline
32
from .scheduler import ApifyScheduler
4-
from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
3+
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .apify_proxy import ApifyHttpProxyMiddleware
2+
from .apify_retry import ApifyRetryMiddleware
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
from urllib.parse import ParseResult, urlparse
5+
6+
from scrapy.core.downloader.handlers.http11 import TunnelError
7+
from scrapy.exceptions import NotConfigured
8+
9+
from ...actor import Actor
10+
from ...proxy_configuration import ProxyConfiguration
11+
from ..utils import get_basic_auth_header
12+
13+
if TYPE_CHECKING:
14+
from scrapy import Request, Spider
15+
from scrapy.crawler import Crawler
16+
17+
18+
class ApifyHttpProxyMiddleware:
19+
"""Apify HTTP proxy middleware for Scrapy.
20+
21+
This middleware enhances request processing by adding a 'proxy' field to the request's meta and an authentication
22+
header. It draws inspiration from the `HttpProxyMiddleware` included by default in Scrapy projects. The proxy URL
23+
is sourced from the settings under the `APIFY_PROXY_SETTINGS` key. The value of this key, a dictionary, should be
24+
provided by the Actor input. An example of the proxy settings:
25+
26+
proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []}
27+
"""
28+
29+
def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict) -> None:
30+
"""Create a new instance.
31+
32+
Args:
33+
proxy_settings: Dictionary containing proxy settings, provided by the Actor input.
34+
auth_encoding: Encoding for basic authentication (default is 'latin-1').
35+
"""
36+
self._proxy_settings = proxy_settings
37+
self._proxy_cfg_internal: ProxyConfiguration | None = None
38+
39+
@classmethod
40+
def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> ApifyHttpProxyMiddleware:
41+
"""Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler.
42+
43+
Args:
44+
cls: Class type.
45+
crawler: Scrapy Crawler object.
46+
47+
Returns:
48+
ApifyHttpProxyMiddleware: Instance of the class.
49+
"""
50+
proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
51+
52+
if proxy_settings is None:
53+
Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.')
54+
raise NotConfigured
55+
56+
use_apify_proxy = proxy_settings.get('useApifyProxy', False)
57+
58+
if use_apify_proxy is not True:
59+
Actor.log.warning(
60+
'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.'
61+
)
62+
raise NotConfigured
63+
64+
return cls(proxy_settings)
65+
66+
async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None:
67+
"""Process a Scrapy request by assigning a new proxy.
68+
69+
Args:
70+
request: Scrapy Request object.
71+
spider: Scrapy Spider object.
72+
73+
Raises:
74+
ValueError: If username and password are not provided in the proxy URL.
75+
76+
Returns:
77+
None: The request is processed and middleware pipeline can continue.
78+
"""
79+
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}')
80+
url = await self._get_new_proxy_url()
81+
82+
if not (url.username and url.password):
83+
raise ValueError('Username and password must be provided in the proxy URL.')
84+
85+
request.meta['proxy'] = url.geturl()
86+
basic_auth_header = get_basic_auth_header(url.username, url.password)
87+
request.headers[b'Proxy-Authorization'] = basic_auth_header
88+
89+
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}')
90+
91+
def process_exception(
92+
self: ApifyHttpProxyMiddleware,
93+
request: Request,
94+
exception: Exception,
95+
spider: Spider,
96+
) -> None | Request:
97+
"""Process an exception that occurs during request processing.
98+
99+
Args:
100+
request: Scrapy Request object.
101+
exception: Exception object.
102+
spider: Scrapy Spider object.
103+
104+
Returns:
105+
If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline.
106+
Return None otherwise to allow the continuation of request processing.
107+
"""
108+
Actor.log.debug(
109+
f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
110+
)
111+
112+
if isinstance(exception, TunnelError):
113+
Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...')
114+
return request
115+
116+
return None
117+
118+
async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
119+
"""Get a new proxy URL.
120+
121+
Raises:
122+
NotConfigured: If creation of the proxy configuration fails.
123+
124+
Returns:
125+
ParseResult: New proxy URL.
126+
"""
127+
# Get proxy configuration, creating it if necessary
128+
proxy_cfg = (
129+
self._proxy_cfg_internal
130+
if isinstance(self._proxy_cfg_internal, ProxyConfiguration)
131+
else await Actor.create_proxy_configuration(actor_proxy_input=self._proxy_settings)
132+
)
133+
134+
# If the proxy configuration is still not available, raise an error. However, this should not happen due
135+
# to the checks in the `from_crawler` method.
136+
if proxy_cfg is None:
137+
Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.')
138+
raise NotConfigured
139+
140+
# Store the proxy configuration for future use
141+
self._proxy_cfg_internal = proxy_cfg
142+
143+
# Get a new proxy URL and return it
144+
new_url = await proxy_cfg.new_url()
145+
return urlparse(new_url)

src/apify/scrapy/middlewares.py renamed to src/apify/scrapy/middlewares/apify_retry.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@
1111
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
1212
) from exc
1313

14-
from ..actor import Actor
15-
from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
14+
from ...actor import Actor
15+
from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
1616

1717
if TYPE_CHECKING:
1818
from scrapy import Spider
1919
from scrapy.http import Request, Response
2020

21-
from ..storages import RequestQueue
21+
from ...storages import RequestQueue
2222

2323

2424
class ApifyRetryMiddleware(RetryMiddleware):

src/apify/scrapy/utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import asyncio
44
import codecs
55
import pickle
6+
from base64 import b64encode
7+
from urllib.parse import unquote
8+
9+
from scrapy.utils.python import to_bytes
610

711
try:
812
from scrapy import Request, Spider
@@ -19,6 +23,13 @@
1923
nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
2024

2125

26+
def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
27+
"""Generate a basic authentication header for the given username and password."""
28+
string = f'{unquote(username)}:{unquote(password)}'
29+
user_pass = to_bytes(string, encoding=auth_encoding)
30+
return b'Basic ' + b64encode(user_pass)
31+
32+
2233
def get_running_event_loop_id() -> int:
2334
"""Get the ID of the currently running event loop.
2435

tests/unit/scrapy/__init__.py

Whitespace-only changes.

tests/unit/scrapy/middlewares/__init__.py

Whitespace-only changes.
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
from __future__ import annotations
2+
3+
from urllib.parse import ParseResult, urlparse
4+
5+
import pytest
6+
from scrapy import Request, Spider
7+
from scrapy.core.downloader.handlers.http11 import TunnelError
8+
from scrapy.crawler import Crawler
9+
from scrapy.exceptions import NotConfigured
10+
11+
from apify import ProxyConfiguration
12+
from apify.scrapy.middlewares import ApifyHttpProxyMiddleware
13+
14+
15+
class DummySpider(Spider):
16+
name = 'dummy_spider'
17+
18+
19+
@pytest.fixture()
20+
def middleware() -> ApifyHttpProxyMiddleware:
21+
"""Fixture to create an Apify HTTP proxy middleware."""
22+
proxy_settings = {'useApifyProxy': True}
23+
return ApifyHttpProxyMiddleware(proxy_settings)
24+
25+
26+
@pytest.fixture()
27+
def crawler(monkeypatch: pytest.MonkeyPatch) -> Crawler:
28+
"""Fixture to create a Scrapy crawler."""
29+
crawler = Crawler(DummySpider)
30+
monkeypatch.setattr(crawler, 'settings', {})
31+
return crawler
32+
33+
34+
@pytest.fixture()
35+
def spider() -> DummySpider:
36+
"""Fixture to create a "dummy" Scrapy spider."""
37+
return DummySpider()
38+
39+
40+
@pytest.fixture()
41+
def dummy_request() -> Request:
42+
"""Fixture to create a "dummy" Scrapy spider."""
43+
return Request('https://example.com')
44+
45+
46+
@pytest.fixture()
47+
def proxy_configuration() -> ProxyConfiguration:
48+
"""Fixture to create an Apify ProxyConfiguration object."""
49+
return ProxyConfiguration()
50+
51+
52+
@pytest.mark.parametrize(
53+
('settings', 'expected_exception'),
54+
[
55+
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True}}, None),
56+
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True, 'apifyProxyGroups': []}}, None),
57+
({}, NotConfigured),
58+
({'a': 1}, NotConfigured),
59+
({'APIFY_PROXY_SETTINGS': {}}, NotConfigured),
60+
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': None}}, NotConfigured),
61+
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': False}}, NotConfigured),
62+
],
63+
)
64+
def test__from_crawler(
65+
crawler: Crawler,
66+
monkeypatch: pytest.MonkeyPatch,
67+
settings: dict,
68+
expected_exception: type[Exception] | None,
69+
) -> None:
70+
monkeypatch.setattr(crawler, 'settings', settings)
71+
72+
if expected_exception is None:
73+
middleware = ApifyHttpProxyMiddleware.from_crawler(crawler)
74+
assert middleware._proxy_settings == settings['APIFY_PROXY_SETTINGS']
75+
76+
else:
77+
with pytest.raises(expected_exception):
78+
ApifyHttpProxyMiddleware.from_crawler(crawler)
79+
80+
81+
@pytest.mark.parametrize(
82+
'expected_proxy_url',
83+
['http://username:[email protected]:8080', 'http://hsdfgds:[email protected]:5748'],
84+
)
85+
async def test__get_new_proxy_url(
86+
monkeypatch: pytest.MonkeyPatch,
87+
middleware: ApifyHttpProxyMiddleware,
88+
proxy_configuration: ProxyConfiguration,
89+
expected_proxy_url: str,
90+
) -> None:
91+
async def mock_new_url() -> str:
92+
return expected_proxy_url
93+
94+
monkeypatch.setattr(proxy_configuration, 'new_url', mock_new_url)
95+
middleware._proxy_cfg_internal = proxy_configuration
96+
proxy_url = await middleware._get_new_proxy_url()
97+
assert proxy_url == urlparse(expected_proxy_url)
98+
99+
100+
@pytest.mark.parametrize(
101+
('proxy_url', 'expected_exception', 'expected_request_header'),
102+
[
103+
('http://username:[email protected]:8080', None, b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='),
104+
('http://user123:[email protected]:5748', None, b'Basic dXNlcjEyMzpwYXNzNDU2'),
105+
('http://@proxy.example.com:2943', ValueError, b''),
106+
],
107+
)
108+
async def test__process_request(
109+
monkeypatch: pytest.MonkeyPatch,
110+
middleware: ApifyHttpProxyMiddleware,
111+
spider: DummySpider,
112+
dummy_request: Request,
113+
proxy_url: str,
114+
expected_exception: type[Exception] | None,
115+
expected_request_header: bytes,
116+
) -> None:
117+
async def mock_get_new_proxy_url() -> ParseResult:
118+
return urlparse(proxy_url)
119+
120+
monkeypatch.setattr(middleware, '_get_new_proxy_url', mock_get_new_proxy_url)
121+
122+
if expected_exception is None:
123+
await middleware.process_request(dummy_request, spider)
124+
assert dummy_request.meta['proxy'] == proxy_url
125+
assert dummy_request.headers[b'Proxy-Authorization'] == expected_request_header
126+
else:
127+
with pytest.raises(expected_exception):
128+
await middleware.process_request(dummy_request, spider)
129+
130+
131+
@pytest.mark.parametrize(
132+
('exception', 'none_returned_values_is_expected'),
133+
[
134+
(TunnelError(), False),
135+
(ValueError(), True),
136+
],
137+
)
138+
def test__process_exception(
139+
middleware: ApifyHttpProxyMiddleware,
140+
spider: DummySpider,
141+
dummy_request: Request,
142+
exception: Exception,
143+
*,
144+
none_returned_values_is_expected: bool,
145+
) -> None:
146+
returned_value = middleware.process_exception(dummy_request, exception, spider)
147+
148+
if none_returned_values_is_expected:
149+
assert returned_value is None
150+
151+
else:
152+
assert returned_value == dummy_request

tests/unit/scrapy/test_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from __future__ import annotations
2+
3+
import pytest
4+
5+
from apify.scrapy import get_basic_auth_header
6+
7+
8+
@pytest.mark.parametrize(
9+
('username', 'password', 'expected_auth_header'),
10+
[
11+
('username', 'password', b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='),
12+
('john_smith', 'secret_password_123', b'Basic am9obl9zbWl0aDpzZWNyZXRfcGFzc3dvcmRfMTIz'),
13+
],
14+
)
15+
def test__get_basic_auth_header(
16+
username: str,
17+
password: str,
18+
expected_auth_header: bytes,
19+
) -> None:
20+
auth_header = get_basic_auth_header(username, password)
21+
assert auth_header == expected_auth_header

0 commit comments

Comments
 (0)