Skip to content

Commit 32f5d0d

Browse files
authored
Fix Scrapy Apify retry middleware (#157)
1 parent 353a579 commit 32f5d0d

File tree

2 files changed

+31
-20
lines changed

2 files changed

+31
-20
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## [1.4.1](../../releases/tag/v1.4.1) - Unreleased
44

5+
### Fixed
6+
7+
- Resolved issue in `ApifyRetryMiddleware.process_exception()` where requests were getting stuck in the request queue.
8+
59
### Internal changes
610

711
- Fix type hint problems for resource clients

src/apify/scrapy/middlewares.py

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,7 @@
44
from typing import TYPE_CHECKING, Any
55

66
try:
7-
from scrapy import Spider # noqa: TCH002
87
from scrapy.downloadermiddlewares.retry import RetryMiddleware
9-
from scrapy.exceptions import IgnoreRequest
10-
from scrapy.http import Request, Response # noqa: TCH002
118
from scrapy.utils.response import response_status_message
129
except ImportError as exc:
1310
raise ImportError(
@@ -18,6 +15,9 @@
1815
from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
1916

2017
if TYPE_CHECKING:
18+
from scrapy import Spider
19+
from scrapy.http import Request, Response
20+
2121
from ..storages import RequestQueue
2222

2323

@@ -33,11 +33,6 @@ def __init__(self: ApifyRetryMiddleware, *args: Any, **kwargs: Any) -> None:
3333
traceback.print_exc()
3434
raise
3535

36-
def __del__(self: ApifyRetryMiddleware) -> None:
37-
"""Before deleting the instance, close the nested event loop."""
38-
nested_event_loop.stop()
39-
nested_event_loop.close()
40-
4136
def process_response(
4237
self: ApifyRetryMiddleware,
4338
request: Request,
@@ -54,9 +49,11 @@ def process_response(
5449
Returns:
5550
The response, or a new request if the request should be retried.
5651
"""
52+
if not isinstance(request.url, str):
53+
raise TypeError(f'Expected request.url to be a string, got {type(request.url)} instead.')
54+
5755
# Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our
5856
# Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details.
59-
assert isinstance(request.url, str) # noqa: S101
6057
if request.url.endswith('robots.txt'):
6158
return response
6259

@@ -72,20 +69,30 @@ def process_exception(
7269
exception: BaseException,
7370
spider: Spider,
7471
) -> Request | Response | None:
75-
"""Handle the exception and decide whether the request should be retried."""
76-
Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (scrapy_request={request})...')
72+
"""Handle the exception and decide whether the request should be retried.
73+
74+
Args:
75+
request: The request that encountered an exception.
76+
exception: The exception that occurred.
77+
spider: The Spider that sent the request.
78+
79+
Returns:
80+
None: The request will not be retried.
81+
"""
82+
Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (request={request}, exception={exception})...')
7783
apify_request = to_apify_request(request, spider=spider)
7884

79-
if isinstance(exception, IgnoreRequest):
80-
try:
81-
nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
82-
except BaseException:
83-
traceback.print_exc()
84-
raise
85-
else:
86-
nested_event_loop.run_until_complete(self._rq.reclaim_request(apify_request))
85+
# Unlike the default Scrapy RetryMiddleware, we do not attempt to retry requests on exception.
86+
# It was causing issues with the Apify request queue, because the request was not marked as handled and was
87+
# stucked in the request queue forever - Scrapy crawling never finished. The solution would be to completely
88+
# rewrite the retry logic from default RetryMiddleware.
89+
try:
90+
nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
91+
except BaseException:
92+
traceback.print_exc()
93+
raise
8794

88-
return super().process_exception(request, exception, spider)
95+
return None
8996

9097
async def _handle_retry_logic(
9198
self: ApifyRetryMiddleware,

0 commit comments

Comments
 (0)