4
4
from typing import TYPE_CHECKING , Any
5
5
6
6
try :
7
- from scrapy import Spider # noqa: TCH002
8
7
from scrapy .downloadermiddlewares .retry import RetryMiddleware
9
- from scrapy .exceptions import IgnoreRequest
10
- from scrapy .http import Request , Response # noqa: TCH002
11
8
from scrapy .utils .response import response_status_message
12
9
except ImportError as exc :
13
10
raise ImportError (
18
15
from .utils import nested_event_loop , open_queue_with_custom_client , to_apify_request
19
16
20
17
if TYPE_CHECKING :
18
+ from scrapy import Spider
19
+ from scrapy .http import Request , Response
20
+
21
21
from ..storages import RequestQueue
22
22
23
23
@@ -33,11 +33,6 @@ def __init__(self: ApifyRetryMiddleware, *args: Any, **kwargs: Any) -> None:
33
33
traceback .print_exc ()
34
34
raise
35
35
36
- def __del__ (self : ApifyRetryMiddleware ) -> None :
37
- """Before deleting the instance, close the nested event loop."""
38
- nested_event_loop .stop ()
39
- nested_event_loop .close ()
40
-
41
36
def process_response (
42
37
self : ApifyRetryMiddleware ,
43
38
request : Request ,
@@ -54,9 +49,11 @@ def process_response(
54
49
Returns:
55
50
The response, or a new request if the request should be retried.
56
51
"""
52
+ if not isinstance (request .url , str ):
53
+ raise TypeError (f'Expected request.url to be a string, got { type (request .url )} instead.' )
54
+
57
55
# Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our
58
56
# Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details.
59
- assert isinstance (request .url , str ) # noqa: S101
60
57
if request .url .endswith ('robots.txt' ):
61
58
return response
62
59
@@ -72,20 +69,30 @@ def process_exception(
72
69
exception : BaseException ,
73
70
spider : Spider ,
74
71
) -> Request | Response | None :
75
- """Handle the exception and decide whether the request should be retried."""
76
- Actor .log .debug (f'ApifyRetryMiddleware.process_exception was called (scrapy_request={ request } )...' )
72
+ """Handle the exception and decide whether the request should be retried.
73
+
74
+ Args:
75
+ request: The request that encountered an exception.
76
+ exception: The exception that occurred.
77
+ spider: The Spider that sent the request.
78
+
79
+ Returns:
80
+ None: The request will not be retried.
81
+ """
82
+ Actor .log .debug (f'ApifyRetryMiddleware.process_exception was called (request={ request } , exception={ exception } )...' )
77
83
apify_request = to_apify_request (request , spider = spider )
78
84
79
- if isinstance (exception , IgnoreRequest ):
80
- try :
81
- nested_event_loop .run_until_complete (self ._rq .mark_request_as_handled (apify_request ))
82
- except BaseException :
83
- traceback .print_exc ()
84
- raise
85
- else :
86
- nested_event_loop .run_until_complete (self ._rq .reclaim_request (apify_request ))
85
+ # Unlike the default Scrapy RetryMiddleware, we do not attempt to retry requests on exception.
86
+ # It was causing issues with the Apify request queue, because the request was not marked as handled and was
87
+ # stucked in the request queue forever - Scrapy crawling never finished. The solution would be to completely
88
+ # rewrite the retry logic from default RetryMiddleware.
89
+ try :
90
+ nested_event_loop .run_until_complete (self ._rq .mark_request_as_handled (apify_request ))
91
+ except BaseException :
92
+ traceback .print_exc ()
93
+ raise
87
94
88
- return super (). process_exception ( request , exception , spider )
95
+ return None
89
96
90
97
async def _handle_retry_logic (
91
98
self : ApifyRetryMiddleware ,
0 commit comments