Skip to content

Commit 17b99c4

Browse files
committed
Add test server and some top level Crawler tests
1 parent 20ecb24 commit 17b99c4

File tree

6 files changed

+317
-96
lines changed

6 files changed

+317
-96
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,15 @@ keywords = [
3636
dependencies = [
3737
"apify-client>=1.11.0",
3838
"apify-shared>=1.3.0",
39-
"crawlee~=0.6.0",
39+
"crawlee[parsel]~=0.6.0",
4040
"cryptography>=42.0.0",
4141
"httpx>=0.27.0",
4242
# TODO: ensure compatibility with the latest version of lazy-object-proxy
4343
# https://github.com/apify/apify-sdk-python/issues/460
4444
"lazy-object-proxy<1.11.0",
4545
"more_itertools>=10.2.0",
4646
"typing-extensions>=4.1.0",
47+
"uvicorn",
4748
"websockets>=14.0",
4849
]
4950

tests/integration/actor_source_base/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ RUN echo "Python version:" \
1212
&& echo "All installed Python packages:" \
1313
&& pip freeze
1414

15-
CMD ["python3", "-m", "src"]
15+
CMD ["sh", "-c", "python test_server.py & python -m src"]
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# The Purpose of this server is to run benchmark independent of the network issues
2+
import asyncio
3+
import logging
4+
from collections.abc import Awaitable, Callable, Coroutine
5+
from typing import Any
6+
7+
from uvicorn import Config
8+
from uvicorn.server import Server
9+
from yarl import URL
10+
11+
Receive = Callable[[], Awaitable[dict[str, Any]]]
12+
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
13+
14+
15+
async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
16+
"""Send an HTML response to the client."""
17+
await send(
18+
{
19+
'type': 'http.response.start',
20+
'status': status,
21+
'headers': [[b'content-type', b'text/html; charset=utf-8']],
22+
}
23+
)
24+
await send({'type': 'http.response.body', 'body': html_content})
25+
26+
27+
async def app(scope: dict[str, Any], _: Receive, send: Send) -> None:
28+
"""Main ASGI application handler that routes requests to specific handlers.
29+
30+
Args:
31+
scope: The ASGI connection scope.
32+
_: The ASGI receive function.
33+
send: The ASGI send function.
34+
"""
35+
assert scope['type'] == 'http'
36+
path = scope['path']
37+
38+
links = '\n'.join(f'<a href="{path}{i}">{path}{i}</a>' for i in range(10))
39+
await send_html_response(
40+
send,
41+
f"""\
42+
<html><head>
43+
<title>Title for {path} </title>
44+
</head>
45+
<body>
46+
{links}
47+
</body></html>""".encode(),
48+
)
49+
50+
51+
class TestServer(Server):
52+
"""A test HTTP server implementation based on Uvicorn Server."""
53+
54+
@property
55+
def url(self) -> URL:
56+
"""Get the base URL of the server.
57+
58+
Returns:
59+
A URL instance with the server's base URL.
60+
"""
61+
protocol = 'https' if self.config.is_ssl else 'http'
62+
return URL(f'{protocol}://{self.config.host}:{self.config.port}/')
63+
64+
async def serve(self) -> None:
65+
"""Run the server and set up restart capability.
66+
67+
Args:
68+
sockets: Optional list of sockets to bind to.
69+
"""
70+
self.restart_requested = asyncio.Event()
71+
72+
loop = asyncio.get_event_loop()
73+
tasks = {
74+
loop.create_task(super().serve()),
75+
}
76+
await asyncio.wait(tasks)
77+
78+
79+
if __name__ == '__main__':
80+
asyncio.run(
81+
TestServer(
82+
config=Config(
83+
app=app,
84+
lifespan='off',
85+
loop='asyncio',
86+
port=8080,
87+
log_config=None,
88+
log_level=logging.CRITICAL,
89+
)
90+
).serve()
91+
)

tests/integration/test_actor_api_helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,12 +400,12 @@ async def main_server() -> None:
400400
async with Actor:
401401

402402
class WebhookHandler(BaseHTTPRequestHandler):
403-
def do_GET(self) -> None: # noqa: N802
403+
def do_GET(self) -> None:
404404
self.send_response(200)
405405
self.end_headers()
406406
self.wfile.write(bytes('Hello, world!', encoding='utf-8'))
407407

408-
def do_POST(self) -> None: # noqa: N802
408+
def do_POST(self) -> None:
409409
nonlocal webhook_body
410410
content_length = self.headers.get('content-length')
411411
length = int(content_length) if content_length else 0
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
from tests.integration.conftest import MakeActorFunction, RunActorFunction
2+
3+
4+
async def test_actor_on_platform_max_crawl_depth(
5+
make_actor: MakeActorFunction,
6+
run_actor: RunActorFunction,
7+
) -> None:
8+
"""
9+
Test that the actor respects max_crawl_depth.
10+
11+
Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
12+
For example: http://localhost:8080/ contains links:
13+
http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9
14+
"""
15+
16+
async def main() -> None:
17+
"""The crawler entry point."""
18+
import re
19+
20+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
21+
22+
from apify import Actor
23+
24+
async with Actor:
25+
crawler = ParselCrawler(max_crawl_depth=2)
26+
finished = []
27+
enqueue_pattern = re.compile(r'http://localhost:8080/2+$')
28+
29+
@crawler.router.default_handler
30+
async def default_handler(context: ParselCrawlingContext) -> None:
31+
"""Default request handler."""
32+
context.log.info(f'Processing {context.request.url} ...')
33+
await context.enqueue_links(include=[enqueue_pattern])
34+
await context.push_data({'Url': context.request.url})
35+
finished.append(context.request.url)
36+
37+
await crawler.run(['http://localhost:8080/'])
38+
assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22']
39+
# assert some dataset
40+
41+
actor = await make_actor(label='parsel-crawler', main_func=main)
42+
run_result = await run_actor(actor)
43+
44+
assert run_result.status == 'SUCCEEDED'
45+
46+
47+
async def test_actor_on_platform_max_requests_per_crawl(
48+
make_actor: MakeActorFunction,
49+
run_actor: RunActorFunction,
50+
) -> None:
51+
"""
52+
Test that the actor respects max_requests_per_crawl.
53+
54+
Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
55+
For example: http://localhost:8080/ contains links:
56+
http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9
57+
"""
58+
59+
async def main() -> None:
60+
"""The crawler entry point."""
61+
from crawlee import ConcurrencySettings
62+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
63+
64+
from apify import Actor
65+
66+
async with Actor:
67+
crawler = ParselCrawler(
68+
max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1)
69+
)
70+
finished = []
71+
72+
@crawler.router.default_handler
73+
async def default_handler(context: ParselCrawlingContext) -> None:
74+
"""Default request handler."""
75+
context.log.info(f'Processing {context.request.url} ...')
76+
await context.enqueue_links()
77+
await context.push_data({'Url': context.request.url})
78+
finished.append(context.request.url)
79+
80+
await crawler.run(['http://localhost:8080/'])
81+
assert len(finished) == 3
82+
# assert some dataset
83+
84+
actor = await make_actor(label='parsel-crawler', main_func=main)
85+
run_result = await run_actor(actor)
86+
87+
assert run_result.status == 'SUCCEEDED'

0 commit comments

Comments
 (0)