Skip to content

chore: Add test server and some top level crawler tests #517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ scrapy = ["scrapy>=2.11.0"]
[dependency-groups]
dev = [
"build~=1.2.0",
"crawlee[parsel]~=0.6.0",
"dycw-pytest-only>=2.1.1",
"griffe~=1.9.0",
"mypy~=1.17.0",
Expand All @@ -76,6 +77,7 @@ dev = [
"respx~=0.22.0",
"ruff~=0.12.0",
"setuptools", # setuptools are used by pytest but not explicitly required
"uvicorn[standard]~=0.35.0",
]

[tool.hatch.build.targets.wheel]
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/actor_source_base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ RUN echo "Python version:" \
&& echo "All installed Python packages:" \
&& pip freeze

CMD ["python3", "-m", "src"]
CMD ["sh", "-c", "python server.py & python -m src"]
2 changes: 2 additions & 0 deletions tests/integration/actor_source_base/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# The test fixture will put the Apify SDK wheel path on the next line
APIFY_SDK_WHEEL_PLACEHOLDER
uvicorn[standard]~=0.35.0
crawlee[parsel]~=0.6.0
101 changes: 101 additions & 0 deletions tests/integration/actor_source_base/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""
Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
For example:
http://localhost:8080/ contains links:
http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9

http://localhost:8080/1 contains links:
http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19

... and so on.
"""

import asyncio
import logging
from collections.abc import Awaitable, Callable, Coroutine
from socket import socket
from typing import Any

from uvicorn import Config
from uvicorn.server import Server
from yarl import URL

Receive = Callable[[], Awaitable[dict[str, Any]]]
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]


async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
"""Send an HTML response to the client."""
await send(
{
'type': 'http.response.start',
'status': status,
'headers': [[b'content-type', b'text/html; charset=utf-8']],
}
)
await send({'type': 'http.response.body', 'body': html_content})


async def app(scope: dict[str, Any], _: Receive, send: Send) -> None:
"""Main ASGI application handler that routes requests to specific handlers.

Args:
scope: The ASGI connection scope.
_: The ASGI receive function.
send: The ASGI send function.
"""
assert scope['type'] == 'http'
path = scope['path']

links = '\n'.join(f'<a href="{path}{i}">{path}{i}</a>' for i in range(10))
await send_html_response(
send,
f"""\
<html><head>
<title>Title for {path} </title>
</head>
<body>
{links}
</body></html>""".encode(),
)


class TestServer(Server):
"""A test HTTP server implementation based on Uvicorn Server."""

@property
def url(self) -> URL:
"""Get the base URL of the server.

Returns:
A URL instance with the server's base URL.
"""
protocol = 'https' if self.config.is_ssl else 'http'
return URL(f'{protocol}://{self.config.host}:{self.config.port}/')

async def serve(self, sockets: list[socket] | None = None) -> None:
"""Run the server."""
if sockets:
raise RuntimeError('Simple TestServer does not support custom sockets')
self.restart_requested = asyncio.Event()

loop = asyncio.get_event_loop()
tasks = {
loop.create_task(super().serve()),
}
await asyncio.wait(tasks)


if __name__ == '__main__':
asyncio.run(
TestServer(
config=Config(
app=app,
lifespan='off',
loop='asyncio',
port=8080,
log_config=None,
log_level=logging.CRITICAL,
)
).serve()
)
4 changes: 2 additions & 2 deletions tests/integration/test_actor_api_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,12 +400,12 @@ async def main_server() -> None:
async with Actor:

class WebhookHandler(BaseHTTPRequestHandler):
def do_GET(self) -> None: # noqa: N802
def do_GET(self) -> None:
self.send_response(200)
self.end_headers()
self.wfile.write(bytes('Hello, world!', encoding='utf-8'))

def do_POST(self) -> None: # noqa: N802
def do_POST(self) -> None:
nonlocal webhook_body
content_length = self.headers.get('content-length')
length = int(content_length) if content_length else 0
Expand Down
111 changes: 111 additions & 0 deletions tests/integration/test_crawlers_with_storages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from .conftest import MakeActorFunction, RunActorFunction


async def test_actor_on_platform_max_crawl_depth(
make_actor: MakeActorFunction,
run_actor: RunActorFunction,
) -> None:
"""Test that the actor respects max_crawl_depth."""

async def main() -> None:
"""The crawler entry point."""
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

from apify import Actor

async with Actor:
crawler = ParselCrawler(max_crawl_depth=2)
finished = []
enqueue_pattern = re.compile(r'http://localhost:8080/2+$')

@crawler.router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
await context.enqueue_links(include=[enqueue_pattern])
finished.append(context.request.url)

await crawler.run(['http://localhost:8080/'])
assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22']

actor = await make_actor(label='crawler-max-depth', main_func=main)
run_result = await run_actor(actor)

assert run_result.status == 'SUCCEEDED'


async def test_actor_on_platform_max_requests_per_crawl(
make_actor: MakeActorFunction,
run_actor: RunActorFunction,
) -> None:
"""Test that the actor respects max_requests_per_crawl."""

async def main() -> None:
"""The crawler entry point."""
from crawlee import ConcurrencySettings
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

from apify import Actor

async with Actor:
crawler = ParselCrawler(
max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1)
)
finished = []

@crawler.router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
await context.enqueue_links()
finished.append(context.request.url)

await crawler.run(['http://localhost:8080/'])
assert len(finished) == 3

actor = await make_actor(label='crawler-max-requests', main_func=main)
run_result = await run_actor(actor)

assert run_result.status == 'SUCCEEDED'


async def test_actor_on_platform_max_request_retries(
make_actor: MakeActorFunction,
run_actor: RunActorFunction,
) -> None:
"""Test that the actor respects max_request_retries."""

async def main() -> None:
"""The crawler entry point."""
from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext

from apify import Actor

async with Actor:
max_retries = 3
crawler = ParselCrawler(max_request_retries=max_retries)
failed_counter = 0

@crawler.error_handler
async def error_handler(_: BasicCrawlingContext, __: Exception) -> None:
nonlocal failed_counter
failed_counter += 1

@crawler.router.default_handler
async def default_handler(_: ParselCrawlingContext) -> None:
raise RuntimeError('Some error')

await crawler.run(['http://localhost:8080/'])
assert failed_counter == max_retries, f'{failed_counter=}'

actor = await make_actor(label='crawler-max-retries', main_func=main)
run_result = await run_actor(actor)

assert run_result.status == 'SUCCEEDED'
Loading
Loading