Skip to content

refactor!: Introduce new storage client system #1194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 45 commits into from
Jul 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
f285707
refactor!: Introduce new storage client system
vdusek May 10, 2025
dd9be6e
Cleanup
vdusek May 10, 2025
89bfa5b
Address feedback
vdusek May 15, 2025
4050c75
Add purge_if_needed method and improve some typing based on Pylance
vdusek May 16, 2025
26f46e2
Address more feedback
vdusek May 20, 2025
c83a36a
RQ FS client improvements
vdusek Jun 4, 2025
c967fe5
Add caching to RQ FS client
vdusek Jun 5, 2025
7df046f
RQ FS performance optimization in add_requests
vdusek Jun 5, 2025
3555565
RQ FS performance issues in fetch_next_request
vdusek Jun 6, 2025
946d1e2
RQ FS fetch performance for is_empty
vdusek Jun 6, 2025
9f10b95
rm code duplication for open methods
vdusek Jun 6, 2025
0864ff8
Request loaders use async getters for handled/total req cnt
vdusek Jun 9, 2025
af0d129
Add missing_ok when removing files
vdusek Jun 9, 2025
9998a58
Improve is_empty
vdusek Jun 10, 2025
fdee111
Optimize RQ memory storage client
vdusek Jun 10, 2025
79cdfc0
Add upgrading guide and skip problematic test
vdusek Jun 11, 2025
3d2fd73
Merge branch 'master' into new-storage-clients
vdusek Jun 11, 2025
e818585
chore: update `docusaurus-plugin-typedoc-api`, fix failing docs build
barjin Jun 11, 2025
65db9ac
fix docs
vdusek Jun 11, 2025
2b786f7
add retries to atomic write
vdusek Jun 12, 2025
2cb04c5
chore(deps): update dependency pytest-cov to ~=6.2.0 (#1244)
renovate[bot] Jun 12, 2025
0c8c4ec
Fix atomic write on Windows
vdusek Jun 12, 2025
ce1eeb1
resolve write function during import time
vdusek Jun 14, 2025
4c05cee
Merge branch 'master' into new-storage-clients
vdusek Jun 14, 2025
8c80513
Update file utils
vdusek Jun 16, 2025
70bc071
revert un-intentionally makefile changes
vdusek Jun 16, 2025
78efb4d
Address Honza's comments (p1)
vdusek Jun 18, 2025
fa18d19
Introduce storage instance manager
vdusek Jun 19, 2025
c783dac
Utilize recoverable state for the FS RQ state
vdusek Jun 20, 2025
437071e
Details
vdusek Jun 20, 2025
df4bfa7
Rm default_"storage"_id options (were not used at all)
vdusek Jun 23, 2025
e133fcd
Update storages guide and add storage clients guide
vdusek Jun 23, 2025
76f1ffb
Docs guides - code examples
vdusek Jun 24, 2025
fa48644
Docs guides polishment
vdusek Jun 24, 2025
5c935af
docs fix lint & type checks for py 3.9
vdusek Jun 24, 2025
ac259ce
Address Honza's feedback
vdusek Jun 24, 2025
1cbf15e
SDK fixes
vdusek Jun 25, 2025
bc50990
Add KVS record_exists method
vdusek Jun 26, 2025
d1cf967
reduce test duplicities for storages & storage clients
vdusek Jun 26, 2025
aa9bfd3
Create locks in async context only
vdusek Jun 27, 2025
d6c9877
rm open methods from base storage clients
vdusek Jun 27, 2025
3b133ce
update storage clients inits
vdusek Jun 30, 2025
43b9fe9
async metadata getter
vdusek Jul 1, 2025
b628fbb
better typing in storage instance manager
vdusek Jul 1, 2025
9dfac4b
update upgrading guide
vdusek Jul 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions docs/deployment/code_examples/google/cloud_run_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,23 @@
import uvicorn
from litestar import Litestar, get

from crawlee import service_locator
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

# highlight-start
# Disable writing storage data to the file system
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False
# highlight-end
from crawlee.storage_clients import MemoryStorageClient


@get('/')
async def main() -> str:
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end

crawler = PlaywrightCrawler(
headless=True,
max_requests_per_crawl=10,
browser_type='firefox',
storage_client=storage_client,
)

@crawler.router.default_handler
Expand Down
15 changes: 7 additions & 8 deletions docs/deployment/code_examples/google/google_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,21 @@
import functions_framework
from flask import Request, Response

from crawlee import service_locator
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)

# highlight-start
# Disable writing storage data to the file system
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False
# highlight-end
from crawlee.storage_clients import MemoryStorageClient


async def main() -> str:
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end

crawler = BeautifulSoupCrawler(
storage_client=storage_client,
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a CSV file.
await crawler.export_data_csv(path='results.csv')
await crawler.export_data(path='results.csv')


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a JSON file.
await crawler.export_data_json(path='results.json')
await crawler.export_data(path='results.json')


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code_examples/parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def some_hook(context: BasicCrawlingContext) -> None:
await crawler.run(['https://github.com'])

# Export the entire dataset to a JSON file.
await crawler.export_data_json(path='results.json')
await crawler.export_data(path='results.json')


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from crawlee.storage_clients import StorageClient
from crawlee.storage_clients._base import (
DatasetClient,
KeyValueStoreClient,
RequestQueueClient,
)

if TYPE_CHECKING:
from crawlee.configuration import Configuration

# Implement the storage type clients with your backend logic.


class CustomDatasetClient(DatasetClient):
# Implement methods like push_data, get_data, iterate_items, etc.
pass


class CustomKeyValueStoreClient(KeyValueStoreClient):
# Implement methods like get_value, set_value, delete, etc.
pass


class CustomRequestQueueClient(RequestQueueClient):
# Implement methods like add_request, fetch_next_request, etc.
pass


# Implement the storage client factory.


class CustomStorageClient(StorageClient):
async def create_dataset_client(
self,
*,
id: str | None = None,
name: str | None = None,
configuration: Configuration | None = None,
) -> CustomDatasetClient:
# Create and return your custom dataset client.
pass

async def create_kvs_client(
self,
*,
id: str | None = None,
name: str | None = None,
configuration: Configuration | None = None,
) -> CustomKeyValueStoreClient:
# Create and return your custom key-value store client.
pass

async def create_rq_client(
self,
*,
id: str | None = None,
name: str | None = None,
configuration: Configuration | None = None,
) -> CustomRequestQueueClient:
# Create and return your custom request queue client.
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import FileSystemStorageClient

# Create a new instance of storage client.
storage_client = FileSystemStorageClient()

# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import FileSystemStorageClient

# Create a new instance of storage client.
storage_client = FileSystemStorageClient()

# Create a configuration with custom settings.
configuration = Configuration(
storage_dir='./my_storage',
purge_on_start=False,
)

# And pass them to the crawler.
crawler = ParselCrawler(
storage_client=storage_client,
configuration=configuration,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import MemoryStorageClient

# Create a new instance of storage client.
storage_client = MemoryStorageClient()

# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import asyncio

from crawlee import service_locator
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset


async def main() -> None:
# Create custom storage client, MemoryStorageClient for example.
storage_client = MemoryStorageClient()

# Register it globally via the service locator.
service_locator.set_storage_client(storage_client)

# Or pass it directly to the crawler, it will be registered globally
# to the service locator under the hood.
crawler = ParselCrawler(storage_client=storage_client)

# Or just provide it when opening a storage (e.g. dataset), it will be used
# for this storage only, not globally.
dataset = await Dataset.open(
name='my_dataset',
storage_client=storage_client,
)


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
import asyncio

from crawlee.crawlers import HttpCrawler
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset


async def main() -> None:
storage_client = MemoryStorageClient.from_config()
# Create storage client with configuration
dataset = await Dataset.open(name='my-dataset')

# Call the purge_on_start method to explicitly purge the storage.
# highlight-next-line
await storage_client.purge_on_start()
# Purge the dataset explicitly - purging will remove all items from the dataset.
# But keeps the dataset itself and its metadata.
await dataset.purge()

# Pass the storage client to the crawler.
crawler = HttpCrawler(storage_client=storage_client)

# ...
# Or you can drop the dataset completely, which will remove the dataset
# and all its items.
await dataset.drop()


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
async def main() -> None:
# Open the dataset, if it does not exist, it will be created.
# Leave name empty to use the default dataset.
dataset = await Dataset.open()
dataset = await Dataset.open(name='my-dataset')

# Push a single row of data.
await dataset.push_data({'foo': 'bar'})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
async def main() -> None:
# Open the dataset, if it does not exist, it will be created.
# Leave name empty to use the default dataset.
dataset = await Dataset.open()
dataset = await Dataset.open(name='my-dataset')

# Create a new crawler (it can be any subclass of BasicCrawler).
crawler = BeautifulSoupCrawler()
Expand Down
2 changes: 1 addition & 1 deletion docs/guides/code_examples/storages/kvs_basic_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
async def main() -> None:
# Open the key-value store, if it does not exist, it will be created.
# Leave name empty to use the default KVS.
kvs = await KeyValueStore.open()
kvs = await KeyValueStore.open(name='my-key-value-store')

# Set a value associated with 'some-key'.
await kvs.set_value(key='some-key', value={'foo': 'bar'})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
async def main() -> None:
# Open the key-value store, if it does not exist, it will be created.
# Leave name empty to use the default KVS.
kvs = await KeyValueStore.open()
kvs = await KeyValueStore.open(name='my-key-value-store')

# Create a new Playwright crawler.
crawler = PlaywrightCrawler()
Expand Down
2 changes: 1 addition & 1 deletion docs/guides/code_examples/storages/rq_basic_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ async def main() -> None:
await request_queue.add_request('https://apify.com/')

# Add multiple requests as a batch.
await request_queue.add_requests_batched(
await request_queue.add_requests(
['https://crawlee.dev/', 'https://crawlee.dev/python/']
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@ async def main() -> None:
request_queue = await RequestQueue.open(name='my-request-queue')

# Interact with the request queue directly, e.g. add a batch of requests.
await request_queue.add_requests_batched(
['https://apify.com/', 'https://crawlee.dev/']
)
await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/'])

# Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
# list as request manager to it. It will be managed by the crawler.
# queue as request manager to it. It will be managed by the crawler.
crawler = HttpCrawler(request_manager=request_queue)

# Define the default request handler, which will be called for every request.
Expand Down
8 changes: 4 additions & 4 deletions docs/guides/request_loaders.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ classDiagram
%% Abstract classes
%% ========================

class BaseStorage {
class Storage {
<<abstract>>
+ id
+ name
Expand All @@ -52,12 +52,12 @@ class BaseStorage {

class RequestLoader {
<<abstract>>
+ handled_count
+ total_count
+ fetch_next_request()
+ mark_request_as_handled()
+ is_empty()
+ is_finished()
+ get_handled_count()
+ get_total_count()
+ to_tandem()
}

Expand Down Expand Up @@ -92,7 +92,7 @@ class RequestManagerTandem {
%% Inheritance arrows
%% ========================

BaseStorage <|-- RequestQueue
Storage <|-- RequestQueue
RequestManager <|-- RequestQueue

RequestLoader <|-- RequestManager
Expand Down
Loading