diff --git a/docs/examples/code_examples/using_browser_profiles_chrome.py b/docs/examples/code_examples/using_browser_profiles_chrome.py new file mode 100644 index 0000000000..55cd4d685b --- /dev/null +++ b/docs/examples/code_examples/using_browser_profiles_chrome.py @@ -0,0 +1,56 @@ +import asyncio +import shutil +from pathlib import Path +from tempfile import TemporaryDirectory + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +# Profile name to use (usually 'Default' for single profile setups) +PROFILE_NAME = 'Default' + +# Paths to Chrome profiles in your system (example for Windows) +# Use `chrome://version/` to find your profile path +PROFILE_PATH = Path(Path.home(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data') + + +async def main() -> None: + # Create a temporary folder to copy the profile to + with TemporaryDirectory(prefix='crawlee-') as tmpdirname: + tmp_profile_dir = Path(tmpdirname) + + # Copy the profile to a temporary folder + shutil.copytree( + PROFILE_PATH / PROFILE_NAME, + tmp_profile_dir / PROFILE_NAME, + dirs_exist_ok=True, + ) + + crawler = PlaywrightCrawler( + headless=False, + # Use chromium for Chrome compatibility + browser_type='chromium', + # Disable fingerprints to preserve profile identity + fingerprint_generator=None, + # Set user data directory to temp folder + user_data_dir=tmp_profile_dir, + browser_launch_options={ + # Use installed Chrome browser + 'channel': 'chrome', + # Slow down actions to mimic human behavior + 'slow_mo': 200, + 'args': [ + # Use the specified profile + f'--profile-directory={PROFILE_NAME}', + ], + }, + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Visiting {context.request.url}') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/examples/code_examples/using_browser_profiles_firefox.py b/docs/examples/code_examples/using_browser_profiles_firefox.py new file mode 100644 index 0000000000..8510269efc --- /dev/null +++ b/docs/examples/code_examples/using_browser_profiles_firefox.py @@ -0,0 +1,42 @@ +import asyncio +from pathlib import Path + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +# Replace this with your actual Firefox profile name +# Find it at about:profiles in Firefox +PROFILE_NAME = 'your-profile-name-here' + +# Paths to Firefox profiles in your system (example for Windows) +# Use `about:profiles` to find your profile path +PROFILE_PATH = Path( + Path.home(), 'AppData', 'Roaming', 'Mozilla', 'Firefox', 'Profiles', PROFILE_NAME +) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Use Firefox browser type + browser_type='firefox', + # Disable fingerprints to use the profile as is + fingerprint_generator=None, + headless=False, + # Path to your Firefox profile + user_data_dir=PROFILE_PATH, + browser_launch_options={ + 'args': [ + # Required to avoid version conflicts + '--allow-downgrade' + ] + }, + ) + + @crawler.router.default_handler + async def default_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Visiting {context.request.url}') + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/examples/using_browser_profile.mdx b/docs/examples/using_browser_profile.mdx new file mode 100644 index 0000000000..a991a8012f --- /dev/null +++ b/docs/examples/using_browser_profile.mdx @@ -0,0 +1,41 @@ +--- +id: using_browser_profile +title: Using browser profile +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import CodeBlock from '@theme/CodeBlock'; + +import ChromeProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_chrome.py'; +import FirefoxProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_firefox.py'; + +This example demonstrates how to run `PlaywrightCrawler` using your local browser profile from [Chrome](https://www.google.com/intl/us/chrome/) or [Firefox](https://www.firefox.com/). + +Using browser profiles allows you to leverage existing login sessions, saved passwords, bookmarks, and other personalized browser data during crawling. This can be particularly useful for testing scenarios or when you need to access content that requires authentication. + +## Chrome browser + +To run `PlaywrightCrawler` with your Chrome profile, you need to know the path to your profile files. You can find this information by entering `chrome://version/` as a URL in your Chrome browser. If you have multiple profiles, pay attention to the profile name - if you only have one profile, it's always `Default`. + +You also need to use the [`channel`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-option-channel) parameter in `browser_launch_options` to use the Chrome browser installed on your system instead of Playwright's Chromium. + +:::warning Profile access limitation +Due to [Chrome's security policies](https://developer.chrome.com/blog/remote-debugging-port), automation cannot use your main browsing profile directly. The example copies your profile to a temporary location as a workaround. +::: + +Make sure you don't have any running Chrome browser processes before running this code: + + + {ChromeProfileExample} + + +## Firefox browser + +To find the path to your Firefox profile, enter `about:profiles` as a URL in your Firefox browser. Unlike Chrome, you can use your standard profile path directly without copying it first. + +Make sure you don't have any running Firefox browser processes before running this code: + + + {FirefoxProfileExample} +