From acc293fd9b8121159cbcf44752523216f4635fc0 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 13:22:45 +0100 Subject: [PATCH 01/13] Experimental transfer authentication to AzureAD --- .../v2/processes/connectors/sharepoint.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index 6376b7a34..e6538806b 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -91,16 +91,33 @@ class SharepointConnectionConfig(ConnectionConfig): access_config: Secret[SharepointAccessConfig] permissions_config: Optional[SharepointPermissionsConfig] = None - @requires_dependencies(["office365"], extras="sharepoint") + @requires_dependencies(["msal"], extras="sharepoint") def get_client(self) -> "ClientContext": - from office365.runtime.auth.client_credential import ClientCredential + from msal import ConfidentialClientApplication from office365.sharepoint.client_context import ClientContext try: - credentials = ClientCredential( - self.client_id, self.access_config.get_secret_value().client_cred + # Acquire the token using MSAL, similar to get_permissions_token + app = ConfidentialClientApplication( + authority=f"{self.permissions_config.authority_url.get_secret_value()}/" + f"{self.permissions_config.permissions_tenant}", + client_id=self.permissions_config.permissions_application_id, + client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), + ) + token_result = app.acquire_token_for_client( + scopes=[ + f"https://{self.permissions_config.permissions_tenant}.sharepoint.com/.default" + ] ) - site_client = ClientContext(self.site).with_credentials(credentials) + if "access_token" not in token_result: + raise SourceConnectionNetworkError( + f"Failed to obtain token for SharePoint: \ + {token_result.get('error_description', '')}" + ) + access_token = token_result["access_token"] + + # Then set up the SharePoint client context with that token + site_client = ClientContext(self.site).with_access_token(access_token) except Exception as e: logger.error(f"Couldn't set Sharepoint client: {e}") raise e From 252ccc0bc7018701d7dd84553d32995fd4a26ae8 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 13:54:33 +0100 Subject: [PATCH 02/13] Added flake8 bypass for one line --- unstructured_ingest/v2/processes/connectors/sharepoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index e6538806b..e8b53a80b 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -102,7 +102,7 @@ def get_client(self) -> "ClientContext": authority=f"{self.permissions_config.authority_url.get_secret_value()}/" f"{self.permissions_config.permissions_tenant}", client_id=self.permissions_config.permissions_application_id, - client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), + client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), # noqa: E501 ) token_result = app.acquire_token_for_client( scopes=[ From 9a3ab3641cce143126b541af71acd007e6286d23 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 13:56:21 +0100 Subject: [PATCH 03/13] Version bump --- CHANGELOG.md | 6 ++++++ unstructured_ingest/__version__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed5234c3e..71ed5c11e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## 0.3.13-dev1 +### Enchancements + +* **Sharepoint coonner no longer uses depracted authorisation method. Uses EntraID instead via msal.** + +## 0.3.13-dev1 + ### Fixes * **Fix Snowflake Uploader error** diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 07fd6c88a..dff5c63d1 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.3.13-dev1" # pragma: no cover +__version__ = "0.3.13-dev2" # pragma: no cover From 1ce2811a16ea920396978d726040249dd5635954 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 14:49:00 +0100 Subject: [PATCH 04/13] Added integration test for sharepoint --- .../integration/connectors/test_sharepoint.py | 86 +++++++++++++++++++ .../v2/processes/connectors/sharepoint.py | 2 +- 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 test/integration/connectors/test_sharepoint.py diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py new file mode 100644 index 000000000..8b8fac415 --- /dev/null +++ b/test/integration/connectors/test_sharepoint.py @@ -0,0 +1,86 @@ +# test_sharepoint_integration.py +import os + +import pytest + +from test.integration.connectors.utils.validation import ( + SourceValidationConfigs, + source_connector_validation, +) +from test.integration.utils import requires_env +from unstructured_ingest.v2.processes.connectors.sharepoint import ( + CONNECTOR_TYPE, + SharepointAccessConfig, + SharepointConnectionConfig, + SharepointDownloader, + SharepointDownloaderConfig, + SharepointIndexer, + SharepointIndexerConfig, + SharepointPermissionsConfig, +) + +SOURCE_TAG = "source" + + +@pytest.mark.asyncio +@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG) +@requires_env("SP_CLIENT_ID", "SP_CLIENT_CRED", "SP_SITE_URL", "SP_TENANT") +async def test_sharepoint_source(temp_dir): + """ + Integration test that: + 1) Creates a SharepointIndexer to list/enumerate items in a given site + 2) Creates a SharepointDownloader to fetch each enumerated item + 3) Runs a validation helper to confirm the end-to-end pipeline + """ + client_id = os.getenv("SHAREPOINT_CLIENT_ID") + client_cred = os.getenv("SHAREPOINT_CRED") + tenant = os.getenv("SHAREPOINT_PERMISSIONS_TENANT") + site_url = os.getenv("SHAREPOINT_SITE") + + access_config = SharepointAccessConfig(client_cred=client_cred) + + permissions_config = SharepointPermissionsConfig( + permissions_application_id=None, + permissions_tenant=tenant, + permissions_client_cred=None, # or SecretStr(...) + ) + + connection_config = SharepointConnectionConfig( + client_id=client_id, + site=site_url, + access_config=access_config, + permissions_config=permissions_config, + ) + + index_config = SharepointIndexerConfig( + path=None, + recursive=True, + omit_files=False, + omit_pages=False, + omit_lists=True, + ) + + download_config = SharepointDownloaderConfig( + download_dir=temp_dir # Directory where the files get saved + ) + + indexer = SharepointIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = SharepointDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + expected_files = 7 + + await source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="sharepoint", + expected_num_files=expected_files, + validate_downloaded_files=True, + ), + ) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index e8b53a80b..220c1ff0c 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -102,7 +102,7 @@ def get_client(self) -> "ClientContext": authority=f"{self.permissions_config.authority_url.get_secret_value()}/" f"{self.permissions_config.permissions_tenant}", client_id=self.permissions_config.permissions_application_id, - client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), # noqa: E501 + client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), # noqa: E501 ) token_result = app.acquire_token_for_client( scopes=[ From b1a5aa0007d74a014922c343f99404f46abf9e8a Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 15:10:04 +0100 Subject: [PATCH 05/13] Fixed import --- test/integration/connectors/test_sharepoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py index 8b8fac415..ae2a9a3a5 100644 --- a/test/integration/connectors/test_sharepoint.py +++ b/test/integration/connectors/test_sharepoint.py @@ -3,7 +3,7 @@ import pytest -from test.integration.connectors.utils.validation import ( +from test.integration.connectors.utils.validation.source import ( SourceValidationConfigs, source_connector_validation, ) From ea937fc2fc8a6e593e9d8b7f09b31798439a3127 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 15:10:54 +0100 Subject: [PATCH 06/13] Changelog bump --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71ed5c11e..fca971788 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.13-dev1 +## 0.3.13-dev2 ### Enchancements From 9e397616cd6d3e34ab583a88e19e33dd589a29cd Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 16:13:53 +0100 Subject: [PATCH 07/13] Fixed env vars requirement --- test/integration/connectors/test_sharepoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py index ae2a9a3a5..fcd2c2265 100644 --- a/test/integration/connectors/test_sharepoint.py +++ b/test/integration/connectors/test_sharepoint.py @@ -24,7 +24,7 @@ @pytest.mark.asyncio @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG) -@requires_env("SP_CLIENT_ID", "SP_CLIENT_CRED", "SP_SITE_URL", "SP_TENANT") +@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "SHAREPOINT_PERMISSIONS_TENANT", "SHAREPOINT_SITE") async def test_sharepoint_source(temp_dir): """ Integration test that: From cf9211c05126447a4f421c6a9f2a5b2af1998aa8 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 9 Jan 2025 16:20:10 +0100 Subject: [PATCH 08/13] Lint update --- test/integration/connectors/test_sharepoint.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py index fcd2c2265..bd1de0d70 100644 --- a/test/integration/connectors/test_sharepoint.py +++ b/test/integration/connectors/test_sharepoint.py @@ -24,7 +24,9 @@ @pytest.mark.asyncio @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG) -@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "SHAREPOINT_PERMISSIONS_TENANT", "SHAREPOINT_SITE") +@requires_env( + "SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "SHAREPOINT_PERMISSIONS_TENANT", "SHAREPOINT_SITE" +) async def test_sharepoint_source(temp_dir): """ Integration test that: From c84d20cb74b732c3fa54ad90f0b86dfcebd9e5f1 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 10 Jan 2025 14:32:55 +0100 Subject: [PATCH 09/13] Small code review correction --- unstructured_ingest/v2/processes/connectors/sharepoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index 220c1ff0c..db5ba182b 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -101,7 +101,7 @@ def get_client(self) -> "ClientContext": app = ConfidentialClientApplication( authority=f"{self.permissions_config.authority_url.get_secret_value()}/" f"{self.permissions_config.permissions_tenant}", - client_id=self.permissions_config.permissions_application_id, + client_id=self.client_id, client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), # noqa: E501 ) token_result = app.acquire_token_for_client( From f5cd905a4be1399050a78883765e63ea3a13a37b Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Wed, 29 Jan 2025 13:28:09 +0100 Subject: [PATCH 10/13] Obtaining permission token from graph --- .../v2/processes/connectors/sharepoint.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index db5ba182b..3ace7a950 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -94,21 +94,9 @@ class SharepointConnectionConfig(ConnectionConfig): @requires_dependencies(["msal"], extras="sharepoint") def get_client(self) -> "ClientContext": from msal import ConfidentialClientApplication - from office365.sharepoint.client_context import ClientContext try: - # Acquire the token using MSAL, similar to get_permissions_token - app = ConfidentialClientApplication( - authority=f"{self.permissions_config.authority_url.get_secret_value()}/" - f"{self.permissions_config.permissions_tenant}", - client_id=self.client_id, - client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), # noqa: E501 - ) - token_result = app.acquire_token_for_client( - scopes=[ - f"https://{self.permissions_config.permissions_tenant}.sharepoint.com/.default" - ] - ) + token_result = self.get_permissions_token() if "access_token" not in token_result: raise SourceConnectionNetworkError( f"Failed to obtain token for SharePoint: \ @@ -118,6 +106,7 @@ def get_client(self) -> "ClientContext": # Then set up the SharePoint client context with that token site_client = ClientContext(self.site).with_access_token(access_token) + except Exception as e: logger.error(f"Couldn't set Sharepoint client: {e}") raise e From eab8adfcc251a7e07572230bfeeea5b6138b5eca Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 31 Jan 2025 00:36:44 +0100 Subject: [PATCH 11/13] Couple of fixes to the token handling in test and connector --- test/integration/connectors/test_sharepoint.py | 7 ++++--- unstructured_ingest/v2/processes/connectors/sharepoint.py | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py index bd1de0d70..81fc9873b 100644 --- a/test/integration/connectors/test_sharepoint.py +++ b/test/integration/connectors/test_sharepoint.py @@ -2,7 +2,7 @@ import os import pytest - +from pydantic import SecretStr from test.integration.connectors.utils.validation.source import ( SourceValidationConfigs, source_connector_validation, @@ -42,9 +42,10 @@ async def test_sharepoint_source(temp_dir): access_config = SharepointAccessConfig(client_cred=client_cred) permissions_config = SharepointPermissionsConfig( - permissions_application_id=None, + permissions_application_id=client_id, + permissions_client_cred=SecretStr(client_cred), permissions_tenant=tenant, - permissions_client_cred=None, # or SecretStr(...) + authority_url=SecretStr("https://login.microsoftonline.com"), ) connection_config = SharepointConnectionConfig( diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index 3ace7a950..2683e568f 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -91,9 +91,11 @@ class SharepointConnectionConfig(ConnectionConfig): access_config: Secret[SharepointAccessConfig] permissions_config: Optional[SharepointPermissionsConfig] = None - @requires_dependencies(["msal"], extras="sharepoint") + @requires_dependencies(["msal", "office365"], extras="sharepoint") def get_client(self) -> "ClientContext": from msal import ConfidentialClientApplication + from office365.sharepoint.client_context import ClientContext + from office365.runtime.auth.token_manager import TokenManager try: token_result = self.get_permissions_token() @@ -105,7 +107,9 @@ def get_client(self) -> "ClientContext": access_token = token_result["access_token"] # Then set up the SharePoint client context with that token - site_client = ClientContext(self.site).with_access_token(access_token) + site_client = ClientContext(self.site).with_access_token( + TokenManager(lambda: {"tokenType": "Bearer", "accessToken": access_token}) + ) except Exception as e: logger.error(f"Couldn't set Sharepoint client: {e}") From 986d2035064fd2b24bcbce158ffbd5cb47a72c57 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 31 Jan 2025 00:43:53 +0100 Subject: [PATCH 12/13] Fixed token bearer function Still unauthorized --- unstructured_ingest/v2/processes/connectors/sharepoint.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index 2683e568f..a6c2f3d51 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -95,7 +95,7 @@ class SharepointConnectionConfig(ConnectionConfig): def get_client(self) -> "ClientContext": from msal import ConfidentialClientApplication from office365.sharepoint.client_context import ClientContext - from office365.runtime.auth.token_manager import TokenManager + from types import SimpleNamespace try: token_result = self.get_permissions_token() @@ -106,10 +106,10 @@ def get_client(self) -> "ClientContext": ) access_token = token_result["access_token"] + def token_callback(): + return SimpleNamespace(tokenType="Bearer", accessToken=access_token) # Then set up the SharePoint client context with that token - site_client = ClientContext(self.site).with_access_token( - TokenManager(lambda: {"tokenType": "Bearer", "accessToken": access_token}) - ) + site_client = ClientContext(self.site).with_access_token(token_callback) except Exception as e: logger.error(f"Couldn't set Sharepoint client: {e}") From 644ca1f738342d29e94e14b5b7972f0b021689b9 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 31 Jan 2025 11:33:39 +0100 Subject: [PATCH 13/13] Fixed test and connector to work properly with ingest creds --- test/integration/connectors/test_sharepoint.py | 2 +- .../v2/processes/connectors/sharepoint.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py index 81fc9873b..4f9795bb9 100644 --- a/test/integration/connectors/test_sharepoint.py +++ b/test/integration/connectors/test_sharepoint.py @@ -45,7 +45,7 @@ async def test_sharepoint_source(temp_dir): permissions_application_id=client_id, permissions_client_cred=SecretStr(client_cred), permissions_tenant=tenant, - authority_url=SecretStr("https://login.microsoftonline.com"), + authority_url=SecretStr("https://login.microsoftonline.com/"), ) connection_config = SharepointConnectionConfig( diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index a6c2f3d51..b02b5f388 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -98,7 +98,8 @@ def get_client(self) -> "ClientContext": from types import SimpleNamespace try: - token_result = self.get_permissions_token() + token_scope = f"{self.site}/.default" + token_result = self.get_permissions_token(token_scope=token_scope) if "access_token" not in token_result: raise SourceConnectionNetworkError( f"Failed to obtain token for SharePoint: \ @@ -117,18 +118,18 @@ def token_callback(): return site_client @requires_dependencies(["msal"], extras="sharepoint") - def get_permissions_token(self): + def get_permissions_token(self, token_scope: Optional[str] = "https://graph.microsoft.com/.default"): from msal import ConfidentialClientApplication try: client_credential = self.permissions_config.permissions_client_cred.get_secret_value() app = ConfidentialClientApplication( - authority=f"{self.permissions_config.authority_url.get_secret_value()}/" + authority=f"{self.permissions_config.authority_url.get_secret_value()}" f"{self.permissions_config.permissions_tenant}", client_id=self.permissions_config.permissions_application_id, client_credential=client_credential, ) - token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + token = app.acquire_token_for_client(scopes=[token_scope]) except ValueError as exc: logger.error("Couldn't set up credentials for Sharepoint") raise exc