diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d850d345..91f59667b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.3.16-dev1 + +### Enchancements + +* **Sharepoint coonner no longer uses depracted authorisation method. Uses EntraID instead via msal.** + ## 0.3.16-dev0 ### Fixes diff --git a/test/integration/connectors/test_sharepoint.py b/test/integration/connectors/test_sharepoint.py new file mode 100644 index 000000000..4f9795bb9 --- /dev/null +++ b/test/integration/connectors/test_sharepoint.py @@ -0,0 +1,89 @@ +# test_sharepoint_integration.py +import os + +import pytest +from pydantic import SecretStr +from test.integration.connectors.utils.validation.source import ( + SourceValidationConfigs, + source_connector_validation, +) +from test.integration.utils import requires_env +from unstructured_ingest.v2.processes.connectors.sharepoint import ( + CONNECTOR_TYPE, + SharepointAccessConfig, + SharepointConnectionConfig, + SharepointDownloader, + SharepointDownloaderConfig, + SharepointIndexer, + SharepointIndexerConfig, + SharepointPermissionsConfig, +) + +SOURCE_TAG = "source" + + +@pytest.mark.asyncio +@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG) +@requires_env( + "SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "SHAREPOINT_PERMISSIONS_TENANT", "SHAREPOINT_SITE" +) +async def test_sharepoint_source(temp_dir): + """ + Integration test that: + 1) Creates a SharepointIndexer to list/enumerate items in a given site + 2) Creates a SharepointDownloader to fetch each enumerated item + 3) Runs a validation helper to confirm the end-to-end pipeline + """ + client_id = os.getenv("SHAREPOINT_CLIENT_ID") + client_cred = os.getenv("SHAREPOINT_CRED") + tenant = os.getenv("SHAREPOINT_PERMISSIONS_TENANT") + site_url = os.getenv("SHAREPOINT_SITE") + + access_config = SharepointAccessConfig(client_cred=client_cred) + + permissions_config = SharepointPermissionsConfig( + permissions_application_id=client_id, + permissions_client_cred=SecretStr(client_cred), + permissions_tenant=tenant, + authority_url=SecretStr("https://login.microsoftonline.com/"), + ) + + connection_config = SharepointConnectionConfig( + client_id=client_id, + site=site_url, + access_config=access_config, + permissions_config=permissions_config, + ) + + index_config = SharepointIndexerConfig( + path=None, + recursive=True, + omit_files=False, + omit_pages=False, + omit_lists=True, + ) + + download_config = SharepointDownloaderConfig( + download_dir=temp_dir # Directory where the files get saved + ) + + indexer = SharepointIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = SharepointDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + expected_files = 7 + + await source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="sharepoint", + expected_num_files=expected_files, + validate_downloaded_files=True, + ), + ) diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 649b22338..5b02e498d 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.3.16-dev0" # pragma: no cover +__version__ = "0.3.16-dev1" # pragma: no cover diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index 6376b7a34..b02b5f388 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -91,34 +91,45 @@ class SharepointConnectionConfig(ConnectionConfig): access_config: Secret[SharepointAccessConfig] permissions_config: Optional[SharepointPermissionsConfig] = None - @requires_dependencies(["office365"], extras="sharepoint") + @requires_dependencies(["msal", "office365"], extras="sharepoint") def get_client(self) -> "ClientContext": - from office365.runtime.auth.client_credential import ClientCredential + from msal import ConfidentialClientApplication from office365.sharepoint.client_context import ClientContext + from types import SimpleNamespace try: - credentials = ClientCredential( - self.client_id, self.access_config.get_secret_value().client_cred - ) - site_client = ClientContext(self.site).with_credentials(credentials) + token_scope = f"{self.site}/.default" + token_result = self.get_permissions_token(token_scope=token_scope) + if "access_token" not in token_result: + raise SourceConnectionNetworkError( + f"Failed to obtain token for SharePoint: \ + {token_result.get('error_description', '')}" + ) + access_token = token_result["access_token"] + + def token_callback(): + return SimpleNamespace(tokenType="Bearer", accessToken=access_token) + # Then set up the SharePoint client context with that token + site_client = ClientContext(self.site).with_access_token(token_callback) + except Exception as e: logger.error(f"Couldn't set Sharepoint client: {e}") raise e return site_client @requires_dependencies(["msal"], extras="sharepoint") - def get_permissions_token(self): + def get_permissions_token(self, token_scope: Optional[str] = "https://graph.microsoft.com/.default"): from msal import ConfidentialClientApplication try: client_credential = self.permissions_config.permissions_client_cred.get_secret_value() app = ConfidentialClientApplication( - authority=f"{self.permissions_config.authority_url.get_secret_value()}/" + authority=f"{self.permissions_config.authority_url.get_secret_value()}" f"{self.permissions_config.permissions_tenant}", client_id=self.permissions_config.permissions_application_id, client_credential=client_credential, ) - token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) + token = app.acquire_token_for_client(scopes=[token_scope]) except ValueError as exc: logger.error("Couldn't set up credentials for Sharepoint") raise exc