Shareable workflows (API + frontend) (#2783)

tw4l · emma-sg · web-flow · commit f722d47849c3 · 2025-09-30T17:12:12.000-07:00
Fixes #2784 Part of #2762 Changes: - New `CrawlConfig.shareable` field that can be set via POST and PATCH endpoints - New public replay.json API endpoint for workflows that returns the replay.json for the last successful crawl (or a 404 if the workflow is not found, not shareable, or does not have at least one successfully finished crawl) - New page search API endpoint for shareable workflows to support loading and searching for the pages in a workflow's last successful crawl (or a 404 under same conditions as above) - Tests for all added functionality --------- Co-authored-by: Emma Segal-Grossman <hi@emma.cafe>
diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
@@ -166,6 +166,7 @@ async def get_crawl_out(
         type_: Optional[str] = None,
         skip_resources=False,
         headers: Optional[dict] = None,
+        cid: Optional[UUID] = None,
     ) -> CrawlOutWithResources:
         """Get crawl data for api output"""
         res = await self.get_crawl_raw(crawlid, org, type_)
@@ -187,9 +188,17 @@ async def get_crawl_out(
                 oid = res.get("oid")
                 if oid:
                     origin = get_origin(headers)
-                    res["pagesQueryUrl"] = (
-                        origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
-                    )
+                    # If cid is passed, construct pagesSearch query for public
+                    # shareable workflow
+                    if cid:
+                        res["pagesQueryUrl"] = (
+                            origin
+                            + f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch"
+                        )
+                    else:
+                        res["pagesQueryUrl"] = (
+                            origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
+                        )
 
                 # this will now disable the downloadUrl in RWP
                 res["downloadUrl"] = None
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -16,7 +16,7 @@
 import urllib.parse
 
 import aiohttp
-from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response
 import pymongo
 
 from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@@ -27,17 +27,20 @@
     CrawlConfigOut,
     CrawlConfigTags,
     CrawlOut,
+    CrawlOutWithResources,
     UpdateCrawlConfig,
     Organization,
     User,
     PaginatedCrawlConfigOutResponse,
     PaginatedSeedResponse,
     PaginatedConfigRevisionResponse,
+    SUCCESSFUL_STATES,
     FAILED_STATES,
     CrawlerChannel,
     CrawlerChannels,
     StartedResponse,
     SuccessResponse,
+    EmptyResponse,
     CrawlConfigAddedResponse,
     CrawlConfigSearchValues,
     CrawlConfigUpdateResponse,
@@ -339,6 +342,7 @@ async def add_crawl_config(
             proxyId=config_in.proxyId,
             firstSeed=first_seed,
             seedCount=seed_count,
+            shareable=config_in.shareable,
         )
 
         if config_in.runNow:
@@ -573,6 +577,9 @@ async def update_crawl_config(
         changed = changed or self.check_attr_changed(
             orig_crawl_config, update, "browserWindows"
         )
+        changed = changed or (
+            self.check_attr_changed(orig_crawl_config, update, "shareable")
+        )
 
         schedule_changed = self.check_attr_changed(
             orig_crawl_config, update, "schedule"
@@ -849,6 +856,30 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
 
         return None
 
+    async def get_last_successful_crawl_out(
+        self,
+        cid: UUID,
+        org: Organization,
+        request: Optional[Request] = None,
+    ) -> Optional[CrawlOutWithResources]:
+        """Return the last successful crawl out with resources for this config, if any"""
+        headers = dict(request.headers) if request else None
+        match_query = {
+            "cid": cid,
+            "oid": org.id,
+            "finished": {"$ne": None},
+            "state": {"$in": SUCCESSFUL_STATES},
+        }
+        last_crawl = await self.crawls.find_one(
+            match_query, sort=[("finished", pymongo.DESCENDING)]
+        )
+        if last_crawl:
+            return await self.crawl_ops.get_crawl_out(
+                last_crawl["_id"], org, "crawl", headers=headers, cid=cid
+            )
+
+        return None
+
     async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
         """recompute stats by incrementing size counter and number of crawls"""
         update_query: dict[str, object] = {}
@@ -1503,6 +1534,7 @@ def init_crawl_config_api(
 
     org_crawl_dep = org_ops.org_crawl_dep
     org_viewer_dep = org_ops.org_viewer_dep
+    org_public = org_ops.org_public
 
     @router.get("", response_model=PaginatedCrawlConfigOutResponse)
     async def get_crawl_configs(
@@ -1619,6 +1651,38 @@ async def get_all_crawler_proxies(
 
         return ops.get_crawler_proxies()
 
+    @app.get(
+        "/orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
+        response_model=CrawlOutWithResources,
+    )
+    async def get_crawl_config_latest_crawl_public_replay(
+        request: Request,
+        response: Response,
+        cid: UUID,
+        org: Organization = Depends(org_public),
+    ):
+        crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True)
+        if not crawl_config.shareable:
+            raise HTTPException(status_code=404, detail="crawl_config_not_found")
+
+        last_successful_crawl_out = await ops.get_last_successful_crawl_out(
+            cid, org, request
+        )
+
+        response.headers["Access-Control-Allow-Origin"] = "*"
+        response.headers["Access-Control-Allow-Headers"] = "*"
+        return last_successful_crawl_out
+
+    @app.options(
+        "orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
+        response_model=EmptyResponse,
+    )
+    async def get_replay_preflight(response: Response):
+        response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
+        response.headers["Access-Control-Allow-Origin"] = "*"
+        response.headers["Access-Control-Allow-Headers"] = "*"
+        return {}
+
     @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
     async def get_crawl_config_seeds(
         cid: UUID,
diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py
@@ -269,6 +269,7 @@ def main() -> None:
         storage_ops,
         background_job_ops,
         coll_ops,
+        crawl_config_ops,
         current_active_user,
     )
 
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -406,6 +406,8 @@ class CrawlConfigIn(BaseModel):
 
     crawlFilenameTemplate: Optional[str] = None
 
+    shareable: bool = False
+
 
 # ============================================================================
 class ConfigRevision(BaseMongoModel):
@@ -497,6 +499,8 @@ class CrawlConfigAdditional(BaseModel):
 
     crawlFilenameTemplate: Optional[str] = None
 
+    shareable: Optional[bool] = False
+
 
 # ============================================================================
 class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
@@ -555,6 +559,7 @@ class UpdateCrawlConfig(BaseModel):
     browserWindows: Optional[BrowserWindowCount] = None
     crawlFilenameTemplate: Optional[str] = None
     config: Optional[RawCrawlConfig] = None
+    shareable: Optional[bool] = None
 
 
 # ============================================================================
diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
@@ -1084,7 +1084,15 @@ async def process_finished_crawls():
 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
 def init_pages_api(
-    app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
+    app,
+    mdb,
+    crawl_ops,
+    org_ops,
+    storage_ops,
+    background_job_ops,
+    coll_ops,
+    crawl_config_ops,
+    user_dep,
 ) -> PageOps:
     """init pages API"""
     # pylint: disable=invalid-name
@@ -1336,6 +1344,47 @@ async def get_search_pages_list(
         )
         return {"items": pages}
 
+    @app.get(
+        "/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch",
+        tags=["pages", "crawlconfigs"],
+        response_model=PageOutItemsResponse,
+    )
+    async def get_search_pages_list_shareable_crawl_config(
+        cid: UUID,
+        org: Organization = Depends(org_public),
+        search: Optional[str] = None,
+        url: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        isSeed: Optional[bool] = None,
+        depth: Optional[int] = None,
+        pageSize: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+    ):
+        """Retrieve paginated list of pages for last successful crawl of workflow"""
+        crawl_config = await crawl_config_ops.get_crawl_config(
+            cid, org.id, active_only=True
+        )
+        if not crawl_config.shareable:
+            raise HTTPException(status_code=404, detail="crawl_config_not_found")
+
+        last_successful_crawl_out = (
+            await crawl_config_ops.get_last_successful_crawl_out(cid, org)
+        )
+
+        pages, _ = await ops.list_pages(
+            crawl_ids=[last_successful_crawl_out.id],
+            search=search,
+            url=url,
+            ts=ts,
+            is_seed=isSeed,
+            depth=depth,
+            org=org,
+            page_size=pageSize,
+            page=page,
+            include_total=False,
+        )
+        return {"items": pages}
+
     @app.get(
         "/orgs/{oid}/collections/{coll_id}/public/pages",
         tags=["pages", "collections"],
diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py
@@ -995,6 +995,85 @@ def test_delete_seed_file_in_use_crawlconfig(
     assert r.json()["id"] == seed_file_id
 
 
+def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
+    # Verify workflow is not shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json()["shareable"] is False
+
+    # Verify public replay.json returns 404 while not shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
+    )
+    assert r.status_code == 404
+    assert r.json()["detail"] == "crawl_config_not_found"
+
+    # Verify public pagesSearch endpoint returns 404 while not shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
+    )
+    assert r.status_code == 404
+
+    # Mark workflow as shareable
+    r = requests.patch(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
+        headers=admin_auth_headers,
+        json={"shareable": True},
+    )
+    assert r.status_code == 200
+
+    data = r.json()
+    assert data["updated"]
+    assert data["settings_changed"]
+    assert data["metadata_changed"] is False
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json()["shareable"]
+
+    # Verify public replay.json returns last successful crawl while shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["id"] == admin_crawl_id
+    assert data["oid"] == default_org_id
+    assert data["cid"] == _admin_crawl_cid
+    assert data["type"] == "crawl"
+    assert data["state"] == "complete"
+
+    resources = data["resources"]
+    assert resources
+    assert resources[0]["path"]
+
+    assert len(data["initialPages"]) == 4
+
+    pages_query_url = data["pagesQueryUrl"]
+    assert pages_query_url.endswith(
+        f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
+    )
+    assert data["downloadUrl"] is None
+
+    # Verify pages search endpoint is accessible and works
+    r = requests.get(pages_query_url)
+    assert r.status_code == 200
+    data = r.json()
+    assert data["items"]
+    for page in data["items"]:
+        assert page["id"]
+        assert page["oid"] == default_org_id
+        assert page["crawl_id"] == admin_crawl_id
+        assert page["url"]
+
+
 def test_add_crawl_config_fail_on_content_check_no_profile(
     crawler_auth_headers, default_org_id, sample_crawl_data
 ):
diff --git a/frontend/src/features/crawl-workflows/templates/shareable-notice.ts b/frontend/src/features/crawl-workflows/templates/shareable-notice.ts
@@ -0,0 +1,14 @@
+import { msg } from "@lit/localize";
+import { html } from "lit";
+
+export const ShareableNotice = () =>
+  html`<btrix-popover
+    content=${msg(
+      "The latest crawl from this workflow is publicly accessible to anyone with the link. This can be changed with the Browsertrix API.",
+    )}
+  >
+    <btrix-badge class="part-[base]:min-h-5" variant="warning">
+      <sl-icon name="info-circle" class="align-icon mr-1"></sl-icon>
+      ${msg("Public")}
+    </btrix-badge>
+  </btrix-popover>`;
diff --git a/frontend/src/features/crawl-workflows/workflow-list.ts b/frontend/src/features/crawl-workflows/workflow-list.ts
@@ -19,6 +19,9 @@ import {
   query,
   queryAssignedElements,
 } from "lit/decorators.js";
+import { when } from "lit/directives/when.js";
+
+import { ShareableNotice } from "./templates/shareable-notice";
 
 import { BtrixElement } from "@/classes/BtrixElement";
 import type { OverflowDropdown } from "@/components/ui/overflow-dropdown";
@@ -250,7 +253,8 @@ export class WorkflowListItem extends BtrixElement {
       }}
     >
       <div class="col">
-        <div class="detail url truncate">
+        <div class="detail url items-center truncate">
+          ${when(this.workflow?.shareable, ShareableNotice)}
           ${this.safeRender(this.renderName)}
         </div>
         <div class="desc">
diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts
diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts

Original file line number	Diff line number	Diff line change
`@@ -269,6 +269,7 @@ def main() -> None:`
`269`	`269`	`storage_ops,`
`270`	`270`	`background_job_ops,`
`271`	`271`	`coll_ops,`
	`272`	`+ crawl_config_ops,`
`272`	`273`	`current_active_user,`
`273`	`274`	`)`
`274`	`275`