Skip to content

Commit f722d47

Browse files
tw4lemma-sg
andauthored
Shareable workflows (API + frontend) (#2783)
Fixes #2784 Part of #2762 Changes: - New `CrawlConfig.shareable` field that can be set via POST and PATCH endpoints - New public replay.json API endpoint for workflows that returns the replay.json for the last successful crawl (or a 404 if the workflow is not found, not shareable, or does not have at least one successfully finished crawl) - New page search API endpoint for shareable workflows to support loading and searching for the pages in a workflow's last successful crawl (or a 404 under same conditions as above) - Tests for all added functionality --------- Co-authored-by: Emma Segal-Grossman <[email protected]>
1 parent ce04751 commit f722d47

File tree

10 files changed

+239
-13
lines changed

10 files changed

+239
-13
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ async def get_crawl_out(
166166
type_: Optional[str] = None,
167167
skip_resources=False,
168168
headers: Optional[dict] = None,
169+
cid: Optional[UUID] = None,
169170
) -> CrawlOutWithResources:
170171
"""Get crawl data for api output"""
171172
res = await self.get_crawl_raw(crawlid, org, type_)
@@ -187,9 +188,17 @@ async def get_crawl_out(
187188
oid = res.get("oid")
188189
if oid:
189190
origin = get_origin(headers)
190-
res["pagesQueryUrl"] = (
191-
origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
192-
)
191+
# If cid is passed, construct pagesSearch query for public
192+
# shareable workflow
193+
if cid:
194+
res["pagesQueryUrl"] = (
195+
origin
196+
+ f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch"
197+
)
198+
else:
199+
res["pagesQueryUrl"] = (
200+
origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
201+
)
193202

194203
# this will now disable the downloadUrl in RWP
195204
res["downloadUrl"] = None

backend/btrixcloud/crawlconfigs.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import urllib.parse
1717

1818
import aiohttp
19-
from fastapi import APIRouter, Depends, HTTPException, Query
19+
from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response
2020
import pymongo
2121

2222
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@@ -27,17 +27,20 @@
2727
CrawlConfigOut,
2828
CrawlConfigTags,
2929
CrawlOut,
30+
CrawlOutWithResources,
3031
UpdateCrawlConfig,
3132
Organization,
3233
User,
3334
PaginatedCrawlConfigOutResponse,
3435
PaginatedSeedResponse,
3536
PaginatedConfigRevisionResponse,
37+
SUCCESSFUL_STATES,
3638
FAILED_STATES,
3739
CrawlerChannel,
3840
CrawlerChannels,
3941
StartedResponse,
4042
SuccessResponse,
43+
EmptyResponse,
4144
CrawlConfigAddedResponse,
4245
CrawlConfigSearchValues,
4346
CrawlConfigUpdateResponse,
@@ -339,6 +342,7 @@ async def add_crawl_config(
339342
proxyId=config_in.proxyId,
340343
firstSeed=first_seed,
341344
seedCount=seed_count,
345+
shareable=config_in.shareable,
342346
)
343347

344348
if config_in.runNow:
@@ -573,6 +577,9 @@ async def update_crawl_config(
573577
changed = changed or self.check_attr_changed(
574578
orig_crawl_config, update, "browserWindows"
575579
)
580+
changed = changed or (
581+
self.check_attr_changed(orig_crawl_config, update, "shareable")
582+
)
576583

577584
schedule_changed = self.check_attr_changed(
578585
orig_crawl_config, update, "schedule"
@@ -849,6 +856,30 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
849856

850857
return None
851858

859+
async def get_last_successful_crawl_out(
860+
self,
861+
cid: UUID,
862+
org: Organization,
863+
request: Optional[Request] = None,
864+
) -> Optional[CrawlOutWithResources]:
865+
"""Return the last successful crawl out with resources for this config, if any"""
866+
headers = dict(request.headers) if request else None
867+
match_query = {
868+
"cid": cid,
869+
"oid": org.id,
870+
"finished": {"$ne": None},
871+
"state": {"$in": SUCCESSFUL_STATES},
872+
}
873+
last_crawl = await self.crawls.find_one(
874+
match_query, sort=[("finished", pymongo.DESCENDING)]
875+
)
876+
if last_crawl:
877+
return await self.crawl_ops.get_crawl_out(
878+
last_crawl["_id"], org, "crawl", headers=headers, cid=cid
879+
)
880+
881+
return None
882+
852883
async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
853884
"""recompute stats by incrementing size counter and number of crawls"""
854885
update_query: dict[str, object] = {}
@@ -1503,6 +1534,7 @@ def init_crawl_config_api(
15031534

15041535
org_crawl_dep = org_ops.org_crawl_dep
15051536
org_viewer_dep = org_ops.org_viewer_dep
1537+
org_public = org_ops.org_public
15061538

15071539
@router.get("", response_model=PaginatedCrawlConfigOutResponse)
15081540
async def get_crawl_configs(
@@ -1619,6 +1651,38 @@ async def get_all_crawler_proxies(
16191651

16201652
return ops.get_crawler_proxies()
16211653

1654+
@app.get(
1655+
"/orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
1656+
response_model=CrawlOutWithResources,
1657+
)
1658+
async def get_crawl_config_latest_crawl_public_replay(
1659+
request: Request,
1660+
response: Response,
1661+
cid: UUID,
1662+
org: Organization = Depends(org_public),
1663+
):
1664+
crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True)
1665+
if not crawl_config.shareable:
1666+
raise HTTPException(status_code=404, detail="crawl_config_not_found")
1667+
1668+
last_successful_crawl_out = await ops.get_last_successful_crawl_out(
1669+
cid, org, request
1670+
)
1671+
1672+
response.headers["Access-Control-Allow-Origin"] = "*"
1673+
response.headers["Access-Control-Allow-Headers"] = "*"
1674+
return last_successful_crawl_out
1675+
1676+
@app.options(
1677+
"orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
1678+
response_model=EmptyResponse,
1679+
)
1680+
async def get_replay_preflight(response: Response):
1681+
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
1682+
response.headers["Access-Control-Allow-Origin"] = "*"
1683+
response.headers["Access-Control-Allow-Headers"] = "*"
1684+
return {}
1685+
16221686
@router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
16231687
async def get_crawl_config_seeds(
16241688
cid: UUID,

backend/btrixcloud/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ def main() -> None:
269269
storage_ops,
270270
background_job_ops,
271271
coll_ops,
272+
crawl_config_ops,
272273
current_active_user,
273274
)
274275

backend/btrixcloud/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,8 @@ class CrawlConfigIn(BaseModel):
406406

407407
crawlFilenameTemplate: Optional[str] = None
408408

409+
shareable: bool = False
410+
409411

410412
# ============================================================================
411413
class ConfigRevision(BaseMongoModel):
@@ -497,6 +499,8 @@ class CrawlConfigAdditional(BaseModel):
497499

498500
crawlFilenameTemplate: Optional[str] = None
499501

502+
shareable: Optional[bool] = False
503+
500504

501505
# ============================================================================
502506
class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
@@ -555,6 +559,7 @@ class UpdateCrawlConfig(BaseModel):
555559
browserWindows: Optional[BrowserWindowCount] = None
556560
crawlFilenameTemplate: Optional[str] = None
557561
config: Optional[RawCrawlConfig] = None
562+
shareable: Optional[bool] = None
558563

559564

560565
# ============================================================================

backend/btrixcloud/pages.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1084,7 +1084,15 @@ async def process_finished_crawls():
10841084
# ============================================================================
10851085
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
10861086
def init_pages_api(
1087-
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
1087+
app,
1088+
mdb,
1089+
crawl_ops,
1090+
org_ops,
1091+
storage_ops,
1092+
background_job_ops,
1093+
coll_ops,
1094+
crawl_config_ops,
1095+
user_dep,
10881096
) -> PageOps:
10891097
"""init pages API"""
10901098
# pylint: disable=invalid-name
@@ -1336,6 +1344,47 @@ async def get_search_pages_list(
13361344
)
13371345
return {"items": pages}
13381346

1347+
@app.get(
1348+
"/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch",
1349+
tags=["pages", "crawlconfigs"],
1350+
response_model=PageOutItemsResponse,
1351+
)
1352+
async def get_search_pages_list_shareable_crawl_config(
1353+
cid: UUID,
1354+
org: Organization = Depends(org_public),
1355+
search: Optional[str] = None,
1356+
url: Optional[str] = None,
1357+
ts: Optional[datetime] = None,
1358+
isSeed: Optional[bool] = None,
1359+
depth: Optional[int] = None,
1360+
pageSize: int = DEFAULT_PAGE_SIZE,
1361+
page: int = 1,
1362+
):
1363+
"""Retrieve paginated list of pages for last successful crawl of workflow"""
1364+
crawl_config = await crawl_config_ops.get_crawl_config(
1365+
cid, org.id, active_only=True
1366+
)
1367+
if not crawl_config.shareable:
1368+
raise HTTPException(status_code=404, detail="crawl_config_not_found")
1369+
1370+
last_successful_crawl_out = (
1371+
await crawl_config_ops.get_last_successful_crawl_out(cid, org)
1372+
)
1373+
1374+
pages, _ = await ops.list_pages(
1375+
crawl_ids=[last_successful_crawl_out.id],
1376+
search=search,
1377+
url=url,
1378+
ts=ts,
1379+
is_seed=isSeed,
1380+
depth=depth,
1381+
org=org,
1382+
page_size=pageSize,
1383+
page=page,
1384+
include_total=False,
1385+
)
1386+
return {"items": pages}
1387+
13391388
@app.get(
13401389
"/orgs/{oid}/collections/{coll_id}/public/pages",
13411390
tags=["pages", "collections"],

backend/test/test_crawlconfigs.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,85 @@ def test_delete_seed_file_in_use_crawlconfig(
995995
assert r.json()["id"] == seed_file_id
996996

997997

998+
def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
999+
# Verify workflow is not shareable
1000+
r = requests.get(
1001+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
1002+
headers=admin_auth_headers,
1003+
)
1004+
assert r.status_code == 200
1005+
assert r.json()["shareable"] is False
1006+
1007+
# Verify public replay.json returns 404 while not shareable
1008+
r = requests.get(
1009+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
1010+
)
1011+
assert r.status_code == 404
1012+
assert r.json()["detail"] == "crawl_config_not_found"
1013+
1014+
# Verify public pagesSearch endpoint returns 404 while not shareable
1015+
r = requests.get(
1016+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
1017+
)
1018+
assert r.status_code == 404
1019+
1020+
# Mark workflow as shareable
1021+
r = requests.patch(
1022+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
1023+
headers=admin_auth_headers,
1024+
json={"shareable": True},
1025+
)
1026+
assert r.status_code == 200
1027+
1028+
data = r.json()
1029+
assert data["updated"]
1030+
assert data["settings_changed"]
1031+
assert data["metadata_changed"] is False
1032+
1033+
r = requests.get(
1034+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
1035+
headers=admin_auth_headers,
1036+
)
1037+
assert r.status_code == 200
1038+
assert r.json()["shareable"]
1039+
1040+
# Verify public replay.json returns last successful crawl while shareable
1041+
r = requests.get(
1042+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
1043+
)
1044+
assert r.status_code == 200
1045+
data = r.json()
1046+
1047+
assert data["id"] == admin_crawl_id
1048+
assert data["oid"] == default_org_id
1049+
assert data["cid"] == _admin_crawl_cid
1050+
assert data["type"] == "crawl"
1051+
assert data["state"] == "complete"
1052+
1053+
resources = data["resources"]
1054+
assert resources
1055+
assert resources[0]["path"]
1056+
1057+
assert len(data["initialPages"]) == 4
1058+
1059+
pages_query_url = data["pagesQueryUrl"]
1060+
assert pages_query_url.endswith(
1061+
f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
1062+
)
1063+
assert data["downloadUrl"] is None
1064+
1065+
# Verify pages search endpoint is accessible and works
1066+
r = requests.get(pages_query_url)
1067+
assert r.status_code == 200
1068+
data = r.json()
1069+
assert data["items"]
1070+
for page in data["items"]:
1071+
assert page["id"]
1072+
assert page["oid"] == default_org_id
1073+
assert page["crawl_id"] == admin_crawl_id
1074+
assert page["url"]
1075+
1076+
9981077
def test_add_crawl_config_fail_on_content_check_no_profile(
9991078
crawler_auth_headers, default_org_id, sample_crawl_data
10001079
):
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import { msg } from "@lit/localize";
2+
import { html } from "lit";
3+
4+
export const ShareableNotice = () =>
5+
html`<btrix-popover
6+
content=${msg(
7+
"The latest crawl from this workflow is publicly accessible to anyone with the link. This can be changed with the Browsertrix API.",
8+
)}
9+
>
10+
<btrix-badge class="part-[base]:min-h-5" variant="warning">
11+
<sl-icon name="info-circle" class="align-icon mr-1"></sl-icon>
12+
${msg("Public")}
13+
</btrix-badge>
14+
</btrix-popover>`;

frontend/src/features/crawl-workflows/workflow-list.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ import {
1919
query,
2020
queryAssignedElements,
2121
} from "lit/decorators.js";
22+
import { when } from "lit/directives/when.js";
23+
24+
import { ShareableNotice } from "./templates/shareable-notice";
2225

2326
import { BtrixElement } from "@/classes/BtrixElement";
2427
import type { OverflowDropdown } from "@/components/ui/overflow-dropdown";
@@ -250,7 +253,8 @@ export class WorkflowListItem extends BtrixElement {
250253
}}
251254
>
252255
<div class="col">
253-
<div class="detail url truncate">
256+
<div class="detail url items-center truncate">
257+
${when(this.workflow?.shareable, ShareableNotice)}
254258
${this.safeRender(this.renderName)}
255259
</div>
256260
<div class="desc">

0 commit comments

Comments
 (0)