|
16 | 16 | import urllib.parse |
17 | 17 |
|
18 | 18 | import aiohttp |
19 | | -from fastapi import APIRouter, Depends, HTTPException, Query |
| 19 | +from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response |
20 | 20 | import pymongo |
21 | 21 |
|
22 | 22 | from .pagination import DEFAULT_PAGE_SIZE, paginated_format |
|
27 | 27 | CrawlConfigOut, |
28 | 28 | CrawlConfigTags, |
29 | 29 | CrawlOut, |
| 30 | + CrawlOutWithResources, |
30 | 31 | UpdateCrawlConfig, |
31 | 32 | Organization, |
32 | 33 | User, |
33 | 34 | PaginatedCrawlConfigOutResponse, |
34 | 35 | PaginatedSeedResponse, |
35 | 36 | PaginatedConfigRevisionResponse, |
| 37 | + SUCCESSFUL_STATES, |
36 | 38 | FAILED_STATES, |
37 | 39 | CrawlerChannel, |
38 | 40 | CrawlerChannels, |
39 | 41 | StartedResponse, |
40 | 42 | SuccessResponse, |
| 43 | + EmptyResponse, |
41 | 44 | CrawlConfigAddedResponse, |
42 | 45 | CrawlConfigSearchValues, |
43 | 46 | CrawlConfigUpdateResponse, |
@@ -339,6 +342,7 @@ async def add_crawl_config( |
339 | 342 | proxyId=config_in.proxyId, |
340 | 343 | firstSeed=first_seed, |
341 | 344 | seedCount=seed_count, |
| 345 | + shareable=config_in.shareable, |
342 | 346 | ) |
343 | 347 |
|
344 | 348 | if config_in.runNow: |
@@ -573,6 +577,9 @@ async def update_crawl_config( |
573 | 577 | changed = changed or self.check_attr_changed( |
574 | 578 | orig_crawl_config, update, "browserWindows" |
575 | 579 | ) |
| 580 | + changed = changed or ( |
| 581 | + self.check_attr_changed(orig_crawl_config, update, "shareable") |
| 582 | + ) |
576 | 583 |
|
577 | 584 | schedule_changed = self.check_attr_changed( |
578 | 585 | orig_crawl_config, update, "schedule" |
@@ -849,6 +856,30 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]: |
849 | 856 |
|
850 | 857 | return None |
851 | 858 |
|
| 859 | + async def get_last_successful_crawl_out( |
| 860 | + self, |
| 861 | + cid: UUID, |
| 862 | + org: Organization, |
| 863 | + request: Optional[Request] = None, |
| 864 | + ) -> Optional[CrawlOutWithResources]: |
| 865 | + """Return the last successful crawl out with resources for this config, if any""" |
| 866 | + headers = dict(request.headers) if request else None |
| 867 | + match_query = { |
| 868 | + "cid": cid, |
| 869 | + "oid": org.id, |
| 870 | + "finished": {"$ne": None}, |
| 871 | + "state": {"$in": SUCCESSFUL_STATES}, |
| 872 | + } |
| 873 | + last_crawl = await self.crawls.find_one( |
| 874 | + match_query, sort=[("finished", pymongo.DESCENDING)] |
| 875 | + ) |
| 876 | + if last_crawl: |
| 877 | + return await self.crawl_ops.get_crawl_out( |
| 878 | + last_crawl["_id"], org, "crawl", headers=headers, cid=cid |
| 879 | + ) |
| 880 | + |
| 881 | + return None |
| 882 | + |
852 | 883 | async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): |
853 | 884 | """recompute stats by incrementing size counter and number of crawls""" |
854 | 885 | update_query: dict[str, object] = {} |
@@ -1503,6 +1534,7 @@ def init_crawl_config_api( |
1503 | 1534 |
|
1504 | 1535 | org_crawl_dep = org_ops.org_crawl_dep |
1505 | 1536 | org_viewer_dep = org_ops.org_viewer_dep |
| 1537 | + org_public = org_ops.org_public |
1506 | 1538 |
|
1507 | 1539 | @router.get("", response_model=PaginatedCrawlConfigOutResponse) |
1508 | 1540 | async def get_crawl_configs( |
@@ -1619,6 +1651,38 @@ async def get_all_crawler_proxies( |
1619 | 1651 |
|
1620 | 1652 | return ops.get_crawler_proxies() |
1621 | 1653 |
|
| 1654 | + @app.get( |
| 1655 | + "/orgs/{oid}/crawlconfigs/{cid}/public/replay.json", |
| 1656 | + response_model=CrawlOutWithResources, |
| 1657 | + ) |
| 1658 | + async def get_crawl_config_latest_crawl_public_replay( |
| 1659 | + request: Request, |
| 1660 | + response: Response, |
| 1661 | + cid: UUID, |
| 1662 | + org: Organization = Depends(org_public), |
| 1663 | + ): |
| 1664 | + crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True) |
| 1665 | + if not crawl_config.shareable: |
| 1666 | + raise HTTPException(status_code=404, detail="crawl_config_not_found") |
| 1667 | + |
| 1668 | + last_successful_crawl_out = await ops.get_last_successful_crawl_out( |
| 1669 | + cid, org, request |
| 1670 | + ) |
| 1671 | + |
| 1672 | + response.headers["Access-Control-Allow-Origin"] = "*" |
| 1673 | + response.headers["Access-Control-Allow-Headers"] = "*" |
| 1674 | + return last_successful_crawl_out |
| 1675 | + |
| 1676 | + @app.options( |
| 1677 | + "orgs/{oid}/crawlconfigs/{cid}/public/replay.json", |
| 1678 | + response_model=EmptyResponse, |
| 1679 | + ) |
| 1680 | + async def get_replay_preflight(response: Response): |
| 1681 | + response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" |
| 1682 | + response.headers["Access-Control-Allow-Origin"] = "*" |
| 1683 | + response.headers["Access-Control-Allow-Headers"] = "*" |
| 1684 | + return {} |
| 1685 | + |
1622 | 1686 | @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse) |
1623 | 1687 | async def get_crawl_config_seeds( |
1624 | 1688 | cid: UUID, |
|
0 commit comments