|
27 | 27 | import uuid
|
28 | 28 | import warnings
|
29 | 29 |
|
30 |
| -from ..job.job import RayJobSpec |
31 |
| - |
32 | 30 | from ...common.kubernetes_cluster.auth import (
|
33 | 31 | config_check,
|
34 | 32 | get_api_client,
|
@@ -609,298 +607,6 @@ def _component_resources_down(
|
609 | 607 | yamls = yaml.safe_load_all(self.resource_yaml)
|
610 | 608 | _delete_resources(yamls, namespace, api_instance, cluster_name)
|
611 | 609 |
|
612 |
| - @staticmethod |
613 |
| - def run_job_with_managed_cluster( |
614 |
| - cluster_config: ClusterConfiguration, |
615 |
| - job_config: RayJobSpec, |
616 |
| - job_cr_name: Optional[str] = None, |
617 |
| - submission_mode: str = "K8sJobMode", |
618 |
| - shutdown_after_job_finishes: bool = True, |
619 |
| - ttl_seconds_after_finished: Optional[int] = None, |
620 |
| - suspend_rayjob_creation: bool = False, |
621 |
| - wait_for_completion: bool = True, |
622 |
| - job_timeout_seconds: Optional[int] = 3600, |
623 |
| - job_polling_interval_seconds: int = 10, |
624 |
| - ): |
625 |
| - """ |
626 |
| - Manages the lifecycle of a Ray cluster and a job by creating a RayJob custom resource. |
627 |
| - KubeRay operator will then create/delete the RayCluster based on the RayJob definition. |
628 |
| -
|
629 |
| - This method will: |
630 |
| - 1. Generate a RayCluster specification from the provided 'cluster_config'. |
631 |
| - 2. Construct a RayJob custom resource definition using 'job_config' and embedding the RayCluster spec. |
632 |
| - 3. Create the RayJob resource in Kubernetes. |
633 |
| - 4. Optionally, wait for the RayJob to complete or timeout, monitoring its status. |
634 |
| - 5. The RayCluster lifecycle (creation and deletion) is managed by KubeRay |
635 |
| - based on the RayJob's 'shutdownAfterJobFinishes' field. |
636 |
| -
|
637 |
| - Args: |
638 |
| - cluster_config: Configuration for the Ray cluster to be created by RayJob. |
639 |
| - job_config: RayJobSpec object containing job-specific details like entrypoint, runtime_env, etc. |
640 |
| - job_cr_name: Name for the RayJob Custom Resource. If None, a unique name is generated. |
641 |
| - submission_mode: How the job is submitted ("K8sJobMode" or "RayClientMode"). |
642 |
| - shutdown_after_job_finishes: If True, RayCluster is deleted after job finishes. |
643 |
| - ttl_seconds_after_finished: TTL for RayJob after it's finished. |
644 |
| - suspend_rayjob_creation: If True, creates the RayJob in a suspended state. |
645 |
| - wait_for_completion: If True, waits for the job to finish. |
646 |
| - job_timeout_seconds: Timeout for waiting for job completion. |
647 |
| - job_polling_interval_seconds: Interval for polling job status. |
648 |
| -
|
649 |
| - Returns: |
650 |
| - A dictionary containing details like RayJob CR name, reported job submission ID, |
651 |
| - final job status, dashboard URL, and the RayCluster name. |
652 |
| -
|
653 |
| - Raises: |
654 |
| - TimeoutError: If the job doesn't complete within the specified timeout. |
655 |
| - ApiException: For Kubernetes API errors. |
656 |
| - ValueError: For configuration issues. |
657 |
| - """ |
658 |
| - config_check() |
659 |
| - k8s_co_api = k8s_client.CustomObjectsApi(get_api_client()) |
660 |
| - namespace = cluster_config.namespace |
661 |
| - |
662 |
| - if not job_config.entrypoint: |
663 |
| - raise ValueError("job_config.entrypoint must be specified.") |
664 |
| - |
665 |
| - # Warn if Pydantic V1/V2 specific fields in RayJobSpec are set, as they are not used for RayJob CR. |
666 |
| - if ( |
667 |
| - job_config.entrypoint_num_cpus is not None |
668 |
| - or job_config.entrypoint_num_gpus is not None |
669 |
| - or job_config.entrypoint_memory is not None |
670 |
| - ): |
671 |
| - warnings.warn( |
672 |
| - "RayJobSpec fields 'entrypoint_num_cpus', 'entrypoint_num_gpus', 'entrypoint_memory' " |
673 |
| - "are not directly used when creating a RayJob CR. They are primarily for the Ray Job Submission Client. " |
674 |
| - "Resource requests for the job driver pod should be configured in the RayCluster head node spec via ClusterConfiguration.", |
675 |
| - UserWarning, |
676 |
| - ) |
677 |
| - |
678 |
| - # Generate rayClusterSpec from ClusterConfiguration |
679 |
| - temp_config_for_spec = copy.deepcopy(cluster_config) |
680 |
| - temp_config_for_spec.appwrapper = False |
681 |
| - |
682 |
| - with warnings.catch_warnings(): |
683 |
| - warnings.simplefilter("ignore", UserWarning) |
684 |
| - dummy_cluster_for_spec = Cluster(temp_config_for_spec) |
685 |
| - |
686 |
| - ray_cluster_cr_dict = dummy_cluster_for_spec.resource_yaml |
687 |
| - if ( |
688 |
| - not isinstance(ray_cluster_cr_dict, dict) |
689 |
| - or "spec" not in ray_cluster_cr_dict |
690 |
| - ): |
691 |
| - raise ValueError( |
692 |
| - "Failed to generate RayCluster CR dictionary from ClusterConfiguration. " |
693 |
| - f"Got: {type(ray_cluster_cr_dict)}" |
694 |
| - ) |
695 |
| - ray_cluster_spec = ray_cluster_cr_dict["spec"] |
696 |
| - |
697 |
| - # Prepare RayJob CR |
698 |
| - actual_job_cr_name = job_cr_name or f"rayjob-{uuid.uuid4().hex[:10]}" |
699 |
| - |
700 |
| - runtime_env_yaml_str = "" |
701 |
| - if job_config.runtime_env: |
702 |
| - try: |
703 |
| - runtime_env_yaml_str = yaml.dump(job_config.runtime_env) |
704 |
| - except yaml.YAMLError as e: |
705 |
| - raise ValueError( |
706 |
| - f"Invalid job_config.runtime_env, failed to dump to YAML: {e}" |
707 |
| - ) |
708 |
| - |
709 |
| - ray_job_cr_spec = { |
710 |
| - "entrypoint": job_config.entrypoint, |
711 |
| - "shutdownAfterJobFinishes": shutdown_after_job_finishes, |
712 |
| - "rayClusterSpec": ray_cluster_spec, |
713 |
| - "submissionMode": submission_mode, |
714 |
| - } |
715 |
| - |
716 |
| - if runtime_env_yaml_str: |
717 |
| - ray_job_cr_spec["runtimeEnvYAML"] = runtime_env_yaml_str |
718 |
| - if job_config.submission_id: |
719 |
| - ray_job_cr_spec["jobId"] = job_config.submission_id |
720 |
| - if job_config.metadata: |
721 |
| - ray_job_cr_spec["metadata"] = job_config.metadata |
722 |
| - if ttl_seconds_after_finished is not None: |
723 |
| - ray_job_cr_spec["ttlSecondsAfterFinished"] = ttl_seconds_after_finished |
724 |
| - if suspend_rayjob_creation: |
725 |
| - ray_job_cr_spec["suspend"] = True |
726 |
| - if job_config.entrypoint_resources: |
727 |
| - ray_job_cr_spec["entrypointResources"] = job_config.entrypoint_resources |
728 |
| - |
729 |
| - ray_job_cr = { |
730 |
| - "apiVersion": "ray.io/v1", |
731 |
| - "kind": "RayJob", |
732 |
| - "metadata": { |
733 |
| - "name": actual_job_cr_name, |
734 |
| - "namespace": namespace, |
735 |
| - }, |
736 |
| - "spec": ray_job_cr_spec, |
737 |
| - } |
738 |
| - |
739 |
| - returned_job_submission_id = None |
740 |
| - final_job_status = "UNKNOWN" |
741 |
| - dashboard_url = None |
742 |
| - ray_cluster_name_actual = None |
743 |
| - |
744 |
| - try: |
745 |
| - print( |
746 |
| - f"Submitting RayJob '{actual_job_cr_name}' to namespace '{namespace}'..." |
747 |
| - ) |
748 |
| - k8s_co_api.create_namespaced_custom_object( |
749 |
| - group="ray.io", |
750 |
| - version="v1", |
751 |
| - namespace=namespace, |
752 |
| - plural="rayjobs", |
753 |
| - body=ray_job_cr, |
754 |
| - ) |
755 |
| - print(f"RayJob '{actual_job_cr_name}' created successfully.") |
756 |
| - |
757 |
| - if wait_for_completion: |
758 |
| - print(f"Waiting for RayJob '{actual_job_cr_name}' to complete...") |
759 |
| - start_time = time.time() |
760 |
| - while True: |
761 |
| - try: |
762 |
| - ray_job_status_cr = ( |
763 |
| - k8s_co_api.get_namespaced_custom_object_status( |
764 |
| - group="ray.io", |
765 |
| - version="v1", |
766 |
| - namespace=namespace, |
767 |
| - plural="rayjobs", |
768 |
| - name=actual_job_cr_name, |
769 |
| - ) |
770 |
| - ) |
771 |
| - except ApiException as e: |
772 |
| - if e.status == 404: |
773 |
| - print( |
774 |
| - f"RayJob '{actual_job_cr_name}' status not found yet, retrying..." |
775 |
| - ) |
776 |
| - time.sleep(job_polling_interval_seconds) |
777 |
| - continue |
778 |
| - raise |
779 |
| - |
780 |
| - status_field = ray_job_status_cr.get("status", {}) |
781 |
| - job_deployment_status = status_field.get( |
782 |
| - "jobDeploymentStatus", "UNKNOWN" |
783 |
| - ) |
784 |
| - current_job_status = status_field.get("jobStatus", "PENDING") |
785 |
| - |
786 |
| - dashboard_url = status_field.get("dashboardURL", dashboard_url) |
787 |
| - ray_cluster_name_actual = status_field.get( |
788 |
| - "rayClusterName", ray_cluster_name_actual |
789 |
| - ) |
790 |
| - returned_job_submission_id = status_field.get( |
791 |
| - "jobId", job_config.submission_id |
792 |
| - ) |
793 |
| - |
794 |
| - final_job_status = current_job_status |
795 |
| - print( |
796 |
| - f"RayJob '{actual_job_cr_name}' status: JobDeployment='{job_deployment_status}', Job='{current_job_status}'" |
797 |
| - ) |
798 |
| - |
799 |
| - if current_job_status in ["SUCCEEDED", "FAILED", "STOPPED"]: |
800 |
| - break |
801 |
| - |
802 |
| - if ( |
803 |
| - job_timeout_seconds |
804 |
| - and (time.time() - start_time) > job_timeout_seconds |
805 |
| - ): |
806 |
| - try: |
807 |
| - ray_job_status_cr_final = ( |
808 |
| - k8s_co_api.get_namespaced_custom_object_status( |
809 |
| - group="ray.io", |
810 |
| - version="v1", |
811 |
| - namespace=namespace, |
812 |
| - plural="rayjobs", |
813 |
| - name=actual_job_cr_name, |
814 |
| - ) |
815 |
| - ) |
816 |
| - status_field_final = ray_job_status_cr_final.get( |
817 |
| - "status", {} |
818 |
| - ) |
819 |
| - final_job_status = status_field_final.get( |
820 |
| - "jobStatus", final_job_status |
821 |
| - ) |
822 |
| - returned_job_submission_id = status_field_final.get( |
823 |
| - "jobId", returned_job_submission_id |
824 |
| - ) |
825 |
| - dashboard_url = status_field_final.get( |
826 |
| - "dashboardURL", dashboard_url |
827 |
| - ) |
828 |
| - ray_cluster_name_actual = status_field_final.get( |
829 |
| - "rayClusterName", ray_cluster_name_actual |
830 |
| - ) |
831 |
| - except Exception: |
832 |
| - pass |
833 |
| - raise TimeoutError( |
834 |
| - f"RayJob '{actual_job_cr_name}' timed out after {job_timeout_seconds} seconds. Last status: {final_job_status}" |
835 |
| - ) |
836 |
| - |
837 |
| - time.sleep(job_polling_interval_seconds) |
838 |
| - |
839 |
| - print( |
840 |
| - f"RayJob '{actual_job_cr_name}' finished with status: {final_job_status}" |
841 |
| - ) |
842 |
| - else: |
843 |
| - try: |
844 |
| - ray_job_status_cr = k8s_co_api.get_namespaced_custom_object_status( |
845 |
| - group="ray.io", |
846 |
| - version="v1", |
847 |
| - namespace=namespace, |
848 |
| - plural="rayjobs", |
849 |
| - name=actual_job_cr_name, |
850 |
| - ) |
851 |
| - status_field = ray_job_status_cr.get("status", {}) |
852 |
| - final_job_status = status_field.get("jobStatus", "SUBMITTED") |
853 |
| - returned_job_submission_id = status_field.get( |
854 |
| - "jobId", job_config.submission_id |
855 |
| - ) |
856 |
| - dashboard_url = status_field.get("dashboardURL", dashboard_url) |
857 |
| - ray_cluster_name_actual = status_field.get( |
858 |
| - "rayClusterName", ray_cluster_name_actual |
859 |
| - ) |
860 |
| - except ApiException as e: |
861 |
| - if e.status == 404: |
862 |
| - final_job_status = "SUBMITTED_NOT_FOUND" |
863 |
| - else: |
864 |
| - print( |
865 |
| - f"Warning: Could not fetch initial status for RayJob '{actual_job_cr_name}': {e}" |
866 |
| - ) |
867 |
| - final_job_status = "UNKNOWN_API_ERROR" |
868 |
| - |
869 |
| - return { |
870 |
| - "job_cr_name": actual_job_cr_name, |
871 |
| - "job_submission_id": returned_job_submission_id, |
872 |
| - "job_status": final_job_status, |
873 |
| - "dashboard_url": dashboard_url, |
874 |
| - "ray_cluster_name": ray_cluster_name_actual, |
875 |
| - } |
876 |
| - |
877 |
| - except ApiException as e: |
878 |
| - print( |
879 |
| - f"Kubernetes API error during RayJob '{actual_job_cr_name}' management: {e.reason} (status: {e.status})" |
880 |
| - ) |
881 |
| - final_status_on_error = "ERROR_BEFORE_SUBMISSION" |
882 |
| - if actual_job_cr_name: |
883 |
| - try: |
884 |
| - ray_job_status_cr = k8s_co_api.get_namespaced_custom_object_status( |
885 |
| - group="ray.io", |
886 |
| - version="v1", |
887 |
| - namespace=namespace, |
888 |
| - plural="rayjobs", |
889 |
| - name=actual_job_cr_name, |
890 |
| - ) |
891 |
| - status_field = ray_job_status_cr.get("status", {}) |
892 |
| - final_status_on_error = status_field.get( |
893 |
| - "jobStatus", "UNKNOWN_AFTER_K8S_ERROR" |
894 |
| - ) |
895 |
| - except Exception: |
896 |
| - final_status_on_error = "UNKNOWN_FINAL_STATUS_FETCH_FAILED" |
897 |
| - raise |
898 |
| - except Exception as e: |
899 |
| - print( |
900 |
| - f"An unexpected error occurred during managed RayJob execution for '{actual_job_cr_name}': {e}" |
901 |
| - ) |
902 |
| - raise |
903 |
| - |
904 | 610 |
|
905 | 611 | def list_all_clusters(namespace: str, print_to_console: bool = True):
|
906 | 612 | """
|
@@ -1057,21 +763,15 @@ def get_cluster(
|
1057 | 763 | head_extended_resource_requests=head_extended_resources,
|
1058 | 764 | worker_extended_resource_requests=worker_extended_resources,
|
1059 | 765 | )
|
1060 |
| - # 1. Prepare RayClusterSpec from ClusterConfiguration |
1061 |
| - # Create a temporary config with appwrapper=False to ensure build_ray_cluster returns RayCluster YAML |
1062 |
| - temp_cluster_config_dict = cluster_config.dict( |
1063 |
| - exclude_none=True |
1064 |
| - ) # Assuming Pydantic V1 or similar .dict() method |
1065 |
| - temp_cluster_config_dict["appwrapper"] = False |
1066 |
| - temp_cluster_config_for_spec = ClusterConfiguration(**temp_cluster_config_dict) |
| 766 | + |
1067 | 767 | # Ignore the warning here for the lack of a ClusterConfiguration
|
1068 | 768 | with warnings.catch_warnings():
|
1069 | 769 | warnings.filterwarnings(
|
1070 | 770 | "ignore",
|
1071 | 771 | message="Please provide a ClusterConfiguration to initialise the Cluster object",
|
1072 | 772 | )
|
1073 | 773 | cluster = Cluster(None)
|
1074 |
| - cluster.config = temp_cluster_config_for_spec |
| 774 | + cluster.config = cluster_config |
1075 | 775 |
|
1076 | 776 | # Remove auto-generated fields like creationTimestamp, uid and etc.
|
1077 | 777 | remove_autogenerated_fields(resource)
|
|
0 commit comments