1515)
1616from v03_pipeline .lib .tasks .dataproc .misc import get_cluster_name , to_kebab_str_args
1717
18+ FAILURE_STATUSES = {
19+ google .cloud .dataproc_v1 .types .jobs .JobStatus .State .CANCELLED ,
20+ google .cloud .dataproc_v1 .types .jobs .JobStatus .State .ERROR ,
21+ google .cloud .dataproc_v1 .types .jobs .JobStatus .State .ATTEMPT_FAILURE ,
22+ }
1823SEQR_PIPELINE_RUNNER_BUILD = f'gs://seqr-pipeline-runner-builds/{ Env .DEPLOYMENT_TYPE } /{ Env .PIPELINE_RUNNER_APP_VERSION } '
1924TIMEOUT_S = 172800 # 2 days
2025
@@ -44,7 +49,9 @@ def job_id(self):
4449 def requires (self ) -> [luigi .Task ]:
4550 return [self .clone (CreateDataprocClusterTask )]
4651
47- def complete (self ) -> bool :
52+ def safely_get_job (
53+ self ,
54+ ):
4855 try :
4956 job = self .client .get_job (
5057 request = {
@@ -54,12 +61,15 @@ def complete(self) -> bool:
5461 },
5562 )
5663 except google .api_core .exceptions .NotFound :
64+ return None
65+ else :
66+ return job
67+
68+ def complete (self ) -> bool :
69+ job = self .safely_get_job ()
70+ if not job :
5771 return False
58- if job .status .state in {
59- google .cloud .dataproc_v1 .types .jobs .JobStatus .State .CANCELLED ,
60- google .cloud .dataproc_v1 .types .jobs .JobStatus .State .ERROR ,
61- google .cloud .dataproc_v1 .types .jobs .JobStatus .State .ATTEMPT_FAILURE ,
62- }:
72+ if job .status .state in FAILURE_STATUSES :
6373 msg = f'Job { self .job_id } entered { job .status .state .name } state'
6474 logger .error (msg )
6575 logger .error (job .status .details )
@@ -68,43 +78,52 @@ def complete(self) -> bool:
6878 )
6979
7080 def run (self ):
71- operation = self .client .submit_job_as_operation (
72- request = {
73- 'project_id' : Env .GCLOUD_PROJECT ,
74- 'region' : Env .GCLOUD_REGION ,
75- 'job' : {
76- 'reference' : {
77- 'job_id' : self .job_id ,
78- },
79- 'placement' : {
80- 'cluster_name' : get_cluster_name (
81- self .reference_genome ,
82- self .run_id ,
83- ),
84- },
85- 'pyspark_job' : {
86- 'main_python_file_uri' : f'{ SEQR_PIPELINE_RUNNER_BUILD } /bin/run_task.py' ,
87- 'args' : [
88- self .task .task_family ,
89- '--local-scheduler' ,
90- * to_kebab_str_args (self ),
91- ],
92- 'python_file_uris' : [
93- f'{ SEQR_PIPELINE_RUNNER_BUILD } /pyscripts.zip' ,
94- ],
81+ job = self .safely_get_job ()
82+ if not job :
83+ self .client .submit_job_as_operation (
84+ request = {
85+ 'project_id' : Env .GCLOUD_PROJECT ,
86+ 'region' : Env .GCLOUD_REGION ,
87+ 'job' : {
88+ 'reference' : {
89+ 'job_id' : self .job_id ,
90+ },
91+ 'placement' : {
92+ 'cluster_name' : get_cluster_name (
93+ self .reference_genome ,
94+ self .run_id ,
95+ ),
96+ },
97+ 'pyspark_job' : {
98+ 'main_python_file_uri' : f'{ SEQR_PIPELINE_RUNNER_BUILD } /bin/run_task.py' ,
99+ 'args' : [
100+ self .task .task_family ,
101+ '--local-scheduler' ,
102+ * to_kebab_str_args (self ),
103+ ],
104+ 'python_file_uris' : [
105+ f'{ SEQR_PIPELINE_RUNNER_BUILD } /pyscripts.zip' ,
106+ ],
107+ },
95108 },
96109 },
97- },
98- )
110+ )
99111 wait_s = 0
100112 while wait_s < TIMEOUT_S :
101- if operation .done ():
102- operation .result () # Will throw on failure!
103- msg = f'Finished { self .job_id } '
113+ job = self .safely_get_job ()
114+ if (
115+ job .status .state
116+ == google .cloud .dataproc_v1 .types .jobs .JobStatus .State .DONE
117+ ):
118+ msg = f'Job { self .job_id } is complete'
104119 logger .info (msg )
105120 break
121+ if job .status .state in FAILURE_STATUSES :
122+ msg = f'Job { self .job_id } entered { job .status .state .name } state'
123+ logger .error (msg )
124+ raise RuntimeError (msg )
106125 logger .info (
107- f'Waiting for job completion { self .job_id } ' ,
126+ f'Waiting for Job completion { self .job_id } ' ,
108127 )
109128 time .sleep (3 )
110129 wait_s += 3
0 commit comments