Skip to content

Commit 9481437

Browse files
committed
[merge] Merge the latest update in refactor_developtment
1 parent eceae70 commit 9481437

File tree

1 file changed

+59
-42
lines changed

1 file changed

+59
-42
lines changed

autoPyTorch/api/base_task.py

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
import pandas as pd
2727

28-
from smac.runhistory.runhistory import RunHistory
28+
from smac.runhistory.runhistory import DataOrigin, RunHistory
2929
from smac.stats.stats import Stats
3030
from smac.tae import StatusType
3131

@@ -172,11 +172,10 @@ def __init__(
172172
self.search_space: Optional[ConfigurationSpace] = None
173173
self._metric: Optional[autoPyTorchMetric] = None
174174
self._logger: Optional[PicklableClientLogger] = None
175-
self.run_history: Optional[RunHistory] = None
175+
self.run_history: RunHistory = RunHistory()
176176
self.trajectory: Optional[List] = None
177177
self.dataset_name: Optional[str] = None
178178
self.cv_models_: Dict = {}
179-
self.num_run: int = 1
180179
self.experiment_task_name: str = 'runSearch'
181180

182181
# By default try to use the TCP logging port or get a new port
@@ -492,6 +491,9 @@ def _do_dummy_prediction(self) -> None:
492491
assert self._metric is not None
493492
assert self._logger is not None
494493

494+
# For dummy estimator, we always expect the num_run to be 1
495+
num_run = 1
496+
495497
self._logger.info("Starting to create dummy predictions.")
496498

497499
memory_limit = self._memory_limit
@@ -511,14 +513,14 @@ def _do_dummy_prediction(self) -> None:
511513
logger_port=self._logger_port,
512514
cost_for_crash=get_cost_of_crash(self._metric),
513515
abort_on_first_run_crash=False,
514-
initial_num_run=self.num_run,
516+
initial_num_run=num_run,
515517
stats=stats,
516518
memory_limit=memory_limit,
517519
disable_file_output=True if len(self._disable_file_output) > 0 else False,
518520
all_supported_metrics=self._all_supported_metrics
519521
)
520522

521-
status, cost, runtime, additional_info = ta.run(self.num_run, cutoff=self._total_walltime_limit)
523+
status, cost, runtime, additional_info = ta.run(num_run, cutoff=self._total_walltime_limit)
522524
if status == StatusType.SUCCESS:
523525
self._logger.info("Finished creating dummy predictions.")
524526
else:
@@ -560,20 +562,12 @@ def _do_traditional_prediction(self, time_left: int) -> None:
560562
This method currently only supports classification.
561563
562564
Args:
563-
num_run: (int)
564-
An identifier to indicate the current machine learning algorithm
565-
being processed
566565
time_left: (int)
567566
Hard limit on how many machine learning algorithms can be fit. Depending on how
568567
fast a traditional machine learning algorithm trains, it will allow multiple
569568
models to be fitted.
570569
func_eval_time_limit_secs: (int)
571570
Maximum training time each algorithm is allowed to take, during training
572-
573-
Returns:
574-
num_run: (int)
575-
The incremented identifier index. This depends on how many machine learning
576-
models were fitted.
577571
"""
578572

579573
# Mypy Checkings -- Traditional prediction is only called for search
@@ -582,16 +576,19 @@ def _do_traditional_prediction(self, time_left: int) -> None:
582576
assert self._logger is not None
583577
assert self._dask_client is not None
584578

585-
self.num_run += 1
579+
self._logger.info("Starting to create traditional classifier predictions.")
580+
581+
# Initialise run history for the traditional classifiers
582+
run_history = RunHistory()
586583

587584
memory_limit = self._memory_limit
588585
if memory_limit is not None:
589586
memory_limit = int(math.ceil(memory_limit))
590587
available_classifiers = get_available_classifiers()
591588
dask_futures = []
592589

593-
total_number_classifiers = len(available_classifiers) + self.num_run
594-
for n_r, classifier in enumerate(available_classifiers, start=self.num_run):
590+
total_number_classifiers = len(available_classifiers)
591+
for n_r, classifier in enumerate(available_classifiers):
595592

596593
# Only launch a task if there is time
597594
start_time = time.time()
@@ -610,7 +607,7 @@ def _do_traditional_prediction(self, time_left: int) -> None:
610607
logger_port=self._logger_port,
611608
cost_for_crash=get_cost_of_crash(self._metric),
612609
abort_on_first_run_crash=False,
613-
initial_num_run=n_r,
610+
initial_num_run=self._backend.get_next_num_run(),
614611
stats=stats,
615612
memory_limit=memory_limit,
616613
disable_file_output=True if len(self._disable_file_output) > 0 else False,
@@ -624,9 +621,6 @@ def _do_traditional_prediction(self, time_left: int) -> None:
624621
)
625622
])
626623

627-
# Increment the launched job index
628-
self.num_run = n_r
629-
630624
# When managing time, we need to take into account the allocated time resources,
631625
# which are dependent on the number of cores. 'dask_futures' is a proxy to the number
632626
# of workers /n_jobs that we have, in that if there are 4 cores allocated, we can run at most
@@ -653,6 +647,11 @@ def _do_traditional_prediction(self, time_left: int) -> None:
653647
if status == StatusType.SUCCESS:
654648
self._logger.info(
655649
f"Fitting {cls} took {runtime}s, performance:{cost}/{additional_info}")
650+
configuration = additional_info['pipeline_configuration']
651+
origin = additional_info['configuration_origin']
652+
run_history.add(config=configuration, cost=cost,
653+
time=runtime, status=status, seed=self.seed,
654+
origin=origin)
656655
else:
657656
if additional_info.get('exitcode') == -6:
658657
self._logger.error(
@@ -679,6 +678,13 @@ def _do_traditional_prediction(self, time_left: int) -> None:
679678
"Please consider increasing the run time to further improve performance.")
680679
break
681680

681+
self._logger.debug("Run history traditional: {}".format(run_history))
682+
# add run history of traditional to api run history
683+
self.run_history.update(run_history, DataOrigin.EXTERNAL_SAME_INSTANCES)
684+
run_history.save_json(os.path.join(self._backend.internals_directory, 'traditional_run_history.json'),
685+
save_external=True)
686+
return
687+
682688
def _run_dummy_predictions(self) -> None:
683689
dummy_task_name = 'runDummy'
684690
self._stopwatch.start_task(dummy_task_name)
@@ -727,13 +733,17 @@ def _run_ensemble(self, dataset: BaseDataset, optimize_metric: str,
727733
dataset_name=dataset.dataset_name,
728734
output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type],
729735
task_type=STRING_TO_TASK_TYPES[self.task_type],
730-
metrics=[self._metric], opt_metric=optimize_metric,
736+
metrics=[self._metric],
737+
opt_metric=optimize_metric,
731738
ensemble_size=self.ensemble_size,
732739
ensemble_nbest=self.ensemble_nbest,
733740
max_models_on_disc=self.max_models_on_disc,
741+
seed=self.seed,
742+
max_iterations=None,
743+
read_at_most=sys.maxsize,
734744
ensemble_memory_limit=self._memory_limit,
735-
seed=self.seed, max_iterations=None, random_state=self.seed,
736-
read_at_most=sys.maxsize, precision=precision,
745+
random_state=self.seed,
746+
precision=precision,
737747
logger_port=self._logger_port
738748
)
739749
self._stopwatch.stop_task(ensemble_task_name)
@@ -756,8 +766,9 @@ def _start_smac(self, proc_smac: AutoMLSMBO) -> None:
756766
assert self._logger is not None
757767

758768
try:
759-
self.run_history, self.trajectory, budget_type = \
769+
run_history, self.trajectory, budget_type = \
760770
proc_smac.run_smbo()
771+
self.run_history.update(run_history, DataOrigin.INTERNAL)
761772
trajectory_filename = os.path.join(
762773
self._backend.get_smac_output_directory_for_run(self.seed),
763774
'trajectory.json')
@@ -802,8 +813,10 @@ def _run_smac(self, dataset: BaseDataset, proc_ensemble: EnsembleBuilderManager,
802813
func_eval_time_limit_secs=self._func_eval_time_limit_secs,
803814
dask_client=self._dask_client,
804815
memory_limit=self._memory_limit,
805-
n_jobs=self.n_jobs, watcher=self._stopwatch,
806-
metric=self._metric, seed=self.seed,
816+
n_jobs=self.n_jobs,
817+
watcher=self._stopwatch,
818+
metric=self._metric,
819+
seed=self.seed,
807820
include=self.include_components,
808821
exclude=self.exclude_components,
809822
disable_file_output=self._disable_file_output,
@@ -813,7 +826,7 @@ def _run_smac(self, dataset: BaseDataset, proc_ensemble: EnsembleBuilderManager,
813826
pipeline_config={**self.pipeline_options, **budget_config},
814827
ensemble_callback=proc_ensemble,
815828
logger_port=self._logger_port,
816-
start_num_run=self.num_run,
829+
start_num_run=self._backend.get_next_num_run(peek=True),
817830
search_space_updates=self.search_space_updates
818831
)
819832

@@ -930,18 +943,23 @@ def _finish_experiment(self, proc_ensemble: EnsembleBuilderManager,
930943
self._logger.info("Starting to clean up the logger")
931944
self._clean_logger()
932945

933-
def _search(self, optimize_metric: str,
934-
dataset: BaseDataset, budget_type: Optional[str] = None,
935-
budget: Optional[float] = None,
936-
total_walltime_limit: int = 100,
937-
func_eval_time_limit_secs: Optional[int] = None,
938-
enable_traditional_pipeline: bool = True,
939-
memory_limit: Optional[int] = 4096,
940-
smac_scenario_args: Optional[Dict[str, Any]] = None,
941-
get_smac_object_callback: Optional[Callable] = None,
942-
all_supported_metrics: bool = True,
943-
precision: int = 32, disable_file_output: List = [],
944-
load_models: bool = True) -> 'BaseTask':
946+
def _search(
947+
self,
948+
optimize_metric: str,
949+
dataset: BaseDataset,
950+
budget_type: Optional[str] = None,
951+
budget: Optional[float] = None,
952+
total_walltime_limit: int = 100,
953+
func_eval_time_limit_secs: Optional[int] = None,
954+
enable_traditional_pipeline: bool = True,
955+
memory_limit: Optional[int] = 4096,
956+
smac_scenario_args: Optional[Dict[str, Any]] = None,
957+
get_smac_object_callback: Optional[Callable] = None,
958+
all_supported_metrics: bool = True,
959+
precision: int = 32,
960+
disable_file_output: List = [],
961+
load_models: bool = True
962+
) -> 'BaseTask':
945963
"""
946964
Search for the best pipeline configuration for the given dataset.
947965
@@ -1033,7 +1051,6 @@ def _search(self, optimize_metric: str,
10331051
total_walltime_limit=total_walltime_limit)
10341052

10351053
self._adapt_time_resource_allocation()
1036-
self.num_run = 1
10371054
self._run_dummy_predictions()
10381055

10391056
if enable_traditional_pipeline:
@@ -1098,7 +1115,7 @@ def refit(
10981115
'train_indices': dataset.splits[split_id][0],
10991116
'val_indices': dataset.splits[split_id][1],
11001117
'split_id': split_id,
1101-
'num_run': 0
1118+
'num_run': self._backend.get_next_num_run(),
11021119
})
11031120
X.update({**self.pipeline_options, **budget_config})
11041121
if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
@@ -1175,7 +1192,7 @@ def fit(self,
11751192
'train_indices': dataset.splits[split_id][0],
11761193
'val_indices': dataset.splits[split_id][1],
11771194
'split_id': split_id,
1178-
'num_run': 0
1195+
'num_run': self._backend.get_next_num_run(),
11791196
})
11801197
X.update({**self.pipeline_options, **budget_config})
11811198

0 commit comments

Comments
 (0)