diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c5468eae7..e50862948 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -128,16 +128,6 @@ class BaseTask(ABC): Number of threads to use for each process. logging_config (Optional[Dict]): Specifies configuration for logging, if None, it is loaded from the logging.yaml - ensemble_size (int: default=50): - Number of models added to the ensemble built by - Ensemble selection from libraries of models. - Models are drawn with replacement. - ensemble_nbest (int: default=50): - Only consider the ensemble_nbest models to build the ensemble - max_models_on_disc (int: default=50): - Maximum number of models saved to disc. It also controls the size of - the ensemble as any additional models will be deleted. - Must be greater than or equal to 1. temporary_directory (str): Folder to store configuration output and log file output_directory (str): @@ -173,9 +163,6 @@ def __init__( n_jobs: int = 1, n_threads: int = 1, logging_config: Optional[Dict] = None, - ensemble_size: int = 50, - ensemble_nbest: int = 50, - max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, @@ -195,9 +182,6 @@ def __init__( self.seed = seed self.n_jobs = n_jobs self.n_threads = n_threads - self.ensemble_size = ensemble_size - self.ensemble_nbest = ensemble_nbest - self.max_models_on_disc = max_models_on_disc self.logging_config: Optional[Dict] = logging_config self.include_components: Optional[Dict] = include_components self.exclude_components: Optional[Dict] = exclude_components @@ -227,6 +211,7 @@ def __init__( self._scoring_functions: Optional[List[autoPyTorchMetric]] = None self._logger: Optional[PicklableClientLogger] = None self.dataset_name: Optional[str] = None + self.dataset = Optional[BaseDataset] self.cv_models_: Dict = {} self._results_manager = ResultsManager() @@ -700,23 +685,26 @@ def _load_best_individual_model(self) -> SingleBest: run_history=self.run_history, backend=self._backend, ) - if self._logger is None: - warnings.warn( - "No valid ensemble was created. Please check the log" - "file for errors. Default to the best individual estimator:{}".format( - ensemble.identifiers_ - ) - ) - else: - self._logger.exception( - "No valid ensemble was created. Please check the log" - "file for errors. Default to the best individual estimator:{}".format( - ensemble.identifiers_ - ) - ) return ensemble + def _cleanup(self) -> None: + """ + Closes the different servers created during api search. + Returns: + None + """ + if hasattr(self, '_logger') and self._logger is not None: + self._logger.info("Closing the dask infrastructure") + self._close_dask_client() + self._logger.info("Finished closing the dask infrastructure") + + # Clean up the logger + self._logger.info("Starting to clean up the logger") + self._clean_logger() + else: + self._close_dask_client() + def _do_dummy_prediction(self) -> None: assert self._metric is not None @@ -914,6 +902,35 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: save_external=True) return + def run_traditional_ml( + self, + current_task_name: str, + runtime_limit: int, + func_eval_time_limit_secs: int + ) -> None: + """ + This function can be used to run the suite of traditional machine + learning models during the current task (for e.g, ensemble fit, search) + + Args: + current_task_name (str): name of the current task, + runtime_limit (int): time limit for fitting traditional models, + func_eval_time_limit_secs (int): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. + """ + assert self._logger is not None # for mypy compliancy + traditional_task_name = 'runTraditional' + self._stopwatch.start_task(traditional_task_name) + elapsed_time = self._stopwatch.wall_elapsed(current_task_name) + time_for_traditional = int(runtime_limit - elapsed_time) + self._do_traditional_prediction( + func_eval_time_limit_secs=func_eval_time_limit_secs, + time_left=time_for_traditional, + ) + self._stopwatch.stop_task(traditional_task_name) + def _search( self, optimize_metric: str, @@ -934,6 +951,9 @@ def _search( load_models: bool = True, portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None, + ensemble_size: int = 50, + ensemble_nbest: int = 50, + max_models_on_disc: int = 50, **kwargs: Any ) -> 'BaseTask': """ @@ -1062,6 +1082,16 @@ def _search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_ + ensemble_size (int: default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. It also controls the size of + the ensemble as any additional models will be deleted. + Must be greater than or equal to 1. kwargs: Any additional arguments that are customed by some specific task. For instance, forecasting tasks require: @@ -1070,6 +1100,7 @@ def _search( hyperparameters are determined by the default configurations custom_init_setting_path (str): The path to the initial hyperparameter configurations set by the users + Returns: self @@ -1102,13 +1133,14 @@ def _search( self._disable_file_output = disable_file_output if disable_file_output is not None else [] if ( DisableFileOutputParameters.y_optimization in self._disable_file_output - and self.ensemble_size > 1 + and ensemble_size > 1 ): self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}" f" is in disable_file_output") self._memory_limit = memory_limit self._time_for_task = total_walltime_limit + # Save start time to backend self._backend.save_start_time(str(self.seed)) @@ -1172,7 +1204,7 @@ def _search( # Make sure that at least 2 models are created for the ensemble process num_models = time_left_for_modelfit // func_eval_time_limit_secs - if num_models < 2 and self.ensemble_size > 0: + if num_models < 2 and ensemble_size > 0: func_eval_time_limit_secs = time_left_for_modelfit // 2 self._logger.warning( "Capping the func_eval_time_limit_secs to {} to have " @@ -1182,66 +1214,48 @@ def _search( ) # ============> Run dummy predictions - dummy_task_name = 'runDummy' - self._stopwatch.start_task(dummy_task_name) - self._do_dummy_prediction() - self._stopwatch.stop_task(dummy_task_name) + # We only want to run dummy predictions in case we want to build an ensemble + if ensemble_size > 0: + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) # ============> Run traditional ml - - if enable_traditional_pipeline: - traditional_task_name = 'runTraditional' - self._stopwatch.start_task(traditional_task_name) - elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) - # We want time for at least 1 Neural network in SMAC - time_for_traditional = int( - self._time_for_task - elapsed_time - func_eval_time_limit_secs - ) - self._do_traditional_prediction( - func_eval_time_limit_secs=func_eval_time_limit_secs, - time_left=time_for_traditional, - ) - self._stopwatch.stop_task(traditional_task_name) + # We only want to run traditional predictions in case we want to build an ensemble + # We want time for at least 1 Neural network in SMAC + if enable_traditional_pipeline and ensemble_size > 0: + traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs) + self.run_traditional_ml(current_task_name=self.dataset_name, + runtime_limit=traditional_runtime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs) # ============> Starting ensemble + self.precision = precision + self.opt_metric = optimize_metric elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time) proc_ensemble = None if time_left_for_ensembles <= 0: # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. - if self.ensemble_size > 0: + if ensemble_size > 0: raise ValueError("Not starting ensemble builder because there " "is no time left. Try increasing the value " "of time_left_for_this_task.") - elif self.ensemble_size <= 0: + elif ensemble_size <= 0: self._logger.info("Not starting ensemble builder as ensemble size is 0") else: self._logger.info("Starting ensemble") ensemble_task_name = 'ensemble' self._stopwatch.start_task(ensemble_task_name) - proc_ensemble = EnsembleBuilderManager( - start_time=time.time(), - time_left_for_ensembles=time_left_for_ensembles, - backend=copy.deepcopy(self._backend), - dataset_name=str(dataset.dataset_name), - output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type], - task_type=STRING_TO_TASK_TYPES[self.task_type], - metrics=[self._metric], - opt_metric=optimize_metric, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, - max_models_on_disc=self.max_models_on_disc, - seed=self.seed, - max_iterations=None, - read_at_most=sys.maxsize, - ensemble_memory_limit=self._memory_limit, - random_state=self.seed, - precision=precision, - logger_port=self._logger_port, - pynisher_context=self._multiprocessing_context, - metrics_kwargs=self._metrics_kwargs, - ) + proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + precision=precision, + optimize_metric=self.opt_metric, + max_models_on_disc=max_models_on_disc + ) self._stopwatch.stop_task(ensemble_task_name) # ==> Run SMAC @@ -1311,22 +1325,7 @@ def _search( self._logger.info("Starting Shutdown") if proc_ensemble is not None: - self._results_manager.ensemble_performance_history = list(proc_ensemble.history) - - if len(proc_ensemble.futures) > 0: - # Also add ensemble runs that did not finish within smac time - # and add them into the ensemble history - self._logger.info("Ensemble script still running, waiting for it to finish.") - result = proc_ensemble.futures.pop().result() - if result: - ensemble_history, _, _, _ = result - self._results_manager.ensemble_performance_history.extend(ensemble_history) - self._logger.info("Ensemble script finished, continue shutdown.") - - # save the ensemble performance history file - if len(self.ensemble_performance_history) > 0: - pd.DataFrame(self.ensemble_performance_history).to_json( - os.path.join(self._backend.internals_directory, 'ensemble_history.json')) + self._collect_results_ensemble(proc_ensemble) self._logger.info("Closing the dask infrastructure") self._close_dask_client() @@ -1337,9 +1336,15 @@ def _search( self._load_models() self._logger.info("Finished loading models...") - # Clean up the logger - self._logger.info("Starting to clean up the logger") - self._clean_logger() + if isinstance(self.ensemble_, SingleBest) and ensemble_size > 0: + self._logger.exception( + "No valid ensemble was created. Please check the log" + "file for errors. Default to the best individual estimator:{}".format( + self.ensemble_.identifiers_ + ) + ) + + self._cleanup() return self @@ -1440,7 +1445,7 @@ def refit( split_id=split_id) fit_and_suppress_warnings(self._logger, model, X, y=None) - self._clean_logger() + self._cleanup() return self @@ -1688,7 +1693,7 @@ def fit_pipeline( disable_file_output=disable_file_output ) - self._clean_logger() + self._cleanup() return fitted_pipeline, run_info, run_value, dataset @@ -1723,6 +1728,240 @@ def _get_fitted_pipeline( budget=float(run_info.budget), ) + def fit_ensemble( + self, + optimize_metric: Optional[str] = None, + precision: Optional[int] = None, + ensemble_nbest: int = 50, + ensemble_size: int = 50, + max_models_on_disc: int = 50, + load_models: bool = True, + time_for_task: int = 100, + func_eval_time_limit_secs: int = 50, + enable_traditional_pipeline: bool = True, + ) -> 'BaseTask': + """ + Enables post-hoc fitting of the ensemble after the `search()` + method is finished. This method creates an ensemble using all + the models stored on disk during the smbo run. + + Args: + optimize_metric (str): name of the metric that is used to + evaluate a pipeline. if not specified, value passed to search will be used + precision (Optional[int]): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + ensemble_size (int: default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. It also controls the size of + the ensemble as any additional models will be deleted. + Must be greater than or equal to 1. + enable_traditional_pipeline (bool), (default=True): + We fit traditional machine learning algorithms + (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM) + prior building PyTorch Neural Networks. You can disable this + feature by turning this flag to False. All machine learning + algorithms that are fitted during search() are considered for + ensemble building. + load_models (bool), (default=True): Whether to load the + models after fitting AutoPyTorch. + time_for_task (int), (default=100): Time limit + in seconds for the search of appropriate models. + By increasing this value, autopytorch has a higher + chance of finding better models. + func_eval_time_limit_secs (int), (default=None): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. Set + this value high enough so that typical machine + learning algorithms can be fit on the training + data. + When set to None, this time will automatically be set to + total_walltime_limit // 2 to allow enough time to fit + at least 2 individual machine learning algorithms. + Set to np.inf in case no time limit is desired. + + Returns: + self + """ + # Make sure that input is valid + if self.dataset is None or self.opt_metric is None: + raise ValueError("fit_ensemble() can only be called after `search()`. " + "Please call the `search()` method of {} prior to " + "fit_ensemble().".format(self.__class__.__name__)) + + precision = precision if precision is not None else self.precision + if precision not in [16, 32, 64]: + raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision)) + + if self._logger is None: + self._logger = self._get_logger(self.dataset.dataset_name) + + # Create a client if needed + if self._dask_client is None: + self._create_dask_client() + else: + self._is_dask_client_internally_created = False + + ensemble_fit_task_name = 'EnsembleFit' + self._stopwatch.start_task(ensemble_fit_task_name) + if enable_traditional_pipeline: + if func_eval_time_limit_secs > time_for_task: + self._logger.warning( + 'Time limit for a single run is higher than total time ' + 'limit. Capping the limit for a single run to the total ' + 'time given to Ensemble fit (%f)' % time_for_task + ) + func_eval_time_limit_secs = time_for_task + + # Make sure that at least 2 models are created for the ensemble process + num_models = time_for_task // func_eval_time_limit_secs + if num_models < 2: + func_eval_time_limit_secs = time_for_task // 2 + self._logger.warning( + "Capping the func_eval_time_limit_secs to {} to have " + "time for at least 2 models to ensemble.".format( + func_eval_time_limit_secs + ) + ) + # ============> Run Dummy predictions + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) + + # ============> Run traditional ml + if enable_traditional_pipeline: + self.run_traditional_ml(current_task_name=ensemble_fit_task_name, + runtime_limit=time_for_task, + func_eval_time_limit_secs=func_eval_time_limit_secs) + + elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name) + time_left_for_ensemble = int(time_for_task - elapsed_time) + manager = self._init_ensemble_builder( + time_left_for_ensembles=time_left_for_ensemble, + optimize_metric=self.opt_metric if optimize_metric is None else optimize_metric, + precision=precision, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc + ) + + manager.build_ensemble(self._dask_client) + if manager is not None: + self._collect_results_ensemble(manager) + + if load_models: + self._load_models() + + self._stopwatch.stop_task(ensemble_fit_task_name) + + self._cleanup() + + return self + + def _init_ensemble_builder( + self, + time_left_for_ensembles: float, + optimize_metric: str, + ensemble_nbest: int, + ensemble_size: int, + max_models_on_disc: int = 50, + precision: int = 32, + ) -> EnsembleBuilderManager: + """ + Initializes an `EnsembleBuilderManager`. + Args: + time_left_for_ensembles (float): + Time (in seconds) allocated to building the ensemble + optimize_metric (str): + Name of the metric to optimize the ensemble. + ensemble_size (int): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int): + Only consider the ensemble_nbest models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. It also controls the size of + the ensemble as any additional models will be deleted. + Must be greater than or equal to 1. + precision (int: default=32): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + + Returns: + EnsembleBuilderManager + """ + if self._logger is None: + raise ValueError("logger should be initialized to fit ensemble") + if self.dataset is None: + raise ValueError("ensemble can only be initialised after or during `search()`. " + "Please call the `search()` method of {}.".format(self.__class__.__name__)) + + self._logger.info("Starting ensemble") + ensemble_task_name = 'ensemble' + self._stopwatch.start_task(ensemble_task_name) + + # Use the current thread to start the ensemble builder process + # The function ensemble_builder_process will internally create a ensemble + # builder in the provide dask client + required_dataset_properties = {'task_type': self.task_type, + 'output_type': self.dataset.output_type} + proc_ensemble = EnsembleBuilderManager( + start_time=time.time(), + time_left_for_ensembles=time_left_for_ensembles, + backend=copy.deepcopy(self._backend), + dataset_name=str(self.dataset.dataset_name), + output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type], + task_type=STRING_TO_TASK_TYPES[self.task_type], + metrics=get_metrics( + dataset_properties=required_dataset_properties, + names=[optimize_metric]), + opt_metric=optimize_metric, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, + seed=self.seed, + max_iterations=None, + read_at_most=sys.maxsize, + ensemble_memory_limit=self._memory_limit, + random_state=self.seed, + precision=precision, + logger_port=self._logger_port, + ) + self._stopwatch.stop_task(ensemble_task_name) + + return proc_ensemble + + def _collect_results_ensemble( + self, + manager: EnsembleBuilderManager + ) -> None: + + if self._logger is None: + raise ValueError("logger should be initialized to fit ensemble") + + self._results_manager.ensemble_performance_history = list(manager.history) + + if len(manager.futures) > 0: + # Also add ensemble runs that did not finish within smac time + # and add them into the ensemble history + self._logger.info("Ensemble script still running, waiting for it to finish.") + result = manager.futures.pop().result() + if result: + ensemble_history, _, _, _ = result + self._results_manager.ensemble_performance_history.extend(ensemble_history) + self._logger.info("Ensemble script finished, continue shutdown.") + + # save the ensemble performance history file + if len(self.ensemble_performance_history) > 0: + pd.DataFrame(self.ensemble_performance_history).to_json( + os.path.join(self._backend.internals_directory, 'ensemble_history.json')) + def predict( self, X_test: np.ndarray, @@ -1774,7 +2013,7 @@ def predict( predictions = self.ensemble_.predict(all_predictions) - self._clean_logger() + self._cleanup() return predictions diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index facb59f99..3ce8ef216 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -39,18 +39,6 @@ class TabularClassificationTask(BaseTask): number of threads to use for each process. logging_config (Optional[Dict]): Specifies configuration for logging, if None, it is loaded from the logging.yaml - ensemble_size (int: default=50): - Number of models added to the ensemble built by - Ensemble selection from libraries of models. - Models are drawn with replacement. - ensemble_nbest (int: default=50): - Only consider the ensemble_nbest - models to build the ensemble - max_models_on_disc (int: default=50): - Maximum number of models saved to disc. - Also, controls the size of the ensemble - as any additional models will be deleted. - Must be greater than or equal to 1. temporary_directory (str): Folder to store configuration output and log file output_directory (str): @@ -85,9 +73,6 @@ def __init__( n_jobs: int = 1, n_threads: int = 1, logging_config: Optional[Dict] = None, - ensemble_size: int = 50, - ensemble_nbest: int = 50, - max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, @@ -104,9 +89,6 @@ def __init__( n_jobs=n_jobs, n_threads=n_threads, logging_config=logging_config, - ensemble_size=ensemble_size, - ensemble_nbest=ensemble_nbest, - max_models_on_disc=max_models_on_disc, temporary_directory=temporary_directory, output_directory=output_directory, delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate, @@ -260,6 +242,9 @@ def search( load_models: bool = True, portfolio_selection: Optional[str] = None, dataset_compression: Union[Mapping[str, Any], bool] = False, + ensemble_size: int = 50, + ensemble_nbest: int = 50, + max_models_on_disc: int = 50, ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -429,6 +414,18 @@ def search( Subsampling takes into account classification labels and stratifies accordingly. We guarantee that at least one occurrence of each label is included in the sampled set. + ensemble_size (int: default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest + models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. + Also, controls the size of the ensemble + as any additional models will be deleted. + Must be greater than or equal to 1. Returns: self @@ -464,6 +461,9 @@ def search( disable_file_output=disable_file_output, load_models=load_models, portfolio_selection=portfolio_selection, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, ) def predict( diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index e0c1e4eac..3c8f42aad 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -39,18 +39,6 @@ class TabularRegressionTask(BaseTask): number of threads to use for each process. logging_config (Optional[Dict]): Specifies configuration for logging, if None, it is loaded from the logging.yaml - ensemble_size (int: default=50): - Number of models added to the ensemble built by - Ensemble selection from libraries of models. - Models are drawn with replacement. - ensemble_nbest (int: default=50): - Only consider the ensemble_nbest - models to build the ensemble - max_models_on_disc (int: default=50): - Maximum number of models saved to disc. - Also, controls the size of the ensemble - as any additional models will be deleted. - Must be greater than or equal to 1. temporary_directory (str): Folder to store configuration output and log file output_directory (str): @@ -86,9 +74,6 @@ def __init__( n_jobs: int = 1, n_threads: int = 1, logging_config: Optional[Dict] = None, - ensemble_size: int = 50, - ensemble_nbest: int = 50, - max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, @@ -105,9 +90,6 @@ def __init__( n_jobs=n_jobs, n_threads=n_threads, logging_config=logging_config, - ensemble_size=ensemble_size, - ensemble_nbest=ensemble_nbest, - max_models_on_disc=max_models_on_disc, temporary_directory=temporary_directory, output_directory=output_directory, delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate, @@ -259,7 +241,13 @@ def search( disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, +<<<<<<< HEAD dataset_compression: Union[Mapping[str, Any], bool] = False, +======= + ensemble_size: int = 50, + ensemble_nbest: int = 50, + max_models_on_disc: int = 50, +>>>>>>> move ensemble arguments to search function ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -390,6 +378,7 @@ def search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_. +<<<<<<< HEAD dataset_compression: Union[bool, Mapping[str, Any]] = True We compress datasets so that they fit into some predefined amount of memory. **NOTE** @@ -429,6 +418,20 @@ def search( Subsampling takes into account classification labels and stratifies accordingly. We guarantee that at least one occurrence of each label is included in the sampled set. +======= + ensemble_size (int: default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest + models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. + Also, controls the size of the ensemble + as any additional models will be deleted. + Must be greater than or equal to 1. +>>>>>>> move ensemble arguments to search function Returns: self @@ -465,6 +468,9 @@ def search( disable_file_output=disable_file_output, load_models=load_models, portfolio_selection=portfolio_selection, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, ) def predict( diff --git a/examples/40_advanced/example_posthoc_ensemble_fit.py b/examples/40_advanced/example_posthoc_ensemble_fit.py new file mode 100644 index 000000000..801f35bf1 --- /dev/null +++ b/examples/40_advanced/example_posthoc_ensemble_fit.py @@ -0,0 +1,82 @@ +""" +===================================================== +Tabular Classification with Post-Hoc Ensemble Fitting +===================================================== + +The following example shows how to fit a sample classification model +and create an ensemble post-hoc with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask + + +############################################################################ +# Data Loading +# ============ +X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=42, +) + +############################################################################ +# Build and fit a classifier +# ========================== +api = TabularClassificationTask( + seed=42, +) + +############################################################################ +# Search for the best neural network +# ================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=100, + func_eval_time_limit_secs=50, + ensemble_size=0, +) + +############################################################################ +# Print the final performance of the incumbent neural network +# =========================================================== +print(api.run_history, api.trajectory) +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) + +############################################################################ +# Fit an ensemble with the neural networks fitted during the search +# ================================================================= + +api.fit_ensemble(ensemble_size=5, + # Set the enable_traditional_pipeline=True + # to also include traditional models + # in the ensemble + enable_traditional_pipeline=False) +# Print the final ensemble built by AutoPyTorch +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) +print(api.show_models()) + +# Print statistics from search +print(api.sprint_statistics()) \ No newline at end of file diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 465d74c6b..d3bb71119 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -609,7 +609,6 @@ def test_tabular_input_support(openml_id, backend): estimator = TabularClassificationTask( backend=backend, resampling_strategy=HoldoutValTypes.holdout_validation, - ensemble_size=0, ) estimator._do_dummy_prediction = unittest.mock.MagicMock() @@ -624,6 +623,7 @@ def test_tabular_input_support(openml_id, backend): func_eval_time_limit_secs=50, enable_traditional_pipeline=False, load_models=False, + ensemble_size=0, ) @@ -633,7 +633,6 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): estimator = TabularClassificationTask( backend=backend, resampling_strategy=HoldoutValTypes.holdout_validation, - ensemble_size=0, ) # Setup pre-requisites normally set by search() @@ -759,7 +758,6 @@ def test_do_traditional_pipeline(fit_dictionary_tabular): estimator = TabularClassificationTask( backend=backend, resampling_strategy=HoldoutValTypes.holdout_validation, - ensemble_size=0, ) # Setup pre-requisites normally set by search() diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index bb8f9c061..edc9499d7 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -12,7 +12,10 @@ from autoPyTorch.api.base_task import BaseTask, _pipeline_predict from autoPyTorch.constants import TABULAR_CLASSIFICATION, TABULAR_REGRESSION +from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes +from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager +from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline @@ -115,7 +118,7 @@ def test_set_pipeline_config(): ]) def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, budget_type, expected): BaseTask.__abstractmethods__ = set() - estimator = BaseTask(task_type='tabular_classification', ensemble_size=0) + estimator = BaseTask(task_type='tabular_classification') # Fixture pipeline config default_pipeline_config = { @@ -138,7 +141,7 @@ def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, bud smac_mock.return_value = smac estimator._search(optimize_metric='accuracy', dataset=dataset, tae_func=pipeline_fit, min_budget=min_budget, max_budget=max_budget, budget_type=budget_type, - enable_traditional_pipeline=False, + ensemble_size=0, enable_traditional_pipeline=False, total_walltime_limit=20, func_eval_time_limit_secs=10, load_models=False) assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config @@ -201,3 +204,32 @@ def test_pipeline_get_budget_forecasting(fit_dictionary_forecasting, min_budget, assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config assert list(smac_mock.call_args)[1]['max_budget'] == max_budget assert list(smac_mock.call_args)[1]['initial_budget'] == min_budget + + +def test_init_ensemble_builder(backend): + BaseTask.__abstractmethods__ = set() + estimator = BaseTask( + backend=backend, + ) + + # Setup pre-requisites normally set by search() + estimator._logger = estimator._get_logger('test') + estimator.task_type = "tabular_classification" + estimator._memory_limit = 60 + estimator.dataset = MagicMock(spec=BaseDataset) + estimator.dataset.output_type = 'binary' + estimator.dataset.dataset_name = 'dummy' + + proc_ensemble = estimator._init_ensemble_builder( + time_left_for_ensembles=60, + optimize_metric='accuracy', + ensemble_nbest=10, + ensemble_size=5) + + assert isinstance(proc_ensemble, EnsembleBuilderManager) + assert proc_ensemble.opt_metric == 'accuracy' + assert proc_ensemble.metrics[0] == accuracy + + estimator._cleanup() + + del estimator