Source code for ddl.deep

"""Deep destructors module."""
from __future__ import division, print_function

import collections
import logging
import warnings
from itertools import cycle, islice

import numpy as np
from sklearn.base import clone
from sklearn.model_selection import check_cv
from sklearn.utils.validation import check_array, check_X_y

# noinspection PyProtectedMember
from .base import (CompositeDestructor, IdentityDestructor, _check_global_random_state,
                   create_implicit_density)

logger = logging.getLogger(__name__)


[docs]class DeepDestructor(CompositeDestructor): """Destructor formed by composing copies of some atomic destructor. This destructor creates a dynamic composite destructor that includes an optional initial destructor (parameter `init_destructor`) followed by multiple copies of a canonical destructor (parameter `canonical_destructor`). The `init_destructor` is often used for preprocessing steps such as standardization. If the training data's domain/support is not the unit hypercube, an initial destructor is required---this initial destructor should have a domain that matches the training data (by the definition of a destructor, the range of the destructor is the unit hypercube and thus the initial destructor will project the data onto the canonical domain. This is a relatively thin wrapper around :class:`~ddl.base.CompositeDestructor` that creates copies of the canonical destructor to create a deep composite destructor with destuctors (or "layers") that are similar in structure because they have the same hyperparameters. See Also -------- DeepDestructorCV A deep destructor whose number of destructors/layers is chosen automatically based on cross-validation test likelihood. ddl.base.CompositeDestructor Parameters ---------- canonical_destructor : estimator or list The canonical destructor(s) that will be cloned to build up a deep destructor. Parameter `canonical_destructor` can be a list of canonical destructors. The list will be cycled through to get as many canonical destructors as needed. init_destructor : estimator, optional Initial destructor (e.g. preprocessing or just to project to canonical domain). n_canonical_destructors : int, default=1 Number of cloned canonical destructors to add to the deep destructor. random_state : int, RandomState instance or None, optional (default=None) If int, `random_state` is the seed used by the random number generator; If :class:`~numpy.random.RandomState` instance, `random_state` is the random number generator; If None, the random number generator is the :class:`~numpy.random.RandomState` instance used by :mod:`numpy.random`. Attributes ---------- fitted_destructors_ : list List of fitted (sub)destructors. See `fitted_destructors_` of :class:`~ddl.base.CompositeDestructor`. density_ : estimator *Implicit* density of deep destructor. """ # noinspection PyMissingConstructor
[docs] def __init__(self, canonical_destructor=None, init_destructor=None, n_canonical_destructors=1, random_state=None): self.canonical_destructor = canonical_destructor self.init_destructor = init_destructor self.n_canonical_destructors = n_canonical_destructors self.random_state = random_state
def _get_canonical_destructors(self): """Get canonical destructors as list and handle single case. If only a single one then wrap in a list. """ if self.canonical_destructor is not None: canonical_destructors = self.canonical_destructor else: canonical_destructors = IdentityDestructor() # If single, then update to list if len(np.array(canonical_destructors).shape) < 1: canonical_destructors = [canonical_destructors] return canonical_destructors def _get_destructor_iterable(self): destructors = [] if self.init_destructor is not None: destructors.append(self.init_destructor) destructors.extend(_take(cycle(self._get_canonical_destructors()), self.n_canonical_destructors)) return np.array([clone(d) for d in destructors])
[docs]class DeepDestructorCV(DeepDestructor): """Deep destructor whose number of destructors/layers is determined by CV. Nearly the same as `DeepDestructor` except that the number of canonical destructors (i.e. the number of layers) is automatically determined using cross validation. The likelihood of held-out data in each CV fold is used to determine the number of parameters. This destructor is computationally more efficient than using :class:`sklearn.model_selection.GridSearchCV` because the deep destructor can be built one layer at a time and the test likelihood can be accumulated one layer at a time. See Also -------- DeepDestructor Parameters ---------- canonical_destructor : estimator or list The canonical destructor(s) that will be cloned to build up a deep destructor. Parameter `canonical_destructor` can be a list of canonical destructors. The list will be cycled through to get as many canonical destructors as needed. init_destructor : estimator, optional Initial destructor (e.g. preprocessing or just to project to canonical domain). cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. stop_tol : float, default=1e-3 Relative difference at which to stop adding destructors. For example, if set to 0.0, then the algorithm will stop if the test log likelihood ever decreases. max_canonical_destructors : int or None, default=None The maximum number of destructors (including the initial destructor) to add to the deep destructor. If set to None, then the number of destructors is unbounded. n_extend : int, default=1 The number of destructors/layers to extend even after the stopping tolerance defined by `stop_tol` has been reached. This could be useful if the destructors are random or not gauranteed to always increase likelihood. If `n_extend` is 1, then the optimization will stop as soon as the test log likelihood decreases. refit : bool, default=True Whether to refit the entire deep destructor with the selected number of layers or just extract the fit from the first fold. silent : bool, default=False Whether to output debug messages via :class:`logging.logger`. Note that logging messages are not output to standard out automatically. Please see the Python module :mod:`logging` for more information. log_prefix : str, default='' Prefix of debug logging messages via :class:`logging.logger`. See `silent` parameter. random_state : int, RandomState instance or None, optional (default=None) If int, `random_state` is the seed used by the random number generator; If :class:`~numpy.random.RandomState` instance, `random_state` is the random number generator; If None, the random number generator is the :class:`~numpy.random.RandomState` instance used by :mod:`numpy.random`. Attributes ---------- fitted_destructors_ : array, shape = [n_layers] Array of fitted destructors. See `fitted_destructors_` of `base.CompositeDestructor`. density_ : estimator *Implicit* density of deep destructor. cv_train_scores_ : array, shape = [n_layers, n_splits] Cross validation train scores (mean log-likelihood). cv_test_scores_ : array, shape = [n_layers, n_splits] Cross validation test scores (mean log-likelihood). best_n_layers_ : int Best number of layers as selected by cross validation. """ # noinspection PyMissingConstructor
[docs] def __init__(self, canonical_destructor=None, init_destructor=None, cv=None, stop_tol=1e-3, max_canonical_destructors=None, n_extend=1, refit=True, silent=False, log_prefix='', random_state=None): self.canonical_destructor = canonical_destructor self.init_destructor = init_destructor self.cv = cv self.stop_tol = stop_tol self.max_canonical_destructors = max_canonical_destructors self.n_extend = n_extend self.silent = silent self.log_prefix = log_prefix self.refit = refit self.random_state = random_state
[docs] @_check_global_random_state def fit(self, X, y=None, X_test=None, first_score_zero=False, **fit_params): """[Placeholder]. Parameters ---------- X : y : X_test : fit_params : first_score_zero : bool Hack so that init destructor is not taken into account for determining when to stop for classifier destructors. Returns ------- obj : object """ # Setup parameters if self.n_extend < 1: raise ValueError('n_extend should be greater than or equal to 1') if y is not None: X, y = check_X_y(X, y) else: X = check_array(X) cv = check_cv(self.cv) splits = list(cv.split(X)) # CV path fit and transform cv_destructors_arr = [[] for _ in splits] scores_arr = [[] for _ in splits] cv_destructors_arr, scores_arr, splits = self._fit_cv_destructors( X, y, cv_destructors_arr, scores_arr, splits, X_test=X_test, first_score_zero=first_score_zero) # Add layers as needed up to max # of layers of all splits if not self.silent: logger.debug(self.log_prefix + 'Fitting extra needed layers') best_n_layers_over_folds = np.max([ len(cv_destructors) for cv_destructors in cv_destructors_arr]) cv_destructors_arr, scores_arr, splits = self._fit_cv_destructors( X, y, cv_destructors_arr, scores_arr, splits, X_test=X_test, selected_n_layers=best_n_layers_over_folds, first_score_zero=first_score_zero) # Determine best number of layers scores_mat = np.array(scores_arr) if np.any(scores_mat.shape != np.array([len(splits), best_n_layers_over_folds, 2])): raise RuntimeError('scores_mat does not seem to be the correct shape') scores_avg = np.mean(scores_mat, axis=0) # Average over different splits best_n_layers = int( 1 + np.argmax(scores_avg[:, 1].ravel())) # Best over cumulative test_score best_score = np.max(scores_avg[:, 1].ravel()) # Final fitting with best # of layers if self.refit: if not self.silent: logger.debug(self.log_prefix + 'Fitting final model with %d layers with score=%g' % (best_n_layers, best_score)) destructors = [] Z = X.copy() for i, d in enumerate(islice(iter(self._get_destructor_iterable()), best_n_layers)): d.fit(Z, y) score = d.score(Z, y) if first_score_zero: score = 0 Z = d.transform(Z, y) destructors.append(d) if not self.silent: logger.debug(self.log_prefix + '(Final fit layer=%d) local layer score=%g' % (i + 1, score)) else: # Use already fitted destructor from CV array if len(cv_destructors_arr) > 1: warnings.warn('refit=False but len(cv_destructors_arr) > 1 so just using fitted ' 'destructors from the first split.') destructors = np.array(cv_destructors_arr[0])[:best_n_layers] self.fitted_destructors_ = np.array(destructors) self.density_ = create_implicit_density(self) self.cv_train_scores_ = scores_mat[:, :, 0].transpose() self.cv_test_scores_ = scores_mat[:, :, 1].transpose() self.best_n_layers_ = best_n_layers return self
[docs] def fit_transform(self, X, y=None, **fit_params): """Fit estimator to X and then transform X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : None, default=None Not used in the fitting process but kept for compatibility. fit_params : dict, optional Parameters to pass to the fit method. Returns ------- X_new : array-like, shape (n_samples, n_features) Transformed data. """ self.fit(X, y, **fit_params) return self.transform(X, y)
def _fit_cv_destructors(self, X, y, cv_destructors_arr, scores_arr, splits, X_test=None, y_test=None, selected_n_layers=None, first_score_zero=False): compute_test = X_test is not None for i, (cv_destructors, scores, (train, validation)) in enumerate( zip(cv_destructors_arr, scores_arr, splits)): Z_train = X[train, :].copy() Z_validation = X[validation, :].copy() if y is not None: y_train = y[train] y_validation = y[validation] else: y_train = None y_validation = None if compute_test: Z_test = X_test.copy() else: Z_test = None # If some destructors are already fit destructor_iterator = iter(self._get_destructor_iterable()) if selected_n_layers is not None and len(cv_destructors) == selected_n_layers: # Don't need to fit any more destructors if not self.silent: logger.debug(self.log_prefix + 'Already done fitting cv=%i deep destructor' % i) continue elif len(cv_destructors) > 0: if not self.silent: logger.debug(self.log_prefix + 'Re-fitting extra destructors for cv=%i deep destructor' % i) # Pop off destructors that were already fit from the destructor iterator _consume(destructor_iterator, len(cv_destructors)) for d in cv_destructors: Z_train = d.transform(Z_train, y_train) Z_validation = d.transform(Z_validation, y_validation) if compute_test: Z_test = d.transform(Z_test, y_test) stop = False cum_test_score = 0 while not stop: # Add layers until all are ready to stop # Fit only on training data destructor = next(destructor_iterator) destructor.fit(Z_train, y_train) # Score and then transform data train_score = destructor.score(Z_train, y_train) validation_score = destructor.score(Z_validation, y_validation) if first_score_zero and len(cv_destructors) == 0: train_score = 0 validation_score = 0 Z_train = destructor.transform(Z_train, y_train) Z_validation = destructor.transform(Z_validation, y_validation) if compute_test: test_score = destructor.score(Z_test, y_test) if first_score_zero and len(cv_destructors) == 0: test_score = 0 Z_test = destructor.transform(Z_test, y_test) cum_test_score += test_score test_score_str = ' test=%g, cum_test=%g' % (test_score, cum_test_score) else: test_score_str = '' if not self.silent: logger.debug(self.log_prefix + '(CV sp=%d, L=%d) Scores: train=%g val=%g%s' % (i + 1, len(cv_destructors) + 1, train_score, validation_score, test_score_str)) # Update cv_destructors cv_destructors.append(destructor) # Maintain cumulative scores previous_scores = scores[-1] if len(scores) > 0 else np.array([0, 0]) cum_scores = previous_scores + np.array([train_score, validation_score]) scores.append(cum_scores) # Stop if global max layers is reached if self.max_canonical_destructors is not None: global_max_layers = self.max_canonical_destructors if self.init_destructor is not None: global_max_layers += 1 if len(cv_destructors) == global_max_layers: stop = True continue if selected_n_layers is not None: if len(cv_destructors) == selected_n_layers: stop = True else: # Keep going if need to fit a certain number of layers no matter what # (i.e. don't check n_extend in this case) pass else: # If we have n_extend + 1 layers then check cumulative scores if len(cv_destructors) > self.n_extend: cur_score = scores[-1][1] max_previous_scores = np.max([sc[1] for sc in scores[:-self.n_extend]]) if max_previous_scores == 0: rel_diff = cur_score - max_previous_scores else: rel_diff = (cur_score - max_previous_scores) / np.abs( max_previous_scores) if not self.silent: logger.debug(self.log_prefix + '(CV sp=%d, L=%d) Relative diff=%g' % (i + 1, len(cv_destructors), rel_diff)) if rel_diff < self.stop_tol: # If the most recent cumulative score is less than the max score of much # everything except the last n_extend layers, then stop stop = True return cv_destructors_arr, scores_arr, splits def _get_destructor_iterable(self): """Yield an infinite sequence of destructors.""" def _destructor_generator(): if self.init_destructor is not None: yield clone(self.init_destructor) for d in cycle(self._get_canonical_destructors()): yield clone(d) return _destructor_generator()
def _take(iterable, n): """Return first n items of the iterable as a list.""" return list(islice(iterable, n)) def _consume(iterator, n): """Advance the iterator n-steps ahead. If n is none, consume entirely.""" # Use functions that consume iterators at C speed. if n is None: collections.deque(iterator, maxlen=0) else: next(islice(iterator, n, n), None)