Source code for ddl.externals.mlpack._mlpack_estimators
"""Private module for loading mlpack estimators."""
from __future__ import division, print_function
import logging
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils import check_array
try:
from ._det import PyDTree
except ImportError:
# Just ignoring import error because mlpack isn't required
pass
logger = logging.getLogger(__name__)
[docs]class MlpackDensityTreeEstimator(BaseEstimator):
"""Density tree estimator via mlpack (mlpack.org).
This estimator leverages the methods for Density Estimation Trees (DET,
see Ram & Gray 2011 paper below) that are implemented in mlpack (see the
DET method in mlpack's documentation at `mlpack.org`_). Essentially,
this class provides a simple wrapper around the C++ functions in mlpack
and thus must be compiled with mlpack source code.
.. _`mlpack.org`: http://mlpack.org/
Parameters
----------
max_leaf_nodes : int or None, default=None
Maximum number of leaf nodes in final tree. The tree will be fully
grown based on `min_samples_leaf` and then pruned until the number
of leaf nodes is less than `max_leaf_nodes`. If None,
then `max_leaf_nodes` is considered to be infinite. This parameter
can be useful for simple regularization of the density tree.
max_depth : int or None, default=None
Maximum depth of final tree. The tree will be fully grown based on
`min_samples_leaf` and then pruned until the depth of the tree is
less than `max_depth`. If None, then `max_depth` is considered to be
infinite. This parameter can be useful for simple regularization of
the density tree.
min_samples_leaf : int, default=1
Minimum number of samples required at all leaf nodes. Main parameter
for growing the tree initially before pruning. This parameter is
mainly here for computational reasons on large datasets. This
parameter could also be used as regularization.
Attributes
----------
tree_ : arrayed_tree
The tree structure represented using arrays similar to the trees
used in sklearn (e.g. :class:`sklearn.tree.DecisionTreeClassifier`).
References
----------
Ram, P. and Gray, A. G. Density Estimation Trees. In Proceedings of the
17th ACM SIGKDD International Conference on Knowledge Discovery and Data
Mining, 2011.
"""
[docs] def __init__(self, max_leaf_nodes=None, max_depth=None, min_samples_leaf=1):
self.max_leaf_nodes = max_leaf_nodes
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
[docs] def fit(self, X, y=None):
"""Fit estimator to X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : None, default=None
Not used in the fitting process but kept for compatibility.
Returns
-------
self : estimator
Returns the instance itself.
"""
fit_params = dict() # Leave as defaults unless overriden
if self.max_leaf_nodes is not None:
fit_params['max_leaf_nodes'] = self.max_leaf_nodes
if self.max_depth is not None:
fit_params['max_depth'] = self.max_depth
if self.min_samples_leaf is not None:
# Note the different parameter name
fit_params['min_leaf_size'] = self.min_samples_leaf
# Setup canonical decision tree
X = check_array(X)
n_samples, n_features = X.shape
try:
py_dtree = PyDTree(min_vals=np.zeros(n_features), max_vals=np.ones(n_features),
total_points=n_samples)
# Make a copy so original data is not mutated when passed to fit below
py_dtree.fit(X.copy(), **fit_params)
except NameError:
raise RuntimeError(
'Mlpack estimator fitting failed because either mlpack or '
'the corresponding wrappers were not installed correctly.')
# Extract arrayed representation of the tree (similar representation to sklearn)
self.tree_ = py_dtree.get_arrayed_tree()
return self