Source code for ddl.datasets

"""Simple module to generate toy datasets."""
import warnings

import numpy as np
# noinspection PyProtectedMember
from sklearn.utils import check_random_state, shuffle


[docs]def make_toy_data(data_name, n_samples=1000, random_state=None, **maker_kwargs): """Make simple toy datasets. Useful for illustrating density destructors. Parameters ---------- data_name : str Should be one of the following strings: ``{'concentric_circles', 'grid', 'gaussian_grid', 'uniform_grid', 'rotated_uniform', 'autoregressive', 'sin_wave', 'rbig_sin_wave', 'quadratic'}``. n_samples : int, default=1000 Number of samples to make. random_state : int, RandomState instance or None, optional (default=None) If int, `random_state` is the seed used by the random number generator; If :class:`~numpy.random.RandomState` instance, `random_state` is the random number generator; If None, the random number generator is the :class:`~numpy.random.RandomState` instance used by :mod:`numpy.random`. maker_kwargs : dict, optional Other keyword arguments to pass to the associated maker. Returns ------- data : object Data object with the following attributes:: X : array-like with shape (n_samples, n_features) y : array-like with shape (n_samples,) or None is_canonical_domain : bool, whether domain is [0, 1] """ try: maker = _makers_dict['_make_%s' % data_name] except KeyError: raise ValueError('Invalid data_name of "%s"' % data_name) X, y, is_canonical_domain = maker(n_samples, random_state=random_state, **maker_kwargs) return _Data(X=X, y=y, data_name=data_name, is_canonical_domain=is_canonical_domain)
class _Data(object): """Simple class to hold data values and attributes.""" def __init__(self, **kwargs): self.__dict__.update(kwargs) def _make_rotated_uniform(n_samples, scale=None, Q=None, random_state=0): n_features = 2 rng = check_random_state(random_state) if scale is None: scale = np.array([1, 3]) if Q is None: Q = np.linalg.qr(rng.randn(n_features, n_features))[0] U = rng.rand(n_samples, n_features) - 0.5 X = np.dot(U * scale, Q) return X, None, False def _make_autoregressive( n_samples, func=None, x_scale=2, y_std=1, flip_x_y=False, x_distribution='uniform', random_state=0 ): if func is None: func = np.sin rng = check_random_state(random_state) # Get x values if x_distribution == 'gaussian': x = x_scale * rng.randn(n_samples) elif x_distribution == 'abs-gaussian': x = np.abs(x_scale * rng.randn(n_samples)) elif x_distribution == 'uniform': x = x_scale * rng.rand(n_samples) else: raise ValueError('x_distribution should be "gaussian", "uniform", or "abs-gaussian"') # Compute y from x and add some noise y = func(x) + y_std * rng.randn(n_samples) # Flip x and y if flip_x_y: X = np.array([y, x]).T else: X = np.array([x, y]).T return X, None, False def _make_sin_wave(n_samples, x_scale=2 * np.pi, y_std=0.2, **kwargs): if 'func' in kwargs: raise ValueError('func is overridden by _make_sin_wave') return _make_autoregressive(n_samples, func=np.sin, x_scale=x_scale, y_std=y_std, **kwargs) def _make_rbig_sin_wave(n_samples, random_state=0): # Example from [Laparra et al. 2011] # Code at https://www.uv.es/vista/vistavalencia/RBIG.htm return _make_sin_wave(n_samples, x_scale=2, y_std=0.25, x_distribution='abs-gaussian', random_state=random_state) def _make_quadratic(n_samples, random_state=0): # Example from [Papamakarios et al. 2017] return _make_autoregressive(n_samples, func=lambda x: (1 / 4) * x ** 2, x_distribution='gaussian', x_scale=2, y_std=1, flip_x_y=True) def _make_grid(n_samples, n_grid=5, sigma=None, Q=None, perc_filled=0.5, random_state=0, kind='gaussian'): rng = check_random_state(random_state) n_features = 2 if Q is None: Q = np.eye(n_features, n_features) if kind == 'gaussian': if sigma is None: sigma = 0.2 query = np.array(range(n_grid)) is_bounded = False def sample(pos, n, d): return sigma * rng.randn(n, d) + pos elif kind == 'uniform': if sigma is not None: warnings.warn('sigma is ignored when kind="uniform"') query = np.linspace(0, 1, n_grid, endpoint=False) scale = 1 / n_grid is_bounded = True def sample(pos, n, d): return scale * rng.rand(n, d) + pos else: raise ValueError('kind should be "gaussian" or "uniform"') X_grid, Y_grid = np.meshgrid(query, query) positions = np.array([X_grid.ravel(), Y_grid.ravel()]).T positions = np.dot(positions, Q) # Filter to only certain components based on percent filled n_components = int(np.round(perc_filled * n_grid ** 2)) perm_idx = rng.permutation(n_grid ** 2) positions = positions[perm_idx[:n_components], :] n_per_component = rng.multinomial(n_samples, 1 / n_components * np.ones(n_components)) X = np.vstack([ sample(pos, n, n_features) for pos, n in zip(positions, n_per_component) ]) X, y = _get_y_and_shuffle(X, n_per_component) return X, y, is_bounded def _make_gaussian_grid(n_samples, **kwargs): kwargs['kind'] = 'gaussian' return _make_grid(n_samples, **kwargs) def _make_manifold_gaussian_grid(n_samples, sigma=0.05, **kwargs): kwargs['random_state'] = 2 return _make_grid(n_samples, sigma=sigma, **kwargs) def _make_uniform_grid(n_samples, **kwargs): kwargs['kind'] = 'uniform' kwargs['random_state'] = 1 return _make_grid(n_samples, **kwargs) def _make_concentric_circles(n_samples, n_circles=4, noise_std=0.1, random_state=0): rng = check_random_state(random_state) radius = np.array(range(n_circles)) + 1 circum = np.array([ 2 * np.pi * rad for rad in radius ]) perc_per_circle = circum / np.sum(circum) n_per_component = rng.multinomial(n_samples, perc_per_circle) theta = [ 2 * np.pi * rng.rand(n) for n in n_per_component ] X = np.vstack([ ((r + noise_std * rng.randn(len(th))) * np.array([np.cos(th), np.sin(th)])).T for th, r in zip(theta, radius) ]) X, y = _get_y_and_shuffle(X, n_per_component) return X, y, False def _get_y_and_shuffle(X, n_per_component): y = np.concatenate([ j * np.ones(n) for j, n in enumerate(n_per_component) ]) # Shuffle X, y = shuffle(X, y, random_state=0) return X, y _makers_dict = {key: val for key, val in locals().items() if '_make_' in key}