Source code for ddl.datasets

"""Simple module to generate toy datasets."""
import warnings

import numpy as np
# noinspection PyProtectedMember
from sklearn.utils import check_random_state, shuffle


[docs]def make_toy_data(data_name, n_samples=1000, random_state=None, **maker_kwargs):
    """Make simple toy datasets.

    Useful for illustrating density destructors.

    Parameters
    ----------
    data_name : str
        Should be one of the following strings: ``{'concentric_circles',
        'grid', 'gaussian_grid', 'uniform_grid', 'rotated_uniform',
        'autoregressive', 'sin_wave', 'rbig_sin_wave', 'quadratic'}``.

    n_samples : int, default=1000
        Number of samples to make.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, `random_state` is the seed used by the random number
        generator; If :class:`~numpy.random.RandomState` instance,
        `random_state` is the random number generator; If None, the random
        number generator is the :class:`~numpy.random.RandomState` instance
        used by :mod:`numpy.random`.

    maker_kwargs : dict, optional
        Other keyword arguments to pass to the associated maker.

    Returns
    -------
    data : object
        Data object with the following attributes::

            X : array-like with shape (n_samples, n_features)
            y : array-like with shape (n_samples,) or None
            is_canonical_domain : bool, whether domain is [0, 1]

    """
    try:
        maker = _makers_dict['_make_%s' % data_name]
    except KeyError:
        raise ValueError('Invalid data_name of "%s"' % data_name)
    X, y, is_canonical_domain = maker(n_samples, random_state=random_state, **maker_kwargs)
    return _Data(X=X, y=y, data_name=data_name, is_canonical_domain=is_canonical_domain)


class _Data(object):
    """Simple class to hold data values and attributes."""

    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)


def _make_rotated_uniform(n_samples, scale=None, Q=None, random_state=0):
    n_features = 2
    rng = check_random_state(random_state)
    if scale is None:
        scale = np.array([1, 3])
    if Q is None:
        Q = np.linalg.qr(rng.randn(n_features, n_features))[0]
    U = rng.rand(n_samples, n_features) - 0.5
    X = np.dot(U * scale, Q)
    return X, None, False


def _make_autoregressive(
        n_samples, func=None, x_scale=2, y_std=1, flip_x_y=False,
        x_distribution='uniform', random_state=0
):
    if func is None:
        func = np.sin
    rng = check_random_state(random_state)
    # Get x values
    if x_distribution == 'gaussian':
        x = x_scale * rng.randn(n_samples)
    elif x_distribution == 'abs-gaussian':
        x = np.abs(x_scale * rng.randn(n_samples))
    elif x_distribution == 'uniform':
        x = x_scale * rng.rand(n_samples)
    else:
        raise ValueError('x_distribution should be "gaussian", "uniform", or "abs-gaussian"')

    # Compute y from x and add some noise
    y = func(x) + y_std * rng.randn(n_samples)

    # Flip x and y
    if flip_x_y:
        X = np.array([y, x]).T
    else:
        X = np.array([x, y]).T

    return X, None, False


def _make_sin_wave(n_samples, x_scale=2 * np.pi, y_std=0.2, **kwargs):
    if 'func' in kwargs:
        raise ValueError('func is overridden by _make_sin_wave')
    return _make_autoregressive(n_samples, func=np.sin,
                                x_scale=x_scale, y_std=y_std, **kwargs)


def _make_rbig_sin_wave(n_samples, random_state=0):
    # Example from [Laparra et al. 2011]
    # Code at https://www.uv.es/vista/vistavalencia/RBIG.htm
    return _make_sin_wave(n_samples, x_scale=2, y_std=0.25,
                          x_distribution='abs-gaussian', random_state=random_state)


def _make_quadratic(n_samples, random_state=0):
    # Example from [Papamakarios et al. 2017]
    return _make_autoregressive(n_samples, func=lambda x: (1 / 4) * x ** 2,
                                x_distribution='gaussian', x_scale=2, y_std=1, flip_x_y=True)


def _make_grid(n_samples, n_grid=5, sigma=None, Q=None, perc_filled=0.5, random_state=0,
               kind='gaussian'):
    rng = check_random_state(random_state)
    n_features = 2

    if Q is None:
        Q = np.eye(n_features, n_features)
    if kind == 'gaussian':
        if sigma is None:
            sigma = 0.2
        query = np.array(range(n_grid))
        is_bounded = False

        def sample(pos, n, d):
            return sigma * rng.randn(n, d) + pos
    elif kind == 'uniform':
        if sigma is not None:
            warnings.warn('sigma is ignored when kind="uniform"')
        query = np.linspace(0, 1, n_grid, endpoint=False)
        scale = 1 / n_grid
        is_bounded = True

        def sample(pos, n, d):
            return scale * rng.rand(n, d) + pos
    else:
        raise ValueError('kind should be "gaussian" or "uniform"')

    X_grid, Y_grid = np.meshgrid(query, query)
    positions = np.array([X_grid.ravel(), Y_grid.ravel()]).T
    positions = np.dot(positions, Q)

    # Filter to only certain components based on percent filled
    n_components = int(np.round(perc_filled * n_grid ** 2))
    perm_idx = rng.permutation(n_grid ** 2)
    positions = positions[perm_idx[:n_components], :]

    n_per_component = rng.multinomial(n_samples, 1 / n_components * np.ones(n_components))

    X = np.vstack([
        sample(pos, n, n_features)
        for pos, n in zip(positions, n_per_component)
    ])
    X, y = _get_y_and_shuffle(X, n_per_component)
    return X, y, is_bounded


def _make_gaussian_grid(n_samples, **kwargs):
    kwargs['kind'] = 'gaussian'
    return _make_grid(n_samples, **kwargs)


def _make_manifold_gaussian_grid(n_samples, sigma=0.05, **kwargs):
    kwargs['random_state'] = 2
    return _make_grid(n_samples, sigma=sigma, **kwargs)


def _make_uniform_grid(n_samples, **kwargs):
    kwargs['kind'] = 'uniform'
    kwargs['random_state'] = 1
    return _make_grid(n_samples, **kwargs)


def _make_concentric_circles(n_samples, n_circles=4, noise_std=0.1, random_state=0):
    rng = check_random_state(random_state)
    radius = np.array(range(n_circles)) + 1
    circum = np.array([
        2 * np.pi * rad for rad in radius
    ])
    perc_per_circle = circum / np.sum(circum)

    n_per_component = rng.multinomial(n_samples, perc_per_circle)

    theta = [
        2 * np.pi * rng.rand(n)
        for n in n_per_component
    ]
    X = np.vstack([
        ((r + noise_std * rng.randn(len(th))) * np.array([np.cos(th), np.sin(th)])).T
        for th, r in zip(theta, radius)
    ])
    X, y = _get_y_and_shuffle(X, n_per_component)
    return X, y, False


def _get_y_and_shuffle(X, n_per_component):
    y = np.concatenate([
        j * np.ones(n)
        for j, n in enumerate(n_per_component)
    ])
    # Shuffle
    X, y = shuffle(X, y, random_state=0)
    return X, y


_makers_dict = {key: val for key, val in locals().items() if '_make_' in key}