Source code for deep_bottleneck.datasets.harmonics

import scipy.io as sio
from pathlib2 import Path
from collections import namedtuple

import numpy as np

from tensorflow.python.keras import utils as keras_utils
from deep_bottleneck import utils


[docs]def load(nb_dir = ''):
    """ Load the Information Bottleneck harmonics dataset

    The output follows the following naming convention:
    
        - X is the data
        - y is class, with numbers from 0 to 9
        - Y is class, but coded as a 10-dim vector with one entry set to 1 at the column index corresponding to the class

    Returns:
        Returns two namedtuples, the first one containing training
        and the second one containing test data respectively. Both come with fields X, y and Y:
    """
    ID = '2017_12_21_16_51_3_275766'
    n_classes = 2
    data_file = Path(nb_dir + 'datasets/IB_data_' + str(ID) + '.npz')
    if data_file.is_file():
        data = np.load(nb_dir + 'datasets/IB_data_' + str(ID) + '.npz')
    else:
        import_IB_data_from_mat(ID, nb_dir)
        data = np.load(nb_dir + 'datasets/IB_data_' + str(ID) + '.npz')

    X_train = data['X_train']
    y_train = data['y_train']
    X_test  = data['X_test']
    y_test  = data['y_test']

    Y_train = keras_utils.to_categorical(y_train, n_classes).astype('float32')
    Y_test = keras_utils.to_categorical(y_test, n_classes).astype('float32')

    Dataset = namedtuple('Dataset', ['X', 'Y', 'y', 'n_classes'])
    training = Dataset(X_train, Y_train, y_train, int(n_classes))
    test = Dataset(X_test, Y_test, y_test, int(n_classes))
    return training, test


[docs]def import_IB_data_from_mat(name_ID, nb_dir = ''):
    """ Writes a .npy file to disk containing the harmonics dataset used by Tishby
    
    Args:
        name_ID: Identifier which is going to be part of the output filename

    Returns:
        None
    """
    print('Loading Data...')
    d = sio.loadmat(nb_dir + 'datasets/var_u.mat')
    F = d['F']
    y = d['y']
    C = type('type_C', (object,), {})
    data_sets_original = C()
    data_sets_original.data = F
    data_sets_original.labels = np.squeeze(np.concatenate((y[None, :], 1 - y[None, :]), axis=0).T)

    data_sets = utils.data_shuffle(data_sets_original, 80, shuffle_data=True)
    X_train, y_train, X_test, y_test = data_sets.train.data, data_sets.train.labels[:,
                                                             0], data_sets.test.data, data_sets.test.labels[:, 0]
    np.savez_compressed(nb_dir + 'datasets/IB_data_' + str(name_ID), X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)