Source code for deep_bottleneck.datasets.mushroom

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import namedtuple
from tensorflow import keras
from tensorflow.python.keras import utils as keras_utils


[docs]def load():
    """Load the mushroom dataset.
    
    Mushrooms are to be classified as either edible or poisonous.
    The output follows the following naming convention:

        - X is the data
        - y is class, with numbers from 0 to 9
        - Y is class, but coded as a 10-dim vector with one entry set to 1 at the column index corresponding to the class

    Returns:
        Returns two namedtuples, the first one containing training
        and the second one containing test data respectively. Both come with fields X, y and Y:
    """
    mushrooms = pd.read_csv('datasets/mushroom.csv')

    n_classes = 2
    y = mushrooms['class=e']
    X = mushrooms.drop(['class=e', 'class=p'], axis=1).values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

    Y_train = keras_utils.to_categorical(y_train, n_classes).astype('float32')
    Y_test = keras_utils.to_categorical(y_test, n_classes).astype('float32')

    Dataset = namedtuple('Dataset', ['X', 'Y', 'y', 'n_classes'])
    training = Dataset(X_train, Y_train, y_train, n_classes)
    test = Dataset(X_test, Y_test, y_test, n_classes)

    return training, test