Source code for lathe.data

import re
import arff
import numpy as np
import sklearn.model_selection
import sklearn.preprocessing


[docs]def minmax_scale(data, axis=0):
    """Transforms features by scaling `data` along `axis` between 0-1.

    Args:
        data (`np.ndarray`): The data to scale.
        axis (int): The axis to scale along.

    Returns:
        (`np.ndarray`): The scaled data.
    """
    return (data - np.nanmin(data, axis=axis)) / (
        np.nanmax(data, axis=axis) - np.nanmin(data, axis=axis))


[docs]def shuffle(features, labels):
    if features.shape[0] != labels.shape[0]:
        raise ValueError(
            "Features {} and labels {} must have the same number of rows".
            format(features.shape, labels.shape))
    permutation = np.random.permutation(features.shape[0])
    return features[permutation], labels[permutation]


[docs]def k_fold(data, n_splits, shuffle=False):
    return sklearn.model_selection.KFold(n_splits, shuffle=shuffle).split(data)


def _split(data, index):
    return data[:, :index], data[:, index:]


# def split(data, percent_chunks, axis=0):
#     if data.ndim != 2:
#         raise ValueError("data to split must be a 2D numpy array")
#     splits = np.cumsum(percent_chunks)
#     if splits[-1] == 100:
#         splits = np.divide(splits, 100.)
#     if splits[-1] != 1.:
#         raise ValueError("Percents must sum to 1.0 or 100")
#     # np.random.shuffle(data)
#     return np.split(data, splits[:-1] * data.shape[1], axis)


[docs]def split(features, labels, percent):
    percent = float(percent)
    if not 0 < percent < 1:
        raise ValueError("percent must be in range: 0-1")
    index = int(percent * features.shape[0])
    return features[:index], features[index:], labels[:index], labels[index:]


# HACK: arff.load only accepts an open file descriptor
# and BYU CS uses a custom arff format
def _fix_attribute_types(f):
    # TODO: do not load entire contents of file into RAM at once
    f.seek(0)
    s = f.read()
    f.seek(0)
    s = re.sub(r'continuous', 'real', s, flags=re.IGNORECASE)
    f.write(s)
    f.truncate()
    f.seek(0)


# TODO: do we need this function as well?
def _find_nominal_index(data):
    return [
        i for i, (_, kind) in enumerate(data)
        if kind not in ['REAL', 'INTEGER', 'NUMERIC', 'STRING']
    ]


[docs]def get_continuous_index(attributes):
    return np.array([x[1] == 'REAL' for x in attributes])


[docs]def get_nominal_index(attributes):
    return np.array([x[1] != 'REAL' for x in attributes])


def _one_hot(data, index):
    encoder = sklearn.preprocessing.OneHotEncoder(
        categorical_features=index, sparse=False, handle_unknown='ignore')
    # TODO: How to not screw up the index? Use encoder.feature_indices_?
    return encoder.fit_transform(data)


[docs]def load(file_path,
         label_size=0,
         encode_nominal=True,
         one_hot_data=False,
         one_hot_targets=False,
         imputer=None,
         normalizer=None,
         shuffle=False):
    """Load an ARFF file.

    Args:
        file_path (str): The path of the ARFF formatted file to load.
        label_size (int, optional): The number of labels (outputs) the dataset
            to load has.
        encode_nominal (bool, optional): Whether or not to encode nominal
            atributes as integers.
        one_hot_data (bool, optional): Whether or not to use a one-hot encoder
            for nominal attributes in `data`. Defaults to whatever the value of
            `encode_nominal` is.
        one_hot_targets (bool, optional): Whether or not to use a one-hot
            encoder for nominal attributes in `targets`.
        imputer (function, optional): A 1 arity function that accepts the
            dataset to impute missing values over.
            e.g: `sklearn.preprocessing.Imputer().fit_transform`.
            Defaults to `None`.
        normalizer (function, optional): A 1 arity function that accepts the
            data to be scaled as a parameter and returns the scaled data.
            e.g: `lathe.minmax_scale`. Defaults to `None`.
        shuffle (bool, optional): Whether or not to shuffle the `data`.

    Returns:
        (list, `numpy.ndarray`, `numpy.ndarray`): Tuple containing
        (`attributes`, `data`, `targets`). Where `attributes` is a list of
        tuples containing (attribute_name, attribute_type), `data` are the
        features and `targets` are the expected output for the dataset.

    Note:
        `targets` will be `None` unless `label_size` >= 1.

    See Also:
        - http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
        - http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
        - http://www.cs.waikato.ac.nz/ml/weka/arff.html
    """
    if label_size < 0:
        raise ValueError('label_size must be >= 0')

    if label_size == 0 and one_hot_targets:
        raise ValueError('label_size must be > 0 if one_hot_targets == True')

    if one_hot_data is None:
        one_hot_data = encode_nominal

    with open(file_path, 'r+') as f:
        try:
            arff_data = arff.load(f, encode_nominal=encode_nominal)
        except arff.BadAttributeType:
            _fix_attribute_types(f)
            arff_data = arff.load(f, encode_nominal=encode_nominal)

    dtype = np.float if encode_nominal else None
    data = np.atleast_2d(np.array(arff_data['data'], dtype=dtype))
    if not encode_nominal:
        data = np.where(data == np.array(None), np.nan, data)

    if imputer:
        data = imputer(data)

    if shuffle:
        np.random.shuffle(data)

    index = -label_size if label_size else None
    attributes = arff_data['attributes']

    if normalizer:
        cont_index = get_continuous_index(attributes)
        data[:, cont_index] = normalizer(data[:, cont_index])

    targets = None
    if label_size != 0:
        data, targets = _split(data, index)

    # have to do this twice because sklearn screws with the indices
    if one_hot_data:
        data_index = _find_nominal_index(attributes[:index])
        data = _one_hot(data, data_index)
    if one_hot_targets:
        target_index = _find_nominal_index(attributes[index:])
        targets = _one_hot(targets, target_index)

    return attributes, data, targets