Source code for pml.data.model

# Copyright (C) 2012 David Rusk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy 
# of this software and associated documentation files (the "Software"), to 
# deal in the Software without restriction, including without limitation the 
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
# sell copies of the Software, and to permit persons to whom the Software is 
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in 
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
# IN THE SOFTWARE.
"""
Models for the data being analysed and manipulated.

@author: drusk
"""

import random as rand

import pandas as pd

from pml.utils import plotting
from pml.utils.errors import InconsistentSampleIdError
from pml.utils.errors import UnlabelledDataSetError
from pml.utils.pandas_util import get_indices_with_value

[docs]class DataSet(object):
    """
    A collection of data that may be analysed and manipulated.
    
    Columns are interpreted as features in the data set, and rows are samples 
    or observations.
    """
    
[docs]    def __init__(self, data, labels=None):
        """
        Creates a new DataSet from data of an unknown type.  If data is itself 
        a DataSet object, then its contents are copied and a new DataSet is 
        created from the copies.
    
        Args:
          data: 
            Data of unknown type.  The supported types are:
                1) pandas DataFrame
                2) Python lists or pandas DataFrame
                3) an existing DataSet object 
          labels: pandas Series, Python list or Python dictionary
            The classification labels for the samples in data.  If they are 
            not known (i.e. it is an unlabelled data set) the value None 
            should be used.  Default value is None (unlabelled).
        
        Raises:
          ValueError if the data or labels are not of a supported type.  
          
          InconsistentSampleIdError if labels were provided whose sample ids 
          do not match those of the data.    
        """
        if isinstance(data, pd.DataFrame):
            self._dataframe = data
        elif isinstance(data, list):
            self._dataframe = pd.DataFrame(data)
        elif isinstance(data, DataSet):
            self._dataframe = data._dataframe.copy()
        else:
            raise ValueError("Unsupported representation of data set")

        if isinstance(labels, list) or isinstance(labels, dict):
            self.labels = pd.Series(labels)
        elif isinstance(labels, pd.Series) or labels is None:
            self.labels = labels
        else:
            raise ValueError("Unsupported representation of labels")
            
        if (self.labels is not None and 
            not (self.labels.index == self._dataframe.index).all()):
            raise InconsistentSampleIdError(("The sample ids for the data "
                                             "and the labels do not match."))
        
    def __str__(self):
        """
        Returns:
          This object's string representation, primarily for debugging 
          purposes.
        """
        return self.__repr__()
    
    def __repr__(self):
        """
        This gets called when the object's name is typed into IPython on its 
        own line, causing a string representation of the object to be 
        displayed.
        
        Returns:
          This object's string representation, providing some summary 
          information about it to the user.
        """
        def display(boolean):
            return "yes" if boolean else "no"
        
        return "\n".join(("Features: %s" % self.feature_list(), 
                         "Samples: %d" % self.num_samples(),
                         "Missing values? %s" 
                            % display(self.has_missing_values()),
                         "Labelled? %s" % display(self.is_labelled())))

[docs]    def copy(self):
        """
        Creates a copy of this dataset.  Changes made to one dataset will not 
        affect the other.
        
        Returns:
          A new DataSet with the current data and labels.
        """
        def copy_if_not_none(copyable):
            return copyable.copy() if copyable is not None else None
        
        return DataSet(self._dataframe.copy(), 
                       labels=copy_if_not_none(self.labels))

[docs]    def get_data_frame(self):
        """
        Retrieve the DataSet's underlying data as a pandas DataFrame object.
        
        See also get_labelled_data_frame().
        
        Returns:
          A pandas DataFrame with the DataSet's main data, but no labels.
        """
        return self._dataframe

[docs]    def get_labelled_data_frame(self):
        """
        Retrieve the DataSet's underlying data as a pandas DataFrame object, 
        including any labels.
        
        See also get_data_frame().
        
        Returns:
          A pandas DataFrame with the DataSet's main data and the labels if 
          they are present attached as the rightmost column.
        """
        if not self.is_labelled():
            return self.get_data_frame()
        
        return pd.concat([self.get_data_frame(), pd.DataFrame(self.labels)], 
                         axis=1)
        
[docs]    def num_samples(self):
        """
        Returns:
          The number of samples (rows) in the data set.
        """    
        return self._dataframe.shape[0]
    
[docs]    def num_features(self):
        """
        Returns:
          The number of features (columns) in the data set.
        """
        return self._dataframe.shape[1]
    
[docs]    def is_labelled(self):
        """
        Returns:
          True if the dataset has classification labels for each sample, 
          False otherwise.
        """
        return self.labels is not None
    
[docs]    def has_missing_values(self):
        """
        Returns:
          True if the dataset is missing values.  These will be represented 
          as np.NaN.
        """
        # isnull returns booleans for each data point (True if null).  The 
        # first any checks columns for any True, producing a 1d array of 
        # booleans.  The second any checks that 1d array.
        return pd.isnull(self._dataframe).any().any()
    
[docs]    def feature_list(self):
        """
        Returns:
          The list of features in the dataset. 
        """
        return self._dataframe.columns.tolist()
    
[docs]    def get_sample_ids(self):
        """
        Returns:
          A Python list of the ids of the samples in the dataset.
        """
        return self.get_data_frame().index.tolist()
    
[docs]    def get_labels(self, indices=None):
        """
        Selects classification labels for the specified samples (rows) in the 
        DataSet.

        Args:
          indices: list
            The list of row indices (0 based) which should be selected.  
            Defaults to None, in which case all labels are selected.
        
        Returns:
          A pandas Series with the classification labels.
        """
        if indices is None:
            return self.labels
        else:
            return self.labels.take(indices)
    
[docs]    def get_feature_values(self, feature):
        """
        Retrieves the set of values for a given feature.
        
        Args:
          feature: string
            The feature whose unique values will be retrieved.
            
        Returns:
          value_set: set
            The set of unique values for a feature.
        """
        return set(self.get_feature_value_counts(feature).index)
    
[docs]    def get_feature_value_counts(self, feature):
        """
        Count the number of occurrences of each value of a given feature in 
        the data set.
        
        Args:
          feature: string
            The feature whose values will be counted.
        
        Returns:
          value_counts: pandas.Series
            A Series containing the counts of each label.  It is  indexable by 
            label.  The index is ordered from highest to lowest count.
        """
        return self.get_column(feature).value_counts()
    
[docs]    def get_label_value_counts(self):
        """
        Count the number of occurrences of each label.
        
        NOTE: If the data set is unlabelled an empty set of results will be 
        returned.
        
        Returns:
          value_counts: pandas.Series
            A Series containing the counts of each label.  It is indexable by 
            label.  The index is ordered from highest to lowest count.
        """
        if self.is_labelled():
            return self.labels.value_counts()
        else:
            return pd.Series() # blank result
    
[docs]    def reduce_rows(self, function):
        """
        Performs a row-wise reduction of the data set.
        
        Args:
          function: 
            the function which will be applied to each row in the data set.
        
        Returns:
          a pandas Series object which is the one dimensional result of 
            reduction (one value corresponding to each row).
        """
        return self._dataframe.apply(function, axis=1)

[docs]    def reduce_features(self, function):
        """
        Performs a feature-wise (i.e. column-wise) reduction of the data set.
        
        Args:
          function:
            The function which will be applied to each feature in the data set.
            
        Returns:
          A pandas Series object which is the one dimensional result of the 
          reduction (one value corresponding to each feature).
        """
        return self._dataframe.apply(function, axis=0)

    def _get_filtered_labels_if_exist(self, indices):
        """
        Internal method used to filter the data set's labels if there are any.
        
        Args:
          indices:
            The indices of the labels to keep.
            
        Returns:
          labels:
            If the data set is labelled, this will be the labels at the 
            specified indices.  If the data set is unlabelled, None will 
            be returned.
        """
        return self.labels[indices] if self.is_labelled() else None

[docs]    def sample_filter(self, samples_to_keep):
        """
        Filters the data set based on its sample ids.
        
        Args:
          samples_to_keep:
            The sample ids of the samples which should be kept.  All others 
            will be removed.
            
        Returns:
          filtered: model.DataSet
            The filtered data set.
        """
        return DataSet(self._dataframe.ix[samples_to_keep], 
                       self._get_filtered_labels_if_exist(samples_to_keep))

[docs]    def value_filter(self, feature, value):
        """
        Filters the data set based on its values for a given feature.
        
        Args:
            feature: string
              The name of the feature whose value will be examined for each 
              sample.
            value:
              The value which all samples passing through the filter should 
              have for the specified feature.
        
        Returns:
          filtered: model.DataSet
            The filtered data set.
        """
        samples = get_indices_with_value(self.get_column(feature), value)
        return self.sample_filter(samples)

[docs]    def label_filter(self, label):
        """
        Filters the data set based on its labels.
        
        Args:
          label:
            Samples with this label value will remain in the filtered data 
            set.  All others will be removed.
        
        Returns:
          filtered: model.DataSet
            The filtered data set.
        
        Raises:
          UnlabelledDataSetError if the data set is not labeled.
        """
        if not self.is_labelled():
            raise UnlabelledDataSetError()
        
        return self.sample_filter(get_indices_with_value(self.labels, label))                

[docs]    def drop_column(self, index):
        """
        Creates a copy of the data set with a specified column removed.
        
        Args:
          index: 
            the index (0 based) of the column to drop.
          
        Returns:
          a new DataSet with the specified column removed.  The original 
          DataSet remains unaltered.
        """
        return DataSet(self._dataframe.drop(index, axis=1), 
                       labels=self.labels)

[docs]    def get_column(self, index):
        """
        Selects a column from the data set.
        
        Args:
          index: 
            The column index.  If the columns are named, this is the column 
            name.  Otherwise it is the 0-based index.
          
        Returns:
          the columns at the specified index as a pandas Series object.  This 
          series is a view on the original data set, not a copy.  That means 
          any changes to it will also be applied to the original data set.
        """
        return self._dataframe[index]

[docs]    def set_column(self, index, new_column):
        """
        Set the new values for a column.  Can be used to create a new column.
        
        Args:
          index: 
            The column index.  If the columns are named, this is the column 
            name.  Otherwise it is the 0-based index.
          new_column: pandas.Series or compatible object
            The new column data to be placed at the specified index.
        """
        self._dataframe[index] = new_column

[docs]    def get_row(self, identifier):
        """
        Selects a single row from the dataset.
        
        Args:
          identifier:
            The id of the row to select.  If the DataSet has special indices 
            set up (ex: through a call to load with has_ids=True) these can 
            be used.  The integer index (0 based) can also be used.
            
        Returns:
          A pandas Series object representing the desired row.  NOTE: this is 
          a view on the original dataset.  Changes made to this Series will 
          also be made to the DataSet.
        """
        return self._dataframe.ix[identifier]

[docs]    def get_rows(self, indices):
        """
        Selects specified rows from the dataset.
        
        Args:
          indices: list
            The list of row indices (0 based) which should be selected.
        
        Returns:
          A new DataSet with the specified rows from the original.
        """
        labels = self.labels.take(indices) if self.is_labelled() else None
        return DataSet(self._dataframe.take(indices), labels=labels)

[docs]    def split(self, percent, random=False):
        """
        Splits the dataset in two.
        
        Args:
          percent: float
            The percentage of the original dataset samples which should be 
            placed in the first dataset returned.  The remainder are placed 
            in the second dataset.  This percentage must be specified as a 
            value between 0 and 1 inclusive.
          random: boolean
            Set to True if the samples selected for each new dataset should 
            be picked randomly.  Defaults to False, meaning the samples are 
            taken in their existing order.
        
        Returns:
          dataset1: DataSet object
            A subset of the original dataset with <percent> samples.
          dataset2: DataSet object
            A subset of the original dataset with 1-<percent> samples.
            
        Raises:
          ValueError if percent < 0 or percent > 1.
        """
        if percent < 0 or percent > 1:
            raise ValueError("Percentage value must be >= 0 and <= 1.")
        
        num_set1_samples = int(percent * self.num_samples())
        
        if not random:
            set1_rows = range(num_set1_samples)
            set2_rows = range(num_set1_samples, self.num_samples())
        else:
            all_rows = range(self.num_samples())
            rand.shuffle(all_rows)
            set1_rows = all_rows[:num_set1_samples]
            set2_rows = all_rows[num_set1_samples:]
    
        return self.get_rows(set1_rows), self.get_rows(set2_rows)
    
[docs]    def fill_missing(self, fill_value):
        """
        Fill in missing data with a constant value.  Changes are made in-place.
        
        Args:
          fill_value:
            The value to insert wherever data is missing.
            
        Returns:
          Void.  The changes to the DataSet are made in-place.
        """
        return self._dataframe.fillna(fill_value, inplace=True)
    
[docs]    def combine_labels(self, to_combine, new_label):
        """
        Combines classification labels to have some new value.
        
        For example, consider a dataset with labels "cat", "crow" and 
        "pidgeon".  Maybe you are only really worried about whether something 
        is a cat or a bird, so you want to combine the "crow" and "pidgeon" 
        labels into a new one called "bird".
        
        Args:
          to_combine: list
            The list of labels which will be combined to form one new 
            classification label.
          new_label: string
            The new classification label for those which were combined.
        """
        # pd.Series.replace returns a new Series, leaves original unmodified
        self.labels = self.labels.replace(to_combine, value=new_label)
        
[docs]    def plot_radviz(self):
        """
        Generates a RadViz plot of the data set.  Radviz is useful for 
        visualizing data with more than two dimensions.
        
        Returns:
          void, but a plot is generated.
        """
        plotting.plot_radviz(self)


[docs]def as_dataset(data):
    """
    Creates a DataSet from the provided data.  If data is already a DataSet, 
    return it directly.  Use this instead of the DataSet constructor if you 
    don't know whether your data is a DataSet already, but you don't want to 
    create a new one if it already is.
    
    Args:
      data: 
        Data of unknown type.  It may be a Python list or pandas DataFrame or 
        DataSet object.
        
    Returns:
      A DataSet object.  If the data was already a DataSet then the input 
      object will be directly returned.
          
    Raises:
      ValueError if the data is not of a supported type.    
    """
    if isinstance(data, DataSet):
        return data
    else:
        return DataSet(data)
Source code for pml.data.model

Project Versions

This Page

Navigation

Source code for pml.data.model

Project Versions

RTD Search

This Page

Quick search

Navigation