Source code for pml.supervised.classifiers
# Copyright (C) 2012 David Rusk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
"""
Classification algorithms for supervised learning tasks.
@author: drusk
"""
import pandas as pd
from pml.data import model
from pml.utils.errors import UnlabelledDataSetError, InconsistentFeaturesError
[docs]class AbstractClassifier(object):
"""
This is the base class which classification algorithms should extend. It
provides the common functionality for each classifier.
"""
[docs] def __init__(self, training_set):
"""
Constructs the classifier. Subclasses may have additional parameters
in their constructors.
Args:
training_set:
A labelled DataSet object used to train the classifier.
Raises:
UnlabelledDataSetError if the training set is not labelled.
"""
if not training_set.is_labelled():
raise UnlabelledDataSetError(custom_message=("Training set must "
"be labelled."))
self.training_set = training_set
def __repr__(self):
"""
This gets called when the object's name is typed into IPython on its
own line, causing a string representation of the object to be
displayed.
NOTE: override __str__ in subclass to set this.
Returns:
This object's string representation, primarily for debugging
purposes.
"""
return self.__str__()
[docs] def classify_all(self, dataset):
"""
Predicts the classification of each sample in a dataset.
Args:
dataset: DataSet compatible object (see DataSet constructor)
the dataset whose samples (observations) will be classified.
Returns:
A ClassifiedDataSet which contains the classification results for
each sample. It also contains the original data.
"""
dataset = model.as_dataset(dataset)
return ClassifiedDataSet(dataset, dataset.reduce_rows(self.classify))
[docs] def classify(self, sample):
"""
Predicts a sample's classification based on the training set.
Args:
sample:
the sample or observation to be classified.
Returns:
The sample's classification.
Raises:
InconsistentFeaturesError if the sample doesn't have the same
features as the training data.
"""
sample = pd.Series(sample)
self._check_feature_list(sample)
return self._classify(sample)
def _classify(self, sample):
"""
Classifiers which subclass this AbstractClassifier must implement
this method to provide the classification algorithm.
"""
raise NotImplementedError(("Classifiers must implement the "
"'_classify' method."))
def _check_feature_list(self, sample):
"""
Raises an InconsistentFeaturesError if the sample does not have
the same features as the training data.
"""
expected_features = self.training_set.feature_list()
actual_features = sample.index.tolist()
if set(expected_features) != set(actual_features):
raise InconsistentFeaturesError(expected_features,
actual_features)
[docs]class ClassifiedDataSet(model.DataSet):
"""
A collection of data which has been analysed by a classification
algorithm. It contains both the original DataSet and the results of
the classification. It provides methods for analysing these
classification results.
"""
[docs] def __init__(self, dataset, classifications):
"""
Creates a new ClassifiedDataSet.
Args:
dataset: model.DataSet
A dataset which has been classified but does not hold the results.
classifications: pandas.Series
A Series with the classification results.
"""
super(ClassifiedDataSet, self).__init__(dataset.get_data_frame(),
dataset.get_labels())
self.classifications = classifications
[docs] def get_classifications(self):
"""
Retrieves the classifications computed for this dataset.
Returns:
A pandas Series containing each sample's classification.
"""
return self.classifications
[docs] def compute_accuracy(self):
"""
Calculates the percent accuracy of classification results.
Returns:
The percent accuracy of the classification results, i.e. the number
of samples correctly classified divided by the total number of
samples. Should be a floating point number between 0 and 1.
Raises:
UnlabelledDataSetError if the dataset is not labelled.
"""
if not self.is_labelled():
raise UnlabelledDataSetError()
correct = 0
for ind in self.classifications.index:
if self.classifications[ind] == self.labels[ind]:
correct += 1
return float(correct) / len(self.classifications)