Source code for pml.supervised.naive_bayes

# Copyright (C) 2012 David Rusk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy 
# of this software and associated documentation files (the "Software"), to 
# deal in the Software without restriction, including without limitation the 
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
# sell copies of the Software, and to permit persons to whom the Software is 
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in 
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
# IN THE SOFTWARE.
"""
Naive Bayes classification algorithm.

@author: drusk
"""

from pml.supervised.classifiers import AbstractClassifier
from pml.utils import collection_utils

[docs]class NaiveBayes(AbstractClassifier):
    """
    Naive Bayes classifier.
    
    This algorithm classifies samples using probabilities calculated based 
    on applying Bayes' theorem.
    
    The algorithm is said to be naive because it assumes all features are 
    independent of each other.  While not generally true, the approach is 
    still quite effective and allows the training set to be much smaller.
    """
    
[docs]    def __init__(self, training_set):
        """
        Constructs a new NaiveBayes classifier.
        
        Args:
          training_set: model.DataSet
            The data used to train the classifier.
        """
        super(NaiveBayes, self).__init__(training_set)
    
    def _classify(self, sample):
        """
        Predicts a sample's classification based on the training set.
        
        Args:
          sample: dict or pandas.Series
            the sample or observation to be classified.
          
        Returns:
          The sample's classification.
        """
        class_probabilities = self.get_classification_probabilities(sample)
        return collection_utils.get_key_with_highest_value(class_probabilities)
    
[docs]    def get_classification_probabilities(self, sample):
        """
        Determines the probability that a sample belongs to each class that 
        was seen in the training set.
        
        Args:
          sample: dict or pandas.Series
            The sample or observation to be classified.
        
        Returns:
          probabilities: dict
            A dictionary of classifications and their probabilities.
        """
        class_probabilities = {}

        for clazz in set(self.training_set.get_labels()):
            prob_clazz = self._calc_prob_class(clazz)
            
            likelihood = 1
            for feature in self.training_set.feature_list():
                likelihood *= self._calc_prob_feature_given_class(clazz, feature, 
                                                            sample[feature])
            class_probabilities[clazz] = prob_clazz * likelihood
            
        return class_probabilities
    
    def _calc_prob_class(self, clazz):
        """
        Calculate the probability of a training example belonging to the 
        given class.
        
        Args:
          clazz:
            The class which examples must belong to.
            
        Returns:
          probability: float
            The probability as a floating point number between 0.0 and 1.0.
        """
        clazz_count = self.training_set.get_label_value_counts()[clazz]
        return float(clazz_count) / self.training_set.num_samples()
    
    def _calc_prob_feature_given_class(self, clazz, feature, feature_val):
        """
        Calculates the probability of a training example having a given class 
        as well as the given value of the specified feature.

        Args:
          clazz:
            A class from the training set.
          feature:
            The feature whose value must match the provided feature_val.
          feature_val:
            The value of feature which must be matched.
            
        Returns:
          probability: float
            The probability as a floating point number between 0.0 and 1.0. 
        """
        n = self.training_set.get_label_value_counts()[clazz]
        n_c = self._count_examples(clazz, feature, feature_val)
        
        num_feature_vals = len(set(self.training_set.get_column(feature)))
        p = float(1) / num_feature_vals
        m = num_feature_vals
        
        # the use of m and p is called 'm-estimates' and is for the case 
        # where n_c = 0 because otherwise that would make the product of the 
        # probabilities.
        
        return float(n_c + m*p) / (n + m)
        
    def _count_examples(self, clazz, feature, feature_val):
        """
        Counts the training set examples which have the specified class as 
        well as the specified value of the given feature.
        
        Args:
          clazz:
            The class which training examples must belong to in order to be 
            counted.
          feature:
            The feature for which the training examples must have the value 
            feature_val.
          feature_val: 
            The value of feature which must be matched in order to count an 
            example.
            
        Returns:
          count: int
            The number of training examples with the specified class and same 
            value as the sample for the specified feature.
        """
        training_classes = self.training_set.get_labels()
        training_feature_vals = self.training_set.get_column(feature)
        
        match_classes = training_classes == clazz
        match_feature_vals = training_feature_vals == feature_val
        
        match_both = match_classes & match_feature_vals
        return match_both.value_counts()[True]
Source code for pml.supervised.naive_bayes

Project Versions

This Page

Navigation

Source code for pml.supervised.naive_bayes

Project Versions

RTD Search

This Page

Quick search

Navigation