Source code for pml.supervised.naive_bayes

# Copyright (C) 2012 David Rusk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy 
# of this software and associated documentation files (the "Software"), to 
# deal in the Software without restriction, including without limitation the 
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
# sell copies of the Software, and to permit persons to whom the Software is 
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in 
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
# IN THE SOFTWARE.
"""
Naive Bayes classification algorithm.

@author: drusk
"""

from pml.supervised.classifiers import AbstractClassifier
from pml.utils import collection_utils

[docs]class NaiveBayes(AbstractClassifier): """ Naive Bayes classifier. This algorithm classifies samples using probabilities calculated based on applying Bayes' theorem. The algorithm is said to be naive because it assumes all features are independent of each other. While not generally true, the approach is still quite effective and allows the training set to be much smaller. """
[docs] def __init__(self, training_set): """ Constructs a new NaiveBayes classifier. Args: training_set: model.DataSet The data used to train the classifier. """ super(NaiveBayes, self).__init__(training_set)
def _classify(self, sample): """ Predicts a sample's classification based on the training set. Args: sample: dict or pandas.Series the sample or observation to be classified. Returns: The sample's classification. """ class_probabilities = self.get_classification_probabilities(sample) return collection_utils.get_key_with_highest_value(class_probabilities)
[docs] def get_classification_probabilities(self, sample): """ Determines the probability that a sample belongs to each class that was seen in the training set. Args: sample: dict or pandas.Series The sample or observation to be classified. Returns: probabilities: dict A dictionary of classifications and their probabilities. """ class_probabilities = {} for clazz in set(self.training_set.get_labels()): prob_clazz = self._calc_prob_class(clazz) likelihood = 1 for feature in self.training_set.feature_list(): likelihood *= self._calc_prob_feature_given_class(clazz, feature, sample[feature]) class_probabilities[clazz] = prob_clazz * likelihood return class_probabilities
def _calc_prob_class(self, clazz): """ Calculate the probability of a training example belonging to the given class. Args: clazz: The class which examples must belong to. Returns: probability: float The probability as a floating point number between 0.0 and 1.0. """ clazz_count = self.training_set.get_label_value_counts()[clazz] return float(clazz_count) / self.training_set.num_samples() def _calc_prob_feature_given_class(self, clazz, feature, feature_val): """ Calculates the probability of a training example having a given class as well as the given value of the specified feature. Args: clazz: A class from the training set. feature: The feature whose value must match the provided feature_val. feature_val: The value of feature which must be matched. Returns: probability: float The probability as a floating point number between 0.0 and 1.0. """ n = self.training_set.get_label_value_counts()[clazz] n_c = self._count_examples(clazz, feature, feature_val) num_feature_vals = len(set(self.training_set.get_column(feature))) p = float(1) / num_feature_vals m = num_feature_vals # the use of m and p is called 'm-estimates' and is for the case # where n_c = 0 because otherwise that would make the product of the # probabilities. return float(n_c + m*p) / (n + m) def _count_examples(self, clazz, feature, feature_val): """ Counts the training set examples which have the specified class as well as the specified value of the given feature. Args: clazz: The class which training examples must belong to in order to be counted. feature: The feature for which the training examples must have the value feature_val. feature_val: The value of feature which must be matched in order to count an example. Returns: count: int The number of training examples with the specified class and same value as the sample for the specified feature. """ training_classes = self.training_set.get_labels() training_feature_vals = self.training_set.get_column(feature) match_classes = training_classes == clazz match_feature_vals = training_feature_vals == feature_val match_both = match_classes & match_feature_vals return match_both.value_counts()[True]

Project Versions

This Page