2 August 2024

Implementing the Naive Bayes Classifier on the Iris Dataset

by Jacob Dichter

August 2, 2024

The code below implements a Naive Bayes classifier for the Iris dataset by first loading and randomizing the dataset and then splitting it into training and test sets. It calculates the mean and variance for each flower class (setosa, versicolor, and virginica) and uses these to compute Gaussian probabilities for classification. The classifier predicts the species of each test instance and calculates the classification accuracy based on the proportion of correct predictions.

import sys
import numpy as np
import pandas as pd
import math

##---Return products of Gaussian probabilities for each feature
def compute_gaussian(x, mean, var):
    res = 1

    for i in range(0, len(x)):
        exp = math.exp(-((x[i]-mean[i])**2 / (2 * var[i] )))
        res *= (1 / (math.sqrt(2 * math.pi * var[i]))) * exp
    return res

### ---- IRIS DATASET ---- ###

def main(): 
    ##---Load in the Iris dataset using Pandas read.csv
    df = pd.read_csv("iris.csv") 
    
    #---Randomize data and assign randomized data into dfrandom
    #--.sample shuffles ourdata
    #--.reset_index creates a new index for the shuffled data and drops the old index
    dfrandom = df.sample(frac=1, random_state=1119).reset_index(drop=True) 
    print(dfrandom)

    # data read from a file is read as a string, so convert the first 4 cols to float 
    df1 = dfrandom.iloc[:,0:4].astype(float) 
    print(df1)

    #---separate out the last column 
    df2 = dfrandom.iloc[:,4] 
   
    #---combine the 4 numerical columns and the ast column that has the flower category 
    dfrandom = pd.concat([df1,df2],axis=1) 
    print(dfrandom) 
    
    #---separate the data into training and test parts 
    dftrain = dfrandom.iloc[0:100,:] 
    print(dftrain) 
    dftest = dfrandom.iloc[100:,:] 
    print(dftest) 
 
    #---assemble the data by categories i.e., classes 
    dfsetosa = dfrandom[dfrandom['species'] == 'setosa'] 
    print(dfsetosa) 
    dfversicolor = dfrandom[dfrandom['species'] == 'versicolor'] 
    print(dfversicolor) 
    dfvirginica = dfrandom[dfrandom['species'] == 'virginica'] 
    print(dfvirginica) 
    
    #---------find mean of each class--------- 
    mean_setosa = dfsetosa.iloc[:,0:4].mean(axis=0) 
    print('mean setosa\n',mean_setosa) 
    mean_versicolor = dfversicolor.iloc[:,0:4].mean(axis=0)     
    print('mean versicolor\n',mean_versicolor)     
    mean_virginica = dfvirginica.iloc[:,0:4].mean(axis=0)     
    print('mean virginica\n',mean_virginica)      
    
    #---------find variance of each class---------     
    var_setosa = dfsetosa.iloc[:,0:4].var(axis=0)     
    print('var setosa\n',var_setosa)     
    var_versicolor = dfversicolor.iloc[:,0:4].var(axis=0)     
    print('var versicolor\n',mean_versicolor)     
    var_virginica = dfvirginica.iloc[:,0:4].var(axis=0)     
    print('var virginica\n',var_virginica)      
    
    #---do prediction on the test set via Naive Bayes     
    count_correct = 0     
    print(len(dftest))     
    for i in range(0,len(dftest)):         
        x = dftest.iloc[i,0:4].values         
        probC1 = compute_gaussian(x,mean_setosa.values,var_setosa.values)         
        probC2 = compute_gaussian(x,mean_versicolor.values,var_versicolor.values)         
        probC3 = compute_gaussian(x,mean_virginica.values,var_virginica.values)         
        probs = np.array([probC1,probC2,probC3])         
        maxindex = probs.argmax(axis=0)          
        
        if dftest.iloc[i,4] == 'setosa':             
            index = 0         
        if dftest.iloc[i,4] == 'versicolor':             
            index = 1         
        if dftest.iloc[i,4] == 'virginica':             
            index = 2         
        if maxindex == index:             
            count_correct = count_correct + 1         
        #print(probC1,' ', probC2,' ', probC3,' class=',dftest.iloc[i,4])     
        print('classification accuracy =', count_correct/len(dftest)*100)
        
main()

if __name__ == 'main':
    sys.exit(int(main() or 0))

tags: