Write a Naive Bayes Classifier from scratch

Created	@February 22, 2024
Tags	ML Coding

Writing a classifier from scratch without relying on high-level machine learning libraries like TensorFlow or scikit-learn is an excellent exercise to test a candidate's fundamental understanding of machine learning algorithms. A simple yet educational task is to implement a Naive Bayes classifier or a Perceptron from scratch using Python. These models are relatively straightforward and provide insight into basic ML concepts.

Task: Write a Naive Bayes Classifier for Text Classification

Prompt: "Using Python, write a Naive Bayes classifier from scratch to categorize text documents into two classes. You can assume the input is a list of sentences (documents) and their associated labels. Also, implement basic preprocessing of the text."

Example Implementation

Here's a simplified version of how one might approach writing a Naive Bayes classifier for a binary text classification task:

import numpy as np

class NaiveBayes:
    
    def __init__(self):

        self.classes = []
        self.n = 0
        self.means = []
        self.vars  = []
        self.prior = []

    def train(self, X, y):
        # X training
        # y labels

        self.classes = np.unique(y)
        self.n = len(self.classes)
        
        for c in self.classes:
            subset = X[y == c]
            self.means.append(np.mean(subset, axis=0))
            self.vars.append(np.var(subset, axis=0))
            self.prior.append(subset.shape[0]/X.shape[0])
        


    def classify(self, x):
        est_prob = []
        
        # compute the prob for each class
        for idx, (mean, var )in enumerate(zip(self.means,self.vars)):
            posterior = np.log(self.prior[idx]) + np.sum(np.log(self.gauss(mean, var, x)))
            est_prob.append(posterior)
        idx = np.argmax(est_prob, axis=0)
        est_labels = self.classes[idx]
        return est_prob, est_labels
    
    def gauss(self, mean, var, x):
        num = np.exp(-((x - mean) ** 2 / (2 * var)))
        den  = np.sqrt( 2* np.pi * var)
        return num /den
    


dataset= np.random.rand(100, 2) * 10
labels = np.where(dataset[:, 0] + dataset[:, 1] > 10, 1, 0)

print(dataset[0:10],labels[0:10])
data_with_labels = np.column_stack((dataset, labels))
np.random.shuffle(data_with_labels)
split = int(0.8 * len(data_with_labels))
train, test = data_with_labels[:split, :], data_with_labels[split:, :]
x_train, y_train = train[:, :-1], train[: , -1]
x_test, y_test = test[:, : -1], test[:, -1]


naivebayes = NaiveBayes()
naivebayes.train(x_train, y_train)

preds = []
for x in x_test:
    _, pred = naivebayes.classify(x)
    preds.append(pred)

accuracy = sum(preds == y_test) / len(y_test)
TP = np.sum((preds == 1) & (y_test == 1))
FP = np.sum((preds == 1) & (y_test == 0))
TN = np.sum((preds == 0) & (y_test == 0))
FN = np.sum((preds == 1) & (y_test == 0))
precision = TP/(TP + FP)
recall = TP/(TP + FN)
f1_score = 2* (precision * recall) /(precision + recall)

Refer test to the end.

Associated Questions

Feature Representation: "How did you represent the text data for modeling? Why is this representation effective for Naive Bayes?"

Probability Estimation: "Can you explain how you calculated the prior and likelihood in your implementation? Why did you apply Laplace smoothing?"

Model Evaluation: "Without using any libraries, how would you evaluate the performance of your classifier?"

Improvements: "What limitations does the Naive Bayes classifier have? How could you potentially improve your implementation?"

Scalability: "How does your implementation scale with the size of the dataset? What would you do to handle very large datasets or vocabularies?"

These questions challenge the candidate to demonstrate their understanding of basic machine learning principles, data preprocessing techniques, and the mathematics underpinning Naive Bayes. Additionally, discussing evaluation and scalability touches on practical aspects of machine learning applications.

background knowledge

yf, test data

import numpy as np
class NaiveBayes:
    def __init__(self):
        self.classes = []
        self.n = 0
        self.means = []
        self.vars  = []
        self.prior = []

    def train(self, X, y):
        # X training
        # y labels
        self.classes = np.unique(y)
        self.n = len(self.classes)
        for c in self.classes:
            subset = X[y == c]
            self.means.append(np.mean(subset, axis=0)) # self.means[0].shape = (4,)
            self.vars.append(np.var(subset, axis=0))
            self.prior.append(subset.shape[0]/X.shape[0])
    
    def classify(self, x):
        est_prob = []
        # compute the prob for each class
        for idx, (mean, var )in enumerate(zip(self.means,self.vars)):
            posterior = np.log(self.prior[idx]) + np.sum(np.log(self.gauss(mean, var, x)))
            est_prob.append(posterior)
        idx = np.argmax(est_prob, axis=0)
        est_labels = self.classes[idx]
        return est_prob, est_labels
    
    def gauss(self, mean, var, x):
        num = np.exp(-((x - mean) ** 2 / (2 * var)))
        den  = np.sqrt( 2* np.pi * var)
        return num /den
    

# Set a random seed for reproducibility
np.random.seed(42)
# Number of samples per class
num_samples = 10000
# Feature 1 for Class 0 (normally distributed with mean=2, std=1.5)
feature1_class0 = np.random.normal(2, 1.5, num_samples)
# Feature 2 for Class 0 (normally distributed with mean=3, std=1)
feature2_class0 = np.random.normal(3, 1, num_samples)
# Feature 1 for Class 1 (normally distributed with mean=5, std=1.5)
feature1_class1 = np.random.normal(5, 1.5, num_samples)
# Feature 2 for Class 1 (normally distributed with mean=4, std=1)
feature2_class1 = np.random.normal(8, 1, num_samples)

# Combine features and labels
data = np.concatenate([np.vstack([feature1_class0, feature2_class0]).T,
                           np.vstack([feature1_class1, feature2_class1]).T])
labels = np.concatenate([np.zeros(num_samples), np.ones(num_samples)])

# Shuffle indices
indices = np.arange(len(labels))
np.random.shuffle(indices)
# Define the split ratio
split_ratio = 0.8
# Determine the split point
split_point = int(len(labels) * split_ratio)
# Split the dataset
train_indices, test_indices = indices[:split_point], indices[split_point:]
X_train, X_test = data[train_indices], data[test_indices]
y_train, y_test = labels[train_indices], labels[test_indices]

# instantiate, train and predict Naive Bayes Classifier
nb = NaiveBayes()
nb.train(X_train, y_train)
predictions = []
for x in X_test:
    prob, pred_label = nb.classify(x)
    predictions.append(pred_label)
predictions = np.array(predictions)
# helper function to calculate accuracy
def get_accuracy(y_true, y_hat):
    return np.sum(y_true==y_hat) / len(y_true)
  
# print results
print('Naive Bayes Accuracy: ', get_accuracy(y_test, predictions))
    
# from sklearn import datasets
# from sklearn.model_selection import train_test_split

# # load iris dataset
# iris = datasets.load_iris()
# X = iris.data
# y = iris.target

# # split into train and test data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# # instantiate, train and predict Naive Bayes Classifier
# nb = NaiveBayes()
# nb.train(X_train, y_train)
# predictions = []
# for x in X_test:
#     prob, pred_label = nb.classify(x)
#     predictions.append(pred_label)
# predictions = np.array(predictions)
# # helper function to calculate accuracy
# def get_accuracy(y_true, y_hat):
#     return np.sum(y_true==y_hat) / len(y_true)
  
# # print results
# print('Naive Bayes Accuracy: ', get_accuracy(y_test, predictions))