from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, classification_report, jaccard_score, log_loss
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

df = pd.read_csv('./data/ibm/churn_data.csv')

columns = [
    'tenure',
    'age',
    'address',
    'income',
    'ed',
    'employ',
    'equip',
    'callcard',
    'wireless', 
    'churn'
]

churn = df[columns].copy()
churn['churn'] = churn['churn'].astype('int')

scaler = StandardScaler()

columns = [
    'tenure',
    'age',
    'address',
    'income',
    'ed',
    'employ',
    'equip'
]

X = np.asarray(churn[columns])
X = scaler.fit_transform(X)

y = np.asarray(churn['churn'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

lr = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

y_hat = lr.predict(X_test)
y_hat_prob = lr.predict_proba(X_test)

confusion_matrix(y_test, y_hat)

array([[24,  1],
       [ 9,  6]])

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.73      0.96      0.83        25
           1       0.86      0.40      0.55        15

    accuracy                           0.75        40
   macro avg       0.79      0.68      0.69        40
weighted avg       0.78      0.75      0.72        40

Log loss

Now, lets try log loss for evaluation. In logistic regression, the output can be the probability of customer churn is yes (or equals to 1). This probability is a value between 0 and 1. Log loss( Logarithmic loss) measures the performance of a classifier where the predicted output is a probability value between 0 and 1.

log_loss(y_test, y_hat_prob)

0.6017092478101185