import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("loan_dataset_final.csv", encoding="ISO-8859-1")
df.head()


df["is_defaulter"] = df["loan_status"].apply(
    lambda x: 1 if x in ["Charged Off", "Default"] else 0
)


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42535 entries, 0 to 42534
Data columns (total 48 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          42535 non-null  int64  
 1   member_id                   42535 non-null  int64  
 2   loan_amnt                   42535 non-null  int64  
 3   funded_amnt                 42535 non-null  int64  
 4   funded_amnt_inv             42535 non-null  float64
 5   term                        42535 non-null  object 
 6   installment                 42535 non-null  float64
 7   emp_title                   39909 non-null  object 
 8   emp_length                  41423 non-null  object 
 9   home_ownership              42535 non-null  object 
 10  annual_inc                  42531 non-null  float64
 11  verification_status         42535 non-null  object 
 12  issue_d                     42535 non-null  object 
 13  loan_status                 42535 non-null  object 
 14  pymnt_plan                  42535 non-null  object 
 15  desc                        29014 non-null  object 
 16  purpose                     42535 non-null  object 
 17  title                       42522 non-null  object 
 18  addr_state                  42535 non-null  object 
 19  dti                         42535 non-null  float64
 20  delinq_2yrs                 42506 non-null  float64
 21  inq_last_6mths              42506 non-null  float64
 22  mths_since_last_delinq      15609 non-null  float64
 23  mths_since_last_record      3651 non-null   float64
 24  open_acc                    42506 non-null  float64
 25  pub_rec                     42506 non-null  float64
 26  revol_bal                   42535 non-null  int64  
 27  total_acc                   42506 non-null  float64
 28  out_prncp                   42535 non-null  float64
 29  out_prncp_inv               42535 non-null  float64
 30  total_pymnt                 42535 non-null  float64
 31  total_pymnt_inv             42535 non-null  float64
 32  total_rec_prncp             42535 non-null  float64
 33  total_rec_int               42535 non-null  float64
 34  total_rec_late_fee          42535 non-null  float64
 35  recoveries                  42535 non-null  float64
 36  collection_recovery_fee     42535 non-null  float64
 37  collections_12_mths_ex_med  42390 non-null  float64
 38  pub_rec_bankruptcies        41170 non-null  float64
 39  interest_rate               42535 non-null  float64
 40  revol_utilization           42445 non-null  float64
 41  number_bc_gt_75             42535 non-null  int64  
 42  fico_score                  42535 non-null  int64  
 43  lti                         42531 non-null  float64
 44  month_since_oldest_tl       42506 non-null  float64
 45  race_name                   42535 non-null  object 
 46  gender                      42535 non-null  object 
 47  is_defaulter                42535 non-null  int64  
dtypes: float64(26), int64(8), object(14)
memory usage: 15.6+ MB


df.describe()


df['is_defaulter'].value_counts()

is_defaulter
0    36881
1     5654
Name: count, dtype: int64


sns.countplot(data=df,x='is_defaulter')

<Axes: xlabel='is_defaulter', ylabel='count'>


sns.boxplot(x='is_defaulter',y='dti',data=df)

<Axes: xlabel='is_defaulter', ylabel='dti'>


sns.boxplot(x='is_defaulter',y='delinq_2yrs',data=df)

<Axes: xlabel='is_defaulter', ylabel='delinq_2yrs'>


sns.boxplot(
    x='is_defaulter',
    y='mths_since_last_delinq',
    data=df
)

<Axes: xlabel='is_defaulter', ylabel='mths_since_last_delinq'>


sns.boxplot(
    x='is_defaulter',
    y='fico_score',
    data=df
)

<Axes: xlabel='is_defaulter', ylabel='fico_score'>


sns.boxplot(
    x='is_defaulter',
    y='lti',
    data=df
)

<Axes: xlabel='is_defaulter', ylabel='lti'>


sns.scatterplot(
    x='fico_score',
    y='lti',
    data=df,
    hue='is_defaulter'
)

<Axes: xlabel='fico_score', ylabel='lti'>


sns.pairplot(
    df[[
        'loan_amnt', 
        'annual_inc', 
        'dti', 
        'fico_score', 
        'lti', 
        'is_defaulter'
    ]],
    hue='is_defaulter' 
)

/Users/nandinisaagar/anaconda3/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

<seaborn.axisgrid.PairGrid at 0x137178c10>


# Encoding categorical features
from sklearn.preprocessing import LabelEncoder

for feature in [
    "home_ownership",
    "verification_status",
]:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

# Handle specifc columns with special format data
df["term"] = df["term"].apply(lambda x: int(x.split()[0]))


def parse_emp_length(x):
    if type(x) == float:
        return x
    x = x.strip(" ")
    if x == "< 1 year":
        return 0.5

    if x == "10+ years":
        return 10

    return int(x.split()[0])


df["emp_length"] = df["emp_length"].apply(parse_emp_length)

df.head()


features = [
    "loan_amnt",
    "term",
    "installment",
    "emp_length",
    "home_ownership",
    "annual_inc",
    "verification_status",
    "dti",
    "delinq_2yrs",
    "inq_last_6mths",
    "mths_since_last_delinq",
    "open_acc",
    "pub_rec",
    "revol_bal",
    "total_acc",
    "fico_score",
    "lti",
]

# Remove mnths_since_last_delinq and emp_length columns as they have a lot of NaN values
features.remove("mths_since_last_delinq")
features.remove("emp_length")

# Fill NaN values with the mean of the column
for feature in features:
    df.iloc[:, df.columns.get_loc(feature)] = df[feature].fillna(df[feature].mean())

df = df[features + ["is_defaulter"]]
df.head()


X = df.drop('is_defaulter',axis=1)
y = df['is_defaulter']


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1
)

scaler = StandardScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)


from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(scaled_X_train, y_train)

LogisticRegression()

LogisticRegression()


log_model.coef_

array([[ 0.8466957 ,  0.12428815, -0.82952042, -0.03614688, -0.06289597,
         0.09657995, -0.12918231, -0.20017166,  0.39097567, -0.17862258,
        -0.06189653, -0.05227772,  0.17617472, -1.93008062,  0.34631879]])


from sklearn.metrics import accuracy_score,classification_report

y_pred = log_model.predict(scaled_X_test)
accuracy_score(y_test, y_pred)

0.8930418429713212


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()


print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      3685
           1       0.70      0.35      0.47       569

    accuracy                           0.89      4254
   macro avg       0.80      0.66      0.70      4254
weighted avg       0.88      0.89      0.88      4254


from sklearn.metrics import PrecisionRecallDisplay
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
display.plot()
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, y_pred)
plt.show()


from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()


base_pred = model.predict(X_test)
print(confusion_matrix(y_test, base_pred))

cm = confusion_matrix(y_test, base_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

[[3408  277]
 [ 218  351]]


print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3685
           1       0.56      0.62      0.59       569

    accuracy                           0.88      4254
   macro avg       0.75      0.77      0.76      4254
weighted avg       0.89      0.88      0.89      4254


model.feature_importances_

array([0.03380809, 0.01203519, 0.07125944, 0.01148217, 0.05051299,
       0.01777959, 0.07939944, 0.00651266, 0.10958668, 0.03386672,
       0.0048568 , 0.07596503, 0.04619592, 0.36237202, 0.08436725])


pd.DataFrame(
    index=X.columns,
    data=model.feature_importances_,
    columns=['Feature Importance']
).sort_values(
    'Feature Importance',
    ascending=False, 
    inplace=True
)


sns.boxplot(x='fico_score',y='lti',data=df)

<Axes: xlabel='fico_score', ylabel='lti'>


from sklearn.ensemble import RandomForestClassifier

# Use 10 random trees
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

RandomForestClassifier(n_estimators=10)


base_pred = model.predict(X_test)

#  Plot the confusion matrix
cm = confusion_matrix(y_test, base_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()


print(classification_report(y_test, base_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96      3685
           1       0.83      0.52      0.64       569

    accuracy                           0.92      4254
   macro avg       0.88      0.75      0.80      4254
weighted avg       0.92      0.92      0.91      4254


model.feature_importances_

array([0.05017686, 0.02063   , 0.06817122, 0.01269615, 0.06348725,
       0.01818864, 0.06878396, 0.01016184, 0.14855666, 0.04503741,
       0.00443753, 0.07022704, 0.05319116, 0.29134898, 0.07490531])


feat_imp = pd.DataFrame(
    index=X.columns, 
    data=model.feature_importances_, 
    columns=["Feature Importance"]
)
feat_imp.sort_values(by="Feature Importance", ascending=False, inplace=True)
feat_imp


data = {
    "Logistic Regression": 89,
    "Decision Tree": 88,
    "Random Forest": 92,
}
plt.bar(data.keys(), data.values())
plt.ylabel("Accuracy")
plt.show()


# Reload the dataset from scratch
df = pd.read_csv("loan_dataset_final.csv", encoding="ISO-8859-1")

df["is_defaulter"] = df["loan_status"].apply(
    lambda x: 1 if x in ["Charged Off", "Default"] else 0
)
df["term"] = df["term"].apply(lambda x: int(x.split()[0]))
df["emp_length"] = df["emp_length"].apply(parse_emp_length)
for feature in ["home_ownership", "verification_status", "race_name", "gender"]:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

df.head()


unbiased_features = [
    "loan_amnt",
    "term",
    "installment",
    "home_ownership",
    "annual_inc",
    "verification_status",
    "dti",
    "delinq_2yrs",
    "inq_last_6mths",
    "open_acc",
    "pub_rec",
    "revol_bal",
    "total_acc",
    "fico_score",
    "lti",
]
biased_features = unbiased_features + ["race_name", "gender"]

all_features = biased_features + ["is_defaulter"]
df = df[all_features].fillna(df[all_features].mean())

df.head()


df_train, df_test = train_test_split(df, test_size=0.2)

# Train the model with unbiased features and get the accuracy
x_unbiased_train = df_train[unbiased_features]
y_unbiased_train = df_train["is_defaulter"]

x_unbiased_test = df_test[unbiased_features]
y_unbiased_test = df_test["is_defaulter"]

model = RandomForestClassifier(n_estimators=10)
model.fit(x_unbiased_train, y_unbiased_train)

y_unbiased_pred = model.predict(x_unbiased_test)
accuracy_score(y_unbiased_test, y_unbiased_pred)

0.9187727753614671


# Train the model with biased features and get the accuracy
x_biased_train = df_train[biased_features]
y_biased_train = df_train["is_defaulter"]

x_biased_test = df_test[biased_features]
y_biased_test = df_test["is_defaulter"]

model = RandomForestClassifier(n_estimators=10)
model.fit(x_biased_train, y_biased_train)

y_biased_pred = model.predict(x_biased_test)
accuracy_score(y_biased_test, y_biased_pred)

0.9173621723286705


biased_index = y_biased_pred != y_biased_test
biased_entries = df_test[biased_index]

print(f"Biased predictions: {len(biased_entries)/len(df_test)*100:.2f}%")

Biased predictions: 8.26%


# Plot the frequency of the race_name, gender and is_defaulter columns
for col in ["race_name", "gender", "is_defaulter"]:
    sns.countplot(data=df, x=col)
    plt.show()

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	installment	emp_title	emp_length	home_ownership	...	pub_rec_bankruptcies	interest_rate	revol_utilization	number_bc_gt_75	fico_score	lti	month_since_oldest_tl	race_name	gender
0	1000007	1225615	5150	5150	5150.0	60 months	132.58	atlantic tomorrows office	1 year	RENT	...	1.0	0.1864	0.873	2	709	0.147176	124.0	White	Female
1	1000030	1225638	20000	20000	20000.0	36 months	635.07	The Red Threads Inc.	6 years	RENT	...	0.0	0.0890	0.354	0	744	0.277778	149.0	African American	Female
2	1000033	1225642	12800	12800	12750.0	60 months	316.54	T-Mobile USA Inc	9 years	MORTGAGE	...	0.0	0.1677	0.754	0	719	0.156655	176.0	White	Female
3	1000045	1225655	14000	14000	14000.0	60 months	349.98	Trader Joe's	9 years	MORTGAGE	...	0.0	0.1727	0.357	0	714	0.194444	140.0	White	Female
4	1000067	1225680	15000	15000	14975.0	60 months	370.94	Truevance Engineering	< 1 year	RENT	...	0.0	0.1677	0.369	0	709	0.208333	136.0	White	Female

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	installment	emp_title	emp_length	home_ownership	...	pub_rec_bankruptcies	interest_rate	revol_utilization	number_bc_gt_75	fico_score	lti	month_since_oldest_tl	race_name	gender
0	1000007	1225615	5150	5150	5150.0	60 months	132.58	atlantic tomorrows office	1 year	RENT	...	1.0	0.1864	0.873	2	709	0.147176	124.0	White	Female
1	1000030	1225638	20000	20000	20000.0	36 months	635.07	The Red Threads Inc.	6 years	RENT	...	0.0	0.0890	0.354	0	744	0.277778	149.0	African American	Female
2	1000033	1225642	12800	12800	12750.0	60 months	316.54	T-Mobile USA Inc	9 years	MORTGAGE	...	0.0	0.1677	0.754	0	719	0.156655	176.0	White	Female
3	1000045	1225655	14000	14000	14000.0	60 months	349.98	Trader Joe's	9 years	MORTGAGE	...	0.0	0.1727	0.357	0	714	0.194444	140.0	White	Female
4	1000067	1225680	15000	15000	14975.0	60 months	370.94	Truevance Engineering	< 1 year	RENT	...	0.0	0.1677	0.369	0	709	0.208333	136.0	White	Female

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	installment	annual_inc	dti	delinq_2yrs	inq_last_6mths	...	collection_recovery_fee	collections_12_mths_ex_med	pub_rec_bankruptcies	interest_rate	revol_utilization	number_bc_gt_75	fico_score	lti	month_since_oldest_tl	is_defaulter
count	4.253500e+04	4.253500e+04	42535.000000	42535.000000	42535.000000	42535.000000	4.253100e+04	42535.000000	42506.000000	42506.000000	...	42535.000000	42390.0	41170.000000	42535.000000	42445.000000	42535.000000	42535.000000	42531.000000	42506.000000	42535.000000
mean	6.645799e+05	8.257026e+05	11089.722581	10821.585753	10139.830603	322.623063	6.913656e+04	13.373043	0.152449	1.306357	...	13.956737	0.0	0.045227	0.121650	0.442584	0.499988	723.065240	0.186105	107.567073	0.132926
std	2.193022e+05	2.795409e+05	7410.938391	7146.914675	7131.686447	208.927216	6.409635e+04	6.726315	0.512406	1.711415	...	159.621861	0.0	0.208737	0.037079	0.306107	0.785378	30.241456	0.116228	58.874550	0.339499
min	5.473400e+04	7.047300e+04	500.000000	500.000000	0.000000	15.670000	1.896000e+03	0.000000	0.000000	0.000000	...	0.000000	0.0	0.000000	0.054200	0.000000	0.000000	619.000000	0.000789	1.000000	0.000000
25%	4.983925e+05	6.384795e+05	5200.000000	5000.000000	4950.000000	165.520000	4.000000e+04	8.200000	0.000000	0.000000	...	0.000000	0.0	0.000000	0.096300	0.166000	0.000000	704.000000	0.097674	67.000000	0.000000
50%	6.442500e+05	8.241780e+05	9700.000000	9600.000000	8500.000000	277.690000	5.900000e+04	13.470000	0.000000	1.000000	...	0.000000	0.0	0.000000	0.119900	0.446000	0.000000	719.000000	0.162813	98.000000	0.000000
75%	8.258225e+05	1.033946e+06	15000.000000	15000.000000	14000.000000	428.180000	8.250000e+04	18.680000	0.000000	2.000000	...	0.000000	0.0	0.000000	0.147200	0.701000	1.000000	739.000000	0.250000	134.000000	0.000000
max	1.077501e+06	1.314167e+06	35000.000000	35000.000000	35000.000000	1305.190000	6.000000e+06	29.990000	13.000000	33.000000	...	7002.190000	0.0	2.000000	0.245900	1.088000	3.000000	829.000000	1.337500	685.000000	1.000000

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	installment	emp_title	emp_length	home_ownership	...	pub_rec_bankruptcies	interest_rate	revol_utilization	number_bc_gt_75	fico_score	lti	month_since_oldest_tl	race_name	gender
0	1000007	1225615	5150	5150	5150.0	60	132.58	atlantic tomorrows office	1.0	4	...	1.0	0.1864	0.873	2	709	0.147176	124.0	White	Female
1	1000030	1225638	20000	20000	20000.0	36	635.07	The Red Threads Inc.	6.0	4	...	0.0	0.0890	0.354	0	744	0.277778	149.0	African American	Female
2	1000033	1225642	12800	12800	12750.0	60	316.54	T-Mobile USA Inc	9.0	0	...	0.0	0.1677	0.754	0	719	0.156655	176.0	White	Female
3	1000045	1225655	14000	14000	14000.0	60	349.98	Trader Joe's	9.0	0	...	0.0	0.1727	0.357	0	714	0.194444	140.0	White	Female
4	1000067	1225680	15000	15000	14975.0	60	370.94	Truevance Engineering	0.5	4	...	0.0	0.1677	0.369	0	709	0.208333	136.0	White	Female

	loan_amnt	term	installment	home_ownership	annual_inc	verification_status	dti	delinq_2yrs	inq_last_6mths	open_acc	pub_rec	revol_bal	total_acc	fico_score	lti
0	5150	60	132.58	4	34992.0	1	17.59	0.0	3.0	15.0	1.0	15187	21.0	709	0.147176
1	20000	36	635.07	4	72000.0	2	7.80	0.0	2.0	6.0	0.0	15637	14.0	744	0.277778
2	12800	60	316.54	0	81708.0	1	14.22	1.0	1.0	7.0	0.0	905	23.0	719	0.156655
3	14000	60	349.98	0	72000.0	2	20.35	0.0	1.0	16.0	0.0	9218	39.0	714	0.194444
4	15000	60	370.94	4	72000.0	2	20.50	0.0	0.0	12.0	0.0	10891	30.0	709	0.208333

Case Study Solution by Nandini Saagar¶

Question 1¶

Explanation:¶

Question 2¶

Explanation:¶

Performing Exploratory Data Analysis:¶

Preparing the Data¶

Encoding Categorical Features:¶

Prepare the Data for Training¶

Logistic Regression Model¶

Performance¶

Model Accuracy¶

Evaluating Curves and AUC¶

Decision Tree Classifier¶

Model Accuracy¶

Random Forest Classifier¶

Model Accuarcy¶

Comparing Model Accuracies¶

Question 3¶

Explanation:¶

Further Improvements that can be implemented in the Model¶

	Feature Importance
fico_score	0.291349
inq_last_6mths	0.148557
lti	0.074905
revol_bal	0.070227
dti	0.068784
installment	0.068171
annual_inc	0.063487
total_acc	0.053191
loan_amnt	0.050177
open_acc	0.045037
term	0.020630
verification_status	0.018189
home_ownership	0.012696
delinq_2yrs	0.010162
pub_rec	0.004438