import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


# Load in the insurance dataset
insurance = pd.read_csv("insurance.csv")


# Columns in the dataset
insurance.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')


insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


insurance.hist("charges")

array([[<Axes: title={'center': 'charges'}>]], dtype=object)


insurance["log_charges"] = np.log2(insurance["charges"])

insurance.hist("log_charges")

array([[<Axes: title={'center': 'log_charges'}>]], dtype=object)


# Checking the correlation between the continuous columns in the insurance data
insurance.corr(numeric_only=True)


insurance.boxplot(column = ["log_charges"], by = "sex")

<Axes: title={'center': 'log_charges'}, xlabel='sex'>


insurance.boxplot(column = ["log_charges"], by = "smoker")

<Axes: title={'center': 'log_charges'}, xlabel='smoker'>


insurance.boxplot(column = ["log_charges"], by = "region")

<Axes: title={'center': 'log_charges'}, xlabel='region'>


insurance.boxplot(column = ["log_charges"], by = "region")

<Axes: title={'center': 'log_charges'}, xlabel='region'>


# Splitting the data up into a training and test set
insurance["is_smoker"] = insurance["smoker"] == "yes"
X = insurance[["age", "bmi", "is_smoker"]]
y = insurance["log_charges"]

# 75% for training set, 25% for test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, 
                                                    random_state = 1)


# Training and checking model performance on training set
insurance_model = LinearRegression()
insurance_model.fit(X_train, y_train)

LinearRegression()

LinearRegression()


# Get predicted values by model
y_pred = insurance_model.predict(X_train)

# MSE on the log scale for the insurance charges
mean_squared_error(y_train, y_pred)

0.4546665339270644


# MSE on the original scale for the insurance charges
np.exp(mean_squared_error(y_train, y_pred))

1.575647870310887


# Coefficient of determination
r2_score(y_train, y_pred)

0.7421118855283421


# Quick visual check of residuals
check = pd.DataFrame()
check["residuals"] = y_train - y_pred
check["fitted"] = y_pred

check.plot.scatter(x = "fitted", y = "residuals")

<Axes: xlabel='fitted', ylabel='residuals'>


# Getting the non-intercept coefficients
insurance_model.coef_

array([0.04892865, 0.01523672, 2.23063344])


# Getting MSE on test model
test_pred = insurance_model.predict(X_test)

mean_squared_error(y_test, test_pred)

0.4355350875308211


# Putting the outcome (in log-terms) back into the original scale
np.exp(mean_squared_error(y_test, test_pred))

1.545789970635098

	age	bmi	children	charges	log_charges
age	1.000000	0.109272	0.042469	0.299008	0.527834
bmi	0.109272	1.000000	0.012759	0.198341	0.132669
children	0.042469	0.012759	1.000000	0.067998	0.161336
charges	0.299008	0.198341	0.067998	1.000000	0.892964
log_charges	0.527834	0.132669	0.161336	0.892964	1.000000

Linear Regression Modeling: Predicting Insurance¶

Exploring The Dataset¶

Comments on correlation¶

Comments on plots¶

Dividing The Data¶

Build The Model¶

Comments¶

Residual Diagnostics¶

Interpreting The Model¶

Final Model Evaluation¶

Drawing Conclusions¶