import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Set the maximum column width
pd.set_option('display.max_colwidth', 100)
# Magic function to display plots in notebook
%matplotlib inline


# Read dataset
heart_df = pd.read_csv('heart.csv')

# View first five rows
heart_df.head()


print(f'Number of features: {heart_df.shape[1]}')
print(f'Number of observations: {heart_df.shape[0]}')

Number of features: 12
Number of observations: 918


# Check data types of the features
heart_df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object


heart_df.describe()


# Calculate missing values in all columns
heart_df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


for i , col in enumerate([1,2,3,4,5]):
    print(i,',',col)

0 , 1
1 , 2
2 , 3
3 , 4
4 , 5


# Create a list of categorical columns
cat_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'FastingBS', 'HeartDisease']

# Define number of rows and columns for the subplots
num_rows = 4
num_cols = 2

# Set up the subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20,18))

# Loop over each categorical column to create a bar plot
for i, col in enumerate(cat_cols):
    # Calculate the row and column index for the current subplot
    row_idx = i // num_cols
    col_idx = i % num_cols
    
    # Count the number of rows for each category in the column (frequency distribution)
    counts = heart_df[col].value_counts()
    
    # Create a bar plot of the counts in the current subplot
    ax = counts.plot(kind='bar', rot=0, color=cm.Blues(counts/counts.max()), ax=axes[row_idx, col_idx])
    
    # Add labels for the x and y axes
    ax.set_xlabel(col, size=12)
    ax.set_ylabel('Count', size=12)
    
    # Add a title for the plot
    ax.set_title(f'Count of {col}', size=16)
    
    # Add data labels to the bars
    for i in ax.containers:
        ax.bar_label(i, label_type='edge')

# If there are not enough columns to fill in the last row, create a blank plot
if len(cat_cols) % num_cols != 0:
    blank_ax = axes[num_rows-1, num_cols-1]
    blank_ax.axis('off')

# Adjust the spacing between subplots
fig.tight_layout()
plt.show()


# Exclude 'HeartDisease' column from 'cat_cols' for grouping
cat_cols_2 = cat_cols[:-1]

# Set up figure and axis objects
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14,12))

# Loop over categorical columns and create bar chart for each
for i, col in enumerate(cat_cols_2):
    # Calculate the row and column index for the current subplot
    row_idx = i // 2
    col_idx = i % 2
    
    # Group data by HeartDisease and the categorical column
    counts = heart_df.groupby(['HeartDisease', col]).size().reset_index(name='count')
    
    # Create bar chart
    ax = counts.pivot(index=col, columns='HeartDisease', values='count').plot(kind='bar', ax=axes[row_idx, col_idx], rot=0)
    
    # Add labels and title
    ax.set_xlabel(col, size=12)
    ax.set_ylabel('Count', size=12)
    ax.set_title(f'Count of {col} grouped by HeartDisease', size=16)
    
    # Add legend
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles, labels=['No Heart Disease', 'Heart Disease'], loc='upper right')
    
    # Add data labels above each bar
    for p in ax.patches:
        ax.annotate(str(p.get_height()), xy=(p.get_x() + p.get_width() / 2., p.get_height()), xytext=(0, 5), 
                    textcoords='offset points', ha='center', va='center')
        
plt.tight_layout()
plt.show()


# Filter for patients with heart disease
heart_disease_df = heart_df[heart_df['HeartDisease'] == 1]

# Calculate percentage of patients with heart disease who are male and over 50 years old
male_over_50 = ((heart_disease_df['Sex'] == 'M') & (heart_disease_df['Age'] > 50)).sum()
total_heart_disease = len(heart_disease_df)
percent_male_over_50 = male_over_50 / total_heart_disease * 100

# Create pie chart
labels = ['Male and over 50', 'Other']
sizes = [percent_male_over_50, 100 - percent_male_over_50]
explode = (0.1, 0)

fig, ax = plt.subplots()
ax.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90)
ax.axis('equal')  # equal aspect ratio ensures that pie is drawn as a circle
ax.set_title('Percentage of Heart Disease Patients Who are Male and Over 50')
plt.show()


# Set figure size
plt.figure(figsize=(8, 6))

# Create box plot
bp = plt.boxplot(x=[heart_df[heart_df['HeartDisease'] == 0]['Age'], heart_df[heart_df['HeartDisease'] == 1]['Age']],
                 labels=['No Heart Disease', 'Heart Disease'], patch_artist=True, widths=0.5, showfliers=True,
                 flierprops={'marker':'o', 'markerfacecolor':'black', 'markersize':5})

# Set boxplot colors
colors = ['#1f77b4', '#ff7f0e']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

# Set median line color
for median in bp['medians']:
    median.set(color='black', linewidth=2)

# Set axis labels and title
plt.xlabel('Heart Disease Diagnosis')
plt.ylabel('Age')
plt.title('Distribution of Age by Heart Disease Diagnosis')

# Set y-axis range
plt.ylim(20, 90)

# Add horizontal grid lines
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()


print(f"Number of rows that have 0 values for RestingBP: {(heart_df['RestingBP'] == 0).sum()}")
print(f"Number of rows that have 0 values for Cholesterol: {(heart_df['Cholesterol'] == 0).sum()}")

Number of rows that have 0 values for RestingBP: 1
Number of rows that have 0 values for Cholesterol: 172


# Create a copy of the original dataframe
heart_clean_df = heart_df.copy()

# Impute 0 values in RestingBP with median value of the column grouped by HeartDisease
heart_clean_df['RestingBP'] = heart_clean_df.groupby('HeartDisease', group_keys=False)['RestingBP'].apply(lambda x: x.replace(0, x.median()))

# Impute 0 values in Cholesterol with median value of the column grouped by HeartDisease
heart_clean_df['Cholesterol'] = heart_clean_df.groupby('HeartDisease', group_keys=False)['Cholesterol'].apply(lambda x: x.replace(0, x.median()))


heart_clean_df[["Cholesterol", "RestingBP"]].describe()


# Convert categorical variable into dummy variables
heart_clean_df = pd.get_dummies(heart_clean_df, drop_first=True)

# View results
heart_clean_df.head()


print(f'Number of columns with dummy variables: {heart_clean_df.shape[1]}')

Number of columns with dummy variables: 16


# Calculate Pearson's correlation matrix
corr_mat = abs(heart_clean_df.corr())

# Create heatmap
fig, ax = plt.subplots(figsize=(12,8))
sns.heatmap(corr_mat, annot=True, cmap='Blues')
plt.title("Pearson's Correlation Heatmap for Heart Disease Data", fontsize=16)
plt.show()


# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_mat[(corr_mat > 0.3) & (corr_mat < 1)], annot=True, cmap='Blues')
plt.title('Features Moderately Correlated with HeartDisease', size=16)
plt.show()


# Select top 7 features using square of the Pearson correlation
top_7_features = (corr_mat['HeartDisease'] ** 2).sort_values(ascending=False)[1:8].index.tolist()

# Create bar plot
plt.figure(figsize=(10, 5))
plt.bar(x=top_7_features, height=corr_mat.loc[top_7_features, 'HeartDisease'])
plt.xticks(rotation=45)
plt.title('Top 7 Correlated Features to Heart Disease: Bar Plot', fontsize=16)
plt.ylabel('Absolute Pearson Correlation Coefficient')
plt.xlabel('Features')
plt.show()


# Split data into features 'X' and target variable 'y'
X = heart_clean_df.drop('HeartDisease', axis=1)
y = heart_clean_df['HeartDisease']

# Create list of selected features
sel_features = [
                'Oldpeak',
                'Sex_M',
                'ExerciseAngina_Y',
                'ST_Slope_Flat',
                'ST_Slope_Up'
]

# Create list of test sizes to experiment with
test_sizes = [0.10, 0.15, 0.20, 0.25]

# Create dictionary of dictionaries to store results
results = {}

# Loop over test sizes
for test_size in test_sizes:
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                      test_size=test_size,
                                                      random_state=42)
    
    # Create dictionary to store results for currect test size
    test_results = {}
    
    # Loop over selected features
    for feature in sel_features:
        # Fit knn classifier with current feature set
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train[[feature]], y_train)
        
        # Evaluate accuracy on validation set
        accuracy = knn.score(X_val[[feature]], y_val)
        
        # Store accuracy in test_results dictionary
        test_results[feature] = round(accuracy, 4)
        
    # Store results from current test size in main dictionary
    results[test_size] = test_results


# Create list to store dictionaries
results_list = []

# Loop over test size values
for size, features in results.items():
    # Loop over features for current k value
    for feature, accuracy in features.items():
        # Create dictionary with test size, feature and accuracy keys
        result_dict = {'size': size, 'feature': feature, 'accuracy': accuracy}
        # Append to results list
        results_list.append(result_dict)

# Convert list of dictionaries to DataFrame
test_size_df = pd.DataFrame(results_list)
test_size_df


# Split data into training (85%) and validation (15%)
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.15,
                                                  random_state=42)

# List of k values (n_neighbors)
k_values = [1, 3, 5, 7, 10]

# Create dictionary of dictionaries to store results
results = {}

# Loop over k values
for value in k_values:
    
    # Create dictionary to store results for currect k value
    test_results = {}
    
    # Loop over selected features
    for feature in sel_features:
        # Fit knn classifier with current feature set
        knn = KNeighborsClassifier(n_neighbors=value)
        knn.fit(X_train[[feature]], y_train)
        
        # Evaluate accuracy on validation set
        accuracy = knn.score(X_val[[feature]], y_val)
        
        # Store accuracy in test_results dictionary
        test_results[feature] = round(accuracy, 4)
        
    # Store results from current k value in main dictionary
    results[value] = test_results


# Create list to store dictionaries
results_list = []

# Loop over k values
for k, features in results.items():
    # Loop over features for current k value
    for feature, accuracy in features.items():
        # Create dictionary with k, feature and accuracy keys
        result_dict = {'n_neighbor': k, 'feature': feature, 'accuracy': accuracy}
        # Append to results list
        results_list.append(result_dict)

# Convert list of dictionaries to DataFrame
k_values_df = pd.DataFrame(results_list)
k_values_df


# Split data for training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X[sel_features], y,
                                                  test_size=0.15,
                                                  random_state=42)


# Instantiate scaler
scaler = MinMaxScaler()

# Fit and transform selected features in the training set
X_train_scaled = scaler.fit_transform(X_train)

# Transform same features in the validation set
X_val_scaled = scaler.transform(X_val)

# Fit the model on scaled features (using default n_neighbors value)
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

# Evaluate the model on scaled features
accuracy = knn.score(X_val_scaled, y_val)

print(f'Model accuracy: {accuracy*100:.2f}%')

Model accuracy: 75.36%


# Split data into training (85%) and test (15%)
X_train, X_test, y_train, y_test = train_test_split(X[sel_features], y,
                                                    test_size=0.15,
                                                    random_state=42)

# List of selected features
sel_features = [
                'Oldpeak',
                'Sex_M',
                'ExerciseAngina_Y',
                'ST_Slope_Flat',
                'ST_Slope_Up'
]

# Instantiate scaler
scaler = MinMaxScaler()

# Scale training set
X_train_scaled = scaler.fit_transform(X_train)


# Define the parameter grid for GridSearchCV
params = {'n_neighbors': list(range(1,21)),
          'weights': ['uniform', 'distance'],
          'metric': ['minkowski', 'manhattan']}

# Instantial model
knn = KNeighborsClassifier()

# Create GridSearchCV instance
knn_grid = GridSearchCV(estimator=knn, param_grid=params, scoring='accuracy')

# Fit the instance (knn_grid)
knn_grid.fit(X_train_scaled, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

KNeighborsClassifier()

KNeighborsClassifier()


# Get best score and best parameters
best_score = knn_grid.best_score_
best_params = knn_grid.best_params_

print(f'Best score: {best_score*100:0.2f}%')
print(f'Best parameters: {best_params}')

Best score: 85.64%
Best parameters: {'metric': 'minkowski', 'n_neighbors': 8, 'weights': 'uniform'}


# Scale same features from the test set
X_test_scaled = scaler.transform(X_test)

# Get the best estimator to make predictions
best_model = knn_grid.best_estimator_

# Make predictions on test set
predictions = best_model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'{accuracy*100:.2f}%')

74.64%


# Create an empty dictionary to store the results
results = {}

# Split the data into training and testing sets using different random_state values
random_states = [0, 415, 417, 419, 2023]

# Loop over random_states values
for state in random_states:
    X_train, _, y_train, _ = train_test_split(X[sel_features], y, test_size=0.15, random_state=state)
    
    # Instantiate scaler
    scaler = MinMaxScaler()
    
    # Rescaled train features
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Create a KNN classifier
    knn = KNeighborsClassifier()
    
    # Define the parameter grid for GridSearchCV
    params = {'n_neighbors': list(range(1, 21)), 'weights': ['uniform', 'distance'], 'metric': ['minkowski', 'manhattan']}
    
    # Create GridSearchCV object with different parameters
    knn_grid = GridSearchCV(estimator=knn, param_grid=params, scoring='accuracy')
    
    # Fit the model to the training data
    knn_grid.fit(X_train_scaled, y_train)
    
    # Get best score and best parameters
    best_score = knn_grid.best_score_
    best_params = knn_grid.best_params_
    
    # Store the results in the dictionary
    results[f'random_state_{state}'] = {'best_score': round(best_score*100,2), 'best_params': best_params}


# Convert results dictionary to a dataframe
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X[sel_features], y,
                                                    test_size=0.15,
                                                    random_state=0)

# Instantiate MinMax scaler
scaler = MinMaxScaler()

# Rescale training and testing features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build model
knn = KNeighborsClassifier(n_neighbors=14, weights='uniform', metric='minkowski')

# Fit the model with training set
knn.fit(X_train_scaled, y_train)

# Make model predictions on test data
predictions = knn.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Model accuracy on test set: {accuracy*100:.2f}%')

Model accuracy on test set: 81.88%

	Age	Sex	ChestPainType	RestingBP	Cholesterol	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease
0	40	M	ATA	140	289	Normal	172	N	0.0	Up	0
1	49	F	NAP	160	180	Normal	156	N	1.0	Flat	1
2	37	M	ATA	130	283	ST	98	N	0.0	Up	0
3	48	F	ASY	138	214	Normal	108	Y	1.5	Flat	1
4	54	M	NAP	150	195	Normal	122	N	0.0	Up	0

	Age	RestingBP	Cholesterol	FastingBS	MaxHR	Oldpeak	HeartDisease
count	918.000000	918.000000	918.000000	918.000000	918.000000	918.000000	918.000000
mean	53.510893	132.396514	198.799564	0.233115	136.809368	0.887364	0.553377
std	9.432617	18.514154	109.384145	0.423046	25.460334	1.066570	0.497414
min	28.000000	0.000000	0.000000	0.000000	60.000000	-2.600000	0.000000
25%	47.000000	120.000000	173.250000	0.000000	120.000000	0.000000	0.000000
50%	54.000000	130.000000	223.000000	0.000000	138.000000	0.600000	1.000000
75%	60.000000	140.000000	267.000000	0.000000	156.000000	1.500000	1.000000
max	77.000000	200.000000	603.000000	1.000000	202.000000	6.200000	1.000000

	Cholesterol	RestingBP
count	918.000000	918.000000
mean	239.675381	132.540305
std	54.328249	17.989941
min	85.000000	80.000000
25%	214.000000	120.000000
50%	225.000000	130.000000
75%	267.000000	140.000000
max	603.000000	200.000000

	Age	RestingBP	Cholesterol	MaxHR	Oldpeak	HeartDisease	Sex_M	ChestPainType_ATA	ChestPainType_NAP	RestingECG_Normal	RestingECG_ST	ExerciseAngina_Y	ST_Slope_Flat	ST_Slope_Up
0	40	140	289	172	0.0	0	1	1	0	1	0	0	0	1
1	49	160	180	156	1.0	1	0	0	1	1	0	0	1	0
2	37	130	283	98	0.0	0	1	1	0	0	1	0	0	1
3	48	138	214	108	1.5	1	0	0	0	1	0	1	1	0
4	54	150	195	122	0.0	0	1	0	1	1	0	0	0	1

	best_score	best_params
random_state_0	85.38	{'metric': 'minkowski', 'n_neighbors': 14, 'weights': 'uniform'}
random_state_415	83.72	{'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'uniform'}
random_state_417	84.62	{'metric': 'minkowski', 'n_neighbors': 12, 'weights': 'uniform'}
random_state_419	83.72	{'metric': 'minkowski', 'n_neighbors': 14, 'weights': 'uniform'}
random_state_2023	83.46	{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}

K-Nearest Neighbors Classifier: Predicting Heart Disease¶

Project and Data Overview¶

Import Libraries and Read Data¶

Exploratory Data Analysis¶

Descriptive Statistics¶

Visualizations¶

Data Cleaning¶

Feature Selection¶

Pearson correlation coefficient¶

Square of the Pearson correlation coefficient¶

Single-Feature Classifier Building¶

Multi-Feature Classifier Building¶

Hyperparameter Tuning¶

Model Evaluation on Test Set¶

Random State Splits¶

Build Model with Optimal Hyperparameters¶

Next Step¶

Conclusion¶

	Age	RestingBP	Cholesterol	MaxHR	Oldpeak	HeartDisease	Sex_M	ChestPainType_ATA	ChestPainType_NAP	RestingECG_Normal	RestingECG_ST	ExerciseAngina_Y	ST_Slope_Flat	ST_Slope_Up
0	40	140	289	172	0.0	0	1	1	0	1	0	0	0	1
1	49	160	180	156	1.0	1	0	0	1	1	0	0	1	0
2	37	130	283	98	0.0	0	1	1	0	0	1	0	0	1
3	48	138	214	108	1.5	1	0	0	0	1	0	1	1	0
4	54	150	195	122	0.0	0	1	0	1	1	0	0	0	1

	size	feature	accuracy
0	0.10	Oldpeak	0.6848
1	0.10	Sex_M	0.4130
2	0.10	ExerciseAngina_Y	0.6630
3	0.10	ST_Slope_Flat	0.5870
4	0.10	ST_Slope_Up	0.7935
5	0.15	Oldpeak	0.7029
6	0.15	Sex_M	0.4058
7	0.15	ExerciseAngina_Y	0.6522
8	0.15	ST_Slope_Flat	0.7319
9	0.15	ST_Slope_Up	0.7899
10	0.20	Oldpeak	0.6467
11	0.20	Sex_M	0.6413
12	0.20	ExerciseAngina_Y	0.6576
13	0.20	ST_Slope_Flat	0.7500
14	0.20	ST_Slope_Up	0.7989
15	0.25	Oldpeak	0.6174
16	0.25	Sex_M	0.6565
17	0.25	ExerciseAngina_Y	0.5739
18	0.25	ST_Slope_Flat	0.7435
19	0.25	ST_Slope_Up	0.8000

	n_neighbor	feature	accuracy
0	1	Oldpeak	0.6594
1	1	Sex_M	0.6522
2	1	ExerciseAngina_Y	0.6522
3	1	ST_Slope_Flat	0.7319
4	1	ST_Slope_Up	0.7899
5	3	Oldpeak	0.7029
6	3	Sex_M	0.4058
7	3	ExerciseAngina_Y	0.6522
8	3	ST_Slope_Flat	0.7319
9	3	ST_Slope_Up	0.7899
10	5	Oldpeak	0.7101
11	5	Sex_M	0.6522
12	5	ExerciseAngina_Y	0.6522
13	5	ST_Slope_Flat	0.7319
14	5	ST_Slope_Up	0.7899
15	7	Oldpeak	0.7029
16	7	Sex_M	0.6522
17	7	ExerciseAngina_Y	0.6522
18	7	ST_Slope_Flat	0.7319
19	7	ST_Slope_Up	0.7899
20	10	Oldpeak	0.6957
21	10	Sex_M	0.6522
22	10	ExerciseAngina_Y	0.6522
23	10	ST_Slope_Flat	0.7319
24	10	ST_Slope_Up	0.7899

	Age	RestingBP	Cholesterol	MaxHR	Oldpeak	HeartDisease	Sex_M	ChestPainType_ATA	ChestPainType_NAP	RestingECG_Normal	RestingECG_ST	ExerciseAngina_Y	ST_Slope_Flat	ST_Slope_Up
0	40	140	289	172	0.0	0	1	1	0	1	0	0	0	1
1	49	160	180	156	1.0	1	0	0	1	1	0	0	1	0
2	37	130	283	98	0.0	0	1	1	0	0	1	0	0	1
3	48	138	214	108	1.5	1	0	0	0	1	0	1	1	0
4	54	150	195	122	0.0	0	1	0	1	1	0	0	0	1

	Age	RestingBP	Cholesterol	MaxHR	Oldpeak	HeartDisease	Sex_M	ChestPainType_ATA	ChestPainType_NAP	RestingECG_Normal	RestingECG_ST	ExerciseAngina_Y	ST_Slope_Flat	ST_Slope_Up
0	40	140	289	172	0.0	0	1	1	0	1	0	0	0	1
1	49	160	180	156	1.0	1	0	0	1	1	0	0	1	0
2	37	130	283	98	0.0	0	1	1	0	0	1	0	0	1
3	48	138	214	108	1.5	1	0	0	0	1	0	1	1	0
4	54	150	195	122	0.0	0	1	0	1	1	0	0	0	1