import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline


df = pd.read_csv('crowdness_gym_data.csv')


df.head()


print('Data contains', df.shape[0], 'rows and', df.shape[1], 'columns')

Data contains 62184 rows and 11 columns


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62184 entries, 0 to 62183
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   number_people         62184 non-null  int64  
 1   date                  62184 non-null  object 
 2   timestamp             62184 non-null  int64  
 3   day_of_week           62184 non-null  int64  
 4   is_weekend            62184 non-null  int64  
 5   is_holiday            62184 non-null  int64  
 6   temperature           62184 non-null  float64
 7   is_start_of_semester  62184 non-null  int64  
 8   is_during_semester    62184 non-null  int64  
 9   month                 62184 non-null  int64  
 10  hour                  62184 non-null  int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 5.2+ MB


df.describe()


df.isnull().sum()

number_people           0
date                    0
timestamp               0
day_of_week             0
is_weekend              0
is_holiday              0
temperature             0
is_start_of_semester    0
is_during_semester      0
month                   0
hour                    0
dtype: int64


df = df.drop('date', axis=1)
df.head()


fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3,figsize=(18,6))

ax1.hist(df['month'])
ax1.set_title("Observations per Month of the Year")
ax1.set_xlabel('Month')
ax1.set_ylabel('No. of Observavtions')

sns.histplot(df['day_of_week'] + 1, color='g', ax=ax2)
ax2.set_title("Observations per Day of the Week")
ax2.set_xlabel('Day of the Week')
ax2.set_ylabel('No. of Observavtions')

sns.histplot(df['hour'], color='r', ax=ax3)
ax3.set_title("Observations per Hour of the Day")
ax3.set_xlabel('Hour')
ax3.set_ylabel('No. of Observavtions')

plt.show()


fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))

ax1.scatter(df['month'], df['number_people'])
ax1.set_title("Number of People VS Month")
ax1.set_xlabel('Month')
ax1.set_ylabel('Number of People')

ax2.scatter(df['day_of_week'], df['number_people'])
ax2.set_title("Number of People VS Day of the Week")
ax2.set_xlabel('Day of the Week')
ax2.set_ylabel('Number of People')

ax3.scatter(df['hour'], df['number_people'])
ax3.set_title("Number of People VS Hour")
ax3.set_xlabel('Hour')
ax3.set_ylabel('Number of People')

plt.show()


df.corr(numeric_only=True)


sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x1ff254d3290>


plt.figure(figsize=(6, 6))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm')

<Axes: >


df = df.drop('timestamp', axis=1)
df.head()


from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor


data = df.values
X= data[:,1:]
y= data[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training label shape: {y_train.shape}")
print(f"Testing label shape: {y_test.shape}")

Training features shape: (43528, 8)
Testing features shape: (18656, 8)
Training label shape: (43528,)
Testing label shape: (18656,)


sgd_v1 = SGDRegressor(alpha=1e-4, learning_rate='optimal', penalty='elasticnet',
                      loss='huber', random_state=52)


sgd_v1.fit(X_train,y_train)

SGDRegressor(learning_rate='optimal', loss='huber', penalty='elasticnet',
             random_state=52)

SGDRegressor(learning_rate='optimal', loss='huber', penalty='elasticnet',
             random_state=52)


y_pred_v1 = sgd_v1.predict(X_test)  # Predict labels


# Let's evaluate the performance of the model.

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# The mean squared error
print(f"Mean squared error: {round( mean_squared_error(y_test, y_pred_v1),3)}")
# Explained variance score: 1 is perfect prediction
print(f"R2 score: {round(r2_score(y_test, y_pred_v1),3)}")
# Mean Absolute Error
print(f"Mean absolute error: { round(mean_absolute_error(y_test, y_pred_v1),3)}")

Mean squared error: 254.545
R2 score: 0.506
Mean absolute error: 12.135


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


sgd_v2 = SGDRegressor(alpha=0.0001, learning_rate='optimal', loss='huber', 
    penalty='elasticnet', random_state = 52)


sgd_v2.fit(X_train_scaled, y_train)

SGDRegressor(learning_rate='optimal', loss='huber', penalty='elasticnet',
             random_state=52)

SGDRegressor(learning_rate='optimal', loss='huber', penalty='elasticnet',
             random_state=52)


y_pred_v2 = sgd_v2.predict(X_test_scaled)  # Predict labels


# The mean squared error
print(f"Mean squared error: {round( mean_squared_error(y_test, y_pred_v2),3)}")
# Explained variance score: 1 is perfect prediction
print(f"R2 score: {round(r2_score(y_test, y_pred_v2),3)}")
# Mean Absolute Error
print(f"Mean absolute error: { round(mean_absolute_error(y_test, y_pred_v2),3)}")

Mean squared error: 254.325
R2 score: 0.507
Mean absolute error: 12.049


plt.figure(figsize=(15, 15))

x_ax = range(len(y_test))
plt.plot(x_ax, y_test, linewidth=1, label="original")
plt.plot(x_ax, y_pred_v1, linewidth=1.1, label="predicted")
plt.title("y-test and y-predicted data Model 1")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()


# Model v2

plt.figure(figsize=(15, 15))

x_ax = range(len(y_test))
plt.plot(x_ax, y_test, linewidth=1, label="original")
plt.plot(x_ax, y_pred_v2, linewidth=1.1, label="predicted")
plt.title("y-test and y-predicted data Model 2")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()

	number_people	date	timestamp	day_of_week	temperature	month	hour
0	37	2015-08-14 17:00:11-07:00	61211	4	71.76	8	17
1	45	2015-08-14 17:20:14-07:00	62414	4	71.76	8	17
2	40	2015-08-14 17:30:15-07:00	63015	4	71.76	8	17
3	44	2015-08-14 17:40:16-07:00	63616	4	71.76	8	17
4	45	2015-08-14 17:50:17-07:00	64217	4	71.76	8	17

	number_people	timestamp	day_of_week	is_weekend	is_holiday	temperature	is_start_of_semester	is_during_semester	month	hour
count	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000
mean	29.072543	45799.437958	2.982504	0.282870	0.002573	58.557108	0.078831	0.660218	7.439824	12.236460
std	22.689026	24211.275891	1.996825	0.450398	0.050660	6.316396	0.269476	0.473639	3.445069	6.717631
min	0.000000	0.000000	0.000000	0.000000	0.000000	38.140000	0.000000	0.000000	1.000000	0.000000
25%	9.000000	26624.000000	1.000000	0.000000	0.000000	55.000000	0.000000	0.000000	5.000000	7.000000
50%	28.000000	46522.500000	3.000000	0.000000	0.000000	58.340000	0.000000	1.000000	8.000000	12.000000
75%	43.000000	66612.000000	5.000000	1.000000	0.000000	62.280000	0.000000	1.000000	10.000000	18.000000
max	145.000000	86399.000000	6.000000	1.000000	1.000000	87.170000	1.000000	1.000000	12.000000	23.000000

	number_people	timestamp	day_of_week	temperature	month	hour
0	37	61211	4	71.76	8	17
1	45	62414	4	71.76	8	17
2	40	63015	4	71.76	8	17
3	44	63616	4	71.76	8	17
4	45	64217	4	71.76	8	17

	number_people	timestamp	day_of_week	is_weekend	is_holiday	temperature	is_start_of_semester	is_during_semester	month	hour
number_people	1.000000	0.550218	-0.162062	-0.173958	-0.048249	0.373327	0.182683	0.335350	-0.097854	0.552049
timestamp	0.550218	1.000000	-0.001793	-0.000509	0.002851	0.184849	0.009551	0.044676	-0.023221	0.999077
day_of_week	-0.162062	-0.001793	1.000000	0.791338	-0.075862	0.011169	-0.011782	-0.004824	0.015559	-0.001914
is_weekend	-0.173958	-0.000509	0.791338	1.000000	-0.031899	0.020673	-0.016646	-0.036127	0.008462	-0.000517
is_holiday	-0.048249	0.002851	-0.075862	-0.031899	1.000000	-0.088527	-0.014858	-0.070798	-0.094942	0.002843
temperature	0.373327	0.184849	0.011169	0.020673	-0.088527	1.000000	0.093242	0.152476	0.063125	0.185121
is_start_of_semester	0.182683	0.009551	-0.011782	-0.016646	-0.014858	0.093242	1.000000	0.209862	-0.137160	0.010091
is_during_semester	0.335350	0.044676	-0.004824	-0.036127	-0.070798	0.152476	0.209862	1.000000	0.096556	0.045581
month	-0.097854	-0.023221	0.015559	0.008462	-0.094942	0.063125	-0.137160	0.096556	1.000000	-0.023624
hour	0.552049	0.999077	-0.001914	-0.000517	0.002843	0.185121	0.010091	0.045581	-0.023624	1.000000

	number_people	day_of_week	temperature	month	hour
0	37	4	71.76	8	17
1	45	4	71.76	8	17
2	40	4	71.76	8	17
3	44	4	71.76	8	17
4	45	4	71.76	8	17

Stochastic Gradient Descent on Linear Regression: Crowdedness in the Gym¶

Import Libraries and Load the Data¶

EDA and Cleaning up the Data¶

Plots¶

Getting ready to build our model with Stochastic Gradient Descent¶

Measure the Performance of the Mode¶

Visualizing the Results¶

Summarize your Results¶