import pandas as pd

i_94 = pd.read_csv('Metro_Interstate_Traffic_Volume.csv')
i_94.head()


i_94.tail()


i_94.describe()


i_94.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              48204 non-null  object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


import matplotlib.pyplot as plt
%matplotlib inline

i_94['traffic_volume'].plot.hist()
plt.show()


i_94['traffic_volume'].describe()

count    48204.000000
mean      3259.818355
std       1986.860670
min          0.000000
25%       1193.000000
50%       3380.000000
75%       4933.000000
max       7280.000000
Name: traffic_volume, dtype: float64


i_94['date_time'] = pd.to_datetime(i_94['date_time']) 

day = i_94.copy()[(i_94['date_time'].dt.hour >= 7) & (i_94['date_time'].dt.hour < 19)]
print(day.shape)
night = i_94.copy()[(i_94['date_time'].dt.hour >= 19) | (i_94['date_time'].dt.hour < 7)]
print(night.shape)

(23877, 9)
(24327, 9)


plt.figure(figsize=(11,3.5))

plt.subplot(1, 2, 1)
plt.hist(day['traffic_volume'])
plt.xlim(-100, 7500)
plt.ylim(0, 8000)
plt.title('Traffic Volume: Day')
plt.xlabel('Traffic Volumn')
plt.ylabel('Frequency')

plt.subplot(1,2,2)
plt.hist(night['traffic_volume'])
plt.xlim(-100, 7500)
plt.ylim(0, 8000)
plt.title('Traffic Volume: Night')
plt.xlabel('Traffic Volumn')
plt.ylabel('Frequency')

plt.show()


day['traffic_volume'].describe()

count    23877.000000
mean      4762.047452
std       1174.546482
min          0.000000
25%       4252.000000
50%       4820.000000
75%       5559.000000
max       7280.000000
Name: traffic_volume, dtype: float64


night['traffic_volume'].describe()

count    24327.000000
mean      1785.377441
std       1441.951197
min          0.000000
25%        530.000000
50%       1287.000000
75%       2819.000000
max       6386.000000
Name: traffic_volume, dtype: float64


day['month'] = day['date_time'].dt.month
by_month = day.groupby('month').mean(numeric_only=True)
by_month['traffic_volume']

month
1     4495.613727
2     4711.198394
3     4889.409560
4     4906.894305
5     4911.121609
6     4898.019566
7     4595.035744
8     4928.302035
9     4870.783145
10    4921.234922
11    4704.094319
12    4374.834566
Name: traffic_volume, dtype: float64


plt.plot(by_month['traffic_volume'])
# by_month['traffic_volume'].plot.line()
plt.show()


day['year'] = day['date_time'].dt.year
only_july = day[day['month'] == 7]
plt.plot(only_july.groupby('year').mean(numeric_only=True)['traffic_volume'])
plt.show()


day['day_of_week']=day['date_time'].dt.dayofweek
by_day_of_week = day.groupby('day_of_week').mean(numeric_only=True)
by_day_of_week['traffic_volume'].plot.line()

<Axes: xlabel='day_of_week'>


day['hour'] = day['date_time'].dt.hour
weekday = day.copy()[(day['day_of_week'] >= 0) & (day['day_of_week'] < 5)]
weekend = day.copy()[day['day_of_week'] >= 5]

by_hour_weekday = weekday.groupby('hour').mean(numeric_only=True)
by_hour_weekend = weekend.groupby('hour').mean(numeric_only=True)

plt.figure(figsize=(11, 3.5))

plt.subplot(1,2,1)
plt.plot(by_hour_weekday['traffic_volume'])
plt.xlim(6, 20)
plt.ylim(1500, 6500)
plt.xlabel('Hour')
plt.ylabel('Traffic Volume')
plt.title('Weekday Traffic by Hour')

plt.subplot(1,2,2)
plt.plot(by_hour_weekend['traffic_volume'])
plt.xlim(6, 20)
plt.ylim(1500, 6500)
plt.xlabel('Hour')
plt.ylabel('Traffic Volume')
plt.title('Weekend Traffic by Hour')

plt.show()


day.corr()['traffic_volume']

C:\Users\Clark\AppData\Local\Temp\ipykernel_20108\3421110943.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  day.corr()['traffic_volume']

temp              0.128317
rain_1h           0.003697
snow_1h           0.001265
clouds_all       -0.032932
traffic_volume    1.000000
month            -0.022337
year             -0.003557
day_of_week      -0.416453
hour              0.172704
Name: traffic_volume, dtype: float64


day.plot.scatter('traffic_volume', 'temp')
plt.ylim(230.320) # two wrong 0K temperatures mess up the y-axis
plt.show()


by_weather_main = day.groupby('weather_main').mean(numeric_only=True)
by_weather_main['traffic_volume'].plot.barh()
plt.show()


by_weather_description = day.groupby('weather_description').mean(numeric_only=True)
by_weather_description['traffic_volume'].plot.barh(figsize = (5, 15))
plt.show()

	holiday	temp	clouds_all	weather_main	weather_description	date_time	traffic_volume
0	None	288.28	40	Clouds	scattered clouds	2012-10-02 09:00:00	5545
1	None	289.36	75	Clouds	broken clouds	2012-10-02 10:00:00	4516
2	None	289.58	90	Clouds	overcast clouds	2012-10-02 11:00:00	4767
3	None	290.13	90	Clouds	overcast clouds	2012-10-02 12:00:00	5026
4	None	291.14	75	Clouds	broken clouds	2012-10-02 13:00:00	4918

	holiday	temp	clouds_all	weather_main	weather_description	date_time	traffic_volume
48199	None	283.45	75	Clouds	broken clouds	2018-09-30 19:00:00	3543
48200	None	282.76	90	Clouds	overcast clouds	2018-09-30 20:00:00	2781
48201	None	282.73	90	Thunderstorm	proximity thunderstorm	2018-09-30 21:00:00	2159
48202	None	282.09	90	Clouds	overcast clouds	2018-09-30 22:00:00	1450
48203	None	282.12	90	Clouds	overcast clouds	2018-09-30 23:00:00	954

Indicators of Heavy Traffic on I-94¶

The I-94 Traffic Dataset¶

Analyzing Traffic Volume¶

Traffic Volume: Day vs. Night¶

Traffic Volume: Day vs. Night (II)¶

Time Indicators¶

Time Indicators (II)¶

Time Indicators (III)¶

Weather Indicators¶

Weather Types¶

Conclusion¶

	temp	rain_1h	snow_1h	clouds_all	traffic_volume
count	48204.000000	48204.000000	48204.000000	48204.000000	48204.000000
mean	281.205870	0.334264	0.000222	49.362231	3259.818355
std	13.338232	44.789133	0.008168	39.015750	1986.860670
min	0.000000	0.000000	0.000000	0.000000	0.000000
25%	272.160000	0.000000	0.000000	1.000000	1193.000000
50%	282.450000	0.000000	0.000000	64.000000	3380.000000
75%	291.806000	0.000000	0.000000	90.000000	4933.000000
max	310.070000	9831.300000	0.510000	100.000000	7280.000000