import pandas as pd
accidents = pd.read_csv('../content/sample_data/US_Accidents_March23.csv', low_memory = False)
display(accidents.head())
display(accidents.dtypes)

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-2-31b73d8a5ac5> in <cell line: 2>()
      1 import pandas as pd
----> 2 accidents = pd.read_csv('../content/sample_data/US_Accidents_March23.csv', low_memory = False)
      3 display(accidents.head())
      4 display(accidents.dtypes)

/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    910     kwds.update(kwds_defaults)
    911 
--> 912     return _read(filepath_or_buffer, kwds)
    913 
    914 

/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    575 
    576     # Create the parser.
--> 577     parser = TextFileReader(filepath_or_buffer, **kwds)
    578 
    579     if chunksize or iterator:

/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
   1405 
   1406         self.handles: IOHandles | None = None
-> 1407         self._engine = self._make_engine(f, self.engine)
   1408 
   1409     def close(self) -> None:

/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in _make_engine(self, f, engine)
   1659                 if "b" not in mode:
   1660                     mode += "b"
-> 1661             self.handles = get_handle(
   1662                 f,
   1663                 mode,

/usr/local/lib/python3.10/dist-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    857         if ioargs.encoding and "b" not in ioargs.mode:
    858             # Encoding
--> 859             handle = open(
    860                 handle,
    861                 ioargs.mode,

FileNotFoundError: [Errno 2] No such file or directory: '../content/sample_data/US_Accidents_March23.csv'


import zipfile
!unzip /content/US_ACC.zip

Archive:  /content/US_ACC.zip
  inflating: US_Accidents_March23.csv


import pandas as pd
accidents_test = pd.read_csv("/content/US_Accidents_March23.csv")
display(accidents_test.head())
display(accidents_test.dtypes)

ID                        object
Source                    object
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
End_Lat                  float64
End_Lng                  float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Weather_Timestamp         object
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Precipitation(in)        float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout                  bool
Station                     bool
Stop                        bool
Traffic_Calming             bool
Traffic_Signal              bool
Turning_Loop                bool
Sunrise_Sunset            object
Civil_Twilight            object
Nautical_Twilight         object
Astronomical_Twilight     object
dtype: object


accidents = accidents_test


accidents.fillna(0,inplace=True)


accidents_weather = accidents[['Severity','State','Temperature(F)',
                               'Wind_Chill(F)','Humidity(%)','Pressure(in)',
                                'Visibility(mi)','Wind_Direction','Wind_Speed(mph)',
                                'Precipitation(in)','Weather_Condition']]
pd.set_option("display.max_rows", 150)
accidents_weatherNJLA = accidents_weather[(accidents_weather['State'] == 'LA') | (accidents_weather['State'] == 'NJ')]
color = accidents_weatherNJLA["State"].map({
    "LA": "Blue",
    "NJ": "Red"
})
accidents_weatherNJLA.plot.scatter(x='Precipitation(in)',y="Temperature(F)", c = color)
#this is just a starting visualization of the accidents in LA and NJ based on precipitation and temperature
#we still have to normalize for frequency of precipation and the outliers.

<Axes: xlabel='Precipitation(in)', ylabel='Temperature(F)'>


pd.set_option("display.max_rows", 10)
accidents_weather.groupby("State")["Severity"].mean().sort_values()

State
ND    2.013479
MT    2.037233
OK    2.077313
SC    2.111055
OR    2.112407
        ...   
CO    2.443902
KY    2.454176
RI    2.458252
WI    2.473939
GA    2.506931
Name: Severity, Length: 49, dtype: float64


import seaborn as sns
accidents_sv_weather = accidents[['Severity','Temperature(F)',
                               'Wind_Chill(F)','Humidity(%)','Pressure(in)',
                                'Visibility(mi)','Wind_Speed(mph)',
                                'Precipitation(in)']]

sns.heatmap(accidents_sv_weather.corr(), annot = True, square = True);


acc_road_con = accidents[['Severity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'Stop', 'No_Exit', 'Traffic_Signal']]

sns.heatmap(acc_road_con.corr(), annot = True, square = True);


import numpy as np
import matplotlib.pyplot as plt

road_cons = ['Bump', 'Crossing', 'Give_Way', 'Junction', 'Stop', 'No_Exit', 'Traffic_Signal']

fig, ax = plt.subplots(2, 4, figsize=(20,5))
# ax[0,0].pie(acc_road_con['Bump'].value_counts(), labels = ['False', 'True'],

# create subplots with pie graphs showing distribution of road conditions
for i in range(len(road_cons)):
  if (i<4):
    ax[0, i].pie(acc_road_con[road_cons[i]].value_counts(), labels = ['False', 'True'],
            autopct='%1.4f%%', labeldistance= 1.15)
    ax[0, i].title.set_text(road_cons[i] + " Percentage")
  if (i>=4):
    ax[1, i-4].pie(acc_road_con[road_cons[i]].value_counts(), labels = ['False', 'True'],
            autopct='%1.4f%%', labeldistance= 1.15)
    ax[1, i-4].title.set_text(road_cons[i] + " Percentage")

#remove last plot
ax.flat[-1].set_visible(False)


import pandas as pd
import matplotlib.pyplot as plt

top_weather_cons = accidents['Weather_Condition'].value_counts().head(6).index.to_list()


severity_levels = accidents['Severity'].unique()

grouped_data = accidents.groupby(['Weather_Condition', 'Severity']).size().unstack()
fig, ax = plt.subplots()
bar_width = 0.2
index = range(len(top_weather_cons))

# Plotting bars for each severity level
for i, severity in enumerate(severity_levels):
    bar_positions = [x + i * bar_width for x in index]
    bar_values = grouped_data[severity].fillna(0).reindex(top_weather_cons, fill_value=0)  # Fill missing values

    ax.bar(bar_positions, bar_values, bar_width, label=severity)


ax.set_xlabel('Weather Condition')
ax.set_ylabel('Frequency')
ax.set_title('Weather Condition vs. Severity')
ax.set_xticks([x + 0.5 * bar_width for x in index])
ax.set_xticklabels(top_weather_cons)
ax.legend()


plt.show()
# print(top_weather_cons)
# print(weather_conditions)


start = pd.to_datetime(accidents['Start_Time'],
               format = 'mixed')
#End time here refers to when the impact of accident on traffic flow was dismissed.
end = pd.to_datetime(accidents['End_Time'],
                     format = 'mixed')

#Create a new attribute, duration - duration of the accidents effect on traffic.
accidents['Duration'] = end - start
correlation_matrix = accidents[['Duration','Temperature(F)',
                               'Wind_Chill(F)','Humidity(%)','Pressure(in)',
                                'Visibility(mi)','Wind_Speed(mph)',
                                'Precipitation(in)']].corr()

#Print the correlation matrix
sns.heatmap(correlation_matrix, annot = True, square = True);


from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, accuracy_score

# define the training data
x_train = accidents[['Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)',
                      'Visibility(mi)','Wind_Speed(mph)','Precipitation(in)']]
y_train = accidents["Severity"]

# standardize the data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

# fit the 5-nearest neighbors model
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train_sc, y_train)

KNeighborsClassifier()

KNeighborsClassifier()


#We do the tests on a random set of 10000 observations because it takes too much time to do it on more than 100000 observations
y_train_pred = model.predict(x_train_sc[2000000:2020000])

print("f1 score is", f1_score(y_train[2000000:2020000], y_train_pred,  average='micro'))

f1 score is 0.7357


y_train_pred = model.predict(x_train_sc[2000900:2000920])
y_df = pd.DataFrame({'Actual Severity': y_train[2000900:2000920], 'Predicted Severity': y_train_pred})
y_df.plot.bar(title = "Actual vs Predicted Severity ")
plt.legend(loc=1)

<matplotlib.legend.Legend at 0x7d39d003ddb0>


# define the training data
x_train_rd = accidents[['Bump', 'Crossing', 'Give_Way', 'Junction', 'Stop', 'No_Exit', 'Traffic_Signal']]
y_train_rd = accidents["Severity"]

# standardize the data
scaler = StandardScaler()
scaler.fit(x_train_rd)
x_train_rd_sc = scaler.transform(x_train_rd)

# fit the 5-nearest neighbors model
model_rd = KNeighborsClassifier(n_neighbors=5)
model_rd.fit(x_train_rd_sc, y_train_rd)

KNeighborsClassifier()

KNeighborsClassifier()


y_train_rd_pred = model_rd.predict(x_train_rd_sc[2000000:2002000])

print("f1 score is", f1_score(y_train_rd[2000000:2002000], y_train_rd_pred,  average='micro'))

f1 score is 0.7405999999999999


y_train_rd_pred = model_rd.predict(x_train_rd_sc[2000900:2000920])
y_df = pd.DataFrame({'Actual Severity': y_train_rd[2000900:2000920], 'Predicted Severity': y_train_rd_pred})
y_df.plot.bar(title = "Actual vs Predicted Severity ")
plt.legend(loc=1)

<matplotlib.legend.Legend at 0x7d39db457d60>


from sklearn.neighbors import KNeighborsRegressor

#weather condition regression
y_train_reg = accidents["Duration"]

# standardize the data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

# fit the 5-nearest neighbors model
model_reg = KNeighborsRegressor(n_neighbors=10)
model_reg.fit(x_train_sc, y_train_reg)

KNeighborsRegressor(n_neighbors=10)

KNeighborsRegressor(n_neighbors=10)


y_train_reg_pred = model_reg.predict(x_train_sc[2000900:2000920])
y_reg_df = pd.DataFrame({'Actual Duration': y_train_reg[2000900:2000920], 'Predicted Duration': y_train_reg_pred})
y_reg_df.plot.bar(title = "Actual vs Predicted Duration ")
plt.legend(loc=1)

<matplotlib.legend.Legend at 0x7d39db252080>


train = accidents.sample(frac=.5)
val = accidents.drop(train.index)
features = ['Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)',
                      'Visibility(mi)','Wind_Speed(mph)','Precipitation(in)']
x_train_dict = train[features].to_dict(orient='records')
x_val_dict = val[features].to_dict(orient='records')

y_train = train['Duration']
y_val = val['Duration']


y_train = train['Duration'] / pd.Timedelta(minutes=1)
y_val = val['Duration'] / pd.Timedelta(minutes=1)


from sklearn.feature_extraction import DictVectorizer
def get_val_error_kneigh(X_train_dict, y_train, X_val_dict, y_val, k):

    # convert categorical variables to dummy variables
    vec = DictVectorizer(sparse=False)
    vec.fit(X_train_dict)
    X_train = vec.transform(X_train_dict)
    X_val = vec.transform(X_val_dict)

    # standardize the data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_sc = scaler.transform(X_train)
    X_val_sc = scaler.transform(X_val)

    # Fit a 10-nearest neighbors model.
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train_sc, y_train)

    # Make predictions on the validation set.
    y_val_pred = model.predict(X_val_sc[2000900:2001000])
    rmse = np.sqrt(((y_val[2000900:2001000] - y_val_pred) ** 2).mean())

    return rmse


print((get_val_error_kneigh(x_train_dict, y_train,x_val_dict, y_val,1) +
       get_val_error_kneigh(x_val_dict, y_val, x_train_dict, y_train,1)) /2)
print((get_val_error_kneigh(x_train_dict, y_train, x_val_dict, y_val,10) +
       get_val_error_kneigh(x_val_dict, y_val, x_train_dict, y_train,10)) /2)

205.2884295944479
165.4329359935768


def cross_rmse(k):
  return (get_val_error_kneigh(x_train_dict, y_train,x_val_dict, y_val,k) + get_val_error_kneigh(x_val_dict, y_val, x_train_dict, y_train,k))/2

ks = pd.Series(range(5, 20))
ks.index = range(5, 20)
test_errs = ks.apply(cross_rmse)

test_errs.plot.line()
test_errs.sort_values()

8      164.089451
9      165.093459
10     165.432936
11     165.513040
6      166.210178
         ...     
17    1649.370131
16    1745.927383
15    1854.986174
14    1979.287852
13    2125.443891
Length: 15, dtype: float64


#road condition regression
y_train_reg = accidents["Duration"]

# standardize the data
scaler = StandardScaler()
scaler.fit(x_train_rd)
x_train_rd_sc = scaler.transform(x_train_rd)

# fit the 5-nearest neighbors model
model_reg_rd = KNeighborsRegressor(n_neighbors=10)
model_reg_rd.fit(x_train_rd_sc, y_train_reg)

KNeighborsRegressor(n_neighbors=10)

KNeighborsRegressor(n_neighbors=10)


y_train_reg_pred_rd = model_reg_rd.predict(x_train_sc[2000900:2000920])
y_reg_rd_df = pd.DataFrame({'Actual Duration': y_train_reg[2000900:2000920], 'Predicted Duration': y_train_reg_pred_rd})
y_reg_rd_df.plot.bar(title = "Actual vs Predicted Duration ")
plt.legend(loc=1)

<matplotlib.legend.Legend at 0x7d39db250280>


%%shell
jupyter nbconvert --to html /content/sample_data/Milestone_2.ipynb

	ID	Source	Severity	Start_Time	End_Time	Start_Lat	Start_Lng	End_Lat	End_Lng	Distance(mi)	...	Roundabout	Station	Stop	Traffic_Calming	Traffic_Signal	Turning_Loop	Sunrise_Sunset	Civil_Twilight	Nautical_Twilight	Astronomical_Twilight
0	A-1	Source2	3	2016-02-08 05:46:00	2016-02-08 11:00:00	39.865147	-84.058723	NaN	NaN	0.01	...	False	False	False	False	False	False	Night	Night	Night	Night
1	A-2	Source2	2	2016-02-08 06:07:59	2016-02-08 06:37:59	39.928059	-82.831184	NaN	NaN	0.01	...	False	False	False	False	False	False	Night	Night	Night	Day
2	A-3	Source2	2	2016-02-08 06:49:27	2016-02-08 07:19:27	39.063148	-84.032608	NaN	NaN	0.01	...	False	False	False	False	True	False	Night	Night	Day	Day
3	A-4	Source2	3	2016-02-08 07:23:34	2016-02-08 07:53:34	39.747753	-84.205582	NaN	NaN	0.01	...	False	False	False	False	False	False	Night	Day	Day	Day
4	A-5	Source2	2	2016-02-08 07:39:07	2016-02-08 08:09:07	39.627781	-84.188354	NaN	NaN	0.01	...	False	False	False	False	True	False	Day	Day	Day	Day

Analysis of US Accidents¶

Exploratory Of Weather and Road Structures¶

Models for predicting severity¶

Regression Model For Duration¶

Conclusion¶