import pandas as pd
bike_rentals = pd.read_csv("bike_rental_hour.csv")
print(bike_rentals.head())
# Plotting "cnt" column
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(bike_rentals["cnt"])
# Printing out how each column correlates with the "cnt" column.
bike_rentals.corr()["cnt"]
# Creating "time_label" column, which will give our algorithm information about how certain hours are related (Morning, Afternoon, etc.)
def assign_label(hr):
if hr >= 6 and hr < 12:
return 1
elif hr >= 12 and hr < 18:
return 2
elif hr >= 18 and hr <= 24:
return 3
elif hr >= 0 and hr < 6:
return 4
bike_rentals["time_label"] = bike_rentals["hr"].apply(assign_label)
We are working with continuous numeric data, so Mean Squared Error will work well here.
# Spliting dataframe into train and test sets.
train = bike_rentals.sample(frac=0.8, random_state=1)
test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]
# Selecting columns to use in algorithm
cols = ["season", "yr", "mnth", "hr", "time_label", "holiday", "weekday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"]
# Training and testing a Linear Regression model, and then determining error metric.
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train[cols], train["cnt"])
predictions = lr.predict(test[cols])
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test["cnt"], predictions)
print(mse)
# Training and testing a Decision Tree model, and then determining error metric.
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(train[cols], train["cnt"])
predictions_dtr = dtr.predict(test[cols])
mse_dtr = mean_squared_error(test["cnt"], predictions_dtr)
print(mse_dtr)
# Adjusting parameters of the DecisionTreeRegressor class to minimize model error.
dtr2 = DecisionTreeRegressor(max_depth=15, min_samples_leaf=3)
dtr2.fit(train[cols], train["cnt"])
predictions_dtr2 = dtr2.predict(test[cols])
mse_dtr2 = mean_squared_error(test["cnt"], predictions_dtr2)
print(mse_dtr2)
# Training and testing a Random Forest model, and then determining error metric.
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(train[cols], train["cnt"])
predictions_rfr = rfr.predict(test[cols])
mse_rfr = mean_squared_error(test["cnt"], predictions_rfr)
print(mse_rfr)
# Adjusting parameters of the RandomForestRegressor class to minimize model error.
rfr2 = RandomForestRegressor(max_depth=17, min_samples_leaf=2)
rfr2.fit(train[cols], train["cnt"])
predictions_rfr2 = rfr2.predict(test[cols])
mse_rfr2 = mean_squared_error(test["cnt"], predictions_rfr2)
print(mse_rfr2)