library(tidyverse)
library(tidymodels)
library(patchwork)
weather <- read.csv("data/weatherAUS.csv", header = TRUE)
We will start by transform any character variables that need to be transformed into categorical. Use the following code to identify them and trasform them into factors.
variables_to_transform = weather %>%
select(where(is.character),-Date) %>% names()
weather <- weather %>%
mutate_at(vars(all_of(variables_to_transform)),factor)
To simplify things, today we will not be using some categorical
explanatory variables, because they have a very large number of
categories and might make our model interpretation more complex.
Specifically we will exclude WindGustDir
,
WindDir9am
and WindDir3pm
.
weather <- weather %>%
select(-WindGustDir,-WindDir9am,-WindDir3pm)
Note that some of these variables have a large number of missing values:
weather %>%
select(where(~ any(is.na(.)))) %>%
summarise(across(everything(), ~mean(is.na(.)))) %>%
pivot_longer(col = everything(), names_to = "Variable", values_to = "Prop_NA") %>%
arrange(desc(Prop_NA))
## # A tibble: 18 × 2
## Variable Prop_NA
## <chr> <dbl>
## 1 Sunshine 0.480
## 2 Evaporation 0.432
## 3 Cloud3pm 0.408
## 4 Cloud9am 0.384
## 5 Pressure9am 0.104
## 6 Pressure3pm 0.103
## 7 WindGustSpeed 0.0706
## 8 Humidity3pm 0.0310
## 9 Temp3pm 0.0248
## 10 RainTomorrow 0.0225
## 11 Rainfall 0.0224
## 12 RainToday 0.0224
## 13 WindSpeed3pm 0.0211
## 14 Humidity9am 0.0182
## 15 WindSpeed9am 0.0121
## 16 Temp9am 0.0121
## 17 MinTemp 0.0102
## 18 MaxTemp 0.00867
RainTomorrow
? If so, we filter them out and save the new
dataset as weather_noNA
.Yes, there are 3267 missing cases in weather
.
weather_noNA <- weather %>%
filter(!is.na(RainTomorrow))
The remainig number of observations in weather_noNA
are
142193.
RainToday
variable.weather_noNA %>% group_by(Location) %>%
summarize(avg_raindays = mean(RainToday == "Yes", na.rm = TRUE)) %>%
arrange(desc(avg_raindays))
## # A tibble: 49 × 2
## Location avg_raindays
## <fct> <dbl>
## 1 Portland 0.365
## 2 Walpole 0.335
## 3 Cairns 0.317
## 4 Dartmoor 0.313
## 5 NorfolkIsland 0.310
## 6 MountGambier 0.304
## 7 Albany 0.298
## 8 Witchcliffe 0.298
## 9 CoffsHarbour 0.294
## 10 MountGinini 0.277
## # ℹ 39 more rows
We will focus our analysis on the city of Portland
.
weather_Portland <- weather_noNA %>%
filter(Location == "Portland")
RainTomorrow
by fitting a linear
regression using the variable RainToday
and print the
output using tidy()
.Portland_fit <- logistic_reg() %>%
set_engine("glm") %>%
fit(RainTomorrow ~ RainToday,
data = weather_Portland, family = "binomial")
tidy(Portland_fit)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.08 0.0527 -20.4 1.51e-92
## 2 RainTodayYes 1.33 0.0807 16.5 4.39e-61
Portland_pred <- predict(Portland_fit, weather_Portland, type = "prob")
ggplot(Portland_pred, aes(x = .pred_Yes)) +
geom_histogram(bins = 5) +
labs(x = "Predicted probabilities",
y = "Count",
title = "Histogram of the predicted probabilies",
subtitle = "RainTomorrow ~ RainToday")
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
The predictive probability for each day can only take two values:
0.2542194 or 0.5634191. This is due to the fact that the only
explanatory variable RainToday
is a categorical variable
with two levels, so the two probabilities correspond to whether it was
raining on that day or not.
Yes, there are 12 missing values, which correspond to the days where
the variable RainToday
was missing:
weather_Portland %>%
mutate(Pred_Yes = Portland_pred$.pred_Yes) %>%
filter(is.na(Pred_Yes)) %>%
select(RainToday)
## RainToday
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## 7 <NA>
## 8 <NA>
## 9 <NA>
## 10 <NA>
## 11 <NA>
## 12 <NA>
Let us set a seed and perform a split of our data.
set.seed(111723)
# Put 80% of the data into the training set
weather_split <- initial_split(weather_Portland, prop = 0.80)
# Create data frames for the two sets:
train_data <- training(weather_split)
test_data <- testing(weather_split)
RainToday
as
predictor on this training data, using tidymodels
recipes
and workflows.step_naomit()
and
finally use step_dummy
to convert categorical to dummy
variables.weather_rec1 <- recipe(
RainTomorrow ~ RainToday,
data = weather_Portland
) %>%
step_naomit(all_predictors()) %>%
step_dummy(all_nominal_predictors())
weather_mod1 <- logistic_reg() %>%
set_engine("glm")
weather_wflow1 <- workflow() %>%
add_model(weather_mod1) %>%
add_recipe(weather_rec1)
weather_fit1 <- weather_wflow1 %>%
fit(data = train_data)
tidy(weather_fit1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.07 0.0589 -18.1 2.76e-73
## 2 RainToday_Yes 1.33 0.0902 14.7 3.27e-49
RainTomorrow
. We will use
as predictors the variables MinTemp
, MaxTemp
,
RainToday
and Rainfall
. Similarly to question
2, use workflows to fit this model on our training data.weather_rec2 <- recipe(
RainTomorrow ~ MinTemp+MaxTemp+RainToday+Rainfall, # formula
data = weather_Portland
) %>%
step_naomit(all_predictors()) %>%
step_dummy(all_nominal(), -all_outcomes())
weather_mod2 <- logistic_reg() %>%
set_engine("glm")
weather_wflow2 <- workflow() %>%
add_model(weather_mod2) %>%
add_recipe(weather_rec2)
weather_fit2 <- weather_wflow2 %>%
fit(data = train_data)
tidy(weather_fit2)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.422 0.217 1.94 0.0519
## 2 MinTemp 0.00872 0.0159 0.550 0.582
## 3 MaxTemp -0.0827 0.0137 -6.06 0.00000000139
## 4 Rainfall 0.0490 0.0124 3.95 0.0000791
## 5 RainToday_Yes 0.715 0.122 5.87 0.00000000436
weather_pred1 <- predict(weather_fit1, test_data, type = "prob") %>%
bind_cols(test_data)
## Warning: ! There are new levels in a factor: `NA`.
weather_pred1 %>%
roc_curve(
truth = RainTomorrow,
.pred_Yes,
event_level = "second"
) %>%
autoplot()
weather_pred1 %>%
roc_auc(
truth = RainTomorrow,
.pred_Yes,
event_level = "second"
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.654
weather_pred2 <- predict(weather_fit2, test_data, type = "prob") %>%
bind_cols(test_data)
## Warning: ! There are new levels in a factor: `NA`.
weather_pred2 %>%
roc_curve(
truth = RainTomorrow,
.pred_Yes,
event_level = "second"
) %>%
autoplot()
weather_pred2 %>%
roc_auc(
truth = RainTomorrow,
.pred_Yes,
event_level = "second"
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.714
The second model, the one with multiple predictors, has a larger AUC. Moreover, its ROC curve is further away from the diagonal compared to the ROC curve of the simpler simple logistic regression model.
RainTomorrow
and look at the number of false
positives and false negatives. For example:cutoff_prob <- 0.5
weather_pred2 %>%
mutate(
RainTomorrow = if_else(RainTomorrow == "Yes", "It rains", "It does not rain"),
RainTomorrow_pred = if_else(.pred_Yes > cutoff_prob, "Predicted rain", "Predicted no rain")
) %>%
na.omit() %>%
count(RainTomorrow_pred, RainTomorrow)
## # A tibble: 4 × 3
## RainTomorrow_pred RainTomorrow n
## <chr> <chr> <int>
## 1 Predicted no rain It does not rain 177
## 2 Predicted no rain It rains 62
## 3 Predicted rain It does not rain 51
## 4 Predicted rain It rains 76
cutoff_prob = 0.3
?The false positive rate is given by FP / (FP + TN), so
51 / (51+177) = 0.22
.
The false negative rate is given by FN / (TP + FN), so
62 / (62+76) = 0.44
.
We will now try to improve our fit by building a model using additional explanatory variables.
train_data %>%
select(where(is.factor), -RainTomorrow, -Location) %>%
names()
## [1] "RainToday"
p_RainToday <- ggplot(train_data, aes(x = RainToday, fill = RainTomorrow)) +
geom_bar() +
scale_fill_manual(values = c("#E48957", "#CA235F"))
p_RainToday
No.
RainTomorrow
?sd_num = train_data %>%
group_by(RainTomorrow) %>%
select(where(is.numeric)) %>%
summarise_all(sd,na.rm=TRUE)
## Adding missing grouping variables: `RainTomorrow`
sd_num
## # A tibble: 2 × 17
## RainTomorrow MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 No 3.79 5.43 3.60 2.72 3.82 12.0
## 2 Yes 3.21 4.39 6.74 2.37 2.99 16.6
## # ℹ 10 more variables: WindSpeed9am <dbl>, WindSpeed3pm <dbl>,
## # Humidity9am <dbl>, Humidity3pm <dbl>, Pressure9am <dbl>, Pressure3pm <dbl>,
## # Cloud9am <dbl>, Cloud3pm <dbl>, Temp9am <dbl>, Temp3pm <dbl>
No, the minimum sd is 1.7879861.
Date
. We will use Ludbridate
again: extract
the month and year.library(lubridate)
weather_Portland %>%
mutate(Date = lubridate::date(Date)) %>%
mutate(month = factor(month(Date)),
year = year(Date)) %>%
select(Date, month, year) %>%
sample_n(size = 5)
## Date month year
## 1 2014-11-03 11 2014
## 2 2015-07-03 7 2015
## 3 2014-07-06 7 2014
## 4 2014-07-24 7 2014
## 5 2009-06-25 6 2009
weather_rec3 <- recipe(
RainTomorrow ~ ., # formula
data = weather_Portland
) %>%
step_rm(Location) %>%
step_mutate(Date = lubridate::date(Date)) %>%
step_date(Date, features = c("month", "year")) %>%
step_rm(Date) %>%
step_dummy(all_nominal(), -all_outcomes())
weather_mod3 <- logistic_reg() %>%
set_engine("glm")
weather_wflow3 <- workflow() %>%
add_model(weather_mod3) %>%
add_recipe(weather_rec3)
weather_fit3 <- weather_wflow3 %>%
fit(data = train_data)
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `Date = lubridate::date(Date)`.
## Caused by warning:
## ! tz(): Don't know how to compute timezone for object of class factor; returning "UTC".
## Warning: ! There are new levels in a factor: `NA`.
tidy(weather_fit3)
## # A tibble: 30 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 104. 70.7 1.46 1.43e- 1
## 2 MinTemp -0.143 0.0413 -3.45 5.52e- 4
## 3 MaxTemp -0.0876 0.0668 -1.31 1.90e- 1
## 4 Rainfall 0.0140 0.0173 0.809 4.19e- 1
## 5 Evaporation 0.0704 0.0455 1.55 1.22e- 1
## 6 Sunshine -0.0730 0.0276 -2.64 8.30e- 3
## 7 WindGustSpeed 0.0697 0.0104 6.72 1.82e-11
## 8 WindSpeed9am -0.0191 0.0146 -1.31 1.91e- 1
## 9 WindSpeed3pm -0.00528 0.0143 -0.368 7.13e- 1
## 10 Humidity9am -0.00472 0.00751 -0.628 5.30e- 1
## # ℹ 20 more rows
weather_pred3 <- predict(weather_fit3, test_data, type = "prob") %>%
bind_cols(test_data)
## Warning: ! There are new levels in a factor: `NA`.
weather_pred3 %>%
roc_curve(
truth = RainTomorrow,
.pred_Yes,
event_level = "second"
) %>%
autoplot()
weather_pred3 %>%
roc_auc(
truth = RainTomorrow,
.pred_Yes,
event_level = "second"
)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.864
Yes, the AUC for this model is higher and the ROC curve is closer to the top-left corner of the plot.