library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
forest_fires <- read_csv("forestfires.csv")
## Rows: 517 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): month, day
## dbl (11): X, Y, FFMC, DMC, DC, ISI, temp, RH, wind, rain, area
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The Importance of Forest Fire Data

colnames(forest_fires)
##  [1] "X"     "Y"     "month" "day"   "FFMC"  "DMC"   "DC"    "ISI"   "temp" 
## [10] "RH"    "wind"  "rain"  "area"

We know that the columns correspond to the following information:

The acronym FWI stands for “fire weather index”, a method used by scientists to quantify risk factors for forest fires.

A single row corresponds to the location of a fire and some characteristics about the fire itself. Higher water presence is typically asssociated with less fire spread, so we might expect the water-related variables (DMC and rain) to be related with area.

Data Processing

month and day are character vartiables, but we know that there is an inherent order to them. We’ll convert these variables into factors so that they’ll be sorted into the correct order when we plot them.

forest_fires %>% pull(month) %>% unique
##  [1] "mar" "oct" "aug" "sep" "apr" "jun" "jul" "feb" "jan" "dec" "may" "nov"
forest_fires %>% pull(day) %>% unique
## [1] "fri" "tue" "sat" "sun" "mon" "wed" "thu"
month_order <-  c("jan", "feb", "mar", "apr",
                  "may", "jun", "jul", "aug",
                  "sep", "oct", "nov", "dec")
week_order <- c("mon", "tue", "wed", "thu", "fri", "sat", "sun")
forest_fires <- forest_fires %>%
  mutate(month = factor(month, levels = month_order),
         day = factor(day, levels = week_order)
  )

When Do Most Forest Fires Occur?

We need to create a summary tibble that counts the number of fires that appears in each month. Then, we’ll be able to use this tibble in a visualization. We can consider month and day to be different grouping variables, so our code to produce the tibbles and plots will look similar.

Month Level

month_occur <- forest_fires%>%
  group_by(month)%>%
#  summarise(sum_occur = n())
   summarise(sum_occur = length(rain))

month_occur %>%
  ggplot(aes(x = month, y = sum_occur)) +
  geom_col()+
  labs (
    title = 'sum of the fire by month',
    x = 'month',
    y = 'total fire'
  )

week_occur <- forest_fires%>%
  group_by(day)%>%
  summarise(sum_occur = n())

week_occur %>%
  ggplot(aes(x = day, y = sum_occur)) +
  geom_col()+
  labs (
    title = 'sum of the fire by weekday',
    x = 'weekday',
    y = 'total fire'
  )

From our graphs, we saw that August and September see more forest fires than other months. It also looks as though the weekend days (Friday, Saturday, and Sunday) have more forest fires than days in the middle of the week.

Plotting Other Variables Against Time

forest_fires_long <- forest_fires%>%
  pivot_longer(
    cols = c("FFMC", "DMC", "DC", "ISI", "temp", "RH",  "wind", "rain" ),
    names_to = "var_col",
    values_to = "value"
  )

forest_fires_long %>%
  ggplot(aes(x = month, y = value))+
  geom_boxplot()+
  facet_wrap(vars(var_col),scales = "free_y")

Examining Forest Fire Severity

forest_fires_long %>% 
  ggplot(aes(x = value, y = area))+
  geom_point()+
  facet_wrap(vars(var_col), scales = "free_x")+
  labs(
    title = "Relationships between other variables and area burned",
    x = "Value of the variable",
    y = "Area burned (hectare)"
  )+
  theme(
    panel.background = element_rect(fill = "white")
  )

# Outlier Problems

It seems that there are two rows where area that still hurt the scale of the visualization. Let’s make a similar visualization that excludes these observations so that we can better see how each variable relates to area.

forest_fires_long %>% 
  filter( area <250 ) %>%
  ggplot(aes(x = value, y = area))+
  geom_point()+
  facet_wrap(vars(var_col), scales = 'free_x')+
  labs(
    title = "Relationships between other varibles and area < 250",
    x = 'the value of the varible',
    y = 'area burned in hectare'
  )