The first step when web scraping is loading the web page and the necessary packages.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(ggplot2)
url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2020-03-01,2020-07-31&count=30'
wp_content <- read_html(url)
as.numeric("125")
## [1] 125
# it yields 125 as numeric
as.numeric(c("5.14", "2.84"))
## [1] 5.14 2.84
# it yields 5.14 2.84 as numeric
as.numeric("1,025")
## Warning: NAs introduced by coercion
## [1] NA
# it yields NA instead of 1025. use readr:parse_number() instead
parse_number("1,025")
## [1] 1025
# it yields 1025 as numeric
parse_number(c("3km", "40 mins", "(+2.5)"))
## [1] 3.0 40.0 2.5
# it yields 3.0 40.0 2.5 as numeric
str_trim(" Remove whitespaces before and after ")
## [1] "Remove whitespaces before and after"
# it yields "Remove whitespaces before and after"
str_replace_all(" Remove whitespaces before and after "," ","")
## [1] "Removewhitespacesbeforeandafter"
# same as before, this can be used to remove "\n"
as.numeric("10.50")
## [1] 10.5
as.numeric(c("14.59", "3.14", "55"))
## [1] 14.59 3.14 55.00
parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
## [1] 14 1735 2012 1234
str_trim(" Spaces before and after should disappear ")
## [1] "Spaces before and after should disappear"
Browse these codes to find the selectors of the elements. Then use the Selector Gadget tool to identify how many elements we can successfully capture, as well as any missing ones.
titles <- wp_content %>%
html_nodes('.lister-item-header a') %>%
html_text()
years <- wp_content %>%
html_nodes('.lister-item-year') %>%
html_text()
years <- parse_number(years)
runtimes <- wp_content %>%
html_nodes('.runtime') %>%
html_text()
runtimes <- parse_number(runtimes)
genres <- wp_content %>%
html_nodes('.genre') %>%
html_text()
genres <- str_trim(genres)
ratings <- wp_content %>%
html_nodes('strong') %>%
html_text()
ratings <- ratings[3:32]
ratings <- as.numeric(ratings)
metascores <- wp_content %>%
html_nodes('.metascore') %>%
html_text()
metascores <- as.numeric(metascores)
votes <- wp_content %>%
html_nodes('.sort-num_votes-visible span:nth-child(2)') %>%
html_attr('data-value')
votes <- parse_number(votes)
append_vector <- function(vector, inserted_indices, values){
## Creating the current indices of the vector
vector_current_indices <- 1:length(vector)
## Adding small amount of values (between 0 and 0.9) to the `inserted_indices`
new_inserted_indices <- inserted_indices + seq(0, 0.9, length.out = length(inserted_indices))
## Appending the `new_inserted_indices` to the current vector indices
indices <- c(vector_current_indices, new_inserted_indices)
## Ordering the values according to the indices
ordered_indices <- order(indices)
## Appending the new value to the existing vector
new_vector <- c(vector, values)
## Ordering the new vector wrt the ordered indices
new_vector[ordered_indices]
}
vector <- c("A", "B", "C", "E", "F", "G", "I", "J") # D and H are missing
# Inserting NA
append_vector(vector, c(3, 6), NA)
## [1] "A" "B" "C" NA "E" "F" "G" NA "I" "J"
# Inserting D and H
append_vector(vector, c(3, 6), c("D", "H"))
## [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J"
Creating a dataframe with the data we previously extracted: titles,
years, runtimes, genres, user ratings, metascores, and votes. Keeping
only the integer part of the user ratings using the floor()
function. For example, 3.4
becomes 3
.
metascores <- append_vector(metascores, c(19, 26, 27), c(NA, NA, NA))
movie_df <- tibble::tibble("title" = titles,
"year" = years,
"runtime" = runtimes,
"genre" = genres,
"rating" = floor(ratings),
"metascore" = metascores,
"vote" = votes)
ggplot(data = movie_df,
aes(x = rating, y = vote, group = rating)) +
geom_boxplot()
We determine that the lowest- and highest-rated films have the lowest and highest votes. There is also a linear correlation between the ratings and votes, on average.
correlation <- movie_df %>%
select(vote, rating) %>%
cor(use= 'pairwise.complete.obs')
correlation
## vote rating
## vote 1.0000000 0.2675205
## rating 0.2675205 1.0000000