library(rvest)
## Read in HTML data
game_of_thrones <- read_html("http://www.imdb.com/title/tt0944947/fullcredits?ref_=tt_ql_1")
## Find all the actors on the page
actors <- html_nodes(game_of_thrones, "td[class = 'primary_photo']") # Find actor entries
actor_names <- html_nodes(actors, "img") # Names are stored within the tag
actor_names <- html_attr(actor_names, "alt") # Names are stored as the "alt" attribute
# Find all actors' pages on imdb
actors_urls <- html_nodes(actors, "a") # Urls are stored within the tag
actors_urls <- html_attr(actors_urls, "href") # Urls are stored as the "href" attribute
actors_urls <- paste0("http://www.imdb.com", actors_urls) # Append the beginning of the URL
## Find all the characters on the page
characters <- html_nodes(game_of_thrones, 'td[class="character"]') # Find the character entries
characters <- html_node(characters, "a") # Character names are stored within the tag
characters <- html_text(characters) # Character names are the text associated with the tag
head(characters)
## Find the number of episodes in which each actor appeared
episodes <- html_nodes(game_of_thrones, 'td[class="character"]') # Find the n episode entries
episodes <- html_text(episodes) # Episodes are a part of a bunch of text associated with this tag
episode_start <- regexpr("episode", episodes) # Find the beginning of the word "episodes"
episodes <- substring(episodes, episode_start-3, episode_start-1) # Subset the string to the number of episodes
episodes <- gsub("\\(","", episodes) # Remove parentheses
episodes <- gsub(" ","",episodes) # Remove white space
episodes <- as.numeric(episodes) # Convert to numeric
got_data <- data.frame(actor_names, characters, episodes, actors_urls, stringsAsFactors = F)
head(got_data)
barplot(table(got_data$episodes), xlab = "Number of episodes", ylab = "Frequency")
## What is the gender distribution in Game of Thrones?
# Example for first character in list
tyrion <- read_html(got_data$actors_urls[1])
tyrion_gender <- html_node(tyrion, "div[id='name-job-categories']") %>% html_node("a")
tyrion_gender <- html_text(tyrion_gender)
## Loop over all characters (I have commented this out as it takes a few hours!)
#got_data$actor_gender <- NA
#for(i in 1:dim(got_data)[1]){
# Sys.sleep(1)
# cat(".")
# tmp_actor_page <- read_html(got_data$actors_urls[i])
# tmp_actor_gender <- html_node(tmp_actor_page, "div[id='name-job-categories']") %>% html_node("a")
# tmp_actor_gender <- html_text(tmp_actor_gender)
# got_data$actor_gender[i] <- tolower(gsub("\\n| ","",tmp_actor_gender))
# print(got_data$actor_gender[i])
# }
#save(got_data, file = "got_data.Rdata")
load("got_data.Rdata")
head(got_data)
got_data <- got_data[got_data$actor_gender %in% c("actor", "actress"),]
prop.table(table(got_data$actor_gender))
aggregate(got_data$episodes, by = list(got_data$actor_gender), FUN = mean)
summary(lm(episodes ~ actor_gender, got_data))