First, I build a data frame for each season I want to scrape which includes the game info for each game in that season. The resulting columns for this data frame are as follows:
Date,Time,Away,Score_Away,Home,Score_Home,GameID,OTs,Attend,Notes,Start_A1,Start_A2,Start_A3,Start_A4,Start_A5,Start_H1,Start_H2,Start_H3,Start_H4,Start_H5,Season
Here's my code for this part of my process:
Code: Select all
# Script to webscrape all NBA GameIDs from basketball-reference.com
# RainmanTrail
# 11/8/2020
library(rvest)
library(RSelenium)
library(stringr)
library(dplyr)
library(tidyverse)
# Define the calendar for which seasons we want to scrape from
years <- c(1997:2020)
months <- c("january","february","march","april","may","june","july",
"august","september","october","november","december")
# Launch Selenium driver to allow scraping of text served by javascript
rD <- rsDriver(browser="firefox", port=4444L, verbose=F)
remDr <- rD$client
# Scrape GameIDs
for (i in years) {
games <- data.frame()
for (j in months) {
# print(c(j,i))
url = paste0("https://www.basketball-reference.com/leagues/NBA_",i,"_games-",j,".html")
remDr$navigate(url) # Navigate to url
raw_html <- remDr$getPageSource()[[1]] # Get html
page <- read_html(raw_html)
# NOTE: This will crap out for Sept 2019 season games. Could add an exception.
if (!is.na(str_match(raw_html, "div_schedule"))) {
game_list <- page %>%
html_node("#schedule") %>%
html_nodes("[data-stat='box_score_text']") %>%
html_nodes('a') %>% html_attr('href')
game_list <- gsub("/boxscores/(.*).html", "\\1", game_list)
game_tbl <- page %>%
html_node("#schedule") %>%
html_table()
# Fix number of columns for years < 2001 since 'Time' column is missing
if (i < 2001) { game_tbl$Time = ''; game_tbl <- game_tbl[, c(1,10,2:9)] }
names(game_tbl) <- c("Date","Time","Away","Score_Away","Home",
"Score_Home","GameID","OTs","Attend","Notes")
# Remove table headers
game_tbl <- game_tbl[!game_tbl$Date %in% c("Date","Playoffs"), ]
game_tbl$GameID <- game_list
} else {
print("No games this month")
}
# OPTIONAL: Scrape the starting lineups for each game and add them to the data frame
k=1
for (g in game_tbl$GameID) {
url = paste0("https://www.basketball-reference.com/boxscores/",g,".html")
remDr$navigate(url)
raw_html <- remDr$getPageSource()[[1]] # Get html
page <- read_html(raw_html)
box <- page %>% html_nodes("[class='sortable stats_table now_sortable']")
start_away <- box[1] %>% html_nodes("[data-stat='player']") %>%
html_nodes('a') %>% html_attr('href')
start_away <- gsub("/players/[a-z]/(\\w+).html", "\\1", start_away[1:5])
start_home <- box[3] %>% html_nodes("[data-stat='player']") %>%
html_nodes('a') %>% html_attr('href')
start_home <- gsub("/players/[a-z]/(\\w+).html", "\\1", start_home[1:5])
game_tbl[k, c("Start_A1","Start_A2","Start_A3","Start_A4","Start_A5")] <- start_away
game_tbl[k, c("Start_H1","Start_H2","Start_H3","Start_H4","Start_H5")] <- start_home
k=k+1
}
games <- rbind.data.frame(games, game_tbl)
}
# Write the results to file
write.csv(games, file = paste0("NBA_Game_Info_",i,".csv"), row.names = F)
rm(games, game_tbl)
}
# Close remote driver and stop Selenium server
remDr$close()
rD$server$stop()