Presentation Exercise

Prompt 1:

I’m using FiveThirtyEight’s nbaallelo.csv. How can I recreate a ‘winning odds’ graph like the overlapping home vs away distributions?

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gt)
library(gtExtras)
library(scales)

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

Load and summarize data

nba <- read_csv("nbaallelo.csv")
Rows: 126314 Columns: 23
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): game_id, lg_id, date_game, team_id, fran_id, opp_id, opp_fran, gam...
dbl (13): gameorder, _iscopy, year_id, seasongame, is_playoffs, pts, elo_i, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(nba)
Rows: 126,314
Columns: 23
$ gameorder     <dbl> 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10…
$ game_id       <chr> "194611010TRH", "194611010TRH", "194611020CHS", "1946110…
$ lg_id         <chr> "NBA", "NBA", "NBA", "NBA", "NBA", "NBA", "NBA", "NBA", …
$ `_iscopy`     <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,…
$ year_id       <dbl> 1947, 1947, 1947, 1947, 1947, 1947, 1947, 1947, 1947, 19…
$ date_game     <chr> "11/1/1946", "11/1/1946", "11/2/1946", "11/2/1946", "11/…
$ seasongame    <dbl> 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 3,…
$ is_playoffs   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ team_id       <chr> "TRH", "NYK", "CHS", "NYK", "DTF", "WSC", "BOS", "PRO", …
$ fran_id       <chr> "Huskies", "Knicks", "Stags", "Knicks", "Falcons", "Capi…
$ pts           <dbl> 66, 68, 63, 47, 33, 50, 53, 59, 51, 56, 60, 71, 56, 71, …
$ elo_i         <dbl> 1300.000, 1300.000, 1300.000, 1306.723, 1300.000, 1300.0…
$ elo_n         <dbl> 1293.277, 1306.723, 1309.652, 1297.071, 1279.619, 1320.3…
$ win_equiv     <dbl> 40.29483, 41.70517, 42.01226, 40.69278, 38.86405, 43.135…
$ opp_id        <chr> "NYK", "TRH", "NYK", "CHS", "WSC", "DTF", "PRO", "BOS", …
$ opp_fran      <chr> "Knicks", "Huskies", "Knicks", "Stags", "Capitols", "Fal…
$ opp_pts       <dbl> 68, 66, 47, 63, 50, 33, 59, 53, 56, 51, 71, 60, 71, 56, …
$ opp_elo_i     <dbl> 1300.000, 1300.000, 1306.723, 1300.000, 1300.000, 1300.0…
$ opp_elo_n     <dbl> 1306.723, 1293.277, 1297.071, 1309.652, 1320.381, 1279.6…
$ game_location <chr> "H", "A", "H", "A", "H", "A", "A", "H", "A", "H", "A", "…
$ game_result   <chr> "L", "W", "W", "L", "L", "W", "L", "W", "L", "W", "L", "…
$ forecast      <dbl> 0.6400650, 0.3599350, 0.6311012, 0.3688987, 0.6400650, 0…
$ notes         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
summary(nba)
   gameorder       game_id             lg_id              _iscopy   
 Min.   :    1   Length:126314      Length:126314      Min.   :0.0  
 1st Qu.:15790   Class :character   Class :character   1st Qu.:0.0  
 Median :31579   Mode  :character   Mode  :character   Median :0.5  
 Mean   :31579                                         Mean   :0.5  
 3rd Qu.:47368                                         3rd Qu.:1.0  
 Max.   :63157                                         Max.   :1.0  
    year_id      date_game           seasongame      is_playoffs     
 Min.   :1947   Length:126314      Min.   :  1.00   Min.   :0.00000  
 1st Qu.:1975   Class :character   1st Qu.: 22.00   1st Qu.:0.00000  
 Median :1990   Mode  :character   Median : 43.00   Median :0.00000  
 Mean   :1988                      Mean   : 43.53   Mean   :0.06386  
 3rd Qu.:2003                      3rd Qu.: 65.00   3rd Qu.:0.00000  
 Max.   :2015                      Max.   :108.00   Max.   :1.00000  
   team_id            fran_id               pts            elo_i     
 Length:126314      Length:126314      Min.   :  0.0   Min.   :1092  
 Class :character   Class :character   1st Qu.: 93.0   1st Qu.:1417  
 Mode  :character   Mode  :character   Median :103.0   Median :1501  
                                       Mean   :102.7   Mean   :1495  
                                       3rd Qu.:112.0   3rd Qu.:1576  
                                       Max.   :186.0   Max.   :1853  
     elo_n        win_equiv        opp_id            opp_fran        
 Min.   :1086   Min.   :10.15   Length:126314      Length:126314     
 1st Qu.:1417   1st Qu.:34.10   Class :character   Class :character  
 Median :1501   Median :42.11   Mode  :character   Mode  :character  
 Mean   :1495   Mean   :41.71                                        
 3rd Qu.:1576   3rd Qu.:49.64                                        
 Max.   :1853   Max.   :71.11                                        
    opp_pts        opp_elo_i      opp_elo_n    game_location     
 Min.   :  0.0   Min.   :1092   Min.   :1086   Length:126314     
 1st Qu.: 93.0   1st Qu.:1417   1st Qu.:1417   Class :character  
 Median :103.0   Median :1501   Median :1501   Mode  :character  
 Mean   :102.7   Mean   :1495   Mean   :1495                     
 3rd Qu.:112.0   3rd Qu.:1576   3rd Qu.:1576                     
 Max.   :186.0   Max.   :1853   Max.   :1853                     
 game_result           forecast          notes          
 Length:126314      Min.   :0.02045   Length:126314     
 Class :character   1st Qu.:0.32799   Class :character  
 Mode  :character   Median :0.50000   Mode  :character  
                    Mean   :0.50000                     
                    3rd Qu.:0.67201                     
                    Max.   :0.97955                     

Convert data to plot

# Keep only one row per game (home team rows)
home_games <- nba %>%
  mutate(forecast = as.numeric(forecast)) %>%
  filter(year_id >= 2010,
         game_location == "H",
         !is.na(forecast)) %>%
  transmute(
    p_home = forecast,
    p_away = 1 - forecast
  )

# Convert format to plot
odds_long <- home_games %>%
  pivot_longer(
    cols = c(p_home, p_away),
    names_to = "side",
    values_to = "win_prob"
  ) %>%
  mutate(
    side = recode(side,
                  p_home = "Home Team",
                  p_away = "Away Team")
  )

Prompt 2:

Can you write ggplot code to plot win probability distributions for home vs away teams (2010+), and make it look professional?

Prompt 3:

Add a dashed vertical reference line at 50%.

Plot

p <- ggplot(odds_long, aes(x = win_prob, fill = side)) +
  geom_histogram(
    aes(y = after_stat(density)),
    bins = 45,
    alpha = 0.65,
    position = "identity"
  ) +
  geom_vline(
    xintercept = 0.5,
    linetype = "dashed",
    linewidth = 0.7,
    color = "gray40"
  ) +
  scale_x_continuous(
    limits = c(0, 1),
    labels = percent_format(accuracy = 1)
  ) +
  labs(
    title = "Winning odds for NBA games (2010+)",
    subtitle = "Pregame Elo win probabilities; home team distribution is shifted higher",
    x = "Win probability",
    y = "Density",
    fill = NULL,
    caption = "Source: FiveThirtyEight nba-elo (nbaallelo.csv)"
  ) +
  theme_classic(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    legend.position = "top",
    axis.title = element_text(face = "bold")
  )

# Print the plot
p
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_bar()`).

Original Plot

Save plot

ggsave(
  filename = "reproduced-odds.png",
  plot = p,
  width = 8,
  height = 5,
  dpi = 300
)
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_bar()`).

Publication Quality Table

Prompt 1

Write R code to create a professional table with: bold/italic text, conditional formatting, a spanner header, a caption, and a footnote.

Prompt 2

Add an advanced feature like sparklines and make the formatting automatic.

Convert data to put into the table

# Game-level home win probability (one row per game)
games_home <- nba %>%
  mutate(forecast = as.numeric(forecast)) %>%
  filter(game_location == "H", !is.na(forecast)) %>%
  transmute(
    year_id,
    p_home = forecast,
    p_away = 1 - forecast
  )

# Summary stats by season
season_summary <- games_home %>%
  group_by(year_id) %>%
  summarise(
    games = n(),
    mean_home = mean(p_home),            
    mean_away = mean(p_away),        
    median_home = median(p_home),
    home_adv = mean_home - mean_away,    # == 2*mean_home - 1
    .groups = "drop"
  )

# Home team distribution over probability bins per season 
bin_edges <- seq(0, 1, by = 0.05)

spark_data <- games_home %>%
  mutate(bin = cut(p_home, breaks = bin_edges, include.lowest = TRUE)) %>%
  count(year_id, bin) %>%
  group_by(year_id) %>%
  summarise(spark = list(n), .groups = "drop")

tbl_data <- season_summary %>%
  left_join(spark_data, by = "year_id") %>%
  arrange(desc(year_id)) %>%
  slice(1:12) 

Table

tbl_data %>%
  gt(rowname_col = "year_id") %>%
  tab_header(
    title = md("**Home-court advantage in FiveThirtyEight Elo forecasts**"),
    subtitle = "Season-level summary of pregame win probabilities (home vs away)"
  ) %>%
  tab_caption(
    md("*Caption:* This table summarizes how strongly the Elo model favored home teams across seasons. Higher **Home advantage** indicates larger model-implied home edge.")
  ) %>%
  tab_spanner(
    label = md("**Average win probability**"),
    columns = c(mean_home, mean_away, median_home)
  ) %>%
  cols_label(
    mean_home = "Home (mean)",
    mean_away = "Away (mean)",
    median_home = "Home (median)",
    home_adv = "Home advantage",
    games = "Games",
    spark = "Home odds shape"
  ) %>%
  fmt_percent(columns = c(mean_home, mean_away, median_home, home_adv), decimals = 1) %>%
  fmt_number(columns = games, decimals = 0) %>%

  tab_stubhead(label = md("**Season**<br><span style='font-weight:normal; font-style:italic;'>year_id</span>")) %>%

  data_color(
    columns = home_adv,
    method = "numeric",
    palette = c("#f7fbff", "#6baed6", "#08306b")
  ) %>%
  data_color(
    columns = mean_home,
    method = "numeric",
    palette = c("#fff5f0", "#fc9272", "#cb181d")
  ) %>%

  tab_style(
    style = list(cell_text(weight = "bold")),
    locations = cells_body(
      columns = home_adv,
      rows = home_adv == max(home_adv, na.rm = TRUE)
    )
  ) %>%

  gt_plt_sparkline(
    column = spark,
    same_limit = TRUE
  ) %>%
 
# Footnote
  tab_footnote(
    footnote = "Forecast is the Elo-based pregame win probability in nbaallelo.csv. Away probability is computed as 1 − home probability using the home-team row per game.",
    locations = cells_column_labels(columns = c(mean_home, mean_away))
  ) %>%

  # Clean up
  tab_options(
    table.font.size = px(13),
    data_row.padding = px(3),
    heading.title.font.size = px(18),
    heading.subtitle.font.size = px(13)
  )
Caption: This table summarizes how strongly the Elo model favored home teams across seasons. Higher Home advantage indicates larger model-implied home edge.
Home-court advantage in FiveThirtyEight Elo forecasts
Season-level summary of pregame win probabilities (home vs away)
Season
year_id
Games
Average win probability
Home advantage Home odds shape
Home (mean)1 Away (mean)1 Home (median)
2015 1,309 62.1% 37.9% 64.4% 24.2% 4.0
2014 1,318 62.2% 37.8% 63.8% 24.4% 5.0
2013 1,313 62.2% 37.8% 64.4% 24.5% 6.0
2012 1,074 62.4% 37.6% 64.6% 24.9% 1.0
2011 1,309 61.8% 38.2% 64.0% 23.6% 48.0
2010 1,312 61.7% 38.3% 63.7% 23.4% 6.0
2009 1,315 61.9% 38.1% 65.0% 23.9% 4.0
2008 1,316 62.1% 37.9% 64.0% 24.3% 1.0
2007 1,309 62.7% 37.3% 64.3% 25.4% 21.0
2006 1,319 62.4% 37.6% 63.6% 24.8% 27.0
2005 1,314 62.4% 37.6% 63.9% 24.7% 2.0
2004 1,269 62.8% 37.2% 64.9% 25.7% 15.0
1 Forecast is the Elo-based pregame win probability in nbaallelo.csv. Away probability is computed as 1 − home probability using the home-team row per game.