cdcdata-exercise – Rebecca Basta Data Analysis Portfolio

This dataset comes from CDC’s Smoking-Attributable Mortality, Morbidity, and Economic Costs (SAMMEC). It looks at estimates of smoking-attributable health costs (millions of dollars)catergorized by states during the years of 2005-2009.

Link to the source: https://data.cdc.gov/Health-Consequences-and-Costs/Smoking-Attributable-Mortality-Morbidity-and-Econo/ezab-8sq5/about_data

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)

# File path (your uploaded CSV in this workspace)
file_path <- "sae.csv"

## Read the data
sae_raw <- read.csv(file_path, stringsAsFactors = FALSE)

## Select variables that I will be using
sae_clean <- sae_raw %>%
  select(Year, LocationDesc, MeasureDesc, Data_Value, Data_Value_Unit)

## Clean Data_Value
sae_raw$Data_Value <- as.numeric(gsub(",", "", sae_raw$Data_Value))

## Variables
sae <- sae_raw[, c(
  "Year",
  "LocationAbbr",
  "LocationDesc",
  "TopicDesc",
  "MeasureDesc",
  "Variable",
  "Data_Value",
  "Data_Value_Type",
  "Data_Value_Unit"
)]

## Summary
summary(sae)

      Year      LocationAbbr       LocationDesc        TopicDesc        
 Min.   :2005   Length:1560        Length:1560        Length:1560       
 1st Qu.:2006   Class :character   Class :character   Class :character  
 Median :2007   Mode  :character   Mode  :character   Mode  :character  
 Mean   :2007                                                           
 3rd Qu.:2008                                                           
 Max.   :2009                                                           
 MeasureDesc          Variable           Data_Value       Data_Value_Type   
 Length:1560        Length:1560        Min.   :     8.4   Length:1560       
 Class :character   Class :character   1st Qu.:   105.7   Class :character  
 Mode  :character   Mode  :character   Median :   306.1   Mode  :character  
                                       Mean   :  1545.2                     
                                       3rd Qu.:   842.3                     
                                       Max.   :132459.8                     
 Data_Value_Unit   
 Length:1560       
 Class :character  
 Mode  :character

anyNA(sae$Data_Value)   # check missing numeric values

[1] FALSE

## Data after cleaning
head(sae, 10)

   Year LocationAbbr  LocationDesc                                TopicDesc
1  2009           RI  Rhode Island Smoking-Attributable Expenditures (SAEs)
2  2007           SD  South Dakota Smoking-Attributable Expenditures (SAEs)
3  2009           NH New Hampshire Smoking-Attributable Expenditures (SAEs)
4  2006           MD      Maryland Smoking-Attributable Expenditures (SAEs)
5  2009           LA     Louisiana Smoking-Attributable Expenditures (SAEs)
6  2006           TN     Tennessee Smoking-Attributable Expenditures (SAEs)
7  2005           WI     Wisconsin Smoking-Attributable Expenditures (SAEs)
8  2005           LA     Louisiana Smoking-Attributable Expenditures (SAEs)
9  2007           MO      Missouri Smoking-Attributable Expenditures (SAEs)
10 2008           TX         Texas Smoking-Attributable Expenditures (SAEs)
       MeasureDesc           Variable Data_Value     Data_Value_Type
1  Type of Expense              Other       47.2 Millions of Dollars
2  Type of Expense       Nursing Home       29.2 Millions of Dollars
3  Type of Expense              Other       41.4 Millions of Dollars
4  Type of Expense Prescription Drugs      465.2 Millions of Dollars
5  Type of Expense              Other      117.5 Millions of Dollars
6  Type of Expense Prescription Drugs      555.8 Millions of Dollars
7  Type of Expense         Ambulatory      376.0 Millions of Dollars
8  Type of Expense Prescription Drugs      319.0 Millions of Dollars
9  Type of Expense Prescription Drugs      503.5 Millions of Dollars
10 Type of Expense           Hospital     4369.6 Millions of Dollars
   Data_Value_Unit
1                $
2                $
3                $
4                $
5                $
6                $
7                $
8                $
9                $
10               $

##Summarize data
str(sae)

'data.frame':   1560 obs. of  9 variables:
 $ Year           : int  2009 2007 2009 2006 2009 2006 2005 2005 2007 2008 ...
 $ LocationAbbr   : chr  "RI" "SD" "NH" "MD" ...
 $ LocationDesc   : chr  "Rhode Island" "South Dakota" "New Hampshire" "Maryland" ...
 $ TopicDesc      : chr  "Smoking-Attributable Expenditures (SAEs)" "Smoking-Attributable Expenditures (SAEs)" "Smoking-Attributable Expenditures (SAEs)" "Smoking-Attributable Expenditures (SAEs)" ...
 $ MeasureDesc    : chr  "Type of Expense" "Type of Expense" "Type of Expense" "Type of Expense" ...
 $ Variable       : chr  "Other" "Nursing Home" "Other" "Prescription Drugs" ...
 $ Data_Value     : num  47.2 29.2 41.4 465.2 117.5 ...
 $ Data_Value_Type: chr  "Millions of Dollars" "Millions of Dollars" "Millions of Dollars" "Millions of Dollars" ...
 $ Data_Value_Unit: chr  "$" "$" "$" "$" ...

summary(sae)

      Year      LocationAbbr       LocationDesc        TopicDesc        
 Min.   :2005   Length:1560        Length:1560        Length:1560       
 1st Qu.:2006   Class :character   Class :character   Class :character  
 Median :2007   Mode  :character   Mode  :character   Mode  :character  
 Mean   :2007                                                           
 3rd Qu.:2008                                                           
 Max.   :2009                                                           
 MeasureDesc          Variable           Data_Value       Data_Value_Type   
 Length:1560        Length:1560        Min.   :     8.4   Length:1560       
 Class :character   Class :character   1st Qu.:   105.7   Class :character  
 Mode  :character   Mode  :character   Median :   306.1   Mode  :character  
                                       Mean   :  1545.2                     
                                       3rd Qu.:   842.3                     
                                       Max.   :132459.8                     
 Data_Value_Unit   
 Length:1560       
 Class :character  
 Mode  :character

## Distribution by year
sae %>%
  count(Year) %>%
  mutate(percent = n / sum(n) * 100)

  Year   n percent
1 2005 312      20
2 2006 312      20
3 2007 312      20
4 2008 312      20
5 2009 312      20

## Distrubution by state
sae %>%
  count(LocationDesc) %>%
  mutate(percent = n / sum(n) * 100) %>%
  arrange(desc(percent))

           LocationDesc  n  percent
1               Alabama 30 1.923077
2                Alaska 30 1.923077
3               Arizona 30 1.923077
4              Arkansas 30 1.923077
5            California 30 1.923077
6              Colorado 30 1.923077
7           Connecticut 30 1.923077
8              Delaware 30 1.923077
9  District of Columbia 30 1.923077
10              Florida 30 1.923077
11              Georgia 30 1.923077
12               Hawaii 30 1.923077
13                Idaho 30 1.923077
14             Illinois 30 1.923077
15              Indiana 30 1.923077
16                 Iowa 30 1.923077
17               Kansas 30 1.923077
18             Kentucky 30 1.923077
19            Louisiana 30 1.923077
20                Maine 30 1.923077
21             Maryland 30 1.923077
22        Massachusetts 30 1.923077
23             Michigan 30 1.923077
24            Minnesota 30 1.923077
25          Mississippi 30 1.923077
26             Missouri 30 1.923077
27              Montana 30 1.923077
28             Nebraska 30 1.923077
29               Nevada 30 1.923077
30        New Hampshire 30 1.923077
31           New Jersey 30 1.923077
32           New Mexico 30 1.923077
33             New York 30 1.923077
34       North Carolina 30 1.923077
35         North Dakota 30 1.923077
36                 Ohio 30 1.923077
37             Oklahoma 30 1.923077
38               Oregon 30 1.923077
39         Pennsylvania 30 1.923077
40         Rhode Island 30 1.923077
41       South Carolina 30 1.923077
42         South Dakota 30 1.923077
43            Tennessee 30 1.923077
44                Texas 30 1.923077
45        United States 30 1.923077
46                 Utah 30 1.923077
47              Vermont 30 1.923077
48             Virginia 30 1.923077
49           Washington 30 1.923077
50        West Virginia 30 1.923077
51            Wisconsin 30 1.923077
52              Wyoming 30 1.923077

## Histogram showing expenditure
ggplot(sae, aes(x = Data_Value)) +
  geom_histogram(bins = 30) +
    labs(title = "Distribution of Smoking-Attributable Expenditures",
        x = "Expenditure (Millions of Dollars)",
        y = "Count")

mean_value <- mean(sae$Data_Value, na.rm = TRUE)
sd_value   <- sd(sae$Data_Value, na.rm = TRUE)

mean_value

[1] 1545.223

sd_value

[1] 7868.553

summary(sae$Data_Value)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     8.4    105.7    306.1   1545.2    842.3 132459.8

The distribution of smoking-attributable expenditures is skewed to the right. Most values show low expenditure levels, while a few observations show large values. The mean will probably be inflated due to the large values.

##Boxplot
ggplot(sae, aes(y = Data_Value)) +
  geom_boxplot() +
  labs(title = "Boxplot of Smoking-Attributable Expenditures")

The boxplot of smoking-attributable expenditures is skewed to the right, with most of the observations having low values, while a few observations have large numbers. The large numbers in the data set increase variability.

##Distribution by year
sae %>%
  group_by(Year) %>%
  summarise(mean_expenditure = mean(Data_Value, na.rm = TRUE)) %>%
  ggplot(aes(x = Year, y = mean_expenditure)) + geom_line() + geom_point() +
  labs(title = "Mean Smoking-Attributable Expenditures by Year",
       x = "Year",
       y = "Mean Expenditure (Millions of Dollars)")

The mean smoking-attributable expenditures increased from 2005 to 2009, showing a positive linear trend over time. There is a consistent upward trend in smoking-related healthcare costs from 2005-2009.

##Distribution by state
ggplot(sae, aes(x = reorder(LocationDesc, Data_Value, median),
                y = Data_Value)) + geom_boxplot() + 
  labs(title = "Distribution of Smoking-Attributable Expenditures by State",
       x = "State",
       y = "Expenditure (Millions of Dollars)")

The boxplot shows a lot of variability in smoking-attributable expenditures across states. Most states have low expenditure values, while a few states have higher expenditure values, which is why the boxplot is skewed to the right. The differences across states could be because of geographic and population factors.

Synthetic data (This section contributed by Cassie Cui)

# AI Assistance:
# I used an LLM to help draft general R code for generating a synthetic dataset with the same column structure as the cleaned data. I described the # panel structure (Year, Location, expense category) and that the outcome variable was right-skewed.

library(dplyr)
library(lubridate)

set.seed(123)

# Define number of rows to generate (match the cleaned data)
n_obs <- nrow(sae)


syn_dat <- data.frame(
  Year            = integer(n_obs),
  LocationAbbr    = character(n_obs),
  LocationDesc    = character(n_obs),
  Variable        = character(n_obs),
  TopicDesc       = character(n_obs),
  MeasureDesc     = character(n_obs),
  Data_Value      = numeric(n_obs),
  Data_Value_Type = character(n_obs),
  Data_Value_Unit = character(n_obs),
  stringsAsFactors = FALSE
)
# Year
year_levels <- sort(unique(sae$Year))
year_prob <- as.numeric(prop.table(table(sae$Year)))
syn_dat$Year <- sample(year_levels, size = n_obs, replace = TRUE, prob = year_prob)

# LocationAbbr
loc_levels <- unique(sae$LocationAbbr)
loc_prob <- as.numeric(prop.table(table(sae$LocationAbbr)))
syn_dat$LocationAbbr <- sample(loc_levels, size = n_obs, replace = TRUE, prob = loc_prob)

# Map Abbr -> Desc (lookup, no join)
loc_tbl <- sae %>% select(LocationAbbr, LocationDesc) %>% distinct()
syn_dat$LocationDesc <- loc_tbl$LocationDesc[match(syn_dat$LocationAbbr, loc_tbl$LocationAbbr)]

# Variable
var_levels <- unique(as.character(sae$Variable))
var_prob <- as.numeric(prop.table(table(as.character(sae$Variable))))
syn_dat$Variable <- sample(var_levels, size = n_obs, replace = TRUE, prob = var_prob)

# Fill label/metadata columns 
syn_dat$TopicDesc <- if ("TopicDesc" %in% names(sae)) {
  sae$TopicDesc[which(!is.na(sae$TopicDesc))[1]]
} else "Smoking-Attributable Expenditures (SAEs)"

syn_dat$MeasureDesc <- if ("MeasureDesc" %in% names(sae)) {
  sae$MeasureDesc[which(!is.na(sae$MeasureDesc))[1]]
} else "Type of Expense"

syn_dat$Data_Value_Type <- if ("Data_Value_Type" %in% names(sae)) {
  sae$Data_Value_Type[which(!is.na(sae$Data_Value_Type))[1]]
} else "Millions of Dollars"

syn_dat$Data_Value_Unit <- if ("Data_Value_Unit" %in% names(sae)) {
  sae$Data_Value_Unit[which(!is.na(sae$Data_Value_Unit))[1]]
} else "$"

# Generate synthetic Data_Value (right-skewed) by Variable
syn_dat$Data_Value <- NA_real_
vars <- unique(var_levels)



for (v in vars) {
  idx <- syn_dat$Variable == v

  x <- sae$Data_Value[as.character(sae$Variable) == v]
  x <- x[is.finite(x) & !is.na(x)]
  x <- pmax(x, 1e-6)  # guard against zeros

  meanlog_v <- mean(log(x))
  sdlog_v   <- sd(log(x))

  syn_dat$Data_Value[idx] <- round(rlnorm(sum(idx), meanlog = meanlog_v, sdlog = sdlog_v), 1)
}

dim(syn_dat)

[1] 1560    9

head(syn_dat)

  Year LocationAbbr LocationDesc           Variable
1 2007           KY     Kentucky Prescription Drugs
2 2009           CO     Colorado         Ambulatory
3 2008           MN    Minnesota              Other
4 2005           ND North Dakota              Other
5 2005           ID        Idaho           Hospital
6 2006           NV       Nevada              Other
                                 TopicDesc     MeasureDesc Data_Value
1 Smoking-Attributable Expenditures (SAEs) Type of Expense     1284.8
2 Smoking-Attributable Expenditures (SAEs) Type of Expense      202.8
3 Smoking-Attributable Expenditures (SAEs) Type of Expense       50.1
4 Smoking-Attributable Expenditures (SAEs) Type of Expense       49.4
5 Smoking-Attributable Expenditures (SAEs) Type of Expense      453.1
6 Smoking-Attributable Expenditures (SAEs) Type of Expense       16.6
      Data_Value_Type Data_Value_Unit
1 Millions of Dollars               $
2 Millions of Dollars               $
3 Millions of Dollars               $
4 Millions of Dollars               $
5 Millions of Dollars               $
6 Millions of Dollars               $

ggplot(syn_dat, aes(x = Data_Value)) +
  geom_histogram(bins = 30) +
  labs(title = "Synthetic Distribution of SAEs")

ggplot(syn_dat, aes(y = Data_Value)) +
  geom_boxplot() +
  labs(title = "Synthetic Boxplot of SAEs")

syn_dat %>%
  group_by(Year) %>%
  summarise(mean_val = mean(Data_Value)) %>%
  ggplot(aes(x = Year, y = mean_val)) +
  geom_line() +
  geom_point() +
  labs(title = "Synthetic Mean by Year")

syn_dat %>%
  group_by(LocationDesc) %>%
  summarise(mean_val = mean(Data_Value)) %>%
  ggplot(aes(x = reorder(LocationDesc, mean_val), y = mean_val)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Synthetic Mean by State")

The synthetic dataset preserves the same panel structure and the overall right-skewed distribution of Data_Value. Because values were simulated independently (within expense categories), the year-to-year mean trend is not forced to match the original increasing pattern, so some fluctuations are expected.

The synthetic data preserves the marginal distributions of Year, Location, and expense category, as well as the overall right-skewed pattern of expenditures. However, because state-level effects were not explicitly modeled, the ranking and spread of mean expenditures across states differ from the original dataset. This reflects the independence assumptions in the simulation.