Working with ACS Data Profile Tables

Introduction

The American Community Survey (ACS) Data Profile tables provide pre-calculated percentages and key demographic/economic indicators that are commonly used in analysis. This post demonstrates how to work with these tables using the tidycensus package, focusing on pulling data across multiple geographies efficiently.

if (!requireNamespace("pacman", quietly = TRUE)) {
  install.packages("pacman")
}

suppressPackageStartupMessages(
  pacman::p_load(
    tidycensus,
    dplyr,
    tidyr,
    DT,
    sf,
    GGally,
    ggplot2,
    knitr
  )
)

Setting up variable mappings

First, we’ll define a lookup table of useful variables from the Census Data Profile tables. These tables contain pre-calculated percentages and key demographic/economic indicators that are commonly used in analysis. The lookup table has three columns:

Variable Name: Descriptive names we use in our code (e.g., pct_bachelor_degree_plus_adults25plus)
Short Label: Short labels for display in plots and tables (e.g., “Bachelor’s+ %”)
Census Code: The official Census Bureau variable codes used by tidycensus (e.g., “DP02_0068P”)

dp_vars_lookup <- data.frame(
  var_name = c(
    # DP02 - Social Characteristics
    "pct_bachelor_degree_plus_adults25plus",
    "pct_high_school_graduate_adults25plus",
    "pct_english_only_home_pop5plus",
    "pct_spanish_home_pop5plus",
    "pct_broadband_subscription_households",
    
    # DP03 - Economic Characteristics  
    "per_capita_income",
    "median_household_income",
    "pct_below_poverty_all_people",
    "pct_unemployed_labor_force",
    "pct_no_vehicle_households",
    "pct_gov_employment_employed16plus",
    "pct_snap_benefits_households",
    
    # DP04 - Housing Characteristics
    "pct_owner_occupied_housing_units",
    
    # DP05 - Demographics 
    "pct_under_5_total_pop",
    "pct_under_18_total_pop",
    "pct_over_65_total_pop",
    
    # Race/Ethnicity
    "pct_hispanic_any_race_total_pop",
    "pct_white_non_hispanic_total_pop",
    "pct_black_non_hispanic_total_pop",
    "pct_asian_non_hispanic_total_pop",
    "pct_native_american_non_hispanic_total_pop"
  ),
  short_label = c(
    "Bachelor's+ %", "HS Grad %", "English Only %", "Spanish Home %", "Broadband %",
    "Per Capita Income", "Median HH Income", "Poverty %", "Unemployment %", 
    "No Vehicle %", "Gov Employment %", "SNAP Benefits %",
    "Owner Occupied %",
    "Under 5 %", "Under 18 %", "Over 65 %",
    "Hispanic %", "White NH %", "Black NH %", "Asian NH %", "Native Am. NH %"
  ),
  census_code = c(
    "DP02_0068P", "DP02_0067P", "DP02_0113P", "DP02_0116P", "DP02_0154P",
    "DP03_0088", "DP03_0062", "DP03_0128P", "DP03_0009P", "DP03_0057P", 
    "DP03_0042P", "DP03_0074P",
    "DP04_0046P",
    "DP05_0005P", "DP05_0019P", "DP05_0024P",
    "DP05_0076P", "DP05_0082P", "DP05_0083P", "DP05_0085P", "DP05_0084P"
  ),
  stringsAsFactors = FALSE
)

Here’s the full variable lookup table for reference:

DT::datatable(
  dp_vars_lookup,
  rownames = FALSE,
  colnames = c("Variable Name", "Short Label", "Census Code"),
  options = list(
    pageLength = 10,
    searching = FALSE,
    scrollX = TRUE
  )
)

Pulling data for multiple geographies

We’ll create a function to pull data for multiple geographies efficiently, then apply it to state, senate districts, and house districts. The ACS 5-year estimates provide the most reliable data for smaller geographies:

# Create named vector for tidycensus (uses var_name as names)
dp_vars_expanded <- setNames(dp_vars_lookup$census_code, dp_vars_lookup$var_name)

# Function to pull ACS data for a given geography
pull_acs_data <- function(geography, geo_label, variables, year = 2023, survey = "acs5", state = "NM") {
  tidycensus::get_acs(
    geography = geography,
    variables = variables,
    year = year,
    survey = survey,
    state = state
  ) |>
    mutate(geo_type = geo_label)
}

# Pull data for all three geographies
nm_state <- pull_acs_data(
  geography = "state",
  geo_label = "State",
  variables = dp_vars_expanded
)

nm_senate <- pull_acs_data(
  geography = "state legislative district (upper chamber)",
  geo_label = "Senate District",
  variables = dp_vars_expanded
)

nm_house <- pull_acs_data(
  geography = "state legislative district (lower chamber)",
  geo_label = "House District",
  variables = dp_vars_expanded
)

# Combine all into one dataset
all_nm_data <- bind_rows(nm_state, nm_senate, nm_house)

Exploring the data

Let’s take a look at the data for one senate district to see what we’ve pulled. The data includes estimates and margins of error for each variable:

# Get the first senate district name
first_district <- nm_senate$NAME[1]

# Display data for this district
DT::datatable(
  nm_senate |> 
    filter(NAME == first_district) |>
    select(-geo_type),
  rownames = FALSE,
  caption = paste("ACS Data Profile variables for", first_district),
  options = list(
    pageLength = 5,
    searching = FALSE,
    scrollX = TRUE
  )
)

Exploring relationships

We can explore relationships between variables by creating a pairs plot using the GGally package and New Mexico Senate District data. The pairs plot shows:

Lower triangle: Scatter plots with loess smoothing lines showing relationships between variable pairs
Diagonal: Density plots showing the distribution of each variable
Upper triangle: Correlation coefficients between variable pairs

First, we’ll join the lookup table to get short labels, then pivot to wide format for visualization:

# Select key variables (using var_name since tidycensus returns names from the vector)
key_vars <- c("pct_bachelor_degree_plus_adults25plus", "pct_snap_benefits_households", 
              "pct_below_poverty_all_people", "pct_unemployed_labor_force",
              "pct_owner_occupied_housing_units", "pct_broadband_subscription_households")

senate_wide <- nm_senate |>
  left_join(dp_vars_lookup, by = c("variable" = "var_name")) |>
  filter(variable %in% key_vars) |>
  select(NAME, short_label, estimate) |>
  tidyr::pivot_wider(names_from = short_label, values_from = estimate)

p <- GGally::ggpairs(
  senate_wide, 
  columns = 2:ncol(senate_wide),
  title = "Variable relationships across New Mexico Senate Districts",
  lower = list(continuous = function(data, mapping, ...) {
    ggplot2::ggplot(data = data, mapping = mapping) +
      ggplot2::geom_point(alpha = 0.6, size = 1) +
      ggplot2::geom_smooth(method = "loess", se = TRUE, color = "red", linewidth = 0.5)
  }),
  diag = list(continuous = GGally::wrap("densityDiag", alpha = 0.5)),
  upper = list(continuous = GGally::wrap("cor", size = 4))
) +
  ggplot2::theme_minimal()

p

Summary

The Data Profile tables are particularly useful because they contain many commonly-used demographic and economic indicators already calculated as percentages, saving you from having to compute them manually from detailed tables.