Introduction

The American Community Survey (ACS) Data Profile tables provide pre-calculated percentages and key demographic/economic indicators that are commonly used in analysis. This post demonstrates how to work with these tables using the tidycensus package, focusing on pulling data across multiple geographies efficiently.

pacman::p_load(tidycensus, dplyr, tidyr, DT, sf, GGally, ggplot2, knitr)

Set Up Variable Mappings

First, we’ll define a lookup table of useful variables from the Census Data Profile tables. These tables contain pre-calculated percentages and key demographic/economic indicators that are commonly used in analysis. The lookup table has three columns:

  • Variable Name: Descriptive names we use in our code (e.g., pct_bachelor_degree_plus_adults25plus)
  • Short Label: Short labels for display in plots and tables (e.g., “Bachelor’s+ %”)
  • Census Code: The official Census Bureau variable codes used by tidycensus (e.g., “DP02_0068P”)
dp_vars_lookup <- data.frame(
  var_name = c(
    # DP02 - Social Characteristics
    "pct_bachelor_degree_plus_adults25plus",
    "pct_high_school_graduate_adults25plus",
    "pct_english_only_home_pop5plus",
    "pct_spanish_home_pop5plus",
    "pct_broadband_subscription_households",
    
    # DP03 - Economic Characteristics  
    "per_capita_income",
    "median_household_income",
    "pct_below_poverty_all_people",
    "pct_unemployed_labor_force",
    "pct_no_vehicle_households",
    "pct_gov_employment_employed16plus",
    "pct_snap_benefits_households",
    
    # DP04 - Housing Characteristics
    "pct_owner_occupied_housing_units",
    
    # DP05 - Demographics 
    "pct_under_5_total_pop",
    "pct_under_18_total_pop",
    "pct_over_65_total_pop",
    
    # Race/Ethnicity
    "pct_hispanic_any_race_total_pop",
    "pct_white_non_hispanic_total_pop",
    "pct_black_non_hispanic_total_pop",
    "pct_asian_non_hispanic_total_pop",
    "pct_native_american_non_hispanic_total_pop"
  ),
  short_label = c(
    "Bachelor's+ %", "HS Grad %", "English Only %", "Spanish Home %", "Broadband %",
    "Per Capita Income", "Median HH Income", "Poverty %", "Unemployment %", 
    "No Vehicle %", "Gov Employment %", "SNAP Benefits %",
    "Owner Occupied %",
    "Under 5 %", "Under 18 %", "Over 65 %",
    "Hispanic %", "White NH %", "Black NH %", "Asian NH %", "Native Am. NH %"
  ),
  census_code = c(
    "DP02_0068P", "DP02_0067P", "DP02_0113P", "DP02_0116P", "DP02_0154P",
    "DP03_0088", "DP03_0062", "DP03_0128P", "DP03_0009P", "DP03_0057P", 
    "DP03_0042P", "DP03_0074P",
    "DP04_0046P",
    "DP05_0005P", "DP05_0019P", "DP05_0024P",
    "DP05_0076P", "DP05_0082P", "DP05_0083P", "DP05_0085P", "DP05_0084P"
  ),
  stringsAsFactors = FALSE
)

Here’s the full variable lookup table for reference: all 21 variables span social characteristics, economic conditions, housing tenure, age structure, and race/ethnicity.

DT::datatable(
  dp_vars_lookup,
  rownames = FALSE,
  colnames = c("Variable Name", "Short Label", "Census Code"),
  options = list(
    pageLength = 10,
    searching = FALSE,
    scrollX = TRUE
  )
)

Pull Data for Multiple Geographies

We’ll create a function to pull data for multiple geographies efficiently, then apply it to state, senate districts, and house districts. The ACS 5-year estimates provide the most reliable data for smaller geographies:

# Create named vector for tidycensus (uses var_name as names)
dp_vars_expanded <- setNames(dp_vars_lookup$census_code, dp_vars_lookup$var_name)

# Function to pull ACS data for a given geography
pull_acs_data <- function(geography, geo_label, variables, year = 2023, survey = "acs5", state = "NM") {
  tidycensus::get_acs(
    geography = geography,
    variables = variables,
    year = year,
    survey = survey,
    state = state
  ) |>
    mutate(geo_type = geo_label)
}

# Pull data for all three geographies
nm_state <- pull_acs_data(
  geography = "state",
  geo_label = "State",
  variables = dp_vars_expanded
)

nm_senate <- pull_acs_data(
  geography = "state legislative district (upper chamber)",
  geo_label = "Senate District",
  variables = dp_vars_expanded
)

nm_house <- pull_acs_data(
  geography = "state legislative district (lower chamber)",
  geo_label = "House District",
  variables = dp_vars_expanded
)

# Combine all into one dataset
all_nm_data <- bind_rows(nm_state, nm_senate, nm_house)

Explore the Data

Let’s take a look at the data for one senate district to see what we’ve pulled. The data includes estimates and margins of error for each variable:

# Get the first senate district name
first_district <- nm_senate$NAME[1]

# Display data for this district
DT::datatable(
  nm_senate |> 
    filter(NAME == first_district) |>
    select(-geo_type),
  rownames = FALSE,
  caption = paste("ACS Data Profile variables for", first_district),
  options = list(
    pageLength = 5,
    searching = FALSE,
    scrollX = TRUE
  )
)

Explore Relationships

The pairs plot below shows pairwise relationships among six key variables across all New Mexico Senate Districts. The lower triangle shows scatter plots with a loess curve, the diagonal shows each variable’s density distribution, and the upper triangle reports the Pearson correlation coefficient. Values near ±1 indicate a strong relationship; values near 0 indicate little linear association.

First, we join the lookup table to get short labels, then pivot to wide format for visualization:

# Select key variables (using var_name since tidycensus returns names from the vector)
key_vars <- c("pct_bachelor_degree_plus_adults25plus", "pct_snap_benefits_households", 
              "pct_below_poverty_all_people", "pct_unemployed_labor_force",
              "pct_owner_occupied_housing_units", "pct_broadband_subscription_households")

senate_wide <- nm_senate |>
  left_join(dp_vars_lookup, by = c("variable" = "var_name")) |>
  filter(variable %in% key_vars) |>
  select(NAME, short_label, estimate) |>
  tidyr::pivot_wider(names_from = short_label, values_from = estimate)

p <- GGally::ggpairs(
  senate_wide, 
  columns = 2:ncol(senate_wide),
  title = "Variable relationships across New Mexico Senate Districts",
  lower = list(continuous = function(data, mapping, ...) {
    ggplot2::ggplot(data = data, mapping = mapping) +
      ggplot2::geom_point(alpha = 0.6, size = 1) +
      ggplot2::geom_smooth(method = "loess", se = TRUE, color = "red", linewidth = 0.5)
  }),
  diag = list(continuous = GGally::wrap("densityDiag", alpha = 0.5)),
  upper = list(continuous = GGally::wrap("cor", size = 4))
) +
  ggplot2::theme_minimal()

p


Summary

Reach for the Data Profile tables when you need a broad demographic snapshot quickly and do not want to aggregate raw counts from the detailed Subject or Sequence tables. Because the Census Bureau pre-computes the percentages – poverty rates, educational attainment shares, homeownership rates – you avoid a class of derivation errors and the MOE arithmetic that comes with them.

This workflow is especially well suited for comparative questions: how do legislative districts, counties, or places rank against each other on a set of standard indicators? It also serves as a fast first-pass before committing to a deeper pull from the detailed B- or C-series tables when a specific variable needs finer disaggregation than the Data Profile provides.