PA470 - Cook County Property Assessment

Part A

There are over 1.8 million parcels in Cook County, Illinois [1]. Each of these parcels is assessed every three years using three to five years of prior sales information [1]. For this analysis, we will take a look at single family home assessments within three of the 36 political townships: Lemont (19), Palos (30), Orland (28). Using data provided on the Cook County data portal, we will will create a series of two models. Our first model will predict the likelihood of overassessment for the year of 2022, and our second model will predict home value for the year of 2023.

Inclusion Criteria

We have 4 inclusion criteria:

Year - 2021, 2022, 2023
Class - 202, 203, 204, 205, 206, 207, 208, 209, 210, 234, 278. These conform to single family homes
Township Code - 19 (Lemont), 30 (Palos), 28 (Orland)
Is Multisale - No

Code

library(arrow)
library(data.table)
library(dtplyr)
library(tidyverse)

sfh2123_assessment <-
  open_csv_dataset("../data/20240217_Assessed_Values.csv",
    skip = 1,
    na = c("", "[]", "NA"),
    schema(
      field("pin", string()),
      field("tax_year", int32()),
      field("class", string()),
      field("township_code", string()),
      field("township_name", string()),
      field("mailed_bldg", int32()),
      field("mailed_land", int32()),
      field("mailed_tot", int32()),
      field("certified_bldg", int32()),
      field("certified_land", int32()),
      field("certified_tot", int32()),
      field("board_bldg", int32()),
      field("board_land", int32()),
      field("board_tot", int32()),
    )
  ) %>%
  filter(tax_year %in% c(2021, 2022, 2023)) %>%
  filter(class %in% c("202", "203", "204", "205", "206", "207", "208", "209", "210", "234", "278")) %>% 
  filter(township_code %in% c("19", "30", "28"))

sfh2123_sales <- 
  open_csv_dataset("../data/20240217_Parcel_Sales.csv",
    skip = 1,
    na = c("", "[]", "NA"),
    schema(
      field("pin", string()),
      field("year", int32()),
      field("township_code", string()),
      field("neighborhood_code", string()),
      field("class", string()),
      field("sale_date", string()),
      field("is_mydec_date", bool()),
      field("sale_price", int32()),
      field("sale_document_num", string()),
      field("sale_deed_type", string()),
      field("mydec_deed_type", string()),
      field("sale_seller_name", string()),
      field("is_multisale", bool()),
      field("num_parcels_sale", int32()),
      field("sale_buyer_name", string()),
      field("sale_type", string()),
      field("sale_filter_same_sale_within_365", bool()),
      field("sale_filter_less_than_10k", bool()),
      field("sale_filter_deed_type", bool())
    )
  ) %>%
  filter(year %in% c(2021, 2022, 2023)) %>%
  filter(class %in% c("202", "203", "204", "205", "206", "207", "208", "209", "210", "234", "278")) %>% 
  filter(township_code %in% c("19", "30", "28")) %>%
  filter(!is_multisale) 

sfh2123_characteristics <- 
  open_csv_dataset("../data/20240217_Improvement_Characteristics.csv",
    skip = 1,
    na = c("", "[]", "NA"),
    schema(
      field("pin", string()),
      field("tax_year", int32()),
      field("card_num", string()),
      field("class", string()),
      field("township_code", string()),
      field("proration_key_pin", string()),
      field("pin_proration_rate", double()),
      field("card_proration_rate", double()),
      field("cdu", string()),
      field("pin_is_multicard", bool()),
      field("pin_num_cards", int32()),
      field("pin_is_multiland", bool()),
      field("pin_num_landlines", int32()),
      field("year_built", int32()),
      field("building_sqft", int32()),
      field("land_sqft", int32()),
      field("num_bedrooms", int32()),
      field("num_rooms", int32()),
      field("num_full_baths", int32()),
      field("num_half_baths", int32()),
      field("num_fireplaces", int32()),
      field("type_of_residence", string()),
      field("construction_quality", string()),
      field("num_apartments", string()),
      field("attic_finish", string()),
      field("garage_attached", string()),
      field("garage_area_included", string()),
      field("garage_size", string()),
      field("garage_ext_wall_material", string()),
      field("attic_type", string()),
      field("basement_type", string()),
      field("ext_wall_material", string()),
      field("central_heating", string()),
      field("repair_condition", string()),
      field("basement_finish", string()),
      field("roof_material", string()),
      field("single_v_multi_family", string()),
      field("site_desirability", string()),
      field("num_commercial_units", string()),
      field("renovation", string()),
      field("recent_renovation", bool()),
      field("porch", string()),
      field("central_air", string()),
      field("design_plan", string())
    )
  ) %>%
  filter(tax_year %in% c(2021, 2022, 2023)) %>%
  filter(class %in% c("202", "203", "204", "205", "206", "207", "208", "209", "210", "234", "278")) %>% 
  filter(township_code %in% c("19", "30", "28"))

sfh2123_universe <- 
  open_csv_dataset("../data/20240217_Parcel_Universe.csv",
    skip = 1,
    na = c("", "[]", "NA"),
    schema(
      field("pin", string()),
      field("pin10", string()),
      field("tax_year", int32()),
      field("class", string()),
      field("triad_name", string()),
      field("triad_code", string()),
      field("township_name", string()),
      field("township_code", string()),
      field("neighborhood_code", string()),
      field("tax_district_code", string()),
      field("zip_code", string()),
      field("longitude", double()),
      field("latitude", double()),
      field("centroid_x_crs_3435", double()),
      field("centroid_y_crs_3435", double()),
      field("census_block_group_geoid", string()),
      field("census_block_geoid", string()),
      field("census_congressional_district_geoid", string()),
      field("census_county_subdivision_geoid", string()),
      field("census_place_geoid", string()),
      field("census_puma_geoid", string()),
      field("census_school_district_elementary_geoid", string()),
      field("census_school_district_secondary_geoid", string()),
      field("census_school_district_unified_geoid", string()),
      field("census_state_representative_geoid", string()),
      field("census_state_senate_geoid", string()),
      field("census_tract_geoid", string()),
      field("census_zcta_geoid", string()),
      field("census_data_year", int32()),
      field("census_acs5_congressional_district_geoid", string()),
      field("census_acs5_county_subdivision_geoid", string()),
      field("census_acs5_place_geoid", string()),
      field("census_acs5_puma_geoid", string()),
      field("census_acs5_school_district_elementary_geoid", string()),
      field("census_acs5_school_district_secondary_geoid", string()),
      field("census_acs5_school_district_unified_geoid", string()),
      field("census_acs5_state_representative_geoid", string()),
      field("census_acs5_state_senate_geoid", string()),
      field("census_acs5_tract_geoid", string()),
      field("census_acs5_data_year", int32()),
      field("board_of_review_district_num", string()),
      field("board_of_review_district_data_year", int32()),
      field("commissioner_district_num", string()),
      field("commissioner_district_data_year", int32()),
      field("judicial_district_num", string()),
      field("judicial_district_data_year", int32()),
      field("ward_num", string()),
      field("ward_chicago_data_year", int32()),
      field("ward_evanston_data_year", int32()),
      field("chicago_community_area_num", string()),
      field("chicago_community_area_name", string()),
      field("chicago_community_area_data_year", int32()),
      field("chicago_industrial_corridor_num", string()),
      field("chicago_industrial_corridor_name", string()),
      field("chicago_industrial_corridor_data_year", int32()),
      field("chicago_police_district_num", string()),
      field("chicago_police_district_data_year", int32()),
      field("coordinated_care_area_num", string()),
      field("coordinated_care_area_data_year", int32()),
      field("enterprise_zone_num", string()),
      field("enterprise_zone_data_year", int32()),
      field("industrial_growth_zone_num", string()),
      field("industrial_growth_zone_data_year", int32()),
      field("qualified_opportunity_zone_num", string()),
      field("qualified_opportunity_zone_data_year", int32()),
      field("flood_fema_sfha", bool()),
      field("flood_fema_data_year", int32()),
      field("flood_fs_factor", int32()),
      field("flood_fs_risk_direction", int32()),
      field("flood_fs_data_year", int32()),
      field("ohare_noise_contour_no_buffer_bool", bool()),
      field("ohare_noise_contour_half_mile_buffer_bool", bool()),
      field("ohare_noise_contour_data_year", int32()),
      field("airport_noise_dnl", double()),
      field("airport_noise_data_year", string()),
      field("school_elementary_district_geoid", string()),
      field("school_elementary_district_name", string()),
      field("school_secondary_district_geoid", string()),
      field("school_secondary_district_name", string()),
      field("school_unified_district_geoid", string()),
      field("school_unified_district_name", string()),
      field("school_year", string()),
      field("school_data_year", int32()),
      field("tax_municipality_num", string()),
      field("tax_municipality_name", string()),
      field("tax_school_elementary_district_num", string()),
      field("tax_school_elementary_district_name", string()),
      field("tax_school_secondary_district_num", string()),
      field("tax_school_secondary_district_name", string()),
      field("tax_school_unified_district_num", string()),
      field("tax_school_unified_district_name", string()),
      field("tax_community_college_district_num", string()),
      field("tax_community_college_district_name", string()),
      field("tax_fire_protection_district_num", string()),
      field("tax_fire_protection_district_name", string()),
      field("tax_library_district_num", string()),
      field("tax_library_district_name", string()),
      field("tax_park_district_num", string()),
      field("tax_park_district_name", string()),
      field("tax_sanitation_district_num", string()),
      field("tax_sanitation_district_name", string()),
      field("tax_special_service_area_num", string()),
      field("tax_special_service_area_name", string()),
      field("tax_tif_district_num", string()),
      field("tax_tif_district_name", string()),
      field("tax_districts_data_year", int32()),
      field("cmap_walkability_grid_id", string()),
      field("cmap_walkability_no_transit_score", double()),
      field("cmap_walkability_total_score", double()),
      field("cmap_walkability_data_year", int32()),
      field("subdivision_id", string()),
      field("subdivision_data_year", int32())
    )
  ) %>%
  filter(tax_year %in% c(2021, 2022, 2023)) %>%
  filter(class %in% c("202", "203", "204", "205", "206", "207", "208", "209", "210", "234", "278")) %>% 
  filter(township_code %in% c("19", "30", "28"))

sfh2123_assessment_dt <- sfh2123_assessment %>% as.data.table()
sfh2123_sales_dt <- sfh2123_sales %>% as.data.table()
sfh2123_characteristics_dt <- sfh2123_characteristics %>% as.data.table()
sfh2123_universe_dt <- sfh2123_universe %>% as.data.table()

Data Joining

Code

sfh2123_sales_AT <- sfh2123_sales_dt %>%
  lazy_dt() %>%
  left_join(sfh2123_assessment_dt, by = c("pin", "township_code", "class", "year" = "tax_year")) %>% 
  left_join(sfh2123_characteristics_dt, by = c("pin", "township_code", "class", "year" = "tax_year")) %>% 
  left_join(sfh2123_universe_dt, by = c("pin", "neighborhood_code", "township_code", "township_name", "class", "year" = "tax_year")) %>%
  mutate(sale_price_ratio = 10 * certified_tot / sale_price) %>%
  collect()

sfh2123_assessment_AT <- sfh2123_assessment_dt %>% 
  lazy_dt() %>%
  left_join(sfh2123_characteristics_dt, by = c("pin", "township_code", "class", "tax_year")) %>% 
  left_join(sfh2123_universe_dt, by = c("pin", "township_code", "township_name", "class", "tax_year")) %>%
  collect()

Check For Missingness

Code

library(ggplot2)

sfh2123_sales_AT %>%
  mutate(id = row_number()) %>%
  tidyr::gather(-id, key = "key", value = "val") %>%
  mutate(isna = is.na(val)) %>%
  ggplot(aes(key, id, fill = isna)) +
    geom_raster(alpha=0.8) +
    scale_fill_manual(name = "",
        values = c('steelblue', 'tomato3'),
        labels = c("Present", "Missing")) +
    labs(x = "Variable",
           y = "Row Number", title = "Missing Values for Housing Sales") +
    coord_flip()

It seems like we have quite a few measures that are either entirely missing or have quite a few missing values. This is okay because we won’t be using all of our variables in order to reduce the complexity and increase the interpretability of our model.

Check For Correlation

Code

library(corrr)

sfh2123_sales_AT %>%
  select(c(sale_price, certified_tot, year_built, building_sqft, land_sqft, num_bedrooms, num_rooms, num_full_baths, num_half_baths, num_fireplaces, cmap_walkability_no_transit_score, cmap_walkability_total_score)) %>%
  select(where(is.numeric)) %>%
  select(where(function(x) sum(!is.na(x)) != 0)) %>%
  select(where(function(x) var(x, na.rm = TRUE) != 0)) %>% 
  correlate(method = "spearman") %>%
  autoplot()

There seems to be a high level of correlation between many of the housing characteristics. Perhaps using a model that takes in account multicollinearity would be a good idea.

Investigating Our Variables

Code

library(scales)

sfh2123_sales_AT %>%
  ggplot(aes(x = year_built, y = sale_price)) +
    geom_point(size = 0.1) +
    geom_smooth(method = "gam", color = "springgreen4") +
    scale_y_continuous(labels = label_currency()) + 
    labs(
      title = "Single Family Home Year Built \nand Sale Price in Cook County, IL", 
      x = "Year Built", 
      y = "Sale Price"
    ) +
    theme(plot.title = element_text(hjust = 0.5))

More recently built houses seem to sell for more

Code

sfh2123_sales_AT %>%
  ggplot(aes(x = building_sqft, y = sale_price)) +
    geom_point(size = 0.1) +
    geom_smooth(method = "gam", color = "springgreen4") +
    scale_y_continuous(labels = label_currency()) + 
    labs(
      title = "Single Family Home Building Square Footage \nand Sale Price in Cook County, IL", 
      x = "Building Square Footage", 
      y = "Sale Price"
    ) +
    theme(plot.title = element_text(hjust = 0.5))

Larger building houses tend to sell for more

Code

sfh2123_sales_AT %>%
  ggplot(aes(x = land_sqft, y = sale_price)) +
    geom_point(size = 0.1) +
    geom_smooth(method = "gam", color = "springgreen4") +
    scale_y_continuous(labels = label_currency()) + 
    labs(
      title = "Single Family Home Land Square Footage \nand Sale Price in Cook County, IL", 
      x = "Land Square Footage", 
      y = "Sale Price"
    ) +
    theme(plot.title = element_text(hjust = 0.5))

The relationship here isn’t entirely clear, but it seems that more land sells for more, up until some point

Code

sfh2123_sales_AT %>%
  ggplot() +
    geom_boxplot(aes(x = num_bedrooms, y = sale_price, fill = factor(num_bedrooms))) +
    scale_x_continuous(n.breaks = 9) +
    scale_y_continuous(labels = label_currency()) +
    labs(
      title = "Single Family Home Number of Bedrooms \nand Sale Price in Cook County, IL", 
      x = "Number of Bedrooms", 
      y = "Sale Price"
    ) + 
    theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

Houses with more bedrooms tend to sell for more

Code

sfh2123_sales_AT %>%
  ggplot() +
    geom_boxplot(aes(x = num_rooms, y = sale_price, fill = factor(num_rooms))) +
    geom_smooth(aes(x = num_rooms, y = sale_price), method = "gam", color = "black") +
    scale_x_continuous(n.breaks = 20) +
    scale_y_continuous(labels = label_currency()) +
    labs(
      title = "Single Family Home Number of Rooms \nand Sale Price in Cook County, IL", 
      x = "Number of Rooms", 
      y = "Sale Price"
    ) + 
    theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

Houses with more rooms tend to sell for more

Code

sfh2123_sales_AT %>%
  ggplot() +
    geom_boxplot(aes(x = num_full_baths + num_half_baths * 0.5, y = sale_price, fill = factor(num_full_baths + num_half_baths * 0.5))) +
    geom_smooth(aes(x = num_full_baths + num_half_baths * 0.5, y = sale_price), method = "gam", color = "black") +
    scale_x_continuous(n.breaks = 20) +
    scale_y_continuous(labels = label_currency()) +
    labs(
      title = "Single Family Home Total Number of Baths \nand Sale Price in Cook County, IL", 
      x = "Total Number of Baths", 
      y = "Sale Price"
    ) + 
    theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

Houses with more bathrooms tend to sell for more

Code

sfh2123_sales_AT %>%
  ggplot() +
    geom_boxplot(aes(x = cmap_walkability_total_score, y = sale_price, fill = factor(cmap_walkability_total_score))) +
    geom_smooth(aes(x = cmap_walkability_total_score, y = sale_price), method = "gam", color = "black") +
    scale_x_continuous(n.breaks = 20) +
    scale_y_continuous(labels = label_currency()) +
    labs(
      title = "Single Family Home CMAP Walkability and \nSale Price in Cook County, IL", 
      x = "Total Walkability Score", 
      y = "Sale Price"
    ) + 
    theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

Houses with in less walkable communities tend to sell for more. Although it seems like individuals might want to live in more walkable communities. The kind of large single family homes that tend to sell for more actually contribute toward reducing walkability, which is most likely why we see a downward trend.

Mapping Our Outcome Variables

Let us take a look at how our outcome variables are distributed spatially. Clustering might indicate that there is a spatial modeling technique is required.

Code

library(sf)

townships <- read_sf("https://gis.cookcountyil.gov/traditional/rest/services/politicalBoundary/MapServer/3/query?outFields=*&where=1%3D1&f=geojson") %>%
  filter(NAME %in% c("LEMONT", "PALOS", "ORLAND")) %>%
  left_join(
    sfh2123_sales_AT %>%
      group_by(township_code) %>%
      summarize(
        median_sp = median(sale_price), 
        median_spr = median(sale_price_ratio))  %>%
      transmute(median_sp, median_spr, NAME = case_when(
        township_code == "19" ~ "LEMONT",
        township_code == "30" ~ "PALOS",
        township_code == "28" ~ "ORLAND",
      )
    )
  )

sfh2123_sales_sf <- sfh2123_sales_AT %>% 
  st_as_sf(coords = c("longitude", "latitude"))

Code

library(leaflet)

qpal1 <- colorQuantile("Greens", sfh2123_sales_sf$sale_price, n = 5)

sfh2123_sales_sf %>%
  leaflet() %>%
    addProviderTiles(providers$CartoDB.Positron) %>% 
    addPolygons(
      data = townships,
      fillColor = "black",
      fillOpacity = 0.1,
      color = "black",
      weight = 2,
      opacity = 0.5,
      highlightOptions = highlightOptions(
        fillOpacity = 0.2,
        weight = 3),
      label = sprintf(
        "<strong>%s</strong><br>Median Sale: %s<br/>",
        townships$NAME,
        label_currency()(townships$median_sp)) %>% lapply(htmltools::HTML),
      labelOptions = labelOptions(
          style = list("font-weight" = "normal", padding = "3px 8px"),
          textsize = "12px",
          direction = "auto")
    ) %>%
    addCircleMarkers(
      radius = 3,
      fillColor = ~ qpal1(sale_price),
      fillOpacity = 0.7,
      stroke = FALSE,
      label = sprintf(
        "<strong>Sale Price:</strong> %s",
        label_currency()(sfh2123_sales_sf$sale_price)) %>% lapply(htmltools::HTML),
      labelOptions = labelOptions(
          style = list("font-weight" = "normal", padding = "3px 8px"),
          textsize = "12px",
          direction = "auto")
    ) %>%
    addLegend(
      pal = qpal1,
      values = sfh2123_sales_sf$sale_price,
      opacity = 0.7,
      title = "Sale Price of Single <br>Family Homes in Cook <br>County",
      position = "topright",
      na.label = "Insufficient Data",
      labFormat = function(type, cuts, p) {
        n = length(cuts)
        p = str_c(round(p * 100), '%')
        cuts = str_c(label_currency()(cuts[-n]), " - ", label_currency()(cuts[-1]))
        str_c('<span title="', p[-n], " - ", p[-1], '">', cuts, '</span>')
      })

It seems that there is clustering of similarly priced homes within our three townships. Lemont has the highest median sale price, but Palos and Orland have similar median sale prices. Perhaps using a spatial modeling technique like a spatial Bayesian or conditional autoregressive model is waranted.

Code

library(spdep)

knn <- knearneigh(sfh2123_sales_sf, k = 20)
nb <- knn2nb(knn)
weights <- nb2listw(nb, style = "B")

moran.test(x = sfh2123_sales_sf$sale_price, listw = weights, zero.policy = TRUE) %>% 
  broom::tidy() %>% 
  select(estimate1, p.value, method) %>%
  rename(`Moran I statistic` = estimate1) %>%
  knitr::kable()

Moran I statistic	p.value	method
0.4998853	0	Moran I test under randomisation

Processing Our Data

For our over/under assessment model, I will define over assessment as greater than the median sale_price_ratio. Theoretically, this should be 1, but parcels are so routinely under assessed as compared to their true sale value that a threshold of 1 would place almost all houses in the under assessed category. I also chose a two class outcome in order to avoid potential complexities related to multiple classification models. I also considered creating a regression model that predicted assessment ratios and simply discretizing the predictions later, but opted against it.

It also important to note that our assessment model will only include data from 2022 and our sale price model will include data from 2021 and 2022. Although prices may be slightly different in these years,

Code

assessment_data <- sfh2123_sales_AT %>%
  filter(year == 2022) %>%
  group_by(pin) %>%
  slice_tail() %>%
  ungroup() %>%
  mutate(overassessed = factor(sale_price_ratio < median(sale_price_ratio)))

sales_data <- sfh2123_sales_AT %>%
  group_by(pin) %>%
  slice_tail() %>%
  st_as_sf(coords = c("longitude", "latitude")) %>%
  st_set_crs(4326)

Create workflow

Code

library(tidymodels)
tidymodels_prefer()

s_workflow <- workflow()

Add Model To Workflow

Code

s_rf_spec <- rand_forest(mtry = tune(), trees = 1000, min_n = tune()) %>%    
  set_engine("ranger") %>%    
  set_mode("regression")  

s_workflow <- workflow() %>%   
  add_model(s_rf_spec)

Add Recipe To Workflow

I chose to use variables that I know are typically considered when buying a house, but we could definitely construct a workflow evaluating models with various predictors.

Code

s_rec <- sales_data %>%    
  st_drop_geometry() %>%   
  recipe(sale_price ~ year_built + building_sqft + land_sqft + num_bedrooms + num_rooms + num_full_baths + num_half_baths + num_fireplaces + type_of_residence + construction_quality + attic_finish + garage_size + ext_wall_material + basement_type + central_heating + roof_material + porch + central_air + school_elementary_district_name, data = .) %>%    
  step_string2factor(all_string()) %>%    
  step_impute_knn(all_predictors()) %>%   
  step_novel(all_nominal_predictors()) %>%   
  step_dummy(all_nominal_predictors())  

s_workflow <- s_workflow %>%   
  add_recipe(s_rec)

Create Testing/Training Data and Evaluate Your Model

Code

set.seed(123) 
s_train_set <- sales_data %>% filter(year != 2023)
s_test_set <- sales_data %>% filter(year == 2023)

set.seed(234) 
s_train_resamples <- bootstraps(s_train_set)

In order to reduce the model tuning time, we will use a repeated measure ANOVA model to eliminate tuning parameter combinations that are unlikely to yield the best results.

Code

library(finetune)

set.seed(345) 
s_tune <- s_workflow %>%   
  tune_race_anova(
    resamples = s_train_resamples,     
    grid = 20,     
    metrics = metric_set(rmse, mape),     
    control = control_race(verbose = TRUE))

Finalize our model

Code

s_best <- select_best(s_tune, "rmse") 
s_final_rf <- finalize_model(s_rf_spec, s_best)  
s_final_wf <- workflow() %>%   
  add_recipe(s_rec) %>%   
  add_model(s_final_rf)

Make our predictions and get the RMSE and MAPE

Code

s_val_preds <- s_final_wf %>%    
  fit(s_train_set) %>%   
  augment(s_test_set)  

s_metrics <- list(   
  s_val_preds %>%     
    rmse(sale_price, .pred),   
  s_val_preds %>%     
    mape(sale_price, .pred))  

s_metrics %>% bind_rows() %>% knitr::kable()

.metric	.estimator	.estimate
rmse	standard	97375.92563
mape	standard	18.67622

It seems the average deviation of our predicted prices from the true sale price is about $97,500. That is pretty high all things considered. We definitely have quite a lot of refining to do.

Code

s_vip_rf_spec <- rand_forest(mtry = tune(), trees = 1000, min_n = tune()) %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("regression")

s_vip_rf <- finalize_model(s_vip_rf_spec, s_best)  
s_vip_wf <- workflow() %>%   
  add_recipe(s_rec) %>%   
  add_model(s_vip_rf) 

s_vip_rf_fit <- s_vip_wf %>%    
  fit(s_train_set)

s_vip_rf_fit %>% 
  extract_fit_parsnip() %>% 
  vip::vip(geom = "point")

Building square footage is by far the most important variable within our model. This makes a lot of sense given our prior EDA and just intuitively. I think the results line up pretty fairly with what one would expect.

Part B

We are going to conduct “out-of-sample” predictions for the entire year of 2023, but first, let’s try to create some new features that may allow us to improve on our assessments.

Code

sfh_assessment_fe <- sfh2123_assessment_dt %>%
  lazy_dt() %>%
  left_join(sfh2123_characteristics_dt, by = c("pin", "township_code", "class", "tax_year")) %>% 
  left_join(sfh2123_universe_dt, by = c("pin", "tax_year", "class", "township_code", "township_name")) %>%
  left_join(sfh2123_sales_dt, by = c("pin", "township_code", "class", "neighborhood_code", "tax_year" = "year")) %>% 
  collect()

We are going to get some age and education features from the ACS

Code

library(tidycensus)

acs2022vars <- load_variables(2022, "acs5")
tables <- list("B01001", "B15003")

acs2022vars <- acs2022vars %>%
  mutate(table = str_sub(name, 1, 7)) %>%
  filter(table %in% str_c(tables, "_")) %>% 
  mutate(table = str_sub(table, 1, 6)) %>% 
  mutate(name = str_c("estimate_", name)) %>%
  mutate(label = str_sub(label, 11)) %>%
  separate_wider_delim(label, "!!", names = c("1st", "2nd", "3rd", "4th", "5th", "6th"), too_few = "align_start")

ACS_data <- map_df(tables,
    \(table) get_acs(geography = "tract", state ="IL", county = "Cook", table = table, year = 2022)
  ) %>%
  pivot_wider(names_from = variable, values_from = c(estimate, moe))

# gets the sum of ACS estimates for a single tract
get_var_rowsums <- function (ACS_data, var_df) {
  ACS_data %>%
    select(.,
      var_df %>%
        select(name) %>%
        unlist() %>%
        matches()
    ) %>% rowSums()
}

processed_ACS_data <- ACS_data %>%
  mutate(
    TotalPop = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B01001",
          `1st` == "Total:",
          is.na(`2nd`)
        )
      ),
    Age00_17 = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B01001",
          `3rd` %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years"),
          is.na(`4th`)
        )
      ),
    Age65p = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B01001",
          `3rd` %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"),
          is.na(`4th`)
        )
      ),
    Pct00_17 = Age00_17 / TotalPop,
    Pct65p = Age65p / TotalPop,
    EduHS = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B15003",
          `2nd` %in% c("Regular high school diploma", "GED or alternative credential", "Some college, less than 1 year", "Some college, 1 or more years, no degree"),
          is.na(`3rd`)
        )
      ),
    TotalEdu = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B15003",
          `1st` == "Total:",
          is.na(`2nd`)
        )
      ),
    EduAS = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B15003",
          `2nd` == "Associate's degree",
          is.na(`3rd`)
        )
      ),
    EduBA = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B15003",
          `2nd` == "Bachelor's degree",
          is.na(`3rd`)
        )
      ),
    EduGA = get_var_rowsums(.,
      acs2022vars %>%
        filter(
          table == "B15003",
          `2nd` %in% c("Master's degree", "Professional school degree", "Doctorate degree"),
          is.na(`3rd`)
        )
      ),
    PctEdHS = EduHS / TotalEdu,
    PctEduAS = EduAS / TotalEdu,
    PctEduBA = EduBA / TotalEdu,
    PctEduGA = EduGA / TotalEdu
  ) %>%
  select(GEOID, starts_with("Pct"))

We will combine these variables with some of our geographic variables from the geouniverse dataset to try to improve our model

Code

sfh_assessment_fe <- sfh_assessment_fe %>%
  left_join(processed_ACS_data, by = c("census_tract_geoid" = "GEOID"))

sfh_assessment_fe_known <- sfh_assessment_fe %>%
  filter(!is.na(sale_price))

Code

s_fe_workflow <- workflow()

Code

s_fe_rf_spec <- rand_forest(mtry = tune(), trees = 1000, min_n = tune()) %>%    
  set_engine("ranger") %>%    
  set_mode("regression")  

s_fe_workflow <- workflow() %>%   
  add_model(s_fe_rf_spec)

Code

s_fe_rec <- recipe(sale_price ~ year_built + building_sqft + land_sqft + num_bedrooms + num_rooms + num_full_baths + num_half_baths + num_fireplaces + type_of_residence + construction_quality + attic_finish + garage_size + ext_wall_material + basement_type + central_heating + roof_material + porch + central_air + neighborhood_code + school_elementary_district_name + Pct00_17 + Pct65p + PctEdHS + PctEduAS + PctEduBA + PctEduGA, data = sfh_assessment_fe_known) %>%    
  step_string2factor(all_string()) %>%    
  step_impute_knn(all_predictors()) %>%   
  step_novel(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors())  

s_fe_workflow <- s_fe_workflow %>%   
  add_recipe(s_fe_rec)

Code

set.seed(456) 
s_fe_train_set <- sfh_assessment_fe_known %>% filter(tax_year != 2023)
s_fe_test_set <- sfh_assessment_fe_known %>% filter(tax_year == 2023)

set.seed(567) 
s_fe_train_resamples <- bootstraps(s_fe_train_set)

Code

library(finetune)

set.seed(678) 
s_fe_tune <- s_fe_workflow %>%   
  tune_race_anova(
    resamples = s_fe_train_resamples,     
    grid = 20,     
    metrics = metric_set(rmse, mape),     
    control = control_race(verbose = TRUE))

Code

s_fe_best <- select_best(s_fe_tune, "rmse") 
s_fe_final_rf <- finalize_model(s_fe_rf_spec, s_fe_best)  
s_fe_final_wf <- workflow() %>%   
  add_recipe(s_fe_rec) %>%   
  add_model(s_fe_final_rf)

Code

s_fe_val_preds <- s_fe_final_wf %>%    
  fit(s_fe_train_set) %>%   
  augment(s_fe_test_set)  

s_fe_metrics <- list(   
  s_fe_val_preds %>%     
    rmse(sale_price, .pred),   
  s_fe_val_preds %>%     
    mape(sale_price, .pred))  

s_fe_metrics %>% bind_rows() %>% knitr::kable()

.metric	.estimator	.estimate
rmse	standard	100419.96886
mape	standard	19.25759

For some reason, it seems that our model has gotten slightly worse. I wonder if the added features that we used are not truly being incorporated into the model in a meaningfully way. This could mean that our added features are simply adding noise.

Code

s_fe_vip_rf_spec <- rand_forest(mtry = tune(), trees = 1000, min_n = tune()) %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("regression")

s_fe_vip_rf <- finalize_model(s_fe_vip_rf_spec, s_fe_best)  
s_fe_vip_wf <- workflow() %>%   
  add_recipe(s_fe_rec) %>%   
  add_model(s_fe_vip_rf) 

s_fe_vip_rf_fit <- s_fe_vip_wf %>%    
  fit(s_fe_train_set)

s_fe_vip_rf_fit %>% 
  extract_fit_parsnip() %>% 
  vip::vip(geom = "point")

Interestingly, the percent of individuals with graduate degrees is one of the more important features in the model.

Part C

Because our feature engineered model worked worse than our original model, we are going to use our original model.

Code

s_aug <- s_vip_rf_fit %>%
    augment(sfh_assessment_fe %>% filter(tax_year == 2023))

Let’s see how our assessments compares to the actual assessments.

Code

s_aug %>%
  mutate(mymodel_error = .pred - sale_price) %>%
  mutate(assessed_error = 10*certified_tot - sale_price) %>%
  summarize(
    mymodel_RMSE = sqrt(mean(mymodel_error^2, na.rm = TRUE)), 
    assessed_RMSE = sqrt(mean(assessed_error^2, na.rm = TRUE)),
    mymodel_MAPE = mean(abs(mymodel_error/sale_price), na.rm = TRUE),
    assessed_MAPE = mean(abs(assessed_error/sale_price), na.rm = TRUE),) %>%
  mutate(across(where(is.numeric), round, digits = 3)) %>%
  pivot_longer(everything(), names_to = c("model", "metric"), names_pattern = "(.*)_(.*)", values_to = "value") %>%
  knitr::kable()

model	metric	value
mymodel	RMSE	99182.379
assessed	RMSE	99502.179
mymodel	MAPE	0.194
assessed	MAPE	0.194

It’s pretty close, but this is not as great as we think it is. We saw in Part 1 that the assessments tend to be less than 85% of the sale price, so this is not really that great.

Nevertheless, let’s see how the distribution of of the Cook County assessed prices, our modeled assessments prices, and the actual sale prices.

Code

s_aug %>%
  mutate(assessed_price = 10*certified_tot) %>%
  mutate(modeled_price = .pred) %>%
  select(pin, ends_with("price")) %>%
  pivot_longer(ends_with("price"), names_to = "metric", values_to = "value") %>%
  ggplot(aes(value)) + 
    geom_density(position = "stack", na.rm = TRUE) +
    facet_wrap(vars(metric), nrow = 3) +
    scale_x_continuous(labels = label_currency()) + 
    labs(title = "Sale Price Distribution of Assessed Prices, Modeled Prices, and Sale Prices")

The distribution of assessed and modeled prices seem to be tending toward the mean. The estimates seem to be tending toward mean a little. This is most likely because the unbalanced nature of our data. There are a lot of homes within the $200,000 to $600,000 range than there are outside, so the assessed and modeled prices seem to be tending toward that range.

References

[1]

“How Properties are Valued,” Cook County Assessor’s Office. Accessed: Feb. 17, 2024. [Online]. Available: https://www.cookcountyassessor.com/how-properties-are-valued