Chapter 2 PreProcessing
The first step for any data driven project is getting to know the data. In this section we will look at the raw data that was provided by the competition and then do a little pre processing that will make the data easier to work with for the rest of the project
2.1 The Raw Data
The raw data from zillow contains the following data (desrciptions from the the data page)
properties_2016.csv - all the properties with their home features for 2016. Note: Some 2017 new properties don’t have any data yet except for their parcelid’s. Those data points should be populated when properties_2017.csv is available.
properties_2017.csv - all the properties with their home features for 2017 (released on 10/2/2017)
train_2016.csv - the training set with transactions from 1/1/2016 to 12/31/2016
train_2017.csv - the training set with transactions from 1/1/2017 to 9/15/2017 (released on 10/2/2017)
sample_submission.csv - a sample submission file in the correct format
zillow_data_dictionary.xlsx - Field Definitions and coded value meanings
2.1.1 Saving Raw Data Using feather
The R binding for the feather data store provides the ability for very fast read and write of data. For speed purposes we will save all raw data into a .feather file format to make all other read faster
library(feather)
library(readr)
dir.create("data-raw/feather")
prop_16 <- read_csv("data-raw/properties_2016.csv")
prop_17 <- read_csv("data-raw/properties_2017.csv")
train_16 <- read_csv("data-raw/train_2016_v2.csv")
train_17 <- read_csv("data-raw/train_2017.csv")
write_feather(prop_16, "data-raw/feather/properties_2016.feather")
write_feather(prop_17, "data-raw/feather/properties_2017.feather")
write_feather(train_16, "data-raw/feather/train_2016_v2.feather")
write_feather(train_17, "data-raw/feather/train_2017.feather")2.2 Renaming Variables
Many of the feature names are not very consistant. To take advatange of helpful functions from the tidyverse set of packages, such as starts_with() and one_of() Let’s rename them to something more consistant and easier to work with.
2.2.1 Renaming properties Features
library(tidyverse)
prop_16 <- read_feather("data-raw/feather/properties_2016.feather")
prop_17 <- read_feather("data-raw/feather/properties_2017.feather")
prop_16 <- prop_16 %>%
rename(
id_parcel = parcelid,
build_year = yearbuilt,
area_basement = basementsqft,
area_patio = yardbuildingsqft17,
area_shed = yardbuildingsqft26,
area_pool = poolsizesum,
area_lot = lotsizesquarefeet,
area_garage = garagetotalsqft,
area_firstfloor_finished_1 = finishedfloor1squarefeet,
area_firstfloor_finished_2 = finishedsquarefeet50,
area_living_finished_calc = calculatedfinishedsquarefeet,
area_base = finishedsquarefeet6,
area_living_finished = finishedsquarefeet12,
area_living_perimeter = finishedsquarefeet13,
area_total = finishedsquarefeet15,
num_unit = unitcnt,
num_story = numberofstories,
num_room = roomcnt,
num_bathroom = bathroomcnt,
num_bedroom = bedroomcnt,
num_bathroom_calc = calculatedbathnbr,
num_bath = fullbathcnt,
num_75_bath = threequarterbathnbr,
num_fireplace = fireplacecnt,
num_pool = poolcnt,
num_garage = garagecarcnt,
region_county = regionidcounty,
region_city = regionidcity,
region_zip = regionidzip,
region_neighbor = regionidneighborhood,
tax_total = taxvaluedollarcnt,
tax_building = structuretaxvaluedollarcnt,
tax_land = landtaxvaluedollarcnt,
tax_property = taxamount,
tax_year = assessmentyear,
tax_delinquency = taxdelinquencyflag,
tax_delinquency_year = taxdelinquencyyear,
zoning_property = propertyzoningdesc,
zoning_landuse = propertylandusetypeid,
zoning_landuse_county = propertycountylandusecode,
str_flag_fireplace = fireplaceflag,
str_flag_tub = hashottuborspa,
str_quality = buildingqualitytypeid,
str_framing = buildingclasstypeid,
str_material = typeconstructiontypeid,
str_deck = decktypeid,
str_story = storytypeid,
str_heating = heatingorsystemtypeid,
str_aircon = airconditioningtypeid,
str_arch_style = architecturalstyletypeid
)
# use 2016 names to rename 17
names(prop_17) <- names(prop_16)2.2.2 renaming train features
trans_16 <- read_feather("data-raw/feather/train_2016_v2.feather")
trans_17 <- read_feather("data-raw/feather/train_2017.feather")
trans_16 <- trans_16 %>%
rename(
id_parcel = parcelid,
date = transactiondate,
log_error = logerror
)
# use 2016 names to rename 17
names(trans_17) <- names(trans_16)2.2.3 Basic Transformations
Based on the definitions in the zillow_data_dictionary.xlsx we can recode some of the features to have be more interpretable while we are exploring.
2.2.3.1 Properties
library(forcats)
prop_16 <- prop_16 %>%
mutate(
tax_delinquency = ifelse(tax_delinquency == "Y", "Yes", "No") %>%
as_factor(),
str_flag_fireplace = ifelse(str_flag_fireplace == "Y", "Yes", "No") %>%
as_factor(),
str_flag_tub = ifelse(str_flag_tub == "Y", "Yes", "No") %>%
as_factor(),
zoning_landuse = factor(zoning_landuse, levels = sort(unique(zoning_landuse))),
zoning_landuse = fct_recode(zoning_landuse,
"Commercial/Office/Residential Mixed Used" = "31",
"Multi-Story Store" = "46",
"Store/Office (Mixed Use)" = "47",
"Duplex (2 Units Any Combination)" = "246",
"Triplex (3 Units Any Combination)" = "247",
"Quadruplex (4 Units Any Combination)" = "248",
"Residential General" = "260",
"Single Family Residential" = "261",
"Rural Residence" = "262",
"Mobile Home" = "263",
"Townhouse" = "264",
"Cluster Home" = "265",
"Condominium" = "266",
"Cooperative" = "267",
"Row House" = "268",
"Planned Unit Development" = "269",
"Residential Common Area" = "270",
"Timeshare" = "271",
"Bungalow" = "273",
"Zero Lot Line" = "274",
"Manufactured Modular Prefabricated Homes" = "275",
"Patio Home" = "276",
"Inferred Single Family Residential" = "279",
"Vacant Land - General" = "290",
"Residential Vacant Land" = "291"
)
)
prop_17 <- prop_17 %>%
mutate(
tax_delinquency = ifelse(tax_delinquency == "Y", "Yes", "No") %>%
as_factor(),
str_flag_fireplace = ifelse(str_flag_fireplace == "Y", "Yes", "No") %>%
as_factor(),
str_flag_tub = ifelse(str_flag_tub == "Y", "Yes", "No") %>%
as_factor(),
zoning_landuse = factor(zoning_landuse, levels = sort(unique(zoning_landuse))),
zoning_landuse = fct_recode(zoning_landuse,
"Commercial/Office/Residential Mixed Used" = "31",
"Multi-Story Store" = "46",
"Store/Office (Mixed Use)" = "47",
"Duplex (2 Units Any Combination)" = "246",
"Triplex (3 Units Any Combination)" = "247",
"Quadruplex (4 Units Any Combination)" = "248",
"Residential General" = "260",
"Single Family Residential" = "261",
"Rural Residence" = "262",
"Mobile Home" = "263",
"Townhouse" = "264",
"Cluster Home" = "265",
"Condominium" = "266",
"Cooperative" = "267",
"Row House" = "268",
"Planned Unit Development" = "269",
"Residential Common Area" = "270",
"Timeshare" = "271",
"Bungalow" = "273",
"Zero Lot Line" = "274",
"Manufactured Modular Prefabricated Homes" = "275",
"Patio Home" = "276",
"Inferred Single Family Residential" = "279",
"Vacant Land - General" = "290",
"Residential Vacant Land" = "291"
)
)2.2.3.2 Transactions
The transactions tables are where our response variable log_error (name changed from orginal logerror) and the dates of the transactions are recorded.
To make them easier to work with, let’s combine all the transactions into one table and create a few basic transformations of the date (name changed from original transactiondate)
library(lubridate)
# combine transactions into one data frame
trans <- trans_16 %>%
bind_rows(trans_17) %>%
mutate(
abs_log_error = abs(log_error),
year = year(date),
month_year = make_date(year(date), month(date)),
month = month(date, label = TRUE),
week = floor_date(date, unit = "week"),
week_of_year = week(date),
week_since_start = (min(date) %--% date %/% dweeks()) + 1,
wday = wday(date, label = TRUE),
day_of_month = day(date)
)Save our output
write_feather(prop_16, "data/properties_16.feather")
write_feather(prop_17, "data/properties_17.feather")
write_feather(trans, "data/transactions.feather")2.3 Extracting Geographic Information
As noted in the 1 we are going to break from the rules of the competition and use external information to (hopefully) help improve our predictions. Since the data we are using relate to locations of individual properties and we have each of their geographic coordinates, latitude and longitude let’s use those to get the U.S. Census Geographies they are apart of that we can make use of when adding external information.
The original data contain the fields rawcensustractandblock and censustractandblock but after trying to parse those into a usable format and failing, I figured it was just easier to use the latitude and longitude fields and then join that to the Census information.
library(sf)
library(tidycensus)
# NAD83 / California zone 5 (ftUS)
# https://epsg.io/2229
crs_id <- 2229
api_key <- Sys.getenv("CENSUS_API_KEY")
census_api_key(api_key)
# some obs have no data at all included lat/long
# the original lat / lon are mulitpled by 10e5 so divide to
# get lat lon back when converting to sf
properties <- read_feather("data-raw/properties_2017") %>%
filter(!is.na(latitude)) %>%
mutate(
lat = latitude / 10e5,
lon = longitude / 10e5
) %>%
st_as_sf(
coords = c("lon", "lat"),
crs = 4326, # WGS 84
remove = FALSE # keep lat/long fields
) %>%
st_transform(crs_id)
census_bgs <- get_acs(
geography = "block group",
variables = "B19013_001",
state = "CA",
county = c("Los Angeles", "Orange", "Ventura"),
geometry = TRUE,
keep_geo_vars = TRUE
) %>%
st_transform(crs_id)
# inner join
# due to lat / lon error some points didn't intersect
# with block groups left = FALSE is inner join
properties_geo <- properties %>%
st_join(census_bgs, left = FALSE)
# find all of the points that didn't intersect
# buffer them and then join to closest block
# then add back the already joined points
# the buffer distance I just played around with until
# all points joined with a block group
properties_geo <- properties %>%
filter(!parcelid %in% properties_geo$parcelid) %>%
st_buffer(dist = 1500) %>% # units are in us-ft based on crs_id
st_join(census_bgs, left = FALSE, largest = TRUE) %>%
rbind(properties_geo)
# remove geometry b/c feather can't store lists
properties_geo <- properties_geo %>%
select(
id_parcel = parcelid,
id_geo_state = STATEFP,
id_geo_county = COUNTYFP,
id_geo_tract = TRACTCE,
id_geo_bg = BLKGRPCE,
id_geo_bg_fips = GEOID,
id_geo_bg_name = NAME.y,
geo_bg_arealand = ALAND,
geo_bg_areawater = AWATER,
lat,
lon
) %>%
mutate(
id_geo_county_fips = paste0(id_geo_state, id_geo_county),
id_geo_tract_fips = paste0(id_geo_county_fips, id_geo_tract),
id_geo_county_name = factor(id_geo_county) %>%
fct_recode(
"Los Angeles" = "037",
"Orange" = "059",
"Ventura" = "111"
)
)Now save our geographic features
# remove geometry b/c feather can't store lists
# add back in when needed from lat lon
properties_geo$geometry <- NULL
write_feather(properties_geo, "data/properties_geo_only.feather")