Housing Prices Analysis

The goal of this short project is to document a fairly realistic ML pipeline, including data cleaning, data visualization, and model development.

Python

Requisite modules

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor

Import data

housing = pd.read_csv(
  "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
)

We can quickly summarize the housing data.

housing.describe()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000

Now that we have a quick overview of the data, we can plot a histogram of all numeric values.

housing.hist(bins = 50, figsize = (12, 8));
plt.show()

We’re going to bin the median_income variable to allow for stratified sampling within income bins.

housing["median_income_bin"] = pd.cut(
  housing["median_income"],
  bins = [0, 1.5, 3, 4.5, 6, np.inf],
  labels = [1, 2, 3, 4, 5]
)
# Plot histogram of counts
housing["median_income_bin"].hist();
plt.show()

Visualize data

Now, let’s visualize the median house prices by plotting them geographically.

(
  housing.
  rename(columns = {"median_house_value": "Median House Value"}).
  plot(
    kind = "scatter",
    x = "longitude",
    y = "latitude",
    alpha = 0.1,
    s = housing["population"]/100,
    c = "Median House Value",
    colormap = plt.get_cmap("jet"),
    colorbar = True,
    title = "Median House Prices by Population",
    xlabel = "Longitude",
    ylabel = "Latitude"
  )
)
plt.show()

Let’s also look at the correlation between a few of our numeric variables.

pd.plotting.scatter_matrix(
  housing[
    [
     "median_house_value",
     "median_income",
     "total_rooms",
     "housing_median_age"
    ]
  ],
  alpha = 0.1,
  figsize = (10, 8)
);
plt.show()

Let’s specifically take a look at the relationship between median_income and median_house_value.

(
  housing.
  rename(
    columns = {
      "median_income": "Median Income",
      "median_house_value": "Median House Value"
    }
  ).
  plot(
    kind = "scatter",
    x = "Median Income",
    y = "Median House Value",
    alpha = 0.1
  )
)
plt.show()

Feature engineering

We want to create a couple new features: rooms_per_household and bedrooms_per_room.

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["pop_per_household"] = housing["population"]/housing["households"]

Modeling prep

Let’s do an 70-30 split of the data initially.

split = StratifiedShuffleSplit(
  n_splits = 1,
  test_size = 0.3,
  random_state = 123
)
for train_idx, test_idx in split.split(housing, housing["median_income_bin"]):
  train = housing.loc[train_idx].drop(
    ["median_income_bin", "median_house_value"],
    axis = 1
  )
  train_labels = housing.loc[train_idx, "median_house_value"]
  test = housing.loc[test_idx].drop(
    ["median_income_bin", "median_house_value"],
    axis = 1
  )
  test_labels = housing.loc[test_idx, "median_house_value"]

Now, let’s prep the data for modeling.

# Numeric transformations
num_transform = Pipeline(
  [
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler())
  ]
)
full_transform = ColumnTransformer(
  [
    (
     "numeric",
     num_transform,
     list(train.drop("ocean_proximity", axis = 1).columns)
    ),
    ("categorical", OneHotEncoder(), ["ocean_proximity"])
  ]
)
train = full_transform.fit_transform(train)
test = full_transform.transform(test)

Train models

Let’s quick chalk up a function to print CV metrics.

def metrics_summary(scores):
  print("Mean: ", scores.mean().round(2))
  print("Standard Dev.: ", scores.std().round(2))

Linear Regression

The first model is a simple linear regression model.

# Fit model
lin_reg_scores = np.sqrt(
  -cross_val_score(
     LinearRegression(),
     train,
     train_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
   )
)
# Summarize score info
metrics_summary(lin_reg_scores)
Mean:  68006.23
Standard Dev.:  2417.77

Decision Tree

The next model is a single decision tree model.

# Fit model
tree_reg = np.sqrt(
  -cross_val_score(
     DecisionTreeRegressor(),
     train,
     train_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
   )
)
# Summarize score info
metrics_summary(tree_reg)
Mean:  71848.76
Standard Dev.:  1912.97

Random Forest

Now, we will run a random forest model.

# Fit model
forest_reg = np.sqrt(
  -cross_val_score(
     RandomForestRegressor(n_estimators = 30),
     train,
     train_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
   )
)
# Summarize score info
metrics_summary(forest_reg)
Mean:  51307.82
Standard Dev.:  1670.66

And finally, we will tune our random forest model via grid search.

# Fit model
forest_reg_grid = GridSearchCV(
  RandomForestRegressor(),
  param_grid = {
    "n_estimators": [30],
    "min_samples_leaf": [5],
    "max_features": ["sqrt"]
  },
  scoring = "neg_mean_squared_error",
  cv = 5,
  return_train_score = True
)
forest_reg_grid_fit = forest_reg_grid.fit(train, train_labels)
# Get CV results
cv_results = forest_reg_grid_fit.cv_results_
# Summarize score info
for mean_sc, param in zip(cv_results["mean_test_score"], cv_results["params"]):
  print("Mean: ", np.sqrt(-mean_sc).round(2))
  print("Parameters: ", param)
Mean:  51546.77
Parameters:  {'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 30}

Final predictions

# Extract model
final_model = forest_reg_grid_fit.best_estimator_
# Make predictions
final_preds = final_model.predict(test)
# Final RMSE
print(
  "Test RMSE: ", np.sqrt(mean_squared_error(test_labels, final_preds)).round(2)
)
Test RMSE:  50199.17