Housing Prices Analysis

The goal of this short project is to document a fairly realistic ML pipeline, including data cleaning, data visualization, and model development.

Python

Requisite modules

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor

Import data

housing = pd.read_csv(
  "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
)

We can quickly summarize the housing data.

housing.describe()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000

Now that we have a quick overview of the data, we can plot a histogram of all numeric values.

housing.hist(bins = 50, figsize = (12, 8));
plt.show()

We’re going to bin the median_income variable to allow for stratified sampling within income bins.

housing["median_income_bin"] = pd.cut(
  housing["median_income"],
  bins = [0, 1.5, 3, 4.5, 6, np.inf],
  labels = [1, 2, 3, 4, 5]
)
# Plot histogram of counts
housing["median_income_bin"].hist();
plt.show()

Visualize data

Now, let’s visualize the median house prices by plotting them geographically.

(
  housing.
  rename(columns = {"median_house_value": "Median House Value"}).
  plot(
    kind = "scatter",
    x = "longitude",
    y = "latitude",
    alpha = 0.1,
    s = housing["population"]/100,
    c = "Median House Value",
    colormap = plt.get_cmap("jet"),
    colorbar = True,
    title = "Median House Prices by Population",
    xlabel = "Longitude",
    ylabel = "Latitude"
  )
)
plt.show()

Let’s also look at the correlation between a few of our numeric variables.

pd.plotting.scatter_matrix(
  housing[
    [
     "median_house_value",
     "median_income",
     "total_rooms",
     "housing_median_age"
    ]
  ],
  alpha = 0.1,
  figsize = (10, 8)
);
plt.show()

Let’s specifically take a look at the relationship between median_income and median_house_value.

(
  housing.
  rename(
    columns = {
      "median_income": "Median Income",
      "median_house_value": "Median House Value"
    }
  ).
  plot(
    kind = "scatter",
    x = "Median Income",
    y = "Median House Value",
    alpha = 0.1
  )
)
plt.show()

Feature engineering

We want to create a couple new features: rooms_per_household and bedrooms_per_room.

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["pop_per_household"] = housing["population"]/housing["households"]

Modeling prep

Let’s do an 70-30 split of the data initially.

split = StratifiedShuffleSplit(
  n_splits = 1,
  test_size = 0.3,
  random_state = 123
)
for train_idx, test_idx in split.split(housing, housing["median_income_bin"]):
  train = housing.loc[train_idx].drop(
    ["median_income_bin", "median_house_value"],
    axis = 1
  )
  train_labels = housing.loc[train_idx, "median_house_value"]
  test = housing.loc[test_idx].drop(
    ["median_income_bin", "median_house_value"],
    axis = 1
  )
  test_labels = housing.loc[test_idx, "median_house_value"]

Now, let’s prep the data for modeling.

# Numeric transformations
num_transform = Pipeline(
  [
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler())
  ]
)
full_transform = ColumnTransformer(
  [
    (
     "numeric",
     num_transform,
     list(train.drop("ocean_proximity", axis = 1).columns)
    ),
    ("categorical", OneHotEncoder(), ["ocean_proximity"])
  ]
)
train = full_transform.fit_transform(train)
test = full_transform.transform(test)

Train models

Let’s quick chalk up a function to print CV metrics.

def metrics_summary(scores):
  print("Mean: ", scores.mean().round(2))
  print("Standard Dev.: ", scores.std().round(2))

Linear Regression

The first model is a simple linear regression model.

# Fit model
lin_reg_scores = np.sqrt(
  -cross_val_score(
     LinearRegression(),
     train,
     train_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
   )
)
# Summarize score info
metrics_summary(lin_reg_scores)

Mean:  68006.23
Standard Dev.:  2417.77

Decision Tree

The next model is a single decision tree model.

# Fit model
tree_reg = np.sqrt(
  -cross_val_score(
     DecisionTreeRegressor(),
     train,
     train_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
   )
)
# Summarize score info
metrics_summary(tree_reg)

Mean:  71848.76
Standard Dev.:  1912.97

Random Forest

Now, we will run a random forest model.

# Fit model
forest_reg = np.sqrt(
  -cross_val_score(
     RandomForestRegressor(n_estimators = 30),
     train,
     train_labels,
     scoring = "neg_mean_squared_error",
     cv = 10
   )
)
# Summarize score info
metrics_summary(forest_reg)

Mean:  51307.82
Standard Dev.:  1670.66

And finally, we will tune our random forest model via grid search.

# Fit model
forest_reg_grid = GridSearchCV(
  RandomForestRegressor(),
  param_grid = {
    "n_estimators": [30],
    "min_samples_leaf": [5],
    "max_features": ["sqrt"]
  },
  scoring = "neg_mean_squared_error",
  cv = 5,
  return_train_score = True
)
forest_reg_grid_fit = forest_reg_grid.fit(train, train_labels)
# Get CV results
cv_results = forest_reg_grid_fit.cv_results_
# Summarize score info
for mean_sc, param in zip(cv_results["mean_test_score"], cv_results["params"]):
  print("Mean: ", np.sqrt(-mean_sc).round(2))
  print("Parameters: ", param)

Mean:  51546.77
Parameters:  {'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 30}

Final predictions

# Extract model
final_model = forest_reg_grid_fit.best_estimator_
# Make predictions
final_preds = final_model.predict(test)
# Final RMSE
print(
  "Test RMSE: ", np.sqrt(mean_squared_error(test_labels, final_preds)).round(2)
)

Test RMSE:  50199.17