import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressorHousing Prices Analysis
The goal of this short project is to document a fairly realistic ML pipeline, including data cleaning, data visualization, and model development.
Python
Requisite modules
Import data
housing = pd.read_csv(
"https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
)We can quickly summarize the housing data.
housing.describe()| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
| mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
| std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
| min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
| 25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
| 50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
| 75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
| max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
Now that we have a quick overview of the data, we can plot a histogram of all numeric values.
housing.hist(bins = 50, figsize = (12, 8));
plt.show()We’re going to bin the median_income variable to allow for stratified sampling within income bins.
housing["median_income_bin"] = pd.cut(
housing["median_income"],
bins = [0, 1.5, 3, 4.5, 6, np.inf],
labels = [1, 2, 3, 4, 5]
)
# Plot histogram of counts
housing["median_income_bin"].hist();
plt.show()Visualize data
Now, let’s visualize the median house prices by plotting them geographically.
(
housing.
rename(columns = {"median_house_value": "Median House Value"}).
plot(
kind = "scatter",
x = "longitude",
y = "latitude",
alpha = 0.1,
s = housing["population"]/100,
c = "Median House Value",
colormap = plt.get_cmap("jet"),
colorbar = True,
title = "Median House Prices by Population",
xlabel = "Longitude",
ylabel = "Latitude"
)
)
plt.show()Let’s also look at the correlation between a few of our numeric variables.
pd.plotting.scatter_matrix(
housing[
[
"median_house_value",
"median_income",
"total_rooms",
"housing_median_age"
]
],
alpha = 0.1,
figsize = (10, 8)
);
plt.show()Let’s specifically take a look at the relationship between median_income and median_house_value.
(
housing.
rename(
columns = {
"median_income": "Median Income",
"median_house_value": "Median House Value"
}
).
plot(
kind = "scatter",
x = "Median Income",
y = "Median House Value",
alpha = 0.1
)
)
plt.show()Feature engineering
We want to create a couple new features: rooms_per_household and bedrooms_per_room.
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["pop_per_household"] = housing["population"]/housing["households"]Modeling prep
Let’s do an 70-30 split of the data initially.
split = StratifiedShuffleSplit(
n_splits = 1,
test_size = 0.3,
random_state = 123
)
for train_idx, test_idx in split.split(housing, housing["median_income_bin"]):
train = housing.loc[train_idx].drop(
["median_income_bin", "median_house_value"],
axis = 1
)
train_labels = housing.loc[train_idx, "median_house_value"]
test = housing.loc[test_idx].drop(
["median_income_bin", "median_house_value"],
axis = 1
)
test_labels = housing.loc[test_idx, "median_house_value"]Now, let’s prep the data for modeling.
# Numeric transformations
num_transform = Pipeline(
[
("imputer", SimpleImputer(strategy = "median")),
("scaler", StandardScaler())
]
)
full_transform = ColumnTransformer(
[
(
"numeric",
num_transform,
list(train.drop("ocean_proximity", axis = 1).columns)
),
("categorical", OneHotEncoder(), ["ocean_proximity"])
]
)
train = full_transform.fit_transform(train)
test = full_transform.transform(test)Train models
Let’s quick chalk up a function to print CV metrics.
def metrics_summary(scores):
print("Mean: ", scores.mean().round(2))
print("Standard Dev.: ", scores.std().round(2))Linear Regression
The first model is a simple linear regression model.
# Fit model
lin_reg_scores = np.sqrt(
-cross_val_score(
LinearRegression(),
train,
train_labels,
scoring = "neg_mean_squared_error",
cv = 10
)
)
# Summarize score info
metrics_summary(lin_reg_scores)Mean: 68006.23
Standard Dev.: 2417.77
Decision Tree
The next model is a single decision tree model.
# Fit model
tree_reg = np.sqrt(
-cross_val_score(
DecisionTreeRegressor(),
train,
train_labels,
scoring = "neg_mean_squared_error",
cv = 10
)
)
# Summarize score info
metrics_summary(tree_reg)Mean: 71848.76
Standard Dev.: 1912.97
Random Forest
Now, we will run a random forest model.
# Fit model
forest_reg = np.sqrt(
-cross_val_score(
RandomForestRegressor(n_estimators = 30),
train,
train_labels,
scoring = "neg_mean_squared_error",
cv = 10
)
)
# Summarize score info
metrics_summary(forest_reg)Mean: 51307.82
Standard Dev.: 1670.66
And finally, we will tune our random forest model via grid search.
# Fit model
forest_reg_grid = GridSearchCV(
RandomForestRegressor(),
param_grid = {
"n_estimators": [30],
"min_samples_leaf": [5],
"max_features": ["sqrt"]
},
scoring = "neg_mean_squared_error",
cv = 5,
return_train_score = True
)
forest_reg_grid_fit = forest_reg_grid.fit(train, train_labels)
# Get CV results
cv_results = forest_reg_grid_fit.cv_results_
# Summarize score info
for mean_sc, param in zip(cv_results["mean_test_score"], cv_results["params"]):
print("Mean: ", np.sqrt(-mean_sc).round(2))
print("Parameters: ", param)Mean: 51546.77
Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 30}
Final predictions
# Extract model
final_model = forest_reg_grid_fit.best_estimator_
# Make predictions
final_preds = final_model.predict(test)
# Final RMSE
print(
"Test RMSE: ", np.sqrt(mean_squared_error(test_labels, final_preds)).round(2)
)Test RMSE: 50199.17