import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
Housing Prices Analysis
The goal of this short project is to document a fairly realistic ML pipeline, including data cleaning, data visualization, and model development.
Python
Requisite modules
Import data
= pd.read_csv(
housing "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
)
We can quickly summarize the housing data.
housing.describe()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
---|---|---|---|---|---|---|---|---|---|
count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
Now that we have a quick overview of the data, we can plot a histogram of all numeric values.
= 50, figsize = (12, 8));
housing.hist(bins plt.show()
We’re going to bin the median_income
variable to allow for stratified sampling within income bins.
"median_income_bin"] = pd.cut(
housing["median_income"],
housing[= [0, 1.5, 3, 4.5, 6, np.inf],
bins = [1, 2, 3, 4, 5]
labels
)# Plot histogram of counts
"median_income_bin"].hist();
housing[ plt.show()
Visualize data
Now, let’s visualize the median house prices by plotting them geographically.
(
housing.= {"median_house_value": "Median House Value"}).
rename(columns
plot(= "scatter",
kind = "longitude",
x = "latitude",
y = 0.1,
alpha = housing["population"]/100,
s = "Median House Value",
c = plt.get_cmap("jet"),
colormap = True,
colorbar = "Median House Prices by Population",
title = "Longitude",
xlabel = "Latitude"
ylabel
)
) plt.show()
Let’s also look at the correlation between a few of our numeric variables.
pd.plotting.scatter_matrix(
housing[
["median_house_value",
"median_income",
"total_rooms",
"housing_median_age"
]
],= 0.1,
alpha = (10, 8)
figsize ;
) plt.show()
Let’s specifically take a look at the relationship between median_income
and median_house_value
.
(
housing.
rename(= {
columns "median_income": "Median Income",
"median_house_value": "Median House Value"
}
).
plot(= "scatter",
kind = "Median Income",
x = "Median House Value",
y = 0.1
alpha
)
) plt.show()
Feature engineering
We want to create a couple new features: rooms_per_household
and bedrooms_per_room
.
"rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["pop_per_household"] = housing["population"]/housing["households"] housing[
Modeling prep
Let’s do an 70-30 split of the data initially.
= StratifiedShuffleSplit(
split = 1,
n_splits = 0.3,
test_size = 123
random_state
)for train_idx, test_idx in split.split(housing, housing["median_income_bin"]):
= housing.loc[train_idx].drop(
train "median_income_bin", "median_house_value"],
[= 1
axis
)= housing.loc[train_idx, "median_house_value"]
train_labels = housing.loc[test_idx].drop(
test "median_income_bin", "median_house_value"],
[= 1
axis
)= housing.loc[test_idx, "median_house_value"] test_labels
Now, let’s prep the data for modeling.
# Numeric transformations
= Pipeline(
num_transform
["imputer", SimpleImputer(strategy = "median")),
("scaler", StandardScaler())
(
]
)= ColumnTransformer(
full_transform
[
("numeric",
num_transform,list(train.drop("ocean_proximity", axis = 1).columns)
),"categorical", OneHotEncoder(), ["ocean_proximity"])
(
]
)= full_transform.fit_transform(train)
train = full_transform.transform(test) test
Train models
Let’s quick chalk up a function to print CV metrics.
def metrics_summary(scores):
print("Mean: ", scores.mean().round(2))
print("Standard Dev.: ", scores.std().round(2))
Linear Regression
The first model is a simple linear regression model.
# Fit model
= np.sqrt(
lin_reg_scores -cross_val_score(
LinearRegression(),
train,
train_labels,= "neg_mean_squared_error",
scoring = 10
cv
)
)# Summarize score info
metrics_summary(lin_reg_scores)
Mean: 68006.23
Standard Dev.: 2417.77
Decision Tree
The next model is a single decision tree model.
# Fit model
= np.sqrt(
tree_reg -cross_val_score(
DecisionTreeRegressor(),
train,
train_labels,= "neg_mean_squared_error",
scoring = 10
cv
)
)# Summarize score info
metrics_summary(tree_reg)
Mean: 71848.76
Standard Dev.: 1912.97
Random Forest
Now, we will run a random forest model.
# Fit model
= np.sqrt(
forest_reg -cross_val_score(
= 30),
RandomForestRegressor(n_estimators
train,
train_labels,= "neg_mean_squared_error",
scoring = 10
cv
)
)# Summarize score info
metrics_summary(forest_reg)
Mean: 51307.82
Standard Dev.: 1670.66
And finally, we will tune our random forest model via grid search.
# Fit model
= GridSearchCV(
forest_reg_grid
RandomForestRegressor(),= {
param_grid "n_estimators": [30],
"min_samples_leaf": [5],
"max_features": ["sqrt"]
},= "neg_mean_squared_error",
scoring = 5,
cv = True
return_train_score
)= forest_reg_grid.fit(train, train_labels)
forest_reg_grid_fit # Get CV results
= forest_reg_grid_fit.cv_results_
cv_results # Summarize score info
for mean_sc, param in zip(cv_results["mean_test_score"], cv_results["params"]):
print("Mean: ", np.sqrt(-mean_sc).round(2))
print("Parameters: ", param)
Mean: 51546.77
Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 30}
Final predictions
# Extract model
= forest_reg_grid_fit.best_estimator_
final_model # Make predictions
= final_model.predict(test)
final_preds # Final RMSE
print(
"Test RMSE: ", np.sqrt(mean_squared_error(test_labels, final_preds)).round(2)
)
Test RMSE: 50199.17