House Price Prediction through Regression
Through the Ames dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LogisticRegression as logR
from sklearn.linear_model import ElasticNet
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import seaborn as sns
%matplotlib inline
#https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
First let’s load both datasets and take some quick looks at the data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id 1460 non-null int64
MSSubClass 1460 non-null int64
MSZoning 1460 non-null object
LotFrontage 1201 non-null float64
LotArea 1460 non-null int64
Street 1460 non-null object
Alley 91 non-null object
LotShape 1460 non-null object
LandContour 1460 non-null object
Utilities 1460 non-null object
LotConfig 1460 non-null object
LandSlope 1460 non-null object
Neighborhood 1460 non-null object
Condition1 1460 non-null object
Condition2 1460 non-null object
BldgType 1460 non-null object
HouseStyle 1460 non-null object
OverallQual 1460 non-null int64
OverallCond 1460 non-null int64
YearBuilt 1460 non-null int64
YearRemodAdd 1460 non-null int64
RoofStyle 1460 non-null object
RoofMatl 1460 non-null object
Exterior1st 1460 non-null object
Exterior2nd 1460 non-null object
MasVnrType 1452 non-null object
MasVnrArea 1452 non-null float64
ExterQual 1460 non-null object
ExterCond 1460 non-null object
Foundation 1460 non-null object
BsmtQual 1423 non-null object
BsmtCond 1423 non-null object
BsmtExposure 1422 non-null object
BsmtFinType1 1423 non-null object
BsmtFinSF1 1460 non-null int64
BsmtFinType2 1422 non-null object
BsmtFinSF2 1460 non-null int64
BsmtUnfSF 1460 non-null int64
TotalBsmtSF 1460 non-null int64
Heating 1460 non-null object
HeatingQC 1460 non-null object
CentralAir 1460 non-null object
Electrical 1459 non-null object
1stFlrSF 1460 non-null int64
2ndFlrSF 1460 non-null int64
LowQualFinSF 1460 non-null int64
GrLivArea 1460 non-null int64
BsmtFullBath 1460 non-null int64
BsmtHalfBath 1460 non-null int64
FullBath 1460 non-null int64
HalfBath 1460 non-null int64
BedroomAbvGr 1460 non-null int64
KitchenAbvGr 1460 non-null int64
KitchenQual 1460 non-null object
TotRmsAbvGrd 1460 non-null int64
Functional 1460 non-null object
Fireplaces 1460 non-null int64
FireplaceQu 770 non-null object
GarageType 1379 non-null object
GarageYrBlt 1379 non-null float64
GarageFinish 1379 non-null object
GarageCars 1460 non-null int64
GarageArea 1460 non-null int64
GarageQual 1379 non-null object
GarageCond 1379 non-null object
PavedDrive 1460 non-null object
WoodDeckSF 1460 non-null int64
OpenPorchSF 1460 non-null int64
EnclosedPorch 1460 non-null int64
3SsnPorch 1460 non-null int64
ScreenPorch 1460 non-null int64
PoolArea 1460 non-null int64
PoolQC 7 non-null object
Fence 281 non-null object
MiscFeature 54 non-null object
MiscVal 1460 non-null int64
MoSold 1460 non-null int64
YrSold 1460 non-null int64
SaleType 1460 non-null object
SaleCondition 1460 non-null object
SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
None
print(df_test.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id 1459 non-null int64
MSSubClass 1459 non-null int64
MSZoning 1455 non-null object
LotFrontage 1232 non-null float64
LotArea 1459 non-null int64
Street 1459 non-null object
Alley 107 non-null object
LotShape 1459 non-null object
LandContour 1459 non-null object
Utilities 1457 non-null object
LotConfig 1459 non-null object
LandSlope 1459 non-null object
Neighborhood 1459 non-null object
Condition1 1459 non-null object
Condition2 1459 non-null object
BldgType 1459 non-null object
HouseStyle 1459 non-null object
OverallQual 1459 non-null int64
OverallCond 1459 non-null int64
YearBuilt 1459 non-null int64
YearRemodAdd 1459 non-null int64
RoofStyle 1459 non-null object
RoofMatl 1459 non-null object
Exterior1st 1458 non-null object
Exterior2nd 1458 non-null object
MasVnrType 1443 non-null object
MasVnrArea 1444 non-null float64
ExterQual 1459 non-null object
ExterCond 1459 non-null object
Foundation 1459 non-null object
BsmtQual 1415 non-null object
BsmtCond 1414 non-null object
BsmtExposure 1415 non-null object
BsmtFinType1 1417 non-null object
BsmtFinSF1 1458 non-null float64
BsmtFinType2 1417 non-null object
BsmtFinSF2 1458 non-null float64
BsmtUnfSF 1458 non-null float64
TotalBsmtSF 1458 non-null float64
Heating 1459 non-null object
HeatingQC 1459 non-null object
CentralAir 1459 non-null object
Electrical 1459 non-null object
1stFlrSF 1459 non-null int64
2ndFlrSF 1459 non-null int64
LowQualFinSF 1459 non-null int64
GrLivArea 1459 non-null int64
BsmtFullBath 1457 non-null float64
BsmtHalfBath 1457 non-null float64
FullBath 1459 non-null int64
HalfBath 1459 non-null int64
BedroomAbvGr 1459 non-null int64
KitchenAbvGr 1459 non-null int64
KitchenQual 1458 non-null object
TotRmsAbvGrd 1459 non-null int64
Functional 1457 non-null object
Fireplaces 1459 non-null int64
FireplaceQu 729 non-null object
GarageType 1383 non-null object
GarageYrBlt 1381 non-null float64
GarageFinish 1381 non-null object
GarageCars 1458 non-null float64
GarageArea 1458 non-null float64
GarageQual 1381 non-null object
GarageCond 1381 non-null object
PavedDrive 1459 non-null object
WoodDeckSF 1459 non-null int64
OpenPorchSF 1459 non-null int64
EnclosedPorch 1459 non-null int64
3SsnPorch 1459 non-null int64
ScreenPorch 1459 non-null int64
PoolArea 1459 non-null int64
PoolQC 3 non-null object
Fence 290 non-null object
MiscFeature 51 non-null object
MiscVal 1459 non-null int64
MoSold 1459 non-null int64
YrSold 1459 non-null int64
SaleType 1458 non-null object
SaleCondition 1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB
None
Let’s take a look at the sale price, the variable we are to model and predict
plt.hist(df_train['SalePrice'], 100)
(array([ 5., 0., 5., 6., 6., 7., 32., 29., 23., 35., 59.,
67., 73., 92., 89., 91., 62., 60., 57., 73., 67., 54.,
45., 33., 30., 31., 31., 33., 26., 23., 17., 19., 18.,
18., 14., 11., 5., 8., 13., 12., 10., 6., 6., 5.,
1., 3., 5., 6., 5., 4., 3., 2., 3., 1., 3.,
1., 2., 2., 0., 2., 0., 1., 1., 0., 1., 0.,
0., 0., 0., 1., 0., 0., 2., 0., 0., 0., 1.,
0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
1.]),
array([ 34900., 42101., 49302., 56503., 63704., 70905.,
78106., 85307., 92508., 99709., 106910., 114111.,
121312., 128513., 135714., 142915., 150116., 157317.,
164518., 171719., 178920., 186121., 193322., 200523.,
207724., 214925., 222126., 229327., 236528., 243729.,
250930., 258131., 265332., 272533., 279734., 286935.,
294136., 301337., 308538., 315739., 322940., 330141.,
337342., 344543., 351744., 358945., 366146., 373347.,
380548., 387749., 394950., 402151., 409352., 416553.,
423754., 430955., 438156., 445357., 452558., 459759.,
466960., 474161., 481362., 488563., 495764., 502965.,
510166., 517367., 524568., 531769., 538970., 546171.,
553372., 560573., 567774., 574975., 582176., 589377.,
596578., 603779., 610980., 618181., 625382., 632583.,
639784., 646985., 654186., 661387., 668588., 675789.,
682990., 690191., 697392., 704593., 711794., 718995.,
726196., 733397., 740598., 747799., 755000.]),
<a list of 100 Patch objects>)

First we clean the dataset. Let’s combine the two first so we can perform the same operations on both. We will split it up later.
df = pd.concat([df_train, df_test])
print(df.info())
df.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 81 columns):
1stFlrSF 2919 non-null int64
2ndFlrSF 2919 non-null int64
3SsnPorch 2919 non-null int64
Alley 198 non-null object
BedroomAbvGr 2919 non-null int64
BldgType 2919 non-null object
BsmtCond 2837 non-null object
BsmtExposure 2837 non-null object
BsmtFinSF1 2918 non-null float64
BsmtFinSF2 2918 non-null float64
BsmtFinType1 2840 non-null object
BsmtFinType2 2839 non-null object
BsmtFullBath 2917 non-null float64
BsmtHalfBath 2917 non-null float64
BsmtQual 2838 non-null object
BsmtUnfSF 2918 non-null float64
CentralAir 2919 non-null object
Condition1 2919 non-null object
Condition2 2919 non-null object
Electrical 2918 non-null object
EnclosedPorch 2919 non-null int64
ExterCond 2919 non-null object
ExterQual 2919 non-null object
Exterior1st 2918 non-null object
Exterior2nd 2918 non-null object
Fence 571 non-null object
FireplaceQu 1499 non-null object
Fireplaces 2919 non-null int64
Foundation 2919 non-null object
FullBath 2919 non-null int64
Functional 2917 non-null object
GarageArea 2918 non-null float64
GarageCars 2918 non-null float64
GarageCond 2760 non-null object
GarageFinish 2760 non-null object
GarageQual 2760 non-null object
GarageType 2762 non-null object
GarageYrBlt 2760 non-null float64
GrLivArea 2919 non-null int64
HalfBath 2919 non-null int64
Heating 2919 non-null object
HeatingQC 2919 non-null object
HouseStyle 2919 non-null object
Id 2919 non-null int64
KitchenAbvGr 2919 non-null int64
KitchenQual 2918 non-null object
LandContour 2919 non-null object
LandSlope 2919 non-null object
LotArea 2919 non-null int64
LotConfig 2919 non-null object
LotFrontage 2433 non-null float64
LotShape 2919 non-null object
LowQualFinSF 2919 non-null int64
MSSubClass 2919 non-null int64
MSZoning 2915 non-null object
MasVnrArea 2896 non-null float64
MasVnrType 2895 non-null object
MiscFeature 105 non-null object
MiscVal 2919 non-null int64
MoSold 2919 non-null int64
Neighborhood 2919 non-null object
OpenPorchSF 2919 non-null int64
OverallCond 2919 non-null int64
OverallQual 2919 non-null int64
PavedDrive 2919 non-null object
PoolArea 2919 non-null int64
PoolQC 10 non-null object
RoofMatl 2919 non-null object
RoofStyle 2919 non-null object
SaleCondition 2919 non-null object
SalePrice 1460 non-null float64
SaleType 2918 non-null object
ScreenPorch 2919 non-null int64
Street 2919 non-null object
TotRmsAbvGrd 2919 non-null int64
TotalBsmtSF 2918 non-null float64
Utilities 2917 non-null object
WoodDeckSF 2919 non-null int64
YearBuilt 2919 non-null int64
YearRemodAdd 2919 non-null int64
YrSold 2919 non-null int64
dtypes: float64(12), int64(26), object(43)
memory usage: 1.8+ MB
None
| 1stFlrSF | 2ndFlrSF | 3SsnPorch | Alley | BedroomAbvGr | BldgType | BsmtCond | BsmtExposure | BsmtFinSF1 | BsmtFinSF2 | ... | SaleType | ScreenPorch | Street | TotRmsAbvGrd | TotalBsmtSF | Utilities | WoodDeckSF | YearBuilt | YearRemodAdd | YrSold | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 856 | 854 | 0 | NaN | 3 | 1Fam | TA | No | 706.0 | 0.0 | ... | WD | 0 | Pave | 8 | 856.0 | AllPub | 0 | 2003 | 2003 | 2008 |
| 1 | 1262 | 0 | 0 | NaN | 3 | 1Fam | TA | Gd | 978.0 | 0.0 | ... | WD | 0 | Pave | 6 | 1262.0 | AllPub | 298 | 1976 | 1976 | 2007 |
| 2 | 920 | 866 | 0 | NaN | 3 | 1Fam | TA | Mn | 486.0 | 0.0 | ... | WD | 0 | Pave | 6 | 920.0 | AllPub | 0 | 2001 | 2002 | 2008 |
| 3 | 961 | 756 | 0 | NaN | 3 | 1Fam | Gd | No | 216.0 | 0.0 | ... | WD | 0 | Pave | 7 | 756.0 | AllPub | 0 | 1915 | 1970 | 2006 |
| 4 | 1145 | 1053 | 0 | NaN | 4 | 1Fam | TA | Av | 655.0 | 0.0 | ... | WD | 0 | Pave | 9 | 1145.0 | AllPub | 192 | 2000 | 2000 | 2008 |
5 rows × 81 columns
Time for data cleaning.
Columns with fewer datapoints than the total number of rows likely possess NaN values
Columns with only 1-2 missing row elements are the simplest case, someone probably messed up during data entry. As it’s only one element, try to set to a default/mean/most likely value, or an “other” category if there is one
print(df['SaleType'][df['SaleType'].isnull()])
print(df['KitchenQual'][df['KitchenQual'].isnull()])
print(df['Electrical'][df['Electrical'].isnull()])
print(df['Exterior1st'][df['Exterior1st'].isnull()])
print(df['Exterior2nd'][df['Exterior2nd'].isnull()])
print(df['GarageArea'][df['GarageArea'].isnull()])
print(df['GarageCars'][df['GarageCars'].isnull()])
print(df['Utilities'][df['Utilities'].isnull()])
print(df['Functional'][df['Functional'].isnull()])
print(df['BsmtFullBath'][df['BsmtFullBath'].isnull()])
print(df['BsmtHalfBath'][df['BsmtHalfBath'].isnull()])
1029 NaN
Name: SaleType, dtype: object
95 NaN
Name: KitchenQual, dtype: object
1379 NaN
Name: Electrical, dtype: object
691 NaN
Name: Exterior1st, dtype: object
691 NaN
Name: Exterior2nd, dtype: object
1116 NaN
Name: GarageArea, dtype: float64
1116 NaN
Name: GarageCars, dtype: float64
455 NaN
485 NaN
Name: Utilities, dtype: object
756 NaN
1013 NaN
Name: Functional, dtype: object
660 NaN
728 NaN
Name: BsmtFullBath, dtype: float64
660 NaN
728 NaN
Name: BsmtHalfBath, dtype: float64
df.Functional.value_counts()
Typ 2717
Min2 70
Min1 65
Mod 35
Maj1 19
Maj2 9
Sev 2
Name: Functional, dtype: int64
Note: while going through this dataset I looked at each variable’s distribution first before deciding to fill in N/A values with the mean or the most common value or the most “default” value.
To help do this, I made use of either the histogram, as seen in Sale Price, or the .value_counts() command above, or the .unique() command
For numerical variables one can do a mini-interpolation to fill in the blanks if there are a lot of N/A values
df['SaleType'] = df['SaleType'].fillna("Oth")
df['KitchenQual'] = df['KitchenQual'].fillna('TA')
df['Electrical'] = df['Electrical'].fillna('SBrkr')
df['Exterior1st'] = df['Exterior1st'].fillna('VinylSd') #Other
df['Exterior2nd'] = df['Exterior2nd'].fillna('VinylSd')
df['GarageArea'] = df['GarageArea'].fillna(np.mean(df['GarageArea']))
df['GarageCars'] = df['GarageCars'].fillna(0)
df['Utilities'] = df['Utilities'].fillna('AllPub')
df['Functional'] = df['Functional'].fillna('Typ')
df['BsmtFullBath'] = df['BsmtFullBath'].fillna(0)
df['BsmtHalfBath'] = df['BsmtHalfBath'].fillna(0)
print(df['Utilities'].unique())
print(df.Utilities.value_counts())
df['Utilities'][df['Utilities'] == 'NoSeWa']
# Thought to consider - can we get rid of 'Utilities'?
['AllPub' 'NoSeWa']
AllPub 2918
NoSeWa 1
Name: Utilities, dtype: int64
944 NoSeWa
Name: Utilities, dtype: object
For some others, it appears that the “NaN” values are intended to be a category amongst themselves.
For these we substitute a new categorical result, “None”
print(df['Alley'].unique())
df['Alley'] = df['Alley'].fillna('None')
[nan 'Grvl' 'Pave']
print(df['BsmtCond'].unique())
print(df['BsmtExposure'].unique())
df['BsmtCond'] = df['BsmtCond'].fillna('None')
df['BsmtExposure'] = df['BsmtExposure'].fillna('None')
#print(df['BsmtCond'].unique())
#print(df['BsmtExposure'].unique())
['TA' 'Gd' nan 'Fa' 'Po']
['No' 'Gd' 'Mn' 'Av' nan]
df.MiscFeature.value_counts()
Shed 95
Gar2 5
Othr 4
TenC 1
Name: MiscFeature, dtype: int64
print(df['BsmtFinSF1'][df['BsmtFinSF1'].isnull()])
print(df['BsmtFinSF2'][df['BsmtFinSF2'].isnull()])
print(df['TotalBsmtSF'][df['TotalBsmtSF'].isnull()])
print(df['BsmtUnfSF'][df['BsmtUnfSF'].isnull()])
df['BsmtFinSF1'].fillna(np.mean(df['BsmtFinSF1'].dropna()), inplace=True)
df['BsmtFinSF2'].fillna(np.mean(df['BsmtFinSF2'].dropna()), inplace=True)
df['TotalBsmtSF'].fillna(np.mean(df['TotalBsmtSF'].dropna()), inplace=True)
df['BsmtUnfSF'].fillna(np.mean(df['BsmtUnfSF'].dropna()), inplace=True)
660 NaN
Name: BsmtFinSF1, dtype: float64
660 NaN
Name: BsmtFinSF2, dtype: float64
660 NaN
Name: TotalBsmtSF, dtype: float64
660 NaN
Name: BsmtUnfSF, dtype: float64
df[['BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtUnfSF', 'SaleType', 'KitchenQual', \
'Electrical', 'Exterior1st', 'Exterior2nd', 'GarageArea', 'GarageCars', 'Utilities', \
'Functional', 'BsmtFullBath', 'BsmtHalfBath']].head(10)
| BsmtFinSF1 | BsmtFinSF2 | TotalBsmtSF | BsmtUnfSF | SaleType | KitchenQual | Electrical | Exterior1st | Exterior2nd | GarageArea | GarageCars | Utilities | Functional | BsmtFullBath | BsmtHalfBath | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 706.0 | 0.0 | 856.0 | 150.0 | WD | Gd | SBrkr | VinylSd | VinylSd | 548.0 | 2.0 | AllPub | Typ | 1.0 | 0.0 |
| 1 | 978.0 | 0.0 | 1262.0 | 284.0 | WD | TA | SBrkr | MetalSd | MetalSd | 460.0 | 2.0 | AllPub | Typ | 0.0 | 1.0 |
| 2 | 486.0 | 0.0 | 920.0 | 434.0 | WD | Gd | SBrkr | VinylSd | VinylSd | 608.0 | 2.0 | AllPub | Typ | 1.0 | 0.0 |
| 3 | 216.0 | 0.0 | 756.0 | 540.0 | WD | Gd | SBrkr | Wd Sdng | Wd Shng | 642.0 | 3.0 | AllPub | Typ | 1.0 | 0.0 |
| 4 | 655.0 | 0.0 | 1145.0 | 490.0 | WD | Gd | SBrkr | VinylSd | VinylSd | 836.0 | 3.0 | AllPub | Typ | 1.0 | 0.0 |
| 5 | 732.0 | 0.0 | 796.0 | 64.0 | WD | TA | SBrkr | VinylSd | VinylSd | 480.0 | 2.0 | AllPub | Typ | 1.0 | 0.0 |
| 6 | 1369.0 | 0.0 | 1686.0 | 317.0 | WD | Gd | SBrkr | VinylSd | VinylSd | 636.0 | 2.0 | AllPub | Typ | 1.0 | 0.0 |
| 7 | 859.0 | 32.0 | 1107.0 | 216.0 | WD | TA | SBrkr | HdBoard | HdBoard | 484.0 | 2.0 | AllPub | Typ | 1.0 | 0.0 |
| 8 | 0.0 | 0.0 | 952.0 | 952.0 | WD | TA | FuseF | BrkFace | Wd Shng | 468.0 | 2.0 | AllPub | Min1 | 0.0 | 0.0 |
| 9 | 851.0 | 0.0 | 991.0 | 140.0 | WD | TA | SBrkr | MetalSd | MetalSd | 205.0 | 1.0 | AllPub | Typ | 1.0 | 0.0 |
print(df['BsmtFinType1'].unique()) # Here NaN means no basement
print(df['BsmtFinType2'].unique())
df['BsmtFinType1']=df['BsmtFinType1'].fillna('None')
df['BsmtFinType2']=df['BsmtFinType2'].fillna('None')
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
df['BsmtQual'] = df['BsmtQual'].fillna('None')
df['Fence'] = df['Fence'].fillna('None')
print(df.FireplaceQu.value_counts())
df['FireplaceQu']= df['FireplaceQu'].fillna('None')
Gd 744
TA 592
Fa 74
Po 46
Ex 43
Name: FireplaceQu, dtype: int64
df.Fireplaces.value_counts()
0 1420
1 1268
2 219
3 11
4 1
Name: Fireplaces, dtype: int64
Sanity check: this XOR should return an empty dataframe, because the number of fireplaces should be 0 at the same time the fireplace quality is nothing
df[(df['FireplaceQu']== 'None') ^ (df['Fireplaces'] ==0)]
| 1stFlrSF | 2ndFlrSF | 3SsnPorch | Alley | BedroomAbvGr | BldgType | BsmtCond | BsmtExposure | BsmtFinSF1 | BsmtFinSF2 | ... | SaleType | ScreenPorch | Street | TotRmsAbvGrd | TotalBsmtSF | Utilities | WoodDeckSF | YearBuilt | YearRemodAdd | YrSold |
|---|
0 rows × 81 columns
Next, what to do with the year the garage was built, if the house does not have a garage?
NaN values will confuse a retrieval algorithm. For now, the simple mean of the years is substituted in, as a neutral value.
print(df['GarageCond'].unique())
print(df['GarageFinish'].unique())
print(df['GarageQual'].unique())
print(df['GarageType'].unique())
print(df['GarageYrBlt'].unique())
df['GarageCond'].fillna('None', inplace=True)
df['GarageFinish'].fillna('None', inplace=True)
df['GarageQual'].fillna('None', inplace=True)
df['GarageType'].fillna('None', inplace=True)
df['GarageYrBlt'].fillna(np.mean(df['GarageYrBlt']), inplace=True)
['TA' 'Fa' nan 'Gd' 'Po' 'Ex']
['RFn' 'Unf' 'Fin' nan]
['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
[ 2003. 1976. 2001. 1998. 2000. 1993. 2004. 1973. 1931. 1939.
1965. 2005. 1962. 2006. 1960. 1991. 1970. 1967. 1958. 1930.
2002. 1968. 2007. 2008. 1957. 1920. 1966. 1959. 1995. 1954.
1953. nan 1983. 1977. 1997. 1985. 1963. 1981. 1964. 1999.
1935. 1990. 1945. 1987. 1989. 1915. 1956. 1948. 1974. 2009.
1950. 1961. 1921. 1900. 1979. 1951. 1969. 1936. 1975. 1971.
1923. 1984. 1926. 1955. 1986. 1988. 1916. 1932. 1972. 1918.
1980. 1924. 1996. 1940. 1949. 1994. 1910. 1978. 1982. 1992.
1925. 1941. 2010. 1927. 1947. 1937. 1942. 1938. 1952. 1928.
1922. 1934. 1906. 1914. 1946. 1908. 1929. 1933. 1917. 1896.
1895. 2207. 1943. 1919.]
df[['LotArea', 'LotConfig', 'LotFrontage', 'LotShape']][df['LotFrontage'].isnull()]
| LotArea | LotConfig | LotFrontage | LotShape | |
|---|---|---|---|---|
| 7 | 10382 | Corner | NaN | IR1 |
| 12 | 12968 | Inside | NaN | IR2 |
| 14 | 10920 | Corner | NaN | IR1 |
| 16 | 11241 | CulDSac | NaN | IR1 |
| 24 | 8246 | Inside | NaN | IR1 |
| 31 | 8544 | CulDSac | NaN | IR1 |
| 42 | 9180 | CulDSac | NaN | IR1 |
| 43 | 9200 | CulDSac | NaN | IR1 |
| 50 | 13869 | Corner | NaN | IR2 |
| 64 | 9375 | Inside | NaN | Reg |
| 66 | 19900 | Inside | NaN | Reg |
| 76 | 8475 | Inside | NaN | IR1 |
| 84 | 8530 | Inside | NaN | IR1 |
| 95 | 9765 | Corner | NaN | IR2 |
| 100 | 10603 | Inside | NaN | IR1 |
| 104 | 7758 | Corner | NaN | Reg |
| 111 | 7750 | Inside | NaN | IR1 |
| 113 | 21000 | Corner | NaN | Reg |
| 116 | 11616 | Inside | NaN | Reg |
| 120 | 21453 | CulDSac | NaN | IR1 |
| 126 | 4928 | Inside | NaN | IR1 |
| 131 | 12224 | Corner | NaN | IR1 |
| 133 | 6853 | Inside | NaN | IR1 |
| 136 | 10355 | Corner | NaN | IR1 |
| 147 | 9505 | CulDSac | NaN | IR1 |
| 149 | 6240 | Inside | NaN | Reg |
| 152 | 14803 | CulDSac | NaN | IR1 |
| 153 | 13500 | Inside | NaN | Reg |
| 160 | 11120 | CulDSac | NaN | IR1 |
| 166 | 10708 | Inside | NaN | IR1 |
| ... | ... | ... | ... | ... |
| 1224 | 12585 | Inside | NaN | IR1 |
| 1240 | 9019 | Inside | NaN | IR1 |
| 1243 | 9240 | Inside | NaN | IR1 |
| 1244 | 9308 | CulDSac | NaN | IR1 |
| 1246 | 8638 | Inside | NaN | IR1 |
| 1247 | 13052 | CulDSac | NaN | IR1 |
| 1248 | 8020 | Inside | NaN | IR1 |
| 1249 | 8789 | Inside | NaN | IR1 |
| 1254 | 2998 | Inside | NaN | Reg |
| 1255 | 4447 | FR2 | NaN | IR1 |
| 1264 | 9759 | Inside | NaN | IR1 |
| 1267 | 10368 | CulDSac | NaN | IR1 |
| 1277 | 8917 | Inside | NaN | IR1 |
| 1278 | 12700 | Inside | NaN | IR1 |
| 1281 | 9610 | Corner | NaN | IR1 |
| 1304 | 18275 | Inside | NaN | IR1 |
| 1347 | 11327 | Inside | NaN | IR1 |
| 1350 | 9535 | Inside | NaN | IR1 |
| 1351 | 7176 | CulDSac | NaN | IR1 |
| 1352 | 9662 | Inside | NaN | IR1 |
| 1354 | 17529 | Inside | NaN | Reg |
| 1355 | 20355 | Inside | NaN | Reg |
| 1358 | 1700 | Inside | NaN | Reg |
| 1379 | 8685 | CulDSac | NaN | IR1 |
| 1385 | 9930 | Inside | NaN | IR1 |
| 1387 | 11088 | Corner | NaN | Reg |
| 1390 | 21533 | FR2 | NaN | IR2 |
| 1440 | 50102 | Inside | NaN | IR1 |
| 1441 | 8098 | Inside | NaN | IR1 |
| 1448 | 11836 | Corner | NaN | IR1 |
486 rows × 4 columns
There isn’t quite a clear explanation for why “Lot Frontage” has NaN values, so here the mean will be substituted in.
print(df['LotFrontage'].unique())
df['LotFrontage'].hist(bins=20)
df['LotFrontage'].fillna(np.mean(df['LotFrontage']), inplace=True)
[ 65. 80. 68. 60. 84. 85. 75. nan 51. 50. 70. 91.
72. 66. 101. 57. 44. 110. 98. 47. 108. 112. 74. 115.
61. 48. 33. 52. 100. 24. 89. 63. 76. 81. 95. 69.
21. 32. 78. 121. 122. 40. 105. 73. 77. 64. 94. 34.
90. 55. 88. 82. 71. 120. 107. 92. 134. 62. 86. 141.
97. 54. 41. 79. 174. 99. 67. 83. 43. 103. 93. 30.
129. 140. 35. 37. 118. 87. 116. 150. 111. 49. 96. 59.
36. 56. 102. 58. 38. 109. 130. 53. 137. 45. 106. 104.
42. 39. 144. 114. 128. 149. 313. 168. 182. 138. 160. 152.
124. 153. 46. 26. 25. 119. 31. 28. 117. 113. 125. 135.
136. 22. 123. 195. 155. 126. 200. 131. 133.]

For some cases marking as “unknown” is fine. Marking something as unknown is far less satisfying.
It’s not quite clear, for example, with Masonry Veneer Type (‘MasVnrType’) whether the NaNs and Nones are the same as both exist in the dataset as is; it’s only our educated best guess that they’re the same
For Misc. Features there’s an “other” category and a “None” category. They’re technically distinct but in this case “other” is vague enough that we will link them together
print(df.MSZoning.value_counts())
print(df.MiscFeature.value_counts())
df['MSZoning'].fillna('RL', inplace=True) # RL most common
df['MasVnrArea'].fillna(0,inplace=True) #fill in median np.mean(df['MasVnrArea']), inplace=True)
df['MasVnrType'].fillna('None', inplace=True)
df['MiscFeature'].fillna('Othr', inplace=True)
df['PoolQC'].fillna('No Pool', inplace=True)
RL 2265
RM 460
FV 139
RH 26
C (all) 25
Name: MSZoning, dtype: int64
Shed 95
Gar2 5
Othr 4
TenC 1
Name: MiscFeature, dtype: int64
df['CentralAir'] = df['CentralAir'].map({'Y': 1, 'N':0})
Here, note that ‘MSSubClass’ shoud be a categorical variable
#Strangely this code reduces my accuracy. Is information being lost, e.g. this is listed in an order of some kind?
df['MSSubClass']=df['MSSubClass'].map({20:'1-STORY 1946 & NEWER ALL STYLES', 30: '1-STORY 1945 & OLDER', 40: '1-STORY W/FINISHED ATTIC ALL AGES', 45: '1-1/2 STORY - UNFINISHED ALL AGES', 50: '1-1/2 STORY FINISHED ALL AGES', 60: '2-STORY 1946 & NEWER', 70: '2-STORY 1945 & OLDER', 75: '2-1/2 STORY ALL AGES', 80: 'SPLIT OR MULTI-LEVEL', 85: 'SPLIT FOYER', 90: 'DUPLEX - ALL STYLES AND AGES', 120: '1-STORY PUD (Planned Unit Development) - 1946 & NEWER', 150: '1-1/2 STORY PUD - ALL AGES', 160: '2-STORY PUD - 1946 & NEWER', 180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER', 190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'})
df.MSSubClass.value_counts()
1-STORY 1946 & NEWER ALL STYLES 1079
2-STORY 1946 & NEWER 575
1-1/2 STORY FINISHED ALL AGES 287
1-STORY PUD (Planned Unit Development) - 1946 & NEWER 182
1-STORY 1945 & OLDER 139
2-STORY PUD - 1946 & NEWER 128
2-STORY 1945 & OLDER 128
SPLIT OR MULTI-LEVEL 118
DUPLEX - ALL STYLES AND AGES 109
2 FAMILY CONVERSION - ALL STYLES AND AGES 61
SPLIT FOYER 48
2-1/2 STORY ALL AGES 23
1-1/2 STORY - UNFINISHED ALL AGES 18
PUD - MULTILEVEL - INCL SPLIT LEV/FOYER 17
1-STORY W/FINISHED ATTIC ALL AGES 6
1-1/2 STORY PUD - ALL AGES 1
Name: MSSubClass, dtype: int64
df['GarageType'].unique()
array(['Attchd', 'Detchd', 'BuiltIn', 'CarPort', 'None', 'Basment',
'2Types'], dtype=object)
df['Utilities'].unique()
array(['AllPub', 'NoSeWa'], dtype=object)
fig = plt.figure(figsize= (10,10))
sns.heatmap(df.corr())
/Users/rosscheung/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
if self._edgecolors == str('face'):
<matplotlib.axes._subplots.AxesSubplot at 0x13fa52550>

Let’s plot some of the data that’s more strongly correlated with the Sale Price
fig = plt.figure(figsize= (10,10))
ax = fig.add_subplot(221)
ax.plot(df['GrLivArea'], df['SalePrice'], '.') # Note that there is a long tail; someone makes 8 million a year
ax.set_title('Above grade (ground) living area square feet')
ax = fig.add_subplot(222)
ax.plot(df['BsmtFinSF1'], df['SalePrice'], '.')
ax.set_title('Type 1 finished square feet')
ax = fig.add_subplot(223)
ax.plot(df['TotalBsmtSF'], df['SalePrice'], '.')
ax.set_title('Total square feet of basement area')
ax = fig.add_subplot(224)
ax.plot(df['GarageArea'], df['SalePrice'], '.')
ax.set_title('Size of garage in square feet')
<matplotlib.text.Text at 0x1406436a0>

It’s worth noting that there are two datapoints with large living area but low prices, which buck the trends
Deciding what to remove as outliers is always a tricky matter, which should be done sparingly and with thought. But later (after splitting the train/test sets) we will do it.
df[df['GrLivArea']>4000][['SalePrice', 'GrLivArea', 'BsmtFinSF1', 'TotalBsmtSF', 'GarageArea']]
| SalePrice | GrLivArea | BsmtFinSF1 | TotalBsmtSF | GarageArea | |
|---|---|---|---|---|---|
| 523 | 184750.0 | 4676 | 2260.0 | 3138.0 | 884.0 |
| 691 | 755000.0 | 4316 | 1455.0 | 2444.0 | 832.0 |
| 1182 | 745000.0 | 4476 | 2096.0 | 2396.0 | 813.0 |
| 1298 | 160000.0 | 5642 | 5644.0 | 6110.0 | 1418.0 |
| 1089 | NaN | 5095 | 4010.0 | 5095.0 | 1154.0 |
print(df.groupby('MoSold')['SalePrice'].mean())
plt.bar(np.arange(1,13), df.groupby('MoSold')['SalePrice'].mean(), )
df['MoSold'] = df['MoSold'].map({1: 'January', 2: 'February', 3: 'March', 4:'April', 5:'May', 6:'June', 7:'July',8:'August', 9:'September', 10:'October',11:'November',12:'December'})
MoSold
1 183256.258621
2 177882.000000
3 183253.924528
4 171503.262411
5 172307.269608
6 177395.735178
7 186331.192308
8 184651.827869
9 195683.206349
10 179563.977528
11 192210.911392
12 186518.966102
Name: SalePrice, dtype: float64

Feature Engineering midwesterners living the American dream
There appears to be two values where the living area is greater than 4000, but the sale price is super low. We may consider removing these as outliers much later (after splitting up the training and test sets)
By domain knowledge (thanks Michele!), a “4 bed-2 bath” house is especially prized. Let’s consider this special category of a house and flag it.
print(df['BedroomAbvGr'].unique())
print(df['FullBath'].unique())
df_4bed_2bath = df[(df['BedroomAbvGr'] ==4) & (df['FullBath']==2)]
df_everthingelse = df[~((df['BedroomAbvGr'] ==4) & (df['FullBath']==2))]
df_4bed_2bath.head()
fig = plt.figure(figsize= (10,10))
ax = fig.add_subplot(211)
ax.hist(df_everthingelse['SalePrice'].dropna(), 100)
ax.set_xlim([0, 500000])
ax = fig.add_subplot(212)
ax.hist(df_4bed_2bath['SalePrice'].dropna().values, 100)
ax.set_xlim([0, 500000])
df_4bed_2bath['SalePrice'].mean(), df_everthingelse['SalePrice'].mean()
[3 4 1 2 0 5 6 8]
[2 1 3 0 4]
(212340.12820512822, 177162.48926380367)

When implemented, this does improve score very slightly (0.00014)
df['4bed2bath'] = 0
df['4bed2bath']= (df['BedroomAbvGr'] ==4) & (df['FullBath']==2)
Can we create a new variable, floor space, out of existing ones?
For those of you doing the Titanic challenge, a similar trick can be done to create a “total family size” variable out of siblings and married couples, etc. It in fact adds more information.
df['TotalFloorSpace'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
Neighborhoods
For a lot of homeowners the choice of a neighborhood may influence buying houses for a number of factors, including the fact that neighborhood is part of people’s identity and outward appearance.
(note: the author grew up in an affluent suburb and has some familiarity with how yuppies think)
df['Neighborhood'].unique()
array(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst',
'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes',
'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert',
'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU',
'Blueste'], dtype=object)
neighborhood_price = df.groupby('Neighborhood')['SalePrice'].mean()
neighborhood_floorspace = df.groupby('Neighborhood')['GrLivArea'].mean()
print(neighborhood_price)
neighborhood_price.sort_values().plot(kind='bar')
Neighborhood
Blmngtn 194870.882353
Blueste 137500.000000
BrDale 104493.750000
BrkSide 124834.051724
ClearCr 212565.428571
CollgCr 197965.773333
Crawfor 210624.725490
Edwards 128219.700000
Gilbert 192854.506329
IDOTRR 100123.783784
MeadowV 98576.470588
Mitchel 156270.122449
NAmes 145847.080000
NPkVill 142694.444444
NWAmes 189050.068493
NoRidge 335295.317073
NridgHt 316270.623377
OldTown 128225.300885
SWISU 142591.360000
Sawyer 136793.135135
SawyerW 186555.796610
Somerst 225379.837209
StoneBr 310499.000000
Timber 242247.447368
Veenker 238772.727273
Name: SalePrice, dtype: float64
<matplotlib.axes._subplots.AxesSubplot at 0x14071a710>

neighborhood_floorspace.sort_values().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1410c0be0>

Places like New York City and San Francisco are testimonials that floor space and wealthy areas aren’t necessarily correlated. However price, tempered by floor space is a useful metric for determining this.
Admittedly in creating a binary variable here, I just “eyeballed” a cutoff. Perhaps a more robust method can be determined in the future
# rich places: NoRidge, NridgeHt, StoneBr,
rich_neighborhoods = ['NoRidge', 'StoneBr', 'NridgHt', 'Veenker', 'ClearCr', 'Crawfor', 'Timber','Somerst']
df['RichNeighborhood'] = df['Neighborhood'].isin(rich_neighborhoods) * 1
df[['Neighborhood', 'RichNeighborhood']]
| Neighborhood | RichNeighborhood | |
|---|---|---|
| 0 | CollgCr | 0 |
| 1 | Veenker | 1 |
| 2 | CollgCr | 0 |
| 3 | Crawfor | 1 |
| 4 | NoRidge | 1 |
| 5 | Mitchel | 0 |
| 6 | Somerst | 1 |
| 7 | NWAmes | 0 |
| 8 | OldTown | 0 |
| 9 | BrkSide | 0 |
| 10 | Sawyer | 0 |
| 11 | NridgHt | 1 |
| 12 | Sawyer | 0 |
| 13 | CollgCr | 0 |
| 14 | NAmes | 0 |
| 15 | BrkSide | 0 |
| 16 | NAmes | 0 |
| 17 | Sawyer | 0 |
| 18 | SawyerW | 0 |
| 19 | NAmes | 0 |
| 20 | NridgHt | 1 |
| 21 | IDOTRR | 0 |
| 22 | CollgCr | 0 |
| 23 | MeadowV | 0 |
| 24 | Sawyer | 0 |
| 25 | NridgHt | 1 |
| 26 | NAmes | 0 |
| 27 | NridgHt | 1 |
| 28 | NAmes | 0 |
| 29 | BrkSide | 0 |
| ... | ... | ... |
| 1429 | IDOTRR | 0 |
| 1430 | IDOTRR | 0 |
| 1431 | IDOTRR | 0 |
| 1432 | IDOTRR | 0 |
| 1433 | IDOTRR | 0 |
| 1434 | Crawfor | 1 |
| 1435 | Crawfor | 1 |
| 1436 | Mitchel | 0 |
| 1437 | Mitchel | 0 |
| 1438 | Mitchel | 0 |
| 1439 | Mitchel | 0 |
| 1440 | Timber | 1 |
| 1441 | Timber | 1 |
| 1442 | Timber | 1 |
| 1443 | Timber | 1 |
| 1444 | Mitchel | 0 |
| 1445 | Mitchel | 0 |
| 1446 | MeadowV | 0 |
| 1447 | Mitchel | 0 |
| 1448 | Mitchel | 0 |
| 1449 | MeadowV | 0 |
| 1450 | MeadowV | 0 |
| 1451 | Mitchel | 0 |
| 1452 | MeadowV | 0 |
| 1453 | MeadowV | 0 |
| 1454 | MeadowV | 0 |
| 1455 | MeadowV | 0 |
| 1456 | Mitchel | 0 |
| 1457 | Mitchel | 0 |
| 1458 | Mitchel | 0 |
2919 rows × 2 columns
neighborhood_floorspace
Neighborhood
Blmngtn 1404.892857
Blueste 1159.700000
BrDale 1115.233333
BrkSide 1234.907407
ClearCr 1744.386364
CollgCr 1496.119850
Crawfor 1722.796117
Edwards 1337.737113
Gilbert 1620.896970
IDOTRR 1205.247312
MeadowV 1066.702703
Mitchel 1327.991228
NAmes 1292.054176
NPkVill 1244.086957
NWAmes 1688.770992
NoRidge 2480.633803
NridgHt 1942.638554
OldTown 1431.974895
SWISU 1636.479167
Sawyer 1183.026490
SawyerW 1604.064000
Somerst 1604.829670
StoneBr 1949.215686
Timber 1714.638889
Veenker 1819.541667
Name: GrLivArea, dtype: float64
df['Neighborhood'].head(10)
0 CollgCr
1 Veenker
2 CollgCr
3 Crawfor
4 NoRidge
5 Mitchel
6 Somerst
7 NWAmes
8 OldTown
9 BrkSide
Name: Neighborhood, dtype: object
Difference in house floor space from neighbords
Let’s create a new variable that is the ratio between a home, and the mean floor space of the neighborhood, assuming people don’t want to be stuck with the smallest home in a neighborhood. One has to show off to ones neighbors after all.
df['SizeDifferenceFromNeighbors'] = df['TotalFloorSpace'] / df['Neighborhood'].map(neighborhood_floorspace)
plt.scatter( df['SizeDifferenceFromNeighbors'],df['SalePrice'])
<matplotlib.collections.PathCollection at 0x1407557f0>
/Users/rosscheung/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
if self._edgecolors == str('face'):

What about bedrooms and bathrooms?
If there are not enough bedrooms and bathrooms, especially with kids, people may be unhappy
df['numBathrooms'] = df['BsmtFullBath'] + df['FullBath'] + df['BsmtHalfBath']
# Question: what about basement half bathrooms?
df['Bathroom-Bedroom ratio'] = df['numBathrooms']/ (df['BedroomAbvGr']+1)
# There doesn't seem to be any improvement if this ratio is higher than 1 (house occupants can't use more than 1 at a time)
plt.scatter(df['Bathroom-Bedroom ratio'], df['SalePrice'])
<matplotlib.collections.PathCollection at 0x13fad84e0>
/Users/rosscheung/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
if self._edgecolors == str('face'):

fig = plt.figure(figsize= (10,10))
ax = fig.add_subplot(321)
ax.plot(df['FullBath'], df['SalePrice'], '.')
ax.set_title('Number of Bathrooms')
ax = fig.add_subplot(322)
ax.plot(df['HalfBath'], df['SalePrice'], '.')
ax.set_title('Number of Half Bathrooms')
ax = fig.add_subplot(323)
ax.plot(df['numBathrooms'], df['SalePrice'], '.')
ax.set_title('Total Bathrooms')
ax = fig.add_subplot(324)
ax.plot(df['BedroomAbvGr'], df['SalePrice'], '.')
ax.set_title('Above ground Bedrooms')
ax = fig.add_subplot(325)
ax.plot(df['BedroomAbvGr'], df['SalePrice'], '.')
ax.set_title('Above ground Bedrooms')
df['BathroomSq'] = df['FullBath'] * df['FullBath']
df['TotalBathroomSq'] = df['numBathrooms'] * df['numBathrooms']

#df['KitchenQual'] = df['KitchenQual'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['FireplaceQu']= df['FireplaceQu'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['GarageQual']= df['GarageQual'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['GarageCond']= df['GarageCond'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['HeatingQC'] = df['HeatingQC'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['PoolQC'] = df['PoolQC'].map({'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'No Pool': 0})
#df['BsmtCond'] = df['BsmtCond'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['ExterCond']= df['ExterCond'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['ExterQual']= df['ExterQual'].map({'Ex':5, 'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
#df['BsmtFinType2'].map({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2,'Unf': 1, 'None': 0})
#df['BsmtFinType2']
#df['BsmtFinType1']
#df['Fence'].unique()
#df['GarageFinish'] = df['GarageFinish'].map({'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0})
#sns.barplot(x= 'Fence', y = 'SalePrice', data = df_train)
fig = plt.figure(figsize= (10,10))
ax = fig.add_subplot(321)
plt.plot(df['OverallCond'], df['SalePrice'], '.')
ax = fig.add_subplot(322)
plt.plot(df['YrSold'], df['SalePrice'], '.')
#ax = fig.add_subplot(223)
#plt.plot(df['Functional'], df['SalePrice'], '.')
ax = fig.add_subplot(323)
plt.plot(df['GrLivArea'], df['SalePrice'], '.')
ax = fig.add_subplot(324)
plt.plot(df['LotFrontage'], df['SalePrice'], '.')
#plt.plot(df['LowQualFinSF'], df['SalePrice'], '.')
ax = fig.add_subplot(325)
plt.plot(df['LotArea'], df['SalePrice'], '.')
ax = fig.add_subplot(326)
plt.plot(np.log1p(df['LotArea']), df['SalePrice'], '.')
[<matplotlib.lines.Line2D at 0x1382315c0>]

df['YrSold'] = df['YrSold'].astype('str')
plt.figure(1)
plt.plot(df_train['YearBuilt'], df_train['SalePrice'], '.')
#plt.figure(1)
#plt.plot(df_train['YearRemodAdd'], df_train['SalePrice'], '.r')
[<matplotlib.lines.Line2D at 0x13f2fa8d0>]

#Heating
#plt.plot( df_train['SalePrice'],df_train['SaleType'], kind='bar')
#df['YearRemodAdd'] = 2017 - df['YearRemodAdd']
#df['YearBuilt'] = 2017 - df['YearBuilt']
print(df['Condition1'].unique())
print(df['Condition2'].unique())
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
df['Location_Norm']= (df['Condition1'] == 'Norm') | (df['Condition2'] == 'Norm')
df['Location_Feedr'] = ((df['Condition1'] == 'Feedr') | (df['Condition2'] == 'Feedr'))
df['Location_Artery'] = ((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))
df['Location_RRNn'] = ((df['Condition1'] == 'RRNn') | (df['Condition2'] == 'RRNn'))
df['Location_RRAn'] = ((df['Condition1'] == 'RRAn') | (df['Condition2'] == 'RRAn'))
df['Location_PosN'] = ((df['Condition1'] == 'PosN') | (df['Condition2'] == 'PosN'))
df['Location_PosA'] = ((df['Condition1'] == 'PosA') | (df['Condition2'] == 'PosA'))
df['Location_RRNe'] = ((df['Condition1'] == 'RRNe') | (df['Condition2'] == 'RRNe'))
df['Location_RRAe'] = ((df['Condition1'] == 'RRAe') | (df['Condition2'] == 'RRAe'))
df['Location_Norm'] = df['Location_Norm'].map(lambda x: 1 if x else 0)
df['Location_Feedr'] = df['Location_Feedr'].map(lambda x: 1 if x else 0)
df['Location_Artery'] = df['Location_Artery'].map(lambda x: 1 if x else 0)
df['Location_RRNn'] = df['Location_RRNn'].map(lambda x: 1 if x else 0)
df['Location_RRAn'] = df['Location_RRAn'].map(lambda x: 1 if x else 0)
df['Location_PosN'] = df['Location_PosN'].map(lambda x: 1 if x else 0)
df['Location_PosA'] = df['Location_PosA'].map(lambda x: 1 if x else 0)
df['Location_RRNe'] = df['Location_RRNe'].map(lambda x: 1 if x else 0)
df['Location_RRAe'] = df['Location_RRAe'].map(lambda x: 1 if x else 0)
df.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
df['Exterior1st'].unique()
array(['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing',
'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'BrkComm', 'AsphShn',
'Stone', 'ImStucc', 'CBlock'], dtype=object)
Note: to convert a boolean (True/False) to a number (0 or 1), another way is to just do an arithmetic option like adding 0 or multiplying by 1. Then Python will just convert the boolean to a numeric.
#Repeat for exterior 1st and 2nd?
df['Exterior_AsbShng']= ((df['Exterior1st'] == 'AsbShng') | (df['Exterior2nd'] == 'AsbShng')) + 0
df['Exterior_AsphShn']= ((df['Exterior1st'] == 'AsphShn') | (df['Exterior2nd'] == 'AsphShn')) + 0
df['Exterior_BrkComm']= ((df['Exterior1st'] == 'BrkComm') | (df['Exterior2nd'] == 'BrkComm')) + 0
df['Exterior_BrkFace']= ((df['Exterior1st'] == 'BrkFace') | (df['Exterior2nd'] == 'BrkFace')) + 0
df['Exterior_CBlock']= ((df['Exterior1st'] == 'CBlock') | (df['Exterior2nd'] == 'CBlock')) + 0
df['Exterior_CemntBd']= ((df['Exterior1st'] == 'CemntBd') | (df['Exterior2nd'] == 'CemntBd')) + 0
df['Exterior_HdBoard']= ((df['Exterior1st'] == 'HdBoard') | (df['Exterior2nd'] == 'HdBoard')) + 0
df['Exterior_ImStucc']= ((df['Exterior1st'] == 'ImStucc') | (df['Exterior2nd'] == 'ImStucc')) + 0
df['Exterior_MetalSd']= ((df['Exterior1st'] == 'MetalSd') | (df['Exterior2nd'] == 'MetalSd')) + 0
df['Exterior_Other']= ((df['Exterior1st'] == 'Other') | (df['Exterior2nd'] == 'Other')) + 0
df['Exterior_Plywood']= ((df['Exterior1st'] == 'Plywood') | (df['Exterior2nd'] == 'Plywood')) + 0
df['Exterior_PreCast']= ((df['Exterior1st'] == 'PreCast') | (df['Exterior2nd'] == 'PreCast')) + 0
df['Exterior_Stone']= ((df['Exterior1st'] == 'Stone') | (df['Exterior2nd'] == 'Stone')) + 0
df['Exterior_Stucco']= ((df['Exterior1st'] == 'Stucco') | (df['Exterior2nd'] == 'Stucco')) + 0
df['Exterior_VinylSd']= ((df['Exterior1st'] == 'VinylSd') | (df['Exterior2nd'] == 'VinylSd')) + 0
df['Exterior_Wd_Sdng']= ((df['Exterior1st'] == 'Wd Sdng') | (df['Exterior2nd'] == 'Wd Sdng')) + 0
df['Exterior_WdShing']= ((df['Exterior1st'] == 'WdShing') | (df['Exterior2nd'] == 'WdShing')) + 0
#df[['Exterior1st', 'Exterior2nd', 'Exterior_VinylSd', 'Exterior_AsphShn','Exterior_BrkComm']]
df.drop(['Exterior1st', 'Exterior2nd'], axis=1, inplace=True)
df['BsmtFinType2'].unique()
array(['Unf', 'BLQ', 'None', 'ALQ', 'Rec', 'LwQ', 'GLQ'], dtype=object)
df['BsmtFinType_GLQ']= ((df['BsmtFinType1'] == 'GLQ') | (df['BsmtFinType2'] == 'GLQ')) + 0
df['BsmtFinType_ALQ']= ((df['BsmtFinType1'] == 'ALQ') | (df['BsmtFinType2'] == 'ALQ')) + 0
df['BsmtFinType_BLQ']= ((df['BsmtFinType1'] == 'BLQ') | (df['BsmtFinType2'] == 'BLQ')) + 0
df['BsmtFinType_Rec']= ((df['BsmtFinType1'] == 'Rec') | (df['BsmtFinType2'] == 'Rec')) + 0
df['BsmtFinType_LwQ']= ((df['BsmtFinType1'] == 'LwQ') | (df['BsmtFinType2'] == 'LwQ')) + 0
df['BsmtFinType_Unf']= ((df['BsmtFinType1'] == 'Unf') | (df['BsmtFinType2'] == 'Unf')) + 0
df['BsmtFinType_None']= ((df['BsmtFinType1'] == 'None') | (df['BsmtFinType2'] == 'None')) + 0
df.drop(['BsmtFinType1', 'BsmtFinType2'], axis=1, inplace=True)
Experiment with dropping rows with next to no information
correlations= df.corr()['SalePrice'].sort_values(ascending=False )
print(correlations)
SalePrice 1.000000
OverallQual 0.790982
TotalFloorSpace 0.782260
GrLivArea 0.708624
GarageCars 0.640409
GarageArea 0.623431
TotalBsmtSF 0.613581
1stFlrSF 0.605852
RichNeighborhood 0.585580
BathroomSq 0.577302
numBathrooms 0.574226
TotalBathroomSq 0.561809
FullBath 0.560664
TotRmsAbvGrd 0.533723
YearBuilt 0.522897
YearRemodAdd 0.507101
MasVnrArea 0.472614
GarageYrBlt 0.471062
Fireplaces 0.466929
SizeDifferenceFromNeighbors 0.436532
BsmtFinType_GLQ 0.430415
Bathroom-Bedroom ratio 0.408142
BsmtFinSF1 0.386420
LotFrontage 0.334820
WoodDeckSF 0.324413
2ndFlrSF 0.319334
OpenPorchSF 0.315856
Exterior_VinylSd 0.302553
HalfBath 0.284108
LotArea 0.263843
...
Location_RRAn 0.002967
Exterior_Stone -0.000998
Location_RRNn -0.001367
BsmtFinSF2 -0.011378
BsmtHalfBath -0.016844
MiscVal -0.021190
Id -0.021917
Exterior_AsphShn -0.024524
Exterior_CBlock -0.025028
LowQualFinSF -0.025606
Exterior_Stucco -0.038470
Location_RRAe -0.043813
Exterior_BrkComm -0.051264
Exterior_WdShing -0.051317
Exterior_Plywood -0.054547
OverallCond -0.077856
BsmtFinType_ALQ -0.086367
Exterior_HdBoard -0.091262
BsmtFinType_LwQ -0.091463
Location_Artery -0.106401
Exterior_AsbShng -0.109712
Location_Feedr -0.123694
EnclosedPorch -0.128578
KitchenAbvGr -0.135907
BsmtFinType_Rec -0.139119
BsmtFinType_None -0.145274
BsmtFinType_BLQ -0.146839
Exterior_Wd_Sdng -0.154201
Exterior_MetalSd -0.168906
Exterior_PreCast NaN
Name: SalePrice, dtype: float64
#df['OverallQualSq'] = df['OverallQual']**2
#df['TotalFloorSpaceSq']= df['TotalFloorSpace'] ** 2
#df['GrLivAreaSq']=df['GrLivArea'] ** 2
#df['GarageCarsSq'] =df['GarageCars'] ** 2
#df['GarageAreaSq'] = df['GarageArea'] ** 2
#correlations= df.corr()['SalePrice'].sort_values(ascending=False )
#print(correlations)
fig = plt.figure(figsize= (10,10))
values_to_consider = ['SalePrice', 'OverallQual', 'TotalFloorSpace', 'GrLivArea','GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath','TotRmsAbvGrd', 'GarageYrBlt', 'YearRemodAdd', 'YearBuilt']
#sns.heatmap(df[['SalePrice', 'GarageArea', 'GarageCars']].corr(), annot=True)
sns.heatmap(df[values_to_consider].corr(), annot=True)
/Users/rosscheung/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
if self._edgecolors == str('face'):
<matplotlib.axes._subplots.AxesSubplot at 0x133baff60>

Multicollinearity
It was found that dropping one of garagecars and garagearea greatly improves the prediction. MMulticollinearity is s genuine concern! The same is true of “TotRmsAbvGrd” and “GrLivArea”.
df.drop('GarageArea',axis=1, inplace=True)
df.drop('TotRmsAbvGrd', axis=1,inplace=True)
#df.drop('GrLivArea', axis=1,inplace=True)
#What about garag year and
Let’s fit the log(1+x) of salesprice
Sales price is initially skewed, but the log should fix this and make it gaussian. This should be okay as long as we remember to do the inverse of this to our final modeled sales prices.
See: https://docs.scipy.org/doc/numpy/reference/generated/numpy.log1p.html
df['SalePrice'] = np.log1p(df['SalePrice'])
#hist(df['SalePrice'], 100)
plt.hist(df['SalePrice'].dropna(), 50)
(array([ 2., 2., 1., 0., 0., 0., 2., 3., 4.,
3., 5., 1., 5., 21., 22., 23., 18., 29.,
58., 56., 65., 100., 122., 93., 90., 82., 108.,
91., 64., 55., 58., 51., 46., 42., 23., 29.,
22., 13., 13., 13., 7., 5., 4., 1., 2.,
2., 2., 0., 0., 2.]),
array([ 10.46027076, 10.52175483, 10.5832389 , 10.64472298,
10.70620705, 10.76769112, 10.82917519, 10.89065926,
10.95214334, 11.01362741, 11.07511148, 11.13659555,
11.19807962, 11.25956369, 11.32104777, 11.38253184,
11.44401591, 11.50549998, 11.56698405, 11.62846813,
11.6899522 , 11.75143627, 11.81292034, 11.87440441,
11.93588849, 11.99737256, 12.05885663, 12.1203407 ,
12.18182477, 12.24330884, 12.30479292, 12.36627699,
12.42776106, 12.48924513, 12.5507292 , 12.61221328,
12.67369735, 12.73518142, 12.79666549, 12.85814956,
12.91963363, 12.98111771, 13.04260178, 13.10408585,
13.16556992, 13.22705399, 13.28853807, 13.35002214,
13.41150621, 13.47299028, 13.53447435]),
<a list of 50 Patch objects>)

plt.scatter(df['SalePrice'], df['SizeDifferenceFromNeighbors'])
<matplotlib.collections.PathCollection at 0x140935978>
/Users/rosscheung/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
if self._edgecolors == str('face'):

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(221)
df['TotalFloorSpace'].hist(bins=20)
ax.set_title('Total Floor Space')
ax = fig.add_subplot(222)
(df['GrLivArea']).hist(bins=20)
ax.set_title('Above grade (ground) living area square feet')
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/Users/rosscheung/anaconda/lib/python3.4/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2133 try:
-> 2134 return self._engine.get_loc(key)
2135 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'GrLivArea'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-1142-a97cfd45e2c4> in <module>()
6
7 ax = fig.add_subplot(222)
----> 8 (df['GrLivArea']).hist(bins=20)
9 ax.set_title('Above grade (ground) living area square feet')
/Users/rosscheung/anaconda/lib/python3.4/site-packages/pandas/core/frame.py in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/Users/rosscheung/anaconda/lib/python3.4/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/Users/rosscheung/anaconda/lib/python3.4/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item)
1387 res = self._box_item_values(item, values)
1388 cache[item] = res
/Users/rosscheung/anaconda/lib/python3.4/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3541
3542 if not isnull(item):
-> 3543 loc = self.items.get_loc(item)
3544 else:
3545 indexer = np.arange(len(self.items))[isnull(self.items)]
/Users/rosscheung/anaconda/lib/python3.4/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2134 return self._engine.get_loc(key)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
2138 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'GrLivArea'

#Observation: The various "square feet" variables are also skewed. Let's do the same for them
#df['TotalFloorSpace'] = np.log1p(df['TotalFloorSpace'])
#df['GrLivArea'] = np.log1p(df['GrLivArea'])
#df['TotalBsmtSF'] = np.log1p(df['TotalBsmtSF'])
#df['2ndFlrSF'] = np.log1p(df['2ndFlrSF'])
#df['1stFlrSF'] = np.log1p(df['1stFlrSF'])
#df['BsmtFinSF1'] = np.log1p(df['BsmtFinSF1'])
#df['BsmtFinSF2'] = np.log1p(df['BsmtFinSF2'])
#plt.hist(.dropna(), 100)
df['LotArea'] = np.log1p(df['LotArea'])
#sns.distplot(df['LotArea'])
df.drop(['Neighborhood'], axis=1, inplace=True)
Creating Dummy Variables
The dataset should be cleaned enough that we can create dummy variables. Be warned, as you can see, the list of variables grows by a lot.
df_dummies = pd.get_dummies(df, drop_first=True)
df_dummies.info()
col_names = list(df_dummies.columns.values)
print(col_names)
#col_names
#neighborhood_corr = df[['SalePrice', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr']]
Let’s split into dependent and independent variables
dependent_vars = df_dummies['SalePrice']
df_houseprice = df_dummies[['Id', 'SalePrice']][1460:]
#not_sure = df_dummies[['MiscVal', 'Utilities_NoSeWa']]
#independent_vars = df_dummies[['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'CentralAir', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MSSubClass', 'MasVnrArea', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold', 'Alley_None', 'Alley_Pave', 'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE', 'BsmtCond_Gd', 'BsmtCond_None', 'BsmtCond_Po', 'BsmtCond_TA', 'BsmtExposure_Gd', 'BsmtExposure_Mn', 'BsmtExposure_No', 'BsmtExposure_None', 'BsmtFinType1_BLQ', 'BsmtFinType1_GLQ', 'BsmtFinType1_LwQ', 'BsmtFinType1_None', 'BsmtFinType1_Rec', 'BsmtFinType1_Unf', 'BsmtFinType2_BLQ', 'BsmtFinType2_GLQ', 'BsmtFinType2_LwQ', 'BsmtFinType2_None', 'BsmtFinType2_Rec', 'BsmtFinType2_Unf', 'BsmtQual_Fa', 'BsmtQual_Gd', 'BsmtQual_None', 'BsmtQual_TA', 'Condition1_Feedr', 'Condition1_Norm', 'Condition1_PosA', 'Condition1_PosN', 'Condition1_RRAe', 'Condition1_RRAn', 'Condition1_RRNe', 'Condition1_RRNn', 'Condition2_Feedr', 'Condition2_Norm', 'Condition2_PosA', 'Condition2_PosN', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'Electrical_FuseF', 'Electrical_FuseP', 'Electrical_Mix', 'Electrical_SBrkr', 'ExterCond_Fa', 'ExterCond_Gd', 'ExterCond_Po', 'ExterCond_TA', 'ExterQual_Fa', 'ExterQual_Gd', 'ExterQual_TA', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm', 'Exterior1st_BrkFace', 'Exterior1st_CBlock', 'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_ImStucc', 'Exterior1st_MetalSd', 'Exterior1st_Plywood', 'Exterior1st_Stone', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng', 'Exterior1st_WdShing', 'Exterior2nd_AsphShn', 'Exterior2nd_Brk Cmn', 'Exterior2nd_BrkFace', 'Exterior2nd_CBlock', 'Exterior2nd_CmentBd', 'Exterior2nd_HdBoard', 'Exterior2nd_ImStucc', 'Exterior2nd_MetalSd', 'Exterior2nd_Other', 'Exterior2nd_Plywood', 'Exterior2nd_Stone', 'Exterior2nd_Stucco', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng', 'Exterior2nd_Wd Shng', 'Fence_GdWo', 'Fence_MnPrv', 'Fence_MnWw', 'Fence_None', 'FireplaceQu_Fa', 'FireplaceQu_Gd', 'FireplaceQu_None', 'FireplaceQu_Po', 'FireplaceQu_TA', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood', 'Functional_Maj2', 'Functional_Min1', 'Functional_Min2', 'Functional_Mod', 'Functional_Sev', 'Functional_Typ', 'GarageCond_Fa', 'GarageCond_Gd', 'GarageCond_None', 'GarageCond_Po', 'GarageCond_TA', 'GarageFinish_None', 'GarageFinish_RFn', 'GarageFinish_Unf', 'GarageQual_Fa', 'GarageQual_Gd', 'GarageQual_None', 'GarageQual_Po', 'GarageQual_TA', 'GarageType_Attchd', 'GarageType_Basment', 'GarageType_BuiltIn', 'GarageType_CarPort', 'GarageType_Detchd', 'GarageType_None', 'Heating_GasA', 'Heating_GasW', 'Heating_Grav', 'Heating_OthW', 'Heating_Wall', 'HeatingQC_Fa', 'HeatingQC_Gd', 'HeatingQC_Po', 'HeatingQC_TA', 'HouseStyle_1.5Unf', 'HouseStyle_1Story', 'HouseStyle_2.5Fin', 'HouseStyle_2.5Unf', 'HouseStyle_2Story', 'HouseStyle_SFoyer', 'HouseStyle_SLvl', 'KitchenQual_Fa', 'KitchenQual_Gd', 'KitchenQual_TA', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'LandSlope_Mod', 'LandSlope_Sev', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'MSZoning_Unknown', 'MasVnrType_BrkFace', 'MasVnrType_None', 'MasVnrType_Stone', 'MiscFeature_Othr', 'MiscFeature_Shed', 'MiscFeature_TenC', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'PavedDrive_P', 'PavedDrive_Y', 'PoolQC_Fa', 'PoolQC_Gd', 'PoolQC_No Pool', 'RoofMatl_CompShg', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'RoofMatl_Tar&Grv', 'RoofMatl_WdShake', 'RoofMatl_WdShngl', 'RoofStyle_Gable', 'RoofStyle_Gambrel', 'RoofStyle_Hip', 'RoofStyle_Mansard', 'RoofStyle_Shed', 'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal', 'SaleCondition_Partial', 'SaleType_CWD', 'SaleType_Con', 'SaleType_ConLD', 'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD', 'Street_Pave', 'Utilities_NoSeWa']]
vars_to_fit = col_names
vars_to_fit.remove('SalePrice')
vars_to_fit.remove('Id')
#vars_to_fit.remove('MiscVal')
#vars_to_fit.remove('MoSold')
vars_to_fit.remove('Utilities_NoSeWa')
independent_vars = df_dummies[vars_to_fit]
x_unknown = independent_vars[1460:]
x = independent_vars[0:1460]
y = dependent_vars[0:1460]
Now, after data cleaning but before splitting into test/train sets, let’s remove those two outliers
#x[x['GrLivArea']>5000][['GrLivArea', 'BsmtFinSF1', 'TotalBsmtSF', 'GarageArea']]
print(x[x['GrLivArea']>4500][['GrLivArea', 'BsmtFinSF1', 'TotalBsmtSF']])
outlier_index = x[x['GrLivArea']>4500][['GrLivArea', 'BsmtFinSF1', 'TotalBsmtSF']].index
x.drop(outlier_index, axis=0, inplace=True)
y.drop(outlier_index,inplace=True)
x.info()
x.reset_index(inplace=True)
x.drop('index',axis=1, inplace=True)
#y.reset_index(inplace=True)
#y.drop('index',axis=1, inplace=True)
y.shape
Split training data into train/test datasets, so that we can diagnose how good each algorithm is
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size = 0.8, random_state = 0)
Now let’s perform our fits
#parameters = { 'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 1.0]} # could in theory test more but whatever
#clf = GridSearchCV(linear_model.Ridge(), parameters)
#clf.fit(x_train, y_train)
#clf.best_params_
#ridgereg = linear_model.Ridge(alpha = 1.0) #alpha is learning rate, Can change (to 0.01, etc)
ridgereg = linear_model.Ridge(alpha = 0.05) #alpha is learning rate, Can change (to 0.01, etc)
ridgereg.fit(x_train, y_train)
y_pred_ridge = ridgereg.predict(x_test)
ridge_corr = ridgereg.score(x_test, y_test)
ridge_error = mean_squared_error(y_test, y_pred_ridge)**0.5
ridge_error, ridge_corr
#(0.11030102597135479, 0.92821064496608097)
#lassoreg = linear_model.Lasso(alpha = .001)
lassoreg = linear_model.Lasso(alpha = .01)
lassoreg.fit(x_train, y_train)
y_pred_lasso = lassoreg.predict(x_test)
lasso_corr = lassoreg.score(x_test, y_test)
lasso_error = mean_squared_error(y_test, y_pred_lasso)**0.5
lasso_error, lasso_corr
#parameters = {'learning_rate': [0.02, 0.03, 0.04, 0.05, 0.06], 'n_estimators': [ 1800, 1900, 2000, 2100, 2200]} # could in theory test more but whatever
#clf = GridSearchCV(GradientBoostingRegressor(), parameters)
#clf.fit(x_train, y_train)
#clf.best_params_
# Do a gridsearch in sklearn
#
# Lasso does feature selection, then do gradient boosting?
# results of gridsearch
# {'learning_rate': 0.03, 'n_estimators': 2000}
#{'learning_rate': 0.05, 'n_estimators': 1900}
#'learning_rate': [0.02, 0.03, 0.04, 0.05, 0.06], 'n_estimators': [1700, 1800, 1900, 2000 ] {'learning_rate': 0.03, 'n_estimators': 1800}
#GradientBoosting = GradientBoostingRegressor(learning_rate = 0.02, n_estimators=3000, loss="huber")
#GradientBoosting = GradientBoosting.fit(x_train, y_train)
#y_pred_gradientboosting = GradientBoosting.predict(x_test)
#gradient_corr = GradientBoosting.score(x_test, y_test)
#gradient_error = mean_squared_error(y_test, y_pred_gradientboosting)**0.5
#gradient_error, gradient_corr
#parameters = { 'n_estimators': [1900, 2000, 2100, 2200, 2300]} # could in theory test more but whatever
#clf = GridSearchCV(RandomForestRegressor(), parameters)
#clf.fit(x_train, y_train)
#clf.best_params_
#RandomForest = RandomForestRegressor(n_estimators=1900)
#RandomForest = RandomForest.fit(x_train, y_train)
#y_pred_randomforest = RandomForest.predict(x_test)
#randomforest_corr = RandomForest.score(x_test, y_test)
#randomforest_error = mean_squared_error(y_test, y_pred_randomforest)**0.5
#randomforest_error, randomforest_corr
#parameters = { 'learning_rate': [0.1, 0.5, 0.7, 1.0],'n_estimators': [1900, 2000, 2100]} # could in theory test more but whatever
#clf = GridSearchCV(AdaBoostRegressor(), parameters)
#clf.fit(x_train, y_train)
#clf.best_params_
#AdaBoost= AdaBoostRegressor(learning_rate= 1.0, n_estimators=2000)
#AdaBoost = AdaBoost.fit(x_train, y_train)
#y_pred_adaboost = AdaBoost.predict(x_test)
#adaboost_corr = AdaBoost.score(x_test, y_test)
#adaboost_error = mean_squared_error(y_test, y_pred_adaboost)**0.5
#adaboost_error, adaboost_corr
#parameters = { 'learning_rate': [0.001, 0.01, 0.02, 0.1, 0.5, 0.7, 1.0],'n_estimators': [1000, 3000, 7200, 9000], } # could in theory test more but whatever
#clf = GridSearchCV(xgb.XGBRegressor(), parameters)
#clf.fit(x_train, y_train)
#clf.best_params_
#parameters = { 'learning_rate': [0.03],'n_estimators': [2500,2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000], } # could in theory test more but whatever
#clf = GridSearchCV(xgb.XGBRegressor(), parameters)
#clf.fit(x_train, y_train)
#clf.best_params_
#xgboost = xgb.XGBRegressor(learning_rate = 0.03, n_estimators=1275, max_depth=3)
xgboost = xgb.XGBRegressor(learning_rate = 0.03, n_estimators=2600, max_depth=3)
xgboost = xgboost.fit(x_train, y_train)
y_pred_xgboost = xgboost.predict(x_test)
xgboost_corr = xgboost.score(x_test, y_test)
xgboost_error = mean_squared_error(y_test, y_pred_xgboost)**0.5
xgboost_error, xgboost_corr
#(0.11091205185460548, 0.92741307029539688)
#(0.11286956997136902, 0.92482824524109142)
#(0.11129650831565363, 0.92690897934993766)
#(0.10996132516749133, 0.92865215216847619) 0.03, 4000
#(0.10991824002441893, 0.92870805236778797) 0.03, 3000
#(0.10961827680696469, 0.92909662814121641) 0.03, 2000
#(0.10916191236725561, 0.92968577116229045) 0.03, 1500
#(0.10911116020722886, 0.92975113772376539) 0.03, 1250
#### enet = ElasticNet(alpha = 1.0, l1_ratio=0.1)
#enet = enet.fit(x_train, y_train)
#y_pred_enet = enet.predict(x_test)
#enet_corr = enet.score(x_test, y_test)
#enet_error = mean_squared_error(y_test, y_pred_xgboost) **0.5
#enet_error, enet_corr
#parameters = {'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
#clf = GridSearchCV(ElasticNet(), parameters)
#clf.fit(x_train, y_train)
#clf.best_params_
#y_ensemble_pred = (y_pred_gradientboosting/gradient_error**2 + y_pred_randomforest/randomforest_error**2 + y_pred_xgboost/xgboost_error**2+ y_pred_ridge/ridge_error**2+y_pred_lasso/lasso_error**2 )
#y_ensemble_pred = y_ensemble_pred / (1/randomforest_error**2 + 1/gradient_error**2 + 1/ridge_error**2 + 1/lasso_error**2 + 1/xgboost_error**2 )
y_ensemble_pred = ( y_pred_xgboost/xgboost_error**2+ y_pred_ridge/ridge_error**2+y_pred_lasso/lasso_error**2 )
y_ensemble_pred = y_ensemble_pred / ( 1/ridge_error**2 + 1/lasso_error**2 + 1/xgboost_error**2 )
ensemble_error = mean_squared_error(y_test, y_ensemble_pred)**0.5
ensemble_error
# Current highest scoring (read: lowest) ensemble error
#0.099778052606703235
#0.099586287880297747
# With Lasso and Ridge scores set quite low
Unsurprisingly the ensemble methods vastly outperform the linear ones
Now, output to a CSV, and remember to undo the log(x+1) done previously
#GradientBoosting = GradientBoosting.fit(x, y) # Fit again since we only used 80% of the data last time
#y_gradientboosting = np.expm1(GradientBoosting.predict(x_unknown))
#df_houseprice['SalePrice'] = y_gradientboosting
#df_houseprice.to_csv('output_gradient_boosting.csv',index=False)
#RandomForest = RandomForest.fit(x, y) # Fit again since we only used 80% of the data last time
#y_randomforest = np.expm1(RandomForest.predict(x_unknown))
#df_houseprice['SalePrice'] = y_randomforest
#df_houseprice.to_csv('output_random_forest.csv',index=False)
xgboost = xgboost.fit(x, y)
y_xgboost = np.expm1(xgboost.predict(x_unknown))
df_houseprice['SalePrice'] = y_xgboost
df_houseprice.to_csv('output_XGboost.csv',index=False)
#AdaBoost = AdaBoost.fit(x,y)
#y_adaboost = np.expm1(AdaBoost.predict(x_unknown))
#df_houseprice['SalePrice'] = y_adaboost
#df_houseprice.to_csv('output_adaboost.csv',index=False)
ridgereg = ridgereg.fit(x,y)
y_ridgereg = np.expm1(ridgereg.predict(x_unknown))
#df_houseprice['SalePrice'] = y_adaboost
#df_houseprice.to_csv('output_adaboost.csv',index=False)
lassoreg = lassoreg.fit(x,y)
y_lassoreg = np.expm1(lassoreg.predict(x_unknown))
#temp = GradientBoosting.predict(x_unknown)
#df_houseprice['SalePrice'].plot('hist', 100)
#temp
# To do: revisit "condition"
# Take the categorical variables which should be ordered, and make them ordered/numerical
# Calculate log of sales price, etc
# Neighborhood - do feature engineering on it
# Will dropping some highly correlated variables help?
# Raising more correlated features by power of 2 or 3
#
# Elastic Net? Extra Trees?
# Order those non-equal categorical variables!
# Bed/Bath ratio
df_houseprice.head(20)
# log average
#ensemble = np.exp((np.log(y_gradientboosting) + np.log(y_randomforest) + np.log(y_xgboost)+np.log(y_adaboost) + np.log(y_ridgereg)+np.log(y_lassoreg))/6)
#More standard mean
#ensemble = ((y_gradientboosting + y_randomforest + y_xgboost+y_adaboost + y_ridgereg+y_lassoreg)/6)
#ensemble = (y_gradientboosting/gradient_error + y_randomforest/randomforest_error + y_xgboost/xgboost_error+y_adaboost/adaboost_error + y_ridgereg/ridge_error+y_lassoreg/lasso_error)
#ensemble = ensemble / (1/randomforest_error + 1/gradient_error + 1/ridge_error + 1/lasso_error + 1/adaboost_error+ 1/xgboost_error)
#ensemble = (y_gradientboosting/gradient_error**2 + y_randomforest/randomforest_error**2 + y_xgboost/xgboost_error**2+ y_ridgereg/ridge_error**2+y_lassoreg/lasso_error**2)
#ensemble = ensemble / (1/randomforest_error**2 + 1/gradient_error**2 + 1/ridge_error**2 + 1/lasso_error**2 + 1/xgboost_error**2)
ensemble = ( y_xgboost/xgboost_error**2+ y_ridgereg/ridge_error**2+y_lassoreg/lasso_error**2)
ensemble = ensemble / ( 1/ridge_error**2 + 1/lasso_error**2 + 1/xgboost_error**2)
df_houseprice['SalePrice'] = ensemble
df_houseprice.to_csv('output_ensemble.csv',index=False)
xstuff = np.linspace(1.,len(y_xgboost),len(y_xgboost))
plt.plot(xstuff, ensemble, 'b-', xstuff, y_xgboost, 'r-')
Let’s try model stacking
Some resources consulted for this section:
https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/
https://mlwave.com/kaggle-ensembling-guide/
Elements of Statistical Learning (http://web.stanford.edu/~hastie/ElemStatLearn/)
kfold = KFold(n_splits=5, shuffle = True)
#allFolds = kfold.split(x,y)
allBasePredictions = np.zeros((x.shape[0],3)) # One column for every base model to train
allBasePredictions
There’s a weird dataframe to numpy conversion that’s necessary to avoid an out of bounds error. See below link:
https://stackoverflow.com/questions/30023927/sklearn-cross-validation-stratifiedshufflesplit-error-indices-are-out-of-bou
x_numpy = x.values
for k, (index_train, index_test) in enumerate(kfold.split(x,y)):
# index_train.reshape(1, len(index_train))
first = x.loc[index_train].reset_index()
second = x.loc[index_test].reset_index()#.reindex()
lassoreg.fit(x.iloc[index_train], y[index_train])
y_predicted = lassoreg.predict(x.iloc[index_test])
allBasePredictions[index_test,0] = y_predicted
#x_numpy = x.values
#for k, (index_train, index_test) in enumerate(kfold.split(x,y)):
# ridgereg.fit(x_numpy[index_train], y[index_train])
# y_predicted = ridgereg.predict(x_numpy[index_test])
# allBasePredictions[index_test,1] = y_predicted
#x_numpy = x.values
#for k, (index_train, index_test) in enumerate(kfold.split(x,y)):
# xgboost.fit(x_numpy[index_train], y[index_train])
# y_predicted = xgboost.predict(x_numpy[index_test])
# allBasePredictions[index_test,2] = y_predicted
first.isnull().any()
first = x.loc[index_train].reset_index()
first.drop(['index'], axis=1, inplace=True)
first
x.loc[index_train].isnull()[1030:1040]
x[1290:1300]
index_train[1030:1040]
x.iloc[index_train[1030:1040]]
first[1030:1040]
z = y[index_train]
z
lassoreg.fit(first.dropna, z)
first.head()
first[first['1stFlrSF'].isnull()]
http://stackoverflow.com/questions/21320456/scikit-nan-or-infinity-error-message
x[1034]