Predicting Car Prices¶

Goal is to practice the ML workflow

In [1]:
import pandas as pd
import numpy as np
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars=pd.read_csv('imports-85.data',names=cols)
In [2]:
cars
Out[2]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base ... engine-size fuel-system bore stroke compression-rate horsepower peak-rpm city-mpg highway-mpg price
0 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.00 111 5000 21 27 13495
1 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.00 111 5000 21 27 16500
2 1 ? alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.00 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.00 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.00 115 5500 18 22 17450
5 2 ? audi gas std two sedan fwd front 99.8 ... 136 mpfi 3.19 3.40 8.50 110 5500 19 25 15250
6 1 158 audi gas std four sedan fwd front 105.8 ... 136 mpfi 3.19 3.40 8.50 110 5500 19 25 17710
7 1 ? audi gas std four wagon fwd front 105.8 ... 136 mpfi 3.19 3.40 8.50 110 5500 19 25 18920
8 1 158 audi gas turbo four sedan fwd front 105.8 ... 131 mpfi 3.13 3.40 8.30 140 5500 17 20 23875
9 0 ? audi gas turbo two hatchback 4wd front 99.5 ... 131 mpfi 3.13 3.40 7.00 160 5500 16 22 ?
10 2 192 bmw gas std two sedan rwd front 101.2 ... 108 mpfi 3.50 2.80 8.80 101 5800 23 29 16430
11 0 192 bmw gas std four sedan rwd front 101.2 ... 108 mpfi 3.50 2.80 8.80 101 5800 23 29 16925
12 0 188 bmw gas std two sedan rwd front 101.2 ... 164 mpfi 3.31 3.19 9.00 121 4250 21 28 20970
13 0 188 bmw gas std four sedan rwd front 101.2 ... 164 mpfi 3.31 3.19 9.00 121 4250 21 28 21105
14 1 ? bmw gas std four sedan rwd front 103.5 ... 164 mpfi 3.31 3.19 9.00 121 4250 20 25 24565
15 0 ? bmw gas std four sedan rwd front 103.5 ... 209 mpfi 3.62 3.39 8.00 182 5400 16 22 30760
16 0 ? bmw gas std two sedan rwd front 103.5 ... 209 mpfi 3.62 3.39 8.00 182 5400 16 22 41315
17 0 ? bmw gas std four sedan rwd front 110.0 ... 209 mpfi 3.62 3.39 8.00 182 5400 15 20 36880
18 2 121 chevrolet gas std two hatchback fwd front 88.4 ... 61 2bbl 2.91 3.03 9.50 48 5100 47 53 5151
19 1 98 chevrolet gas std two hatchback fwd front 94.5 ... 90 2bbl 3.03 3.11 9.60 70 5400 38 43 6295
20 0 81 chevrolet gas std four sedan fwd front 94.5 ... 90 2bbl 3.03 3.11 9.60 70 5400 38 43 6575
21 1 118 dodge gas std two hatchback fwd front 93.7 ... 90 2bbl 2.97 3.23 9.41 68 5500 37 41 5572
22 1 118 dodge gas std two hatchback fwd front 93.7 ... 90 2bbl 2.97 3.23 9.40 68 5500 31 38 6377
23 1 118 dodge gas turbo two hatchback fwd front 93.7 ... 98 mpfi 3.03 3.39 7.60 102 5500 24 30 7957
24 1 148 dodge gas std four hatchback fwd front 93.7 ... 90 2bbl 2.97 3.23 9.40 68 5500 31 38 6229
25 1 148 dodge gas std four sedan fwd front 93.7 ... 90 2bbl 2.97 3.23 9.40 68 5500 31 38 6692
26 1 148 dodge gas std four sedan fwd front 93.7 ... 90 2bbl 2.97 3.23 9.40 68 5500 31 38 7609
27 1 148 dodge gas turbo ? sedan fwd front 93.7 ... 98 mpfi 3.03 3.39 7.60 102 5500 24 30 8558
28 -1 110 dodge gas std four wagon fwd front 103.3 ... 122 2bbl 3.34 3.46 8.50 88 5000 24 30 8921
29 3 145 dodge gas turbo two hatchback fwd front 95.9 ... 156 mfi 3.60 3.90 7.00 145 5000 19 24 12964
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
175 -1 65 toyota gas std four hatchback fwd front 102.4 ... 122 mpfi 3.31 3.54 8.70 92 4200 27 32 9988
176 -1 65 toyota gas std four sedan fwd front 102.4 ... 122 mpfi 3.31 3.54 8.70 92 4200 27 32 10898
177 -1 65 toyota gas std four hatchback fwd front 102.4 ... 122 mpfi 3.31 3.54 8.70 92 4200 27 32 11248
178 3 197 toyota gas std two hatchback rwd front 102.9 ... 171 mpfi 3.27 3.35 9.30 161 5200 20 24 16558
179 3 197 toyota gas std two hatchback rwd front 102.9 ... 171 mpfi 3.27 3.35 9.30 161 5200 19 24 15998
180 -1 90 toyota gas std four sedan rwd front 104.5 ... 171 mpfi 3.27 3.35 9.20 156 5200 20 24 15690
181 -1 ? toyota gas std four wagon rwd front 104.5 ... 161 mpfi 3.27 3.35 9.20 156 5200 19 24 15750
182 2 122 volkswagen diesel std two sedan fwd front 97.3 ... 97 idi 3.01 3.40 23.00 52 4800 37 46 7775
183 2 122 volkswagen gas std two sedan fwd front 97.3 ... 109 mpfi 3.19 3.40 9.00 85 5250 27 34 7975
184 2 94 volkswagen diesel std four sedan fwd front 97.3 ... 97 idi 3.01 3.40 23.00 52 4800 37 46 7995
185 2 94 volkswagen gas std four sedan fwd front 97.3 ... 109 mpfi 3.19 3.40 9.00 85 5250 27 34 8195
186 2 94 volkswagen gas std four sedan fwd front 97.3 ... 109 mpfi 3.19 3.40 9.00 85 5250 27 34 8495
187 2 94 volkswagen diesel turbo four sedan fwd front 97.3 ... 97 idi 3.01 3.40 23.00 68 4500 37 42 9495
188 2 94 volkswagen gas std four sedan fwd front 97.3 ... 109 mpfi 3.19 3.40 10.00 100 5500 26 32 9995
189 3 ? volkswagen gas std two convertible fwd front 94.5 ... 109 mpfi 3.19 3.40 8.50 90 5500 24 29 11595
190 3 256 volkswagen gas std two hatchback fwd front 94.5 ... 109 mpfi 3.19 3.40 8.50 90 5500 24 29 9980
191 0 ? volkswagen gas std four sedan fwd front 100.4 ... 136 mpfi 3.19 3.40 8.50 110 5500 19 24 13295
192 0 ? volkswagen diesel turbo four sedan fwd front 100.4 ... 97 idi 3.01 3.40 23.00 68 4500 33 38 13845
193 0 ? volkswagen gas std four wagon fwd front 100.4 ... 109 mpfi 3.19 3.40 9.00 88 5500 25 31 12290
194 -2 103 volvo gas std four sedan rwd front 104.3 ... 141 mpfi 3.78 3.15 9.50 114 5400 23 28 12940
195 -1 74 volvo gas std four wagon rwd front 104.3 ... 141 mpfi 3.78 3.15 9.50 114 5400 23 28 13415
196 -2 103 volvo gas std four sedan rwd front 104.3 ... 141 mpfi 3.78 3.15 9.50 114 5400 24 28 15985
197 -1 74 volvo gas std four wagon rwd front 104.3 ... 141 mpfi 3.78 3.15 9.50 114 5400 24 28 16515
198 -2 103 volvo gas turbo four sedan rwd front 104.3 ... 130 mpfi 3.62 3.15 7.50 162 5100 17 22 18420
199 -1 74 volvo gas turbo four wagon rwd front 104.3 ... 130 mpfi 3.62 3.15 7.50 162 5100 17 22 18950
200 -1 95 volvo gas std four sedan rwd front 109.1 ... 141 mpfi 3.78 3.15 9.50 114 5400 23 28 16845
201 -1 95 volvo gas turbo four sedan rwd front 109.1 ... 141 mpfi 3.78 3.15 8.70 160 5300 19 25 19045
202 -1 95 volvo gas std four sedan rwd front 109.1 ... 173 mpfi 3.58 2.87 8.80 134 5500 18 23 21485
203 -1 95 volvo diesel turbo four sedan rwd front 109.1 ... 145 idi 3.01 3.40 23.00 106 4800 26 27 22470
204 -1 95 volvo gas turbo four sedan rwd front 109.1 ... 141 mpfi 3.78 3.15 9.50 114 5400 19 25 22625

205 rows × 26 columns

In [3]:
cars.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-rate     205 non-null float64
horsepower           205 non-null object
peak-rpm             205 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null object
dtypes: float64(5), int64(5), object(16)
memory usage: 41.7+ KB

['price'] is the target column.

['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'] are continuous values as seen from the database description.

It looks like some values have '?' as an entry instead of null. Lets check our null values. Also lets convert all to type float.

In [4]:
cont_car=cars[['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']].copy()
cont_car.replace('?',np.nan,inplace=True)
cont_car=cont_car.astype(float)
cont_car.isnull().sum()
Out[4]:
normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

We will want to remove all rows with a null 'price' value. Our model can't take null values at all so we will replace other null values with column means.

In [5]:
cont_car.dropna(subset=['price'],inplace=True)
cont_car=cont_car.apply(lambda x:x.fillna(x.mean()))
cont_car.isnull().sum()
Out[5]:
normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

Now rescale all values to be contained in [0,1]

In [6]:
norm_cont_car=cont_car.copy()
norm_cont_car=(norm_cont_car-norm_cont_car.min())/(norm_cont_car.max()-norm_cont_car.min())
norm_cont_car['price']=cont_car['price']

Univariate Model¶

Test all possible univariate models. 50/50 split on test/train. RMSE.

In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

def knn_train_test(df,features,target):
    
    np.random.seed(1)
    df=df.iloc[np.random.permutation(len(df))].copy()
    
    split=int(len(df)/2)
    train=df[:split].copy()
    test=df[split:].copy()
    
    knn=KNeighborsRegressor()
    knn.fit(train[features],train[target])
    p=knn.predict(test[features])
    
    return mean_squared_error(test[target],p)**.5

all_features=norm_cont_car.columns.drop('price')
univar=[[i,knn_train_test(norm_cont_car,[i],'price')] for i in all_features]
univar.sort(key=lambda x: x[1])
univar
Out[7]:
[['engine-size', 3238.4628296477176],
 ['horsepower', 4037.0377131537603],
 ['curb-weight', 4401.118254793124],
 ['highway-mpg', 4630.026798588056],
 ['width', 4704.482589704386],
 ['city-mpg', 4766.422505090134],
 ['length', 5427.2009608367125],
 ['wheel-base', 5461.553997873057],
 ['compression-rate', 6610.812153159129],
 ['bore', 6780.627784685976],
 ['normalized-losses', 7330.197653434445],
 ['peak-rpm', 7697.4596964334805],
 ['stroke', 8006.529544647101],
 ['height', 8144.441042663747]]

Looks like 'engine-size' performed the best. Now let's modify the function to accept a parameter, k, the number of neighbors.

In [8]:
def knn_train_test(df,features,target,k):
    
    np.random.seed(1)
    df=df.iloc[np.random.permutation(len(df))].copy()
    
    split=int(len(df)/2)
    train=df[:split].copy()
    test=df[split:].copy()
    
    knn=KNeighborsRegressor(n_neighbors=k)
    knn.fit(train[features],train[target])
    p=knn.predict(test[features])
    
    return mean_squared_error(test[target],p)**.5

results=[]
for i in all_features:
    for k in [1,3,5,7,9]:
        results.append([i,k,knn_train_test(norm_cont_car,[i],'price',k)])
results
Out[8]:
[['normalized-losses', 1, 7846.750605148984],
 ['normalized-losses', 3, 7500.5698123109905],
 ['normalized-losses', 5, 7330.197653434445],
 ['normalized-losses', 7, 7756.421586234123],
 ['normalized-losses', 9, 7688.096096891432],
 ['wheel-base', 1, 4493.734068810494],
 ['wheel-base', 3, 5120.161506064513],
 ['wheel-base', 5, 5461.553997873057],
 ['wheel-base', 7, 5448.1070513823315],
 ['wheel-base', 9, 5738.405685192312],
 ['length', 1, 4628.45550121557],
 ['length', 3, 5129.8358210721635],
 ['length', 5, 5427.2009608367125],
 ['length', 7, 5313.427720847974],
 ['length', 9, 5383.054514833446],
 ['width', 1, 4559.257297950061],
 ['width', 3, 4606.413692169901],
 ['width', 5, 4704.482589704386],
 ['width', 7, 4571.485046194653],
 ['width', 9, 4652.914172067787],
 ['height', 1, 8904.04645636071],
 ['height', 3, 8277.609643045525],
 ['height', 5, 8144.441042663747],
 ['height', 7, 7679.598124393773],
 ['height', 9, 7811.03606291223],
 ['curb-weight', 1, 5264.290230758878],
 ['curb-weight', 3, 5022.318011757233],
 ['curb-weight', 5, 4401.118254793124],
 ['curb-weight', 7, 4330.608104418053],
 ['curb-weight', 9, 4632.044474454401],
 ['engine-size', 1, 3258.4861059962027],
 ['engine-size', 3, 2840.562805643501],
 ['engine-size', 5, 3238.4628296477176],
 ['engine-size', 7, 3563.086774256415],
 ['engine-size', 9, 3831.8244149840766],
 ['bore', 1, 8602.58848450066],
 ['bore', 3, 6984.239489480916],
 ['bore', 5, 6780.627784685976],
 ['bore', 7, 6878.097965921532],
 ['bore', 9, 6866.808502038413],
 ['stroke', 1, 9116.495955406906],
 ['stroke', 3, 7338.68466990294],
 ['stroke', 5, 8006.529544647101],
 ['stroke', 7, 7803.937796804327],
 ['stroke', 9, 7735.554366079291],
 ['compression-rate', 1, 8087.205346523092],
 ['compression-rate', 3, 7375.063685578359],
 ['compression-rate', 5, 6610.812153159129],
 ['compression-rate', 7, 6732.801282941515],
 ['compression-rate', 9, 7024.485525463435],
 ['horsepower', 1, 4170.054848037801],
 ['horsepower', 3, 4020.8492630885394],
 ['horsepower', 5, 4037.0377131537603],
 ['horsepower', 7, 4353.811860277134],
 ['horsepower', 9, 4515.135617419103],
 ['peak-rpm', 1, 9511.480067750124],
 ['peak-rpm', 3, 8537.550899973421],
 ['peak-rpm', 5, 7697.4596964334805],
 ['peak-rpm', 7, 7510.294160083481],
 ['peak-rpm', 9, 7340.041341263401],
 ['city-mpg', 1, 5901.143574354764],
 ['city-mpg', 3, 4646.746408727155],
 ['city-mpg', 5, 4766.422505090134],
 ['city-mpg', 7, 5232.523034167316],
 ['city-mpg', 9, 5465.209492527533],
 ['highway-mpg', 1, 6025.594966720739],
 ['highway-mpg', 3, 4617.305019788554],
 ['highway-mpg', 5, 4630.026798588056],
 ['highway-mpg', 7, 4796.061440186946],
 ['highway-mpg', 9, 5278.358056953987]]
In [9]:
import matplotlib.pyplot as plt
%matplotlib inline

x=[]
y=[]
for i in range(len(results)):
    x.append(results[i][1])
    y.append(results[i][2])
    if i%5==4:
        plt.plot(x,y)
        x=[]
        y=[]
    
plt.ylabel('rmse')
plt.xlabel('n_neighbors')
Out[9]:
<matplotlib.text.Text at 0x7fb8ee593630>
No description has been provided for this image

Looks like k=3 and k=5 produced the best results for most. Now lets look at a Multivariate Model.

Multivariate Model¶

Use all of the columns

In [10]:
knn_train_test(
    norm_cont_car,
    all_features,
    'price',
    5)
Out[10]:
3726.6377183145073

Now use 2 best, 3 best, 4 best, and 5 best columns

In [11]:
multi={}
for i in range(2,6):
    
    multi[str(i)+' best features']=knn_train_test(
        norm_cont_car,
        [univar[j][0] for j in range(i)],
        'price',
        5)
multi
Out[11]:
{'2 best features': 2949.8817277180374,
 '3 best features': 3435.2601095401815,
 '4 best features': 3286.5597353252815,
 '5 best features': 3410.2170133901805}

Now adjust hyperparameters for these results and visualize

In [12]:
final=[]
for i in range(2,6):
    for k in range(1,26):
        final.append([i,k,knn_train_test(
                            norm_cont_car,
                            [univar[j][0] for j in range(i)],
                            'price',
                            k)])        
final
Out[12]:
[[2, 1, 2783.6204237227344],
 [2, 2, 2657.7963807419765],
 [2, 3, 2792.586573031673],
 [2, 4, 2891.5329686923255],
 [2, 5, 2949.8817277180374],
 [2, 6, 3096.402601694776],
 [2, 7, 3164.681969020496],
 [2, 8, 3413.228359192009],
 [2, 9, 3748.6716603306486],
 [2, 10, 4080.7125057341937],
 [2, 11, 4215.6372280600335],
 [2, 12, 4275.421524277872],
 [2, 13, 4373.901683035496],
 [2, 14, 4424.285137239815],
 [2, 15, 4539.505493095937],
 [2, 16, 4667.307671446768],
 [2, 17, 4729.605305844226],
 [2, 18, 4790.556632159094],
 [2, 19, 4824.3866193292615],
 [2, 20, 4840.850914693829],
 [2, 21, 4837.429062000271],
 [2, 22, 4831.16988267597],
 [2, 23, 4861.679492959275],
 [2, 24, 4903.346008862579],
 [2, 25, 4955.892429427362],
 [3, 1, 3131.798233957262],
 [3, 2, 3005.5737903777945],
 [3, 3, 3016.72346189447],
 [3, 4, 3144.445503247916],
 [3, 5, 3435.2601095401815],
 [3, 6, 3490.638342356411],
 [3, 7, 3734.328302612299],
 [3, 8, 3936.1203798667007],
 [3, 9, 3976.4878693193755],
 [3, 10, 4124.277723900439],
 [3, 11, 4213.73633336988],
 [3, 12, 4230.509703492906],
 [3, 13, 4176.851250534326],
 [3, 14, 4292.415585753593],
 [3, 15, 4382.50190463831],
 [3, 16, 4418.525629386618],
 [3, 17, 4427.787909307598],
 [3, 18, 4493.877120715579],
 [3, 19, 4591.965375700132],
 [3, 20, 4597.552075200799],
 [3, 21, 4674.306863451769],
 [3, 22, 4684.262850205994],
 [3, 23, 4718.175304198165],
 [3, 24, 4757.187227038854],
 [3, 25, 4783.665528691579],
 [4, 1, 2927.820116343107],
 [4, 2, 2600.431466008274],
 [4, 3, 2799.0983199667144],
 [4, 4, 3074.9999988931822],
 [4, 5, 3286.5597353252815],
 [4, 6, 3705.1702717384333],
 [4, 7, 3822.180444798032],
 [4, 8, 3986.7014080509525],
 [4, 9, 4210.340136384571],
 [4, 10, 4213.315457869819],
 [4, 11, 4182.323353796191],
 [4, 12, 4288.696189501458],
 [4, 13, 4249.894628724804],
 [4, 14, 4357.287937912011],
 [4, 15, 4451.408927529352],
 [4, 16, 4459.2244133241875],
 [4, 17, 4495.028598279714],
 [4, 18, 4546.50352887804],
 [4, 19, 4643.185310790349],
 [4, 20, 4668.634597012364],
 [4, 21, 4698.7862455275545],
 [4, 22, 4700.1491313974975],
 [4, 23, 4711.210423921079],
 [4, 24, 4739.97994062864],
 [4, 25, 4766.177179071813],
 [5, 1, 2824.7061233282866],
 [5, 2, 2915.6731645496975],
 [5, 3, 3012.4204546509704],
 [5, 4, 3202.8876051367483],
 [5, 5, 3410.2170133901805],
 [5, 6, 3618.4509432660384],
 [5, 7, 3622.6290209234803],
 [5, 8, 3848.635835654326],
 [5, 9, 3977.8149139381726],
 [5, 10, 3994.8132211260104],
 [5, 11, 4159.843526607947],
 [5, 12, 4294.3389473154875],
 [5, 13, 4380.848359486949],
 [5, 14, 4466.368754416089],
 [5, 15, 4522.420711094978],
 [5, 16, 4536.427578452413],
 [5, 17, 4587.098443664006],
 [5, 18, 4622.107837952761],
 [5, 19, 4612.890107622797],
 [5, 20, 4632.693976139521],
 [5, 21, 4712.917548435062],
 [5, 22, 4676.301064518744],
 [5, 23, 4691.189310956096],
 [5, 24, 4755.990767231825],
 [5, 25, 4804.323266214411]]
In [13]:
#Visualize
x=[]
y=[]
for i in range(len(final)):
    x.append(final[i][1])
    y.append(final[i][2])
    if i%25==24:
        label=str(final[i][0])+' best features'
        plt.plot(x,y,label=label)
        x=[]
        y=[]
    
plt.ylabel('rmse')
plt.xlabel('n_neighbors')
plt.legend(loc='lower right')
Out[13]:
<matplotlib.legend.Legend at 0x7fb8ee577d30>
No description has been provided for this image

Looks like 4 best features at k=2 performed the best.

In [ ]: