Predicting Car Prices¶
Goal is to practice the ML workflow
import pandas as pd
import numpy as np
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars=pd.read_csv('imports-85.data',names=cols)
cars
symboling | normalized-losses | make | fuel-type | aspiration | num-of-doors | body-style | drive-wheels | engine-location | wheel-base | ... | engine-size | fuel-system | bore | stroke | compression-rate | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.00 | 111 | 5000 | 21 | 27 | 13495 |
1 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.00 | 111 | 5000 | 21 | 27 | 16500 |
2 | 1 | ? | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.00 | 154 | 5000 | 19 | 26 | 16500 |
3 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.40 | 10.00 | 102 | 5500 | 24 | 30 | 13950 |
4 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.00 | 115 | 5500 | 18 | 22 | 17450 |
5 | 2 | ? | audi | gas | std | two | sedan | fwd | front | 99.8 | ... | 136 | mpfi | 3.19 | 3.40 | 8.50 | 110 | 5500 | 19 | 25 | 15250 |
6 | 1 | 158 | audi | gas | std | four | sedan | fwd | front | 105.8 | ... | 136 | mpfi | 3.19 | 3.40 | 8.50 | 110 | 5500 | 19 | 25 | 17710 |
7 | 1 | ? | audi | gas | std | four | wagon | fwd | front | 105.8 | ... | 136 | mpfi | 3.19 | 3.40 | 8.50 | 110 | 5500 | 19 | 25 | 18920 |
8 | 1 | 158 | audi | gas | turbo | four | sedan | fwd | front | 105.8 | ... | 131 | mpfi | 3.13 | 3.40 | 8.30 | 140 | 5500 | 17 | 20 | 23875 |
9 | 0 | ? | audi | gas | turbo | two | hatchback | 4wd | front | 99.5 | ... | 131 | mpfi | 3.13 | 3.40 | 7.00 | 160 | 5500 | 16 | 22 | ? |
10 | 2 | 192 | bmw | gas | std | two | sedan | rwd | front | 101.2 | ... | 108 | mpfi | 3.50 | 2.80 | 8.80 | 101 | 5800 | 23 | 29 | 16430 |
11 | 0 | 192 | bmw | gas | std | four | sedan | rwd | front | 101.2 | ... | 108 | mpfi | 3.50 | 2.80 | 8.80 | 101 | 5800 | 23 | 29 | 16925 |
12 | 0 | 188 | bmw | gas | std | two | sedan | rwd | front | 101.2 | ... | 164 | mpfi | 3.31 | 3.19 | 9.00 | 121 | 4250 | 21 | 28 | 20970 |
13 | 0 | 188 | bmw | gas | std | four | sedan | rwd | front | 101.2 | ... | 164 | mpfi | 3.31 | 3.19 | 9.00 | 121 | 4250 | 21 | 28 | 21105 |
14 | 1 | ? | bmw | gas | std | four | sedan | rwd | front | 103.5 | ... | 164 | mpfi | 3.31 | 3.19 | 9.00 | 121 | 4250 | 20 | 25 | 24565 |
15 | 0 | ? | bmw | gas | std | four | sedan | rwd | front | 103.5 | ... | 209 | mpfi | 3.62 | 3.39 | 8.00 | 182 | 5400 | 16 | 22 | 30760 |
16 | 0 | ? | bmw | gas | std | two | sedan | rwd | front | 103.5 | ... | 209 | mpfi | 3.62 | 3.39 | 8.00 | 182 | 5400 | 16 | 22 | 41315 |
17 | 0 | ? | bmw | gas | std | four | sedan | rwd | front | 110.0 | ... | 209 | mpfi | 3.62 | 3.39 | 8.00 | 182 | 5400 | 15 | 20 | 36880 |
18 | 2 | 121 | chevrolet | gas | std | two | hatchback | fwd | front | 88.4 | ... | 61 | 2bbl | 2.91 | 3.03 | 9.50 | 48 | 5100 | 47 | 53 | 5151 |
19 | 1 | 98 | chevrolet | gas | std | two | hatchback | fwd | front | 94.5 | ... | 90 | 2bbl | 3.03 | 3.11 | 9.60 | 70 | 5400 | 38 | 43 | 6295 |
20 | 0 | 81 | chevrolet | gas | std | four | sedan | fwd | front | 94.5 | ... | 90 | 2bbl | 3.03 | 3.11 | 9.60 | 70 | 5400 | 38 | 43 | 6575 |
21 | 1 | 118 | dodge | gas | std | two | hatchback | fwd | front | 93.7 | ... | 90 | 2bbl | 2.97 | 3.23 | 9.41 | 68 | 5500 | 37 | 41 | 5572 |
22 | 1 | 118 | dodge | gas | std | two | hatchback | fwd | front | 93.7 | ... | 90 | 2bbl | 2.97 | 3.23 | 9.40 | 68 | 5500 | 31 | 38 | 6377 |
23 | 1 | 118 | dodge | gas | turbo | two | hatchback | fwd | front | 93.7 | ... | 98 | mpfi | 3.03 | 3.39 | 7.60 | 102 | 5500 | 24 | 30 | 7957 |
24 | 1 | 148 | dodge | gas | std | four | hatchback | fwd | front | 93.7 | ... | 90 | 2bbl | 2.97 | 3.23 | 9.40 | 68 | 5500 | 31 | 38 | 6229 |
25 | 1 | 148 | dodge | gas | std | four | sedan | fwd | front | 93.7 | ... | 90 | 2bbl | 2.97 | 3.23 | 9.40 | 68 | 5500 | 31 | 38 | 6692 |
26 | 1 | 148 | dodge | gas | std | four | sedan | fwd | front | 93.7 | ... | 90 | 2bbl | 2.97 | 3.23 | 9.40 | 68 | 5500 | 31 | 38 | 7609 |
27 | 1 | 148 | dodge | gas | turbo | ? | sedan | fwd | front | 93.7 | ... | 98 | mpfi | 3.03 | 3.39 | 7.60 | 102 | 5500 | 24 | 30 | 8558 |
28 | -1 | 110 | dodge | gas | std | four | wagon | fwd | front | 103.3 | ... | 122 | 2bbl | 3.34 | 3.46 | 8.50 | 88 | 5000 | 24 | 30 | 8921 |
29 | 3 | 145 | dodge | gas | turbo | two | hatchback | fwd | front | 95.9 | ... | 156 | mfi | 3.60 | 3.90 | 7.00 | 145 | 5000 | 19 | 24 | 12964 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
175 | -1 | 65 | toyota | gas | std | four | hatchback | fwd | front | 102.4 | ... | 122 | mpfi | 3.31 | 3.54 | 8.70 | 92 | 4200 | 27 | 32 | 9988 |
176 | -1 | 65 | toyota | gas | std | four | sedan | fwd | front | 102.4 | ... | 122 | mpfi | 3.31 | 3.54 | 8.70 | 92 | 4200 | 27 | 32 | 10898 |
177 | -1 | 65 | toyota | gas | std | four | hatchback | fwd | front | 102.4 | ... | 122 | mpfi | 3.31 | 3.54 | 8.70 | 92 | 4200 | 27 | 32 | 11248 |
178 | 3 | 197 | toyota | gas | std | two | hatchback | rwd | front | 102.9 | ... | 171 | mpfi | 3.27 | 3.35 | 9.30 | 161 | 5200 | 20 | 24 | 16558 |
179 | 3 | 197 | toyota | gas | std | two | hatchback | rwd | front | 102.9 | ... | 171 | mpfi | 3.27 | 3.35 | 9.30 | 161 | 5200 | 19 | 24 | 15998 |
180 | -1 | 90 | toyota | gas | std | four | sedan | rwd | front | 104.5 | ... | 171 | mpfi | 3.27 | 3.35 | 9.20 | 156 | 5200 | 20 | 24 | 15690 |
181 | -1 | ? | toyota | gas | std | four | wagon | rwd | front | 104.5 | ... | 161 | mpfi | 3.27 | 3.35 | 9.20 | 156 | 5200 | 19 | 24 | 15750 |
182 | 2 | 122 | volkswagen | diesel | std | two | sedan | fwd | front | 97.3 | ... | 97 | idi | 3.01 | 3.40 | 23.00 | 52 | 4800 | 37 | 46 | 7775 |
183 | 2 | 122 | volkswagen | gas | std | two | sedan | fwd | front | 97.3 | ... | 109 | mpfi | 3.19 | 3.40 | 9.00 | 85 | 5250 | 27 | 34 | 7975 |
184 | 2 | 94 | volkswagen | diesel | std | four | sedan | fwd | front | 97.3 | ... | 97 | idi | 3.01 | 3.40 | 23.00 | 52 | 4800 | 37 | 46 | 7995 |
185 | 2 | 94 | volkswagen | gas | std | four | sedan | fwd | front | 97.3 | ... | 109 | mpfi | 3.19 | 3.40 | 9.00 | 85 | 5250 | 27 | 34 | 8195 |
186 | 2 | 94 | volkswagen | gas | std | four | sedan | fwd | front | 97.3 | ... | 109 | mpfi | 3.19 | 3.40 | 9.00 | 85 | 5250 | 27 | 34 | 8495 |
187 | 2 | 94 | volkswagen | diesel | turbo | four | sedan | fwd | front | 97.3 | ... | 97 | idi | 3.01 | 3.40 | 23.00 | 68 | 4500 | 37 | 42 | 9495 |
188 | 2 | 94 | volkswagen | gas | std | four | sedan | fwd | front | 97.3 | ... | 109 | mpfi | 3.19 | 3.40 | 10.00 | 100 | 5500 | 26 | 32 | 9995 |
189 | 3 | ? | volkswagen | gas | std | two | convertible | fwd | front | 94.5 | ... | 109 | mpfi | 3.19 | 3.40 | 8.50 | 90 | 5500 | 24 | 29 | 11595 |
190 | 3 | 256 | volkswagen | gas | std | two | hatchback | fwd | front | 94.5 | ... | 109 | mpfi | 3.19 | 3.40 | 8.50 | 90 | 5500 | 24 | 29 | 9980 |
191 | 0 | ? | volkswagen | gas | std | four | sedan | fwd | front | 100.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.50 | 110 | 5500 | 19 | 24 | 13295 |
192 | 0 | ? | volkswagen | diesel | turbo | four | sedan | fwd | front | 100.4 | ... | 97 | idi | 3.01 | 3.40 | 23.00 | 68 | 4500 | 33 | 38 | 13845 |
193 | 0 | ? | volkswagen | gas | std | four | wagon | fwd | front | 100.4 | ... | 109 | mpfi | 3.19 | 3.40 | 9.00 | 88 | 5500 | 25 | 31 | 12290 |
194 | -2 | 103 | volvo | gas | std | four | sedan | rwd | front | 104.3 | ... | 141 | mpfi | 3.78 | 3.15 | 9.50 | 114 | 5400 | 23 | 28 | 12940 |
195 | -1 | 74 | volvo | gas | std | four | wagon | rwd | front | 104.3 | ... | 141 | mpfi | 3.78 | 3.15 | 9.50 | 114 | 5400 | 23 | 28 | 13415 |
196 | -2 | 103 | volvo | gas | std | four | sedan | rwd | front | 104.3 | ... | 141 | mpfi | 3.78 | 3.15 | 9.50 | 114 | 5400 | 24 | 28 | 15985 |
197 | -1 | 74 | volvo | gas | std | four | wagon | rwd | front | 104.3 | ... | 141 | mpfi | 3.78 | 3.15 | 9.50 | 114 | 5400 | 24 | 28 | 16515 |
198 | -2 | 103 | volvo | gas | turbo | four | sedan | rwd | front | 104.3 | ... | 130 | mpfi | 3.62 | 3.15 | 7.50 | 162 | 5100 | 17 | 22 | 18420 |
199 | -1 | 74 | volvo | gas | turbo | four | wagon | rwd | front | 104.3 | ... | 130 | mpfi | 3.62 | 3.15 | 7.50 | 162 | 5100 | 17 | 22 | 18950 |
200 | -1 | 95 | volvo | gas | std | four | sedan | rwd | front | 109.1 | ... | 141 | mpfi | 3.78 | 3.15 | 9.50 | 114 | 5400 | 23 | 28 | 16845 |
201 | -1 | 95 | volvo | gas | turbo | four | sedan | rwd | front | 109.1 | ... | 141 | mpfi | 3.78 | 3.15 | 8.70 | 160 | 5300 | 19 | 25 | 19045 |
202 | -1 | 95 | volvo | gas | std | four | sedan | rwd | front | 109.1 | ... | 173 | mpfi | 3.58 | 2.87 | 8.80 | 134 | 5500 | 18 | 23 | 21485 |
203 | -1 | 95 | volvo | diesel | turbo | four | sedan | rwd | front | 109.1 | ... | 145 | idi | 3.01 | 3.40 | 23.00 | 106 | 4800 | 26 | 27 | 22470 |
204 | -1 | 95 | volvo | gas | turbo | four | sedan | rwd | front | 109.1 | ... | 141 | mpfi | 3.78 | 3.15 | 9.50 | 114 | 5400 | 19 | 25 | 22625 |
205 rows × 26 columns
cars.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 205 entries, 0 to 204 Data columns (total 26 columns): symboling 205 non-null int64 normalized-losses 205 non-null object make 205 non-null object fuel-type 205 non-null object aspiration 205 non-null object num-of-doors 205 non-null object body-style 205 non-null object drive-wheels 205 non-null object engine-location 205 non-null object wheel-base 205 non-null float64 length 205 non-null float64 width 205 non-null float64 height 205 non-null float64 curb-weight 205 non-null int64 engine-type 205 non-null object num-of-cylinders 205 non-null object engine-size 205 non-null int64 fuel-system 205 non-null object bore 205 non-null object stroke 205 non-null object compression-rate 205 non-null float64 horsepower 205 non-null object peak-rpm 205 non-null object city-mpg 205 non-null int64 highway-mpg 205 non-null int64 price 205 non-null object dtypes: float64(5), int64(5), object(16) memory usage: 41.7+ KB
['price'] is the target column.
['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'] are continuous values as seen from the database description.
It looks like some values have '?' as an entry instead of null. Lets check our null values. Also lets convert all to type float.
cont_car=cars[['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']].copy()
cont_car.replace('?',np.nan,inplace=True)
cont_car=cont_car.astype(float)
cont_car.isnull().sum()
normalized-losses 41 wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 4 stroke 4 compression-rate 0 horsepower 2 peak-rpm 2 city-mpg 0 highway-mpg 0 price 4 dtype: int64
We will want to remove all rows with a null 'price' value. Our model can't take null values at all so we will replace other null values with column means.
cont_car.dropna(subset=['price'],inplace=True)
cont_car=cont_car.apply(lambda x:x.fillna(x.mean()))
cont_car.isnull().sum()
normalized-losses 0 wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 0 stroke 0 compression-rate 0 horsepower 0 peak-rpm 0 city-mpg 0 highway-mpg 0 price 0 dtype: int64
Now rescale all values to be contained in [0,1]
norm_cont_car=cont_car.copy()
norm_cont_car=(norm_cont_car-norm_cont_car.min())/(norm_cont_car.max()-norm_cont_car.min())
norm_cont_car['price']=cont_car['price']
Univariate Model¶
Test all possible univariate models. 50/50 split on test/train. RMSE.
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
def knn_train_test(df,features,target):
np.random.seed(1)
df=df.iloc[np.random.permutation(len(df))].copy()
split=int(len(df)/2)
train=df[:split].copy()
test=df[split:].copy()
knn=KNeighborsRegressor()
knn.fit(train[features],train[target])
p=knn.predict(test[features])
return mean_squared_error(test[target],p)**.5
all_features=norm_cont_car.columns.drop('price')
univar=[[i,knn_train_test(norm_cont_car,[i],'price')] for i in all_features]
univar.sort(key=lambda x: x[1])
univar
[['engine-size', 3238.4628296477176], ['horsepower', 4037.0377131537603], ['curb-weight', 4401.118254793124], ['highway-mpg', 4630.026798588056], ['width', 4704.482589704386], ['city-mpg', 4766.422505090134], ['length', 5427.2009608367125], ['wheel-base', 5461.553997873057], ['compression-rate', 6610.812153159129], ['bore', 6780.627784685976], ['normalized-losses', 7330.197653434445], ['peak-rpm', 7697.4596964334805], ['stroke', 8006.529544647101], ['height', 8144.441042663747]]
Looks like 'engine-size' performed the best. Now let's modify the function to accept a parameter, k, the number of neighbors.
def knn_train_test(df,features,target,k):
np.random.seed(1)
df=df.iloc[np.random.permutation(len(df))].copy()
split=int(len(df)/2)
train=df[:split].copy()
test=df[split:].copy()
knn=KNeighborsRegressor(n_neighbors=k)
knn.fit(train[features],train[target])
p=knn.predict(test[features])
return mean_squared_error(test[target],p)**.5
results=[]
for i in all_features:
for k in [1,3,5,7,9]:
results.append([i,k,knn_train_test(norm_cont_car,[i],'price',k)])
results
[['normalized-losses', 1, 7846.750605148984], ['normalized-losses', 3, 7500.5698123109905], ['normalized-losses', 5, 7330.197653434445], ['normalized-losses', 7, 7756.421586234123], ['normalized-losses', 9, 7688.096096891432], ['wheel-base', 1, 4493.734068810494], ['wheel-base', 3, 5120.161506064513], ['wheel-base', 5, 5461.553997873057], ['wheel-base', 7, 5448.1070513823315], ['wheel-base', 9, 5738.405685192312], ['length', 1, 4628.45550121557], ['length', 3, 5129.8358210721635], ['length', 5, 5427.2009608367125], ['length', 7, 5313.427720847974], ['length', 9, 5383.054514833446], ['width', 1, 4559.257297950061], ['width', 3, 4606.413692169901], ['width', 5, 4704.482589704386], ['width', 7, 4571.485046194653], ['width', 9, 4652.914172067787], ['height', 1, 8904.04645636071], ['height', 3, 8277.609643045525], ['height', 5, 8144.441042663747], ['height', 7, 7679.598124393773], ['height', 9, 7811.03606291223], ['curb-weight', 1, 5264.290230758878], ['curb-weight', 3, 5022.318011757233], ['curb-weight', 5, 4401.118254793124], ['curb-weight', 7, 4330.608104418053], ['curb-weight', 9, 4632.044474454401], ['engine-size', 1, 3258.4861059962027], ['engine-size', 3, 2840.562805643501], ['engine-size', 5, 3238.4628296477176], ['engine-size', 7, 3563.086774256415], ['engine-size', 9, 3831.8244149840766], ['bore', 1, 8602.58848450066], ['bore', 3, 6984.239489480916], ['bore', 5, 6780.627784685976], ['bore', 7, 6878.097965921532], ['bore', 9, 6866.808502038413], ['stroke', 1, 9116.495955406906], ['stroke', 3, 7338.68466990294], ['stroke', 5, 8006.529544647101], ['stroke', 7, 7803.937796804327], ['stroke', 9, 7735.554366079291], ['compression-rate', 1, 8087.205346523092], ['compression-rate', 3, 7375.063685578359], ['compression-rate', 5, 6610.812153159129], ['compression-rate', 7, 6732.801282941515], ['compression-rate', 9, 7024.485525463435], ['horsepower', 1, 4170.054848037801], ['horsepower', 3, 4020.8492630885394], ['horsepower', 5, 4037.0377131537603], ['horsepower', 7, 4353.811860277134], ['horsepower', 9, 4515.135617419103], ['peak-rpm', 1, 9511.480067750124], ['peak-rpm', 3, 8537.550899973421], ['peak-rpm', 5, 7697.4596964334805], ['peak-rpm', 7, 7510.294160083481], ['peak-rpm', 9, 7340.041341263401], ['city-mpg', 1, 5901.143574354764], ['city-mpg', 3, 4646.746408727155], ['city-mpg', 5, 4766.422505090134], ['city-mpg', 7, 5232.523034167316], ['city-mpg', 9, 5465.209492527533], ['highway-mpg', 1, 6025.594966720739], ['highway-mpg', 3, 4617.305019788554], ['highway-mpg', 5, 4630.026798588056], ['highway-mpg', 7, 4796.061440186946], ['highway-mpg', 9, 5278.358056953987]]
import matplotlib.pyplot as plt
%matplotlib inline
x=[]
y=[]
for i in range(len(results)):
x.append(results[i][1])
y.append(results[i][2])
if i%5==4:
plt.plot(x,y)
x=[]
y=[]
plt.ylabel('rmse')
plt.xlabel('n_neighbors')
<matplotlib.text.Text at 0x7fb8ee593630>
Looks like k=3 and k=5 produced the best results for most. Now lets look at a Multivariate Model.
Multivariate Model¶
Use all of the columns
knn_train_test(
norm_cont_car,
all_features,
'price',
5)
3726.6377183145073
Now use 2 best, 3 best, 4 best, and 5 best columns
multi={}
for i in range(2,6):
multi[str(i)+' best features']=knn_train_test(
norm_cont_car,
[univar[j][0] for j in range(i)],
'price',
5)
multi
{'2 best features': 2949.8817277180374, '3 best features': 3435.2601095401815, '4 best features': 3286.5597353252815, '5 best features': 3410.2170133901805}
Now adjust hyperparameters for these results and visualize
final=[]
for i in range(2,6):
for k in range(1,26):
final.append([i,k,knn_train_test(
norm_cont_car,
[univar[j][0] for j in range(i)],
'price',
k)])
final
[[2, 1, 2783.6204237227344], [2, 2, 2657.7963807419765], [2, 3, 2792.586573031673], [2, 4, 2891.5329686923255], [2, 5, 2949.8817277180374], [2, 6, 3096.402601694776], [2, 7, 3164.681969020496], [2, 8, 3413.228359192009], [2, 9, 3748.6716603306486], [2, 10, 4080.7125057341937], [2, 11, 4215.6372280600335], [2, 12, 4275.421524277872], [2, 13, 4373.901683035496], [2, 14, 4424.285137239815], [2, 15, 4539.505493095937], [2, 16, 4667.307671446768], [2, 17, 4729.605305844226], [2, 18, 4790.556632159094], [2, 19, 4824.3866193292615], [2, 20, 4840.850914693829], [2, 21, 4837.429062000271], [2, 22, 4831.16988267597], [2, 23, 4861.679492959275], [2, 24, 4903.346008862579], [2, 25, 4955.892429427362], [3, 1, 3131.798233957262], [3, 2, 3005.5737903777945], [3, 3, 3016.72346189447], [3, 4, 3144.445503247916], [3, 5, 3435.2601095401815], [3, 6, 3490.638342356411], [3, 7, 3734.328302612299], [3, 8, 3936.1203798667007], [3, 9, 3976.4878693193755], [3, 10, 4124.277723900439], [3, 11, 4213.73633336988], [3, 12, 4230.509703492906], [3, 13, 4176.851250534326], [3, 14, 4292.415585753593], [3, 15, 4382.50190463831], [3, 16, 4418.525629386618], [3, 17, 4427.787909307598], [3, 18, 4493.877120715579], [3, 19, 4591.965375700132], [3, 20, 4597.552075200799], [3, 21, 4674.306863451769], [3, 22, 4684.262850205994], [3, 23, 4718.175304198165], [3, 24, 4757.187227038854], [3, 25, 4783.665528691579], [4, 1, 2927.820116343107], [4, 2, 2600.431466008274], [4, 3, 2799.0983199667144], [4, 4, 3074.9999988931822], [4, 5, 3286.5597353252815], [4, 6, 3705.1702717384333], [4, 7, 3822.180444798032], [4, 8, 3986.7014080509525], [4, 9, 4210.340136384571], [4, 10, 4213.315457869819], [4, 11, 4182.323353796191], [4, 12, 4288.696189501458], [4, 13, 4249.894628724804], [4, 14, 4357.287937912011], [4, 15, 4451.408927529352], [4, 16, 4459.2244133241875], [4, 17, 4495.028598279714], [4, 18, 4546.50352887804], [4, 19, 4643.185310790349], [4, 20, 4668.634597012364], [4, 21, 4698.7862455275545], [4, 22, 4700.1491313974975], [4, 23, 4711.210423921079], [4, 24, 4739.97994062864], [4, 25, 4766.177179071813], [5, 1, 2824.7061233282866], [5, 2, 2915.6731645496975], [5, 3, 3012.4204546509704], [5, 4, 3202.8876051367483], [5, 5, 3410.2170133901805], [5, 6, 3618.4509432660384], [5, 7, 3622.6290209234803], [5, 8, 3848.635835654326], [5, 9, 3977.8149139381726], [5, 10, 3994.8132211260104], [5, 11, 4159.843526607947], [5, 12, 4294.3389473154875], [5, 13, 4380.848359486949], [5, 14, 4466.368754416089], [5, 15, 4522.420711094978], [5, 16, 4536.427578452413], [5, 17, 4587.098443664006], [5, 18, 4622.107837952761], [5, 19, 4612.890107622797], [5, 20, 4632.693976139521], [5, 21, 4712.917548435062], [5, 22, 4676.301064518744], [5, 23, 4691.189310956096], [5, 24, 4755.990767231825], [5, 25, 4804.323266214411]]
#Visualize
x=[]
y=[]
for i in range(len(final)):
x.append(final[i][1])
y.append(final[i][2])
if i%25==24:
label=str(final[i][0])+' best features'
plt.plot(x,y,label=label)
x=[]
y=[]
plt.ylabel('rmse')
plt.xlabel('n_neighbors')
plt.legend(loc='lower right')
<matplotlib.legend.Legend at 0x7fb8ee577d30>
Looks like 4 best features at k=2 performed the best.