import pandas as pd
import numpy as np
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars=pd.read_csv('imports-85.data',names=cols)

cars

cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-rate     205 non-null float64
horsepower           205 non-null object
peak-rpm             205 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null object
dtypes: float64(5), int64(5), object(16)
memory usage: 41.7+ KB

cont_car=cars[['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']].copy()
cont_car.replace('?',np.nan,inplace=True)
cont_car=cont_car.astype(float)
cont_car.isnull().sum()

normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

cont_car.dropna(subset=['price'],inplace=True)
cont_car=cont_car.apply(lambda x:x.fillna(x.mean()))
cont_car.isnull().sum()

normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

norm_cont_car=cont_car.copy()
norm_cont_car=(norm_cont_car-norm_cont_car.min())/(norm_cont_car.max()-norm_cont_car.min())
norm_cont_car['price']=cont_car['price']

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

def knn_train_test(df,features,target):
    
    np.random.seed(1)
    df=df.iloc[np.random.permutation(len(df))].copy()
    
    split=int(len(df)/2)
    train=df[:split].copy()
    test=df[split:].copy()
    
    knn=KNeighborsRegressor()
    knn.fit(train[features],train[target])
    p=knn.predict(test[features])
    
    return mean_squared_error(test[target],p)**.5

all_features=norm_cont_car.columns.drop('price')
univar=[[i,knn_train_test(norm_cont_car,[i],'price')] for i in all_features]
univar.sort(key=lambda x: x[1])
univar

[['engine-size', 3238.4628296477176],
 ['horsepower', 4037.0377131537603],
 ['curb-weight', 4401.118254793124],
 ['highway-mpg', 4630.026798588056],
 ['width', 4704.482589704386],
 ['city-mpg', 4766.422505090134],
 ['length', 5427.2009608367125],
 ['wheel-base', 5461.553997873057],
 ['compression-rate', 6610.812153159129],
 ['bore', 6780.627784685976],
 ['normalized-losses', 7330.197653434445],
 ['peak-rpm', 7697.4596964334805],
 ['stroke', 8006.529544647101],
 ['height', 8144.441042663747]]

def knn_train_test(df,features,target,k):
    
    np.random.seed(1)
    df=df.iloc[np.random.permutation(len(df))].copy()
    
    split=int(len(df)/2)
    train=df[:split].copy()
    test=df[split:].copy()
    
    knn=KNeighborsRegressor(n_neighbors=k)
    knn.fit(train[features],train[target])
    p=knn.predict(test[features])
    
    return mean_squared_error(test[target],p)**.5

results=[]
for i in all_features:
    for k in [1,3,5,7,9]:
        results.append([i,k,knn_train_test(norm_cont_car,[i],'price',k)])
results

[['normalized-losses', 1, 7846.750605148984],
 ['normalized-losses', 3, 7500.5698123109905],
 ['normalized-losses', 5, 7330.197653434445],
 ['normalized-losses', 7, 7756.421586234123],
 ['normalized-losses', 9, 7688.096096891432],
 ['wheel-base', 1, 4493.734068810494],
 ['wheel-base', 3, 5120.161506064513],
 ['wheel-base', 5, 5461.553997873057],
 ['wheel-base', 7, 5448.1070513823315],
 ['wheel-base', 9, 5738.405685192312],
 ['length', 1, 4628.45550121557],
 ['length', 3, 5129.8358210721635],
 ['length', 5, 5427.2009608367125],
 ['length', 7, 5313.427720847974],
 ['length', 9, 5383.054514833446],
 ['width', 1, 4559.257297950061],
 ['width', 3, 4606.413692169901],
 ['width', 5, 4704.482589704386],
 ['width', 7, 4571.485046194653],
 ['width', 9, 4652.914172067787],
 ['height', 1, 8904.04645636071],
 ['height', 3, 8277.609643045525],
 ['height', 5, 8144.441042663747],
 ['height', 7, 7679.598124393773],
 ['height', 9, 7811.03606291223],
 ['curb-weight', 1, 5264.290230758878],
 ['curb-weight', 3, 5022.318011757233],
 ['curb-weight', 5, 4401.118254793124],
 ['curb-weight', 7, 4330.608104418053],
 ['curb-weight', 9, 4632.044474454401],
 ['engine-size', 1, 3258.4861059962027],
 ['engine-size', 3, 2840.562805643501],
 ['engine-size', 5, 3238.4628296477176],
 ['engine-size', 7, 3563.086774256415],
 ['engine-size', 9, 3831.8244149840766],
 ['bore', 1, 8602.58848450066],
 ['bore', 3, 6984.239489480916],
 ['bore', 5, 6780.627784685976],
 ['bore', 7, 6878.097965921532],
 ['bore', 9, 6866.808502038413],
 ['stroke', 1, 9116.495955406906],
 ['stroke', 3, 7338.68466990294],
 ['stroke', 5, 8006.529544647101],
 ['stroke', 7, 7803.937796804327],
 ['stroke', 9, 7735.554366079291],
 ['compression-rate', 1, 8087.205346523092],
 ['compression-rate', 3, 7375.063685578359],
 ['compression-rate', 5, 6610.812153159129],
 ['compression-rate', 7, 6732.801282941515],
 ['compression-rate', 9, 7024.485525463435],
 ['horsepower', 1, 4170.054848037801],
 ['horsepower', 3, 4020.8492630885394],
 ['horsepower', 5, 4037.0377131537603],
 ['horsepower', 7, 4353.811860277134],
 ['horsepower', 9, 4515.135617419103],
 ['peak-rpm', 1, 9511.480067750124],
 ['peak-rpm', 3, 8537.550899973421],
 ['peak-rpm', 5, 7697.4596964334805],
 ['peak-rpm', 7, 7510.294160083481],
 ['peak-rpm', 9, 7340.041341263401],
 ['city-mpg', 1, 5901.143574354764],
 ['city-mpg', 3, 4646.746408727155],
 ['city-mpg', 5, 4766.422505090134],
 ['city-mpg', 7, 5232.523034167316],
 ['city-mpg', 9, 5465.209492527533],
 ['highway-mpg', 1, 6025.594966720739],
 ['highway-mpg', 3, 4617.305019788554],
 ['highway-mpg', 5, 4630.026798588056],
 ['highway-mpg', 7, 4796.061440186946],
 ['highway-mpg', 9, 5278.358056953987]]

import matplotlib.pyplot as plt
%matplotlib inline

x=[]
y=[]
for i in range(len(results)):
    x.append(results[i][1])
    y.append(results[i][2])
    if i%5==4:
        plt.plot(x,y)
        x=[]
        y=[]
    
plt.ylabel('rmse')
plt.xlabel('n_neighbors')

<matplotlib.text.Text at 0x7fb8ee593630>

knn_train_test(
    norm_cont_car,
    all_features,
    'price',
    5)

3726.6377183145073

multi={}
for i in range(2,6):
    
    multi[str(i)+' best features']=knn_train_test(
        norm_cont_car,
        [univar[j][0] for j in range(i)],
        'price',
        5)
multi

{'2 best features': 2949.8817277180374,
 '3 best features': 3435.2601095401815,
 '4 best features': 3286.5597353252815,
 '5 best features': 3410.2170133901805}

final=[]
for i in range(2,6):
    for k in range(1,26):
        final.append([i,k,knn_train_test(
                            norm_cont_car,
                            [univar[j][0] for j in range(i)],
                            'price',
                            k)])        
final

[[2, 1, 2783.6204237227344],
 [2, 2, 2657.7963807419765],
 [2, 3, 2792.586573031673],
 [2, 4, 2891.5329686923255],
 [2, 5, 2949.8817277180374],
 [2, 6, 3096.402601694776],
 [2, 7, 3164.681969020496],
 [2, 8, 3413.228359192009],
 [2, 9, 3748.6716603306486],
 [2, 10, 4080.7125057341937],
 [2, 11, 4215.6372280600335],
 [2, 12, 4275.421524277872],
 [2, 13, 4373.901683035496],
 [2, 14, 4424.285137239815],
 [2, 15, 4539.505493095937],
 [2, 16, 4667.307671446768],
 [2, 17, 4729.605305844226],
 [2, 18, 4790.556632159094],
 [2, 19, 4824.3866193292615],
 [2, 20, 4840.850914693829],
 [2, 21, 4837.429062000271],
 [2, 22, 4831.16988267597],
 [2, 23, 4861.679492959275],
 [2, 24, 4903.346008862579],
 [2, 25, 4955.892429427362],
 [3, 1, 3131.798233957262],
 [3, 2, 3005.5737903777945],
 [3, 3, 3016.72346189447],
 [3, 4, 3144.445503247916],
 [3, 5, 3435.2601095401815],
 [3, 6, 3490.638342356411],
 [3, 7, 3734.328302612299],
 [3, 8, 3936.1203798667007],
 [3, 9, 3976.4878693193755],
 [3, 10, 4124.277723900439],
 [3, 11, 4213.73633336988],
 [3, 12, 4230.509703492906],
 [3, 13, 4176.851250534326],
 [3, 14, 4292.415585753593],
 [3, 15, 4382.50190463831],
 [3, 16, 4418.525629386618],
 [3, 17, 4427.787909307598],
 [3, 18, 4493.877120715579],
 [3, 19, 4591.965375700132],
 [3, 20, 4597.552075200799],
 [3, 21, 4674.306863451769],
 [3, 22, 4684.262850205994],
 [3, 23, 4718.175304198165],
 [3, 24, 4757.187227038854],
 [3, 25, 4783.665528691579],
 [4, 1, 2927.820116343107],
 [4, 2, 2600.431466008274],
 [4, 3, 2799.0983199667144],
 [4, 4, 3074.9999988931822],
 [4, 5, 3286.5597353252815],
 [4, 6, 3705.1702717384333],
 [4, 7, 3822.180444798032],
 [4, 8, 3986.7014080509525],
 [4, 9, 4210.340136384571],
 [4, 10, 4213.315457869819],
 [4, 11, 4182.323353796191],
 [4, 12, 4288.696189501458],
 [4, 13, 4249.894628724804],
 [4, 14, 4357.287937912011],
 [4, 15, 4451.408927529352],
 [4, 16, 4459.2244133241875],
 [4, 17, 4495.028598279714],
 [4, 18, 4546.50352887804],
 [4, 19, 4643.185310790349],
 [4, 20, 4668.634597012364],
 [4, 21, 4698.7862455275545],
 [4, 22, 4700.1491313974975],
 [4, 23, 4711.210423921079],
 [4, 24, 4739.97994062864],
 [4, 25, 4766.177179071813],
 [5, 1, 2824.7061233282866],
 [5, 2, 2915.6731645496975],
 [5, 3, 3012.4204546509704],
 [5, 4, 3202.8876051367483],
 [5, 5, 3410.2170133901805],
 [5, 6, 3618.4509432660384],
 [5, 7, 3622.6290209234803],
 [5, 8, 3848.635835654326],
 [5, 9, 3977.8149139381726],
 [5, 10, 3994.8132211260104],
 [5, 11, 4159.843526607947],
 [5, 12, 4294.3389473154875],
 [5, 13, 4380.848359486949],
 [5, 14, 4466.368754416089],
 [5, 15, 4522.420711094978],
 [5, 16, 4536.427578452413],
 [5, 17, 4587.098443664006],
 [5, 18, 4622.107837952761],
 [5, 19, 4612.890107622797],
 [5, 20, 4632.693976139521],
 [5, 21, 4712.917548435062],
 [5, 22, 4676.301064518744],
 [5, 23, 4691.189310956096],
 [5, 24, 4755.990767231825],
 [5, 25, 4804.323266214411]]

#Visualize
x=[]
y=[]
for i in range(len(final)):
    x.append(final[i][1])
    y.append(final[i][2])
    if i%25==24:
        label=str(final[i][0])+' best features'
        plt.plot(x,y,label=label)
        x=[]
        y=[]
    
plt.ylabel('rmse')
plt.xlabel('n_neighbors')
plt.legend(loc='lower right')

<matplotlib.legend.Legend at 0x7fb8ee577d30>

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	...	engine-size	fuel-system	bore	stroke	compression-rate	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.00	111	5000	21	27	13495
1	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.00	111	5000	21	27	16500
2	1	?	alfa-romero	gas	std	two	hatchback	rwd	front	94.5	...	152	mpfi	2.68	3.47	9.00	154	5000	19	26	16500
3	2	164	audi	gas	std	four	sedan	fwd	front	99.8	...	109	mpfi	3.19	3.40	10.00	102	5500	24	30	13950
4	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.40	8.00	115	5500	18	22	17450
5	2	?	audi	gas	std	two	sedan	fwd	front	99.8	...	136	mpfi	3.19	3.40	8.50	110	5500	19	25	15250
6	1	158	audi	gas	std	four	sedan	fwd	front	105.8	...	136	mpfi	3.19	3.40	8.50	110	5500	19	25	17710
7	1	?	audi	gas	std	four	wagon	fwd	front	105.8	...	136	mpfi	3.19	3.40	8.50	110	5500	19	25	18920
8	1	158	audi	gas	turbo	four	sedan	fwd	front	105.8	...	131	mpfi	3.13	3.40	8.30	140	5500	17	20	23875
9	0	?	audi	gas	turbo	two	hatchback	4wd	front	99.5	...	131	mpfi	3.13	3.40	7.00	160	5500	16	22	?
10	2	192	bmw	gas	std	two	sedan	rwd	front	101.2	...	108	mpfi	3.50	2.80	8.80	101	5800	23	29	16430
11	0	192	bmw	gas	std	four	sedan	rwd	front	101.2	...	108	mpfi	3.50	2.80	8.80	101	5800	23	29	16925
12	0	188	bmw	gas	std	two	sedan	rwd	front	101.2	...	164	mpfi	3.31	3.19	9.00	121	4250	21	28	20970
13	0	188	bmw	gas	std	four	sedan	rwd	front	101.2	...	164	mpfi	3.31	3.19	9.00	121	4250	21	28	21105
14	1	?	bmw	gas	std	four	sedan	rwd	front	103.5	...	164	mpfi	3.31	3.19	9.00	121	4250	20	25	24565
15	0	?	bmw	gas	std	four	sedan	rwd	front	103.5	...	209	mpfi	3.62	3.39	8.00	182	5400	16	22	30760
16	0	?	bmw	gas	std	two	sedan	rwd	front	103.5	...	209	mpfi	3.62	3.39	8.00	182	5400	16	22	41315
17	0	?	bmw	gas	std	four	sedan	rwd	front	110.0	...	209	mpfi	3.62	3.39	8.00	182	5400	15	20	36880
18	2	121	chevrolet	gas	std	two	hatchback	fwd	front	88.4	...	61	2bbl	2.91	3.03	9.50	48	5100	47	53	5151
19	1	98	chevrolet	gas	std	two	hatchback	fwd	front	94.5	...	90	2bbl	3.03	3.11	9.60	70	5400	38	43	6295
20	0	81	chevrolet	gas	std	four	sedan	fwd	front	94.5	...	90	2bbl	3.03	3.11	9.60	70	5400	38	43	6575
21	1	118	dodge	gas	std	two	hatchback	fwd	front	93.7	...	90	2bbl	2.97	3.23	9.41	68	5500	37	41	5572
22	1	118	dodge	gas	std	two	hatchback	fwd	front	93.7	...	90	2bbl	2.97	3.23	9.40	68	5500	31	38	6377
23	1	118	dodge	gas	turbo	two	hatchback	fwd	front	93.7	...	98	mpfi	3.03	3.39	7.60	102	5500	24	30	7957
24	1	148	dodge	gas	std	four	hatchback	fwd	front	93.7	...	90	2bbl	2.97	3.23	9.40	68	5500	31	38	6229
25	1	148	dodge	gas	std	four	sedan	fwd	front	93.7	...	90	2bbl	2.97	3.23	9.40	68	5500	31	38	6692
26	1	148	dodge	gas	std	four	sedan	fwd	front	93.7	...	90	2bbl	2.97	3.23	9.40	68	5500	31	38	7609
27	1	148	dodge	gas	turbo	?	sedan	fwd	front	93.7	...	98	mpfi	3.03	3.39	7.60	102	5500	24	30	8558
28	-1	110	dodge	gas	std	four	wagon	fwd	front	103.3	...	122	2bbl	3.34	3.46	8.50	88	5000	24	30	8921
29	3	145	dodge	gas	turbo	two	hatchback	fwd	front	95.9	...	156	mfi	3.60	3.90	7.00	145	5000	19	24	12964
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
175	-1	65	toyota	gas	std	four	hatchback	fwd	front	102.4	...	122	mpfi	3.31	3.54	8.70	92	4200	27	32	9988
176	-1	65	toyota	gas	std	four	sedan	fwd	front	102.4	...	122	mpfi	3.31	3.54	8.70	92	4200	27	32	10898
177	-1	65	toyota	gas	std	four	hatchback	fwd	front	102.4	...	122	mpfi	3.31	3.54	8.70	92	4200	27	32	11248
178	3	197	toyota	gas	std	two	hatchback	rwd	front	102.9	...	171	mpfi	3.27	3.35	9.30	161	5200	20	24	16558
179	3	197	toyota	gas	std	two	hatchback	rwd	front	102.9	...	171	mpfi	3.27	3.35	9.30	161	5200	19	24	15998
180	-1	90	toyota	gas	std	four	sedan	rwd	front	104.5	...	171	mpfi	3.27	3.35	9.20	156	5200	20	24	15690
181	-1	?	toyota	gas	std	four	wagon	rwd	front	104.5	...	161	mpfi	3.27	3.35	9.20	156	5200	19	24	15750
182	2	122	volkswagen	diesel	std	two	sedan	fwd	front	97.3	...	97	idi	3.01	3.40	23.00	52	4800	37	46	7775
183	2	122	volkswagen	gas	std	two	sedan	fwd	front	97.3	...	109	mpfi	3.19	3.40	9.00	85	5250	27	34	7975
184	2	94	volkswagen	diesel	std	four	sedan	fwd	front	97.3	...	97	idi	3.01	3.40	23.00	52	4800	37	46	7995
185	2	94	volkswagen	gas	std	four	sedan	fwd	front	97.3	...	109	mpfi	3.19	3.40	9.00	85	5250	27	34	8195
186	2	94	volkswagen	gas	std	four	sedan	fwd	front	97.3	...	109	mpfi	3.19	3.40	9.00	85	5250	27	34	8495
187	2	94	volkswagen	diesel	turbo	four	sedan	fwd	front	97.3	...	97	idi	3.01	3.40	23.00	68	4500	37	42	9495
188	2	94	volkswagen	gas	std	four	sedan	fwd	front	97.3	...	109	mpfi	3.19	3.40	10.00	100	5500	26	32	9995
189	3	?	volkswagen	gas	std	two	convertible	fwd	front	94.5	...	109	mpfi	3.19	3.40	8.50	90	5500	24	29	11595
190	3	256	volkswagen	gas	std	two	hatchback	fwd	front	94.5	...	109	mpfi	3.19	3.40	8.50	90	5500	24	29	9980
191	0	?	volkswagen	gas	std	four	sedan	fwd	front	100.4	...	136	mpfi	3.19	3.40	8.50	110	5500	19	24	13295
192	0	?	volkswagen	diesel	turbo	four	sedan	fwd	front	100.4	...	97	idi	3.01	3.40	23.00	68	4500	33	38	13845
193	0	?	volkswagen	gas	std	four	wagon	fwd	front	100.4	...	109	mpfi	3.19	3.40	9.00	88	5500	25	31	12290
194	-2	103	volvo	gas	std	four	sedan	rwd	front	104.3	...	141	mpfi	3.78	3.15	9.50	114	5400	23	28	12940
195	-1	74	volvo	gas	std	four	wagon	rwd	front	104.3	...	141	mpfi	3.78	3.15	9.50	114	5400	23	28	13415
196	-2	103	volvo	gas	std	four	sedan	rwd	front	104.3	...	141	mpfi	3.78	3.15	9.50	114	5400	24	28	15985
197	-1	74	volvo	gas	std	four	wagon	rwd	front	104.3	...	141	mpfi	3.78	3.15	9.50	114	5400	24	28	16515
198	-2	103	volvo	gas	turbo	four	sedan	rwd	front	104.3	...	130	mpfi	3.62	3.15	7.50	162	5100	17	22	18420
199	-1	74	volvo	gas	turbo	four	wagon	rwd	front	104.3	...	130	mpfi	3.62	3.15	7.50	162	5100	17	22	18950
200	-1	95	volvo	gas	std	four	sedan	rwd	front	109.1	...	141	mpfi	3.78	3.15	9.50	114	5400	23	28	16845
201	-1	95	volvo	gas	turbo	four	sedan	rwd	front	109.1	...	141	mpfi	3.78	3.15	8.70	160	5300	19	25	19045
202	-1	95	volvo	gas	std	four	sedan	rwd	front	109.1	...	173	mpfi	3.58	2.87	8.80	134	5500	18	23	21485
203	-1	95	volvo	diesel	turbo	four	sedan	rwd	front	109.1	...	145	idi	3.01	3.40	23.00	106	4800	26	27	22470
204	-1	95	volvo	gas	turbo	four	sedan	rwd	front	109.1	...	141	mpfi	3.78	3.15	9.50	114	5400	19	25	22625

Predicting Car Prices¶

Univariate Model¶

Multivariate Model¶