# Diamonds price regression

## Using Machine Lerning regressor

The dataset used is [here](https://www.kaggle.com/shivam2503/diamonds/data)

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("Datasets/Diamonds/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df['cut'].unique()
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [3]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [4]:
import sklearn
from sklearn import svm, preprocessing
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

In [5]:
len(y)
print("Data len: ",len(y),"\nAs test we used 20%: ",20/100*len(y))

Data len:  53940 
As test we used 20%:  10788.0


In [6]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

## SGD Regressor

In [7]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

0.907304269100812


In [8]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

2057.938907081024 1716
408.86668615612143 776
1877.1359526566246 1850
14413.45842840683 13317
6583.122199486379 5880
1472.7275999563244 2231
603.3124107228641 666
-175.58587579503273 705
732.6492505985616 552
2763.681359134369 3061


## SVR Regressor

In [9]:
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))



0.5235043972946711


In [10]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

1614.983425223527 1716
579.9398270257539 776
2254.067311348471 1850
5348.406983308956 13317
5642.754018835578 5880
2166.566950323401 2231
763.0511461708497 666
514.0293281929548 705
1103.8544288088133 552
3299.614546713646 3061


### Support Vector Regression (SVR) with linear kernel

In [14]:
clf = svm.SVR(kernel="linear")

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))


0.8630448690848345


### Support Vector Regression (SVR) with rbf kernel

In [31]:
clf2 = svm.SVR(kernel="rbf")

clf2.fit(X_train, y_train)
print(clf2.score(X_test, y_test))



0.5235043972946711


In [32]:
predictions = clf2.predict(X_test)  # make predictions
predictions

array([1614.98342522,  579.93982703, 2254.06731135, ..., 1600.90972976,
        863.31271817, 1313.2325644 ])

## Random Forest regression

In [34]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=0)
rf.fit(X_train,y_train)
print("SCORE: ",rf.score(X_test, y_test),"\n--------------")

for X,y in list(zip(X_test, y_test))[:10]:
    print(rf.predict([X])[0], y)

SCORE:  0.9795971375747344 
--------------
1651.2 1716
748.7833333333333 776
1823.3 1850
16492.9 13317
4931.6 5880
1935.2 2231
688.1 666
689.4 705
663.8 552
3171.0 3061


## Linear Regression

In [46]:
from sklearn import linear_model

linear = linear_model.LinearRegression()

linear.fit(X_train, y_train)

print("SCORE: ",linear.score(X_test, y_test),"\n--------------")
for X,y in list(zip(X_test, y_test))[:10]:
    print(linear.predict([X])[0], y)
    

SCORE:  0.9087525665852461 
--------------
2039.5860723087262 1716
315.57839741764747 776
1857.7588850776274 1850
14629.248347536268 13317
6689.10311583452 5880
1455.96854276338 2231
543.9652038627887 666
-295.936014145806 705
675.2908110759709 552
2831.5232911325866 3061


## Logistic regression

In [7]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
logistic = linear_model.LogisticRegression(random_state=0)  # create object for the class
logistic.fit(X_train, y_train) # perform logistic regression
ac = logistic.score(X_test, y_test)
print("SCORE: ",ac ,"\n--------------")



SCORE:  0.009547645532072673 
--------------


In [9]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(logistic.predict([X])[0], y)
    

1662 3370
1882 3321
776 698
530 646
1882 4839
1076 1851
984 1624
802 665
872 596
394 1154
