# Diamonds price regression

## Using Machine Lerning regressor

The dataset used is [here](https://www.kaggle.com/shivam2503/diamonds/data)

In [54]:
import pandas as pd
import numpy as np

df = pd.read_csv("Datasets/Diamonds/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [55]:
df['cut'].unique()
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [56]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [59]:
import sklearn
from sklearn import svm, preprocessing
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

In [67]:
len(y)
print("Data len: ",len(y),"\nAs test we used 20%: ",20/100*len(y))

Data len:  53940 
As test we used 20%:  10788.0


In [68]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

## SGD Regressor

In [69]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

0.9040861346309637


In [70]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

15298.319296019743 13919
12283.622023001368 14386
5396.202925107901 3951
4034.4365255612984 2855
215.1080120323627 645
3533.2049908575455 2978
-624.3585716217572 654
3935.1728997587816 3170
-1127.3151816200148 450
4022.1708282842237 2956


## SVR Regressor

In [71]:
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))



0.5413237370675921


In [72]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

5122.681145918745 13919
6632.724385241532 14386
4567.411354034963 3951
3261.084788402066 2855
529.2786025656524 645
3219.2301461725656 2978
1002.5617023863538 654
3440.406994396222 3170
685.2569483457883 450
3101.373161450196 2956


### Support Vector Regression (SVR) with linear kernel

In [None]:
clf = svm.SVR(kernel="linear")

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

predictions_lin = clf.predict(X_test, y_test)  # make predictions

acc = clf.accuracy_score(y_test, predictions_lin)

print("Accuracy: ",acc)

### Support Vector Regression (SVR) with rbf kernel

In [None]:
clf = svm.SVR(kernel="rbf")

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

predictions = clf.predict(X_test, y_test)  # make predictions

acc = clf.accuracy_score(y_test, predictions)

print("Accuracy: ",acc)

## Random Forest regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=0)
rf.fit(X_train,y_train)
print("SCORE: ",rf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(rf.predict([X])[0], y)

## Linear Regression

In [80]:
from sklearn import linear_model

linear = linear_model.LinearRegression()

linear.fit(X_train, y_train)
print(linear.score(X_test, y_test))

0.905111184064965


## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state=0)  # create object for the class
logistic.fit(X_train, y_train) # perform logistic regression
ac = logistic.score(X_test, y_test)
Y_pred = logistic.predict(X_test, y_test)  # make predictions

print("Accuracy: ",ac)

