{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Diamonds price regression\n",
"\n",
"## Using Machine Lerning regressor\n",
"\n",
"The dataset used is [here](https://www.kaggle.com/shivam2503/diamonds/data)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" carat | \n",
" cut | \n",
" color | \n",
" clarity | \n",
" depth | \n",
" table | \n",
" price | \n",
" x | \n",
" y | \n",
" z | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0.23 | \n",
" Ideal | \n",
" E | \n",
" SI2 | \n",
" 61.5 | \n",
" 55.0 | \n",
" 326 | \n",
" 3.95 | \n",
" 3.98 | \n",
" 2.43 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.21 | \n",
" Premium | \n",
" E | \n",
" SI1 | \n",
" 59.8 | \n",
" 61.0 | \n",
" 326 | \n",
" 3.89 | \n",
" 3.84 | \n",
" 2.31 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.23 | \n",
" Good | \n",
" E | \n",
" VS1 | \n",
" 56.9 | \n",
" 65.0 | \n",
" 327 | \n",
" 4.05 | \n",
" 4.07 | \n",
" 2.31 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.29 | \n",
" Premium | \n",
" I | \n",
" VS2 | \n",
" 62.4 | \n",
" 58.0 | \n",
" 334 | \n",
" 4.20 | \n",
" 4.23 | \n",
" 2.63 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.31 | \n",
" Good | \n",
" J | \n",
" SI2 | \n",
" 63.3 | \n",
" 58.0 | \n",
" 335 | \n",
" 4.34 | \n",
" 4.35 | \n",
" 2.75 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
"2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
"3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
"4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
"5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"df = pd.read_csv(\"Datasets/Diamonds/diamonds.csv\", index_col=0)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"df['cut'].unique()\n",
"cut_class_dict = {\"Fair\": 1, \"Good\": 2, \"Very Good\": 3, \"Premium\": 4, \"Ideal\": 5}\n",
"clarity_dict = {\"I3\": 1, \"I2\": 2, \"I1\": 3, \"SI2\": 4, \"SI1\": 5, \"VS2\": 6, \"VS1\": 7, \"VVS2\": 8, \"VVS1\": 9, \"IF\": 10, \"FL\": 11}\n",
"color_dict = {\"J\": 1,\"I\": 2,\"H\": 3,\"G\": 4,\"F\": 5,\"E\": 6,\"D\": 7}"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" carat | \n",
" cut | \n",
" color | \n",
" clarity | \n",
" depth | \n",
" table | \n",
" price | \n",
" x | \n",
" y | \n",
" z | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0.23 | \n",
" 5 | \n",
" 6 | \n",
" 4 | \n",
" 61.5 | \n",
" 55.0 | \n",
" 326 | \n",
" 3.95 | \n",
" 3.98 | \n",
" 2.43 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.21 | \n",
" 4 | \n",
" 6 | \n",
" 5 | \n",
" 59.8 | \n",
" 61.0 | \n",
" 326 | \n",
" 3.89 | \n",
" 3.84 | \n",
" 2.31 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.23 | \n",
" 2 | \n",
" 6 | \n",
" 7 | \n",
" 56.9 | \n",
" 65.0 | \n",
" 327 | \n",
" 4.05 | \n",
" 4.07 | \n",
" 2.31 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.29 | \n",
" 4 | \n",
" 2 | \n",
" 6 | \n",
" 62.4 | \n",
" 58.0 | \n",
" 334 | \n",
" 4.20 | \n",
" 4.23 | \n",
" 2.63 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.31 | \n",
" 2 | \n",
" 1 | \n",
" 4 | \n",
" 63.3 | \n",
" 58.0 | \n",
" 335 | \n",
" 4.34 | \n",
" 4.35 | \n",
" 2.75 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"1 0.23 5 6 4 61.5 55.0 326 3.95 3.98 2.43\n",
"2 0.21 4 6 5 59.8 61.0 326 3.89 3.84 2.31\n",
"3 0.23 2 6 7 56.9 65.0 327 4.05 4.07 2.31\n",
"4 0.29 4 2 6 62.4 58.0 334 4.20 4.23 2.63\n",
"5 0.31 2 1 4 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['cut'] = df['cut'].map(cut_class_dict)\n",
"df['clarity'] = df['clarity'].map(clarity_dict)\n",
"df['color'] = df['color'].map(color_dict)\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"import sklearn\n",
"from sklearn import svm, preprocessing\n",
"from sklearn.linear_model import SGDRegressor\n",
"\n",
"df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.\n",
"\n",
"X = df.drop(\"price\", axis=1).values\n",
"X = preprocessing.scale(X)\n",
"y = df[\"price\"].values"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data len: 53940 \n",
"As test we used 20%: 10788.0\n"
]
}
],
"source": [
"len(y)\n",
"print(\"Data len: \",len(y),\"\\nAs test we used 20%: \",20/100*len(y))"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SGD Regressor"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9040861346309637\n"
]
}
],
"source": [
"clf = SGDRegressor(max_iter=1000)\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print(clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15298.319296019743 13919\n",
"12283.622023001368 14386\n",
"5396.202925107901 3951\n",
"4034.4365255612984 2855\n",
"215.1080120323627 645\n",
"3533.2049908575455 2978\n",
"-624.3585716217572 654\n",
"3935.1728997587816 3170\n",
"-1127.3151816200148 450\n",
"4022.1708282842237 2956\n"
]
}
],
"source": [
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(clf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SVR Regressor"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
" \"avoid this warning.\", FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5413237370675921\n"
]
}
],
"source": [
"from sklearn import svm\n",
"\n",
"clf = svm.SVR()\n",
"\n",
"clf.fit(X_train, y_train)\n",
"print(clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5122.681145918745 13919\n",
"6632.724385241532 14386\n",
"4567.411354034963 3951\n",
"3261.084788402066 2855\n",
"529.2786025656524 645\n",
"3219.2301461725656 2978\n",
"1002.5617023863538 654\n",
"3440.406994396222 3170\n",
"685.2569483457883 450\n",
"3101.373161450196 2956\n"
]
}
],
"source": [
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(clf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Support Vector Regression (SVR) with linear kernel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clf = svm.SVR(kernel=\"linear\")\n",
"\n",
"clf.fit(X_train, y_train)\n",
"print(clf.score(X_test, y_test))\n",
"\n",
"predictions_lin = clf.predict(X_test, y_test) # make predictions\n",
"\n",
"acc = clf.accuracy_score(y_test, predictions_lin)\n",
"\n",
"print(\"Accuracy: \",acc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Support Vector Regression (SVR) with rbf kernel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clf = svm.SVR(kernel=\"rbf\")\n",
"\n",
"clf.fit(X_train, y_train)\n",
"print(clf.score(X_test, y_test))\n",
"\n",
"predictions = clf.predict(X_test, y_test) # make predictions\n",
"\n",
"acc = clf.accuracy_score(y_test, predictions)\n",
"\n",
"print(\"Accuracy: \",acc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Forest regression"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"rf = RandomForestRegressor(n_estimators=10, random_state=0)\n",
"rf.fit(X_train,y_train)\n",
"print(\"SCORE: \",rf.score(X_test, y_test))\n",
"\n",
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(rf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Linear Regression"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.905111184064965\n"
]
}
],
"source": [
"from sklearn import linear_model\n",
"\n",
"linear = linear_model.LinearRegression()\n",
"\n",
"linear.fit(X_train, y_train)\n",
"print(linear.score(X_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic regression"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
" FutureWarning)\n",
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
" \"this warning.\", FutureWarning)\n"
]
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"logistic = LogisticRegression(random_state=0) # create object for the class\n",
"logistic.fit(X_train, y_train) # perform logistic regression\n",
"ac = logistic.score(X_test, y_test)\n",
"Y_pred = logistic.predict(X_test, y_test) # make predictions\n",
"\n",
"print(\"Accuracy: \",ac)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}