Master-DataScience-Notes/1year/2trimester/Coding for Data Science - Python language/Python/Examples/Diamonds ML regression.ipynb
Andreaierardi d634470290 up
2020-03-04 17:29:48 +01:00

715 lines
18 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Diamonds price regression\n",
"\n",
"## Using Machine Lerning regressor\n",
"\n",
"The dataset used is [here](https://www.kaggle.com/shivam2503/diamonds/data)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.23</td>\n",
" <td>Ideal</td>\n",
" <td>E</td>\n",
" <td>SI2</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.21</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.23</td>\n",
" <td>Good</td>\n",
" <td>E</td>\n",
" <td>VS1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.29</td>\n",
" <td>Premium</td>\n",
" <td>I</td>\n",
" <td>VS2</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.31</td>\n",
" <td>Good</td>\n",
" <td>J</td>\n",
" <td>SI2</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
"2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
"3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
"4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
"5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"df = pd.read_csv(\"Datasets/Diamonds/diamonds.csv\", index_col=0)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df['cut'].unique()\n",
"cut_class_dict = {\"Fair\": 1, \"Good\": 2, \"Very Good\": 3, \"Premium\": 4, \"Ideal\": 5}\n",
"clarity_dict = {\"I3\": 1, \"I2\": 2, \"I1\": 3, \"SI2\": 4, \"SI1\": 5, \"VS2\": 6, \"VS1\": 7, \"VVS2\": 8, \"VVS1\": 9, \"IF\": 10, \"FL\": 11}\n",
"color_dict = {\"J\": 1,\"I\": 2,\"H\": 3,\"G\": 4,\"F\": 5,\"E\": 6,\"D\": 7}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.23</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.21</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.23</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.29</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.31</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"1 0.23 5 6 4 61.5 55.0 326 3.95 3.98 2.43\n",
"2 0.21 4 6 5 59.8 61.0 326 3.89 3.84 2.31\n",
"3 0.23 2 6 7 56.9 65.0 327 4.05 4.07 2.31\n",
"4 0.29 4 2 6 62.4 58.0 334 4.20 4.23 2.63\n",
"5 0.31 2 1 4 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['cut'] = df['cut'].map(cut_class_dict)\n",
"df['clarity'] = df['clarity'].map(clarity_dict)\n",
"df['color'] = df['color'].map(color_dict)\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import sklearn\n",
"from sklearn import svm, preprocessing\n",
"from sklearn.linear_model import SGDRegressor\n",
"\n",
"df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.\n",
"\n",
"X = df.drop(\"price\", axis=1).values\n",
"X = preprocessing.scale(X)\n",
"y = df[\"price\"].values"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data len: 53940 \n",
"As test we used 20%: 10788.0\n"
]
}
],
"source": [
"len(y)\n",
"print(\"Data len: \",len(y),\"\\nAs test we used 20%: \",20/100*len(y))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SGD Regressor"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.907304269100812\n"
]
}
],
"source": [
"clf = SGDRegressor(max_iter=1000)\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print(clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2057.938907081024 1716\n",
"408.86668615612143 776\n",
"1877.1359526566246 1850\n",
"14413.45842840683 13317\n",
"6583.122199486379 5880\n",
"1472.7275999563244 2231\n",
"603.3124107228641 666\n",
"-175.58587579503273 705\n",
"732.6492505985616 552\n",
"2763.681359134369 3061\n"
]
}
],
"source": [
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(clf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SVR Regressor"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
" \"avoid this warning.\", FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5235043972946711\n"
]
}
],
"source": [
"from sklearn import svm\n",
"\n",
"clf = svm.SVR()\n",
"\n",
"clf.fit(X_train, y_train)\n",
"print(clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1614.983425223527 1716\n",
"579.9398270257539 776\n",
"2254.067311348471 1850\n",
"5348.406983308956 13317\n",
"5642.754018835578 5880\n",
"2166.566950323401 2231\n",
"763.0511461708497 666\n",
"514.0293281929548 705\n",
"1103.8544288088133 552\n",
"3299.614546713646 3061\n"
]
}
],
"source": [
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(clf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Support Vector Regression (SVR) with linear kernel"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8630448690848345\n"
]
}
],
"source": [
"clf = svm.SVR(kernel=\"linear\")\n",
"\n",
"clf.fit(X_train, y_train)\n",
"print(clf.score(X_test, y_test))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Support Vector Regression (SVR) with rbf kernel"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
" \"avoid this warning.\", FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5235043972946711\n"
]
}
],
"source": [
"clf2 = svm.SVR(kernel=\"rbf\")\n",
"\n",
"clf2.fit(X_train, y_train)\n",
"print(clf2.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1614.98342522, 579.93982703, 2254.06731135, ..., 1600.90972976,\n",
" 863.31271817, 1313.2325644 ])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions = clf2.predict(X_test) # make predictions\n",
"predictions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Forest regression"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SCORE: 0.9795971375747344 \n",
"--------------\n",
"1651.2 1716\n",
"748.7833333333333 776\n",
"1823.3 1850\n",
"16492.9 13317\n",
"4931.6 5880\n",
"1935.2 2231\n",
"688.1 666\n",
"689.4 705\n",
"663.8 552\n",
"3171.0 3061\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"rf = RandomForestRegressor(n_estimators=10, random_state=0)\n",
"rf.fit(X_train,y_train)\n",
"print(\"SCORE: \",rf.score(X_test, y_test),\"\\n--------------\")\n",
"\n",
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(rf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Linear Regression"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SCORE: 0.9087525665852461 \n",
"--------------\n",
"2039.5860723087262 1716\n",
"315.57839741764747 776\n",
"1857.7588850776274 1850\n",
"14629.248347536268 13317\n",
"6689.10311583452 5880\n",
"1455.96854276338 2231\n",
"543.9652038627887 666\n",
"-295.936014145806 705\n",
"675.2908110759709 552\n",
"2831.5232911325866 3061\n"
]
}
],
"source": [
"from sklearn import linear_model\n",
"\n",
"linear = linear_model.LinearRegression()\n",
"\n",
"linear.fit(X_train, y_train)\n",
"\n",
"print(\"SCORE: \",linear.score(X_test, y_test),\"\\n--------------\")\n",
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(linear.predict([X])[0], y)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Logistic regression"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
" FutureWarning)\n",
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
" \"this warning.\", FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"SCORE: 0.009547645532072673 \n",
"--------------\n"
]
}
],
"source": [
"from sklearn import linear_model\n",
"from sklearn.linear_model import LogisticRegression\n",
"logistic = linear_model.LogisticRegression(random_state=0) # create object for the class\n",
"logistic.fit(X_train, y_train) # perform logistic regression\n",
"ac = logistic.score(X_test, y_test)\n",
"print(\"SCORE: \",ac ,\"\\n--------------\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1662 3370\n",
"1882 3321\n",
"776 698\n",
"530 646\n",
"1882 4839\n",
"1076 1851\n",
"984 1624\n",
"802 665\n",
"872 596\n",
"394 1154\n"
]
}
],
"source": [
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(logistic.predict([X])[0], y)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}