"df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.\n",
"\n",
"X = df.drop(\"price\", axis=1).values\n",
"X = preprocessing.scale(X)\n",
"y = df[\"price\"].values"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data len: 53940 \n",
"As test we used 20%: 10788.0\n"
]
}
],
"source": [
"len(y)\n",
"print(\"Data len: \",len(y),\"\\nAs test we used 20%: \",20/100*len(y))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SGD Regressor"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.907304269100812\n"
]
}
],
"source": [
"clf = SGDRegressor(max_iter=1000)\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print(clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2057.938907081024 1716\n",
"408.86668615612143 776\n",
"1877.1359526566246 1850\n",
"14413.45842840683 13317\n",
"6583.122199486379 5880\n",
"1472.7275999563244 2231\n",
"603.3124107228641 666\n",
"-175.58587579503273 705\n",
"732.6492505985616 552\n",
"2763.681359134369 3061\n"
]
}
],
"source": [
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(clf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SVR Regressor"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
" \"avoid this warning.\", FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5235043972946711\n"
]
}
],
"source": [
"from sklearn import svm\n",
"\n",
"clf = svm.SVR()\n",
"\n",
"clf.fit(X_train, y_train)\n",
"print(clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1614.983425223527 1716\n",
"579.9398270257539 776\n",
"2254.067311348471 1850\n",
"5348.406983308956 13317\n",
"5642.754018835578 5880\n",
"2166.566950323401 2231\n",
"763.0511461708497 666\n",
"514.0293281929548 705\n",
"1103.8544288088133 552\n",
"3299.614546713646 3061\n"
]
}
],
"source": [
"for X,y in list(zip(X_test, y_test))[:10]:\n",
" print(clf.predict([X])[0], y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Support Vector Regression (SVR) with linear kernel"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8630448690848345\n"
]
}
],
"source": [
"clf = svm.SVR(kernel=\"linear\")\n",
"\n",
"clf.fit(X_train, y_train)\n",
"print(clf.score(X_test, y_test))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Support Vector Regression (SVR) with rbf kernel"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"E:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",