946 lines
63 KiB
Plaintext
Raw Normal View History

2020-02-29 21:00:09 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import BaggingClassifier\n",
"from sklearn.model_selection import train_test_split, cross_val_score, validation_curve, learning_curve"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>1x1</th>\n",
" <th>1x2</th>\n",
" <th>1x3</th>\n",
" <th>1x4</th>\n",
" <th>1x5</th>\n",
" <th>1x6</th>\n",
" <th>1x7</th>\n",
" <th>1x8</th>\n",
" <th>1x9</th>\n",
" <th>...</th>\n",
" <th>28x19</th>\n",
" <th>28x20</th>\n",
" <th>28x21</th>\n",
" <th>28x22</th>\n",
" <th>28x23</th>\n",
" <th>28x24</th>\n",
" <th>28x25</th>\n",
" <th>28x26</th>\n",
" <th>28x27</th>\n",
" <th>28x28</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 785 columns</p>\n",
"</div>"
],
"text/plain": [
" label 1x1 1x2 1x3 1x4 1x5 1x6 1x7 1x8 1x9 ... 28x19 28x20 \\\n",
"0 7 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"1 2 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"2 1 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"3 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"4 4 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"\n",
" 28x21 28x22 28x23 28x24 28x25 28x26 28x27 28x28 \n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 \n",
"\n",
"[5 rows x 785 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mnist = pd.read_csv(\"Datasets/MNIST/mnist_test.csv\")\n",
"mnist.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10000, 785)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mnist.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 7\n",
"1 2\n",
"2 1\n",
"3 0\n",
"4 4\n",
" ..\n",
"9995 2\n",
"9996 3\n",
"9997 4\n",
"9998 5\n",
"9999 6\n",
"Name: label, Length: 10000, dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mnist_X = mnist.drop(\"label\",1)\n",
"mnist_X\n",
"mnist_y = mnist[\"label\"]\n",
"mnist_y"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([7, 2, 1, ..., 4, 5, 6], dtype=int64)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mnist_y.values"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"X = mnist_X.values\n",
"y = mnist_y.values\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X train : (8000, 784) -- X test : (2000, 784) \n",
"Y train : (8000,) -- Y test : (2000,)\n"
]
}
],
"source": [
"print(\"X train :\",X_train.shape, \"-- X test :\", X_test.shape\n",
" ,\"\\nY train :\",y_train.shape, \"-- Y test :\", y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Figure size 432x432 with 0 Axes>"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"<Figure size 432x432 with 0 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(6,6))"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAADrCAYAAAA2eW6hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOyddVgU2xvHP7OgWKCiomKhci3s9trdgd1x7Y5rt9fuwLhid3djXOvaYICBoqIgKgKCAhLC+f2x11WkYXdn8Tef55mH3TlnznyZmX3n5PtKQggUFBQUFJKPSm4BCgoKCr8KikFVUFBQ0BKKQVVQUFDQEopBVVBQUNASikFVUFBQ0BKKQVVQUFDQEsaJyZw1a1ZhZWWlEyHu7u74+PhI2ihLlzoBHB0dfYQQ2bRRVkq5pqBo/YbyrGqfX+X+J8qgWllZcefOHe2o+ony5ctrrSxd6gSQJOmVtspKKdcUFK3fUJ5V7fOr3P//yyZ/s2YDUamMom3585eUW1qCufPiBdNXbJZbRoz4BwfTtedkVCojSpWqzUNPT7klKSjohf87g9qs2UBOnrQHoGDBMvQaMJ169boD8OrVQzmlJYobTg/JYZVDbhkx8vz9e3Zvn49KpcLZ+TIHDl+QW1KM5M5dKNa0rZcu88TLS49qEsf60+f4a+VWwiMi5Jai4ZWPD4NGL8DF0yPOfB8+fWL96XOEhofrSZn+SFSTP6Vz+sEDzpzZCMCD1+7kymxO5vTp+RIWRvkyz3j8+LrMChPOo2uPWL1knNwyouHh60u/tgPklpEgQkNDYk07t/Uc+7x3c+zYaj0qShheHz8yuWdf3r93Z2Sf9qQyMpJbEu8CAihTqAS//96K4rnzxJrvw6dPlC9RBT+/t5R54Ei5/Pn1qDIqPp8/M3LYAp4+usvlq4cwSZUq2WUmu4a6+vBJatRoT9t2o+nSYxJO7u7JFqUrPL28ESKSQoXKUzx3HjKnTw/A1HnrcHNzlFldwrn85Amb/54tt4wYad+sN/fvR62R3jp9nX03b8qkKGbCvn6NM71kjRI8e+aIf3CwnhQlnGNXb/H+vTvNmw8hvYmJ3HLw9POjdeOe+Pt7c/z4mjjzTpq8Eg+PJ0xaulpWYwpQumg5dmyZw+3bp/ANDNRKmcmuoc4d+ieenq6a70cPruG332LvtM2ZswBD5w6iUUn991f2aVSPKu4vMc+QIcr+47u2Ex4epnc9ScXlyQu+fNHOA6Btbt8+EW3f6dPrcXa+TMbju2lQooQMqqKz59p12vUYEmu6/4cAnj1z5POXL2RKl06PyuImODSUlRPnANB6eBtUktYGxpPMBWcXbtw4Gm++f58+ZZ3dZBo2/IN+HZvpQVnsuL59y5s3bkj/Xb8BPaaydutMcmbKlKxyk21Q5+1az+Mbj/mtrDXPnNzYsHgud++ew9KyIF5ezzX5LCzy4u39mrt3wTJ/HhqtlGcAyCZ37ijfJ8xby8sX9wEoXbquHJISzdppC8mV6ze5ZUSjVq1OREZGRtmXOXN20qXLiIfHExqVKk1kpPx9fpefPGFYy3a88Hwea55z+4/oUVHCufTkCS4ulwHoWbeWvGJQ95ueXn8agAWbdsea79+nT2lVtR4ADbs3jVap0Tfzp9tH+X78+GqK5NvNkElzmD22b9ILFkIkeCtXrpxIKqsPnxSSpBJFilQWXh8/Rkv/r+xE6UmuzuETlwhJUglJUole/aYn+H8B7uhb6zcWbtknQBLlyzeON682r2lcWp09XousWfMIIyNjYWRkLPLmLSb6D58rAoKDNXmyZcsrjIyMRbp0ZmLyovUiJCxMFq2rDp0Q6dNnFIUKVYj1uvUeNFNIkkpUqdIqmk5ta03s/S9btoEARKpUJgnKr+tn1dZ2pJAklahbt1uM10oIIT59+SIyZswmJEkVq0593X8hhLA/cUaAJIQQIjDki5i/cY/Il89GgCSAGO1TQrXqbVBqSu8+CBHJsHlTk12t1gYNGvTi8uW9ALRuM5LFS0bLrChhuN5Sd6/0nzlKZiXfCQ3/ysePbwGoVKk5e46tJ7e5eZQ8gybPYNaovnz5EsjccQNo07YepfPl07vWk+uP8eVLIF2HDY01z5G9azEyMmbIwjFaGajQFocdHXFycgAgbVpTmdVE5cKFHdSs7s+g+VF/RzeP3+TGhXN8+uRLkyb9ZFIXldAvYZqmfnqTNIzt1Z6TWw7x+vVjhIA0ybjnejOovr5vMTPLSvEiBfR1yjhxdDxDWFgI5uY5mLxoqGaAypA56uTEvu3LsbGpSqfa1eSWEyPr9tpFM6YA7VrX48imGty/f1H/ov7jw6dP3L17DoApg7vFms/P7y3W1mXpXPV3fUlLEE6X72s+d+03VkYl3xkwoxfXrh3C2/s1t26d4FbtqH3oQggkSSJv3qLMWjVeJpVRObr2YLR9zs6XNJ+TYwv0Mg/10H8rFtY7HKd64cL6OGW8fPz4HoBWHQbKUlNKCpeO/ktAwAcKFixDepM0csuJQmRkJJGRkRTLlSvGdIEgMjISIdT5pg9fomeFEBwWxrt3L2nefHC8eQsVqqgHRYnjwSW1QTUzzcKwUV1lVqOmQYkS3HO9x45/r9Jv2GyyZLGk37DZmu3yk8cAlCvX0GB+Zy36twbgppsbqw6eoEWLoQQEfMDMLAsA1549S3LZeqmhntvzD1Wr2tKibFl9nC5e1p06C0Dlyi1YuGCkzGoSzuNbLoBEkz5N5ZYShb8X70SlivvdvP/geR4+vIokqVCpVExfrv8uC/MMGShWrCqurrd46+8fY9fTKx8fhBBUaGhYBvXA7dscPboSAFMzcwrnzCmzou/kyJiRTlWq0KlKFf5eFrUWev/1a4QQLLOfKJO66LSpW41JpuZU/q2QpulfrVoblm1dQLu6rVg1cxO/b52TpLJ1blA/h4Rw6dRRZm1ZZhD9UV4fP7J64jwAipUplyKa+gAvP3hz+/ZJChQoRf/mDeWWE4XzJ/fGme7h68uqmVM1383NLUltrP81JaZp0pAvXzFOnVpH09rt6D1lhCbt6Z2nvHrkzps3z5AkSfNDMxT8PnxECPUMimo1WsusJuHMHLUcSZLIkyWL3FI05MyUiVXHD9GtRnWEgJ59p7HSbhzpTdLQ0LYzR3dv4N6r/kmqUeu0yb/x7AUyZzDl9etHtCpXTpenSjD5c1hy//5Fnr17y/pVk+WWk2Amj16Oj88bajVMOT8mUE+lqlqmumbQqlatTnh7v4q1a0DXHD++hoiIryzcvgxJJWm25XNGcvjwcvLmLQrE3ceqb1zfvqV/s8YAFCpUgd0758msKGG06zCGAweWJqiLRd90rV7tv5H5SDbZT9N0oa1eNBYvLzeaVKnLW3//RJers2qC18ePTO89hIiICGrV6qyr0ySZd/7+mPxUS8pmqh45/fD5MwDvAwJYOmMDAEbGRtjZjccsbVr9Cv2P82d2AlC3i+HNlRVCaOafbj5/kfFdeuHt/fq/tEgk6ft7+8KF7bJo/Jm6NjbUtbGJtj9P0TxwWD1XtUaRIjIoi87Za46a2mn9lu1lVpNwLl7YQ9q0GRgy2zBG9xNKs2aDOH58NQsWb2HpzOGJOlYnBjU8IoJGNW3x8HhCvnzF+MvO8KYkVS9SNNq+pk37I0lGHD8e8/rtbHmysWha7KtrdMX+W7fw8TFcj01dhwxn7jj1+v0+DesDaPpUIyO/f+7S03D60WLlv/mEhmJMAfzfq2tKmTPnYPz4P2RWkzBm/70dHx9PsmbNRf3ixeWWkyhGzB3A2bObWT57FF16Nad8gYTPTNJJk//+q1c4O6tXc0z+e5nBjO4BcdaWT5xYqzGmxsapSJ3KhNSpTGjSpB/j5qymRnN5ptGc3uJAREQENjZVaVe5kiwa4qJjh4aYm1vGmm5ubsmd527Y2RnGtJk4McD+08uHzwNgaWmtaUUZOjtWrEaSJGrX7QTAozdvZFaUcOrY2DBqxhKEEEwZPI/PIbE70fkZrddQH3p60qpGIwAmzl/LHw0Nq4l65swGxs+twNcwtXOMp46uUWqko6Y
"text/plain": [
"<Figure size 432x288 with 64 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for digit_num in range(0,64):\n",
" plt.subplot(8,8,digit_num+1)\n",
" grid_data = mnist_X.iloc[digit_num].values.reshape(28,28)\n",
" plt.imshow(grid_data, interpolation = \"none\", cmap = \"bone_r\")\n",
" plt.xticks([])\n",
" plt.yticks([])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "multiclass format is not supported",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-76-fdcd5b2118f0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mroc_auc_score\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mroc_value\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mroc_auc_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrf_probs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m#sizes = range(1000, 6666, 1000)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mE:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\ranking.py\u001b[0m in \u001b[0;36mroc_auc_score\u001b[1;34m(y_true, y_score, average, sample_weight, max_fpr)\u001b[0m\n\u001b[0;32m 353\u001b[0m return _average_binary_score(\n\u001b[0;32m 354\u001b[0m \u001b[0m_binary_roc_auc_score\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_true\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_score\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maverage\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 355\u001b[1;33m sample_weight=sample_weight)\n\u001b[0m\u001b[0;32m 356\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 357\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mE:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\base.py\u001b[0m in \u001b[0;36m_average_binary_score\u001b[1;34m(binary_metric, y_true, y_score, average, sample_weight)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[0my_type\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtype_of_target\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 72\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0my_type\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"binary\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"multilabel-indicator\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 73\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"{0} format is not supported\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_type\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 74\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0my_type\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"binary\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: multiclass format is not supported"
]
}
],
"source": [
"model = RandomForestClassifier(n_estimators=10, max_depth=10)\n",
"model.fit(X_train,y_train)\n",
"\n",
"pred = model.predict(X_test)\n",
"\n",
"rf_probs = model.predict_proba(X_test)[:, 1]\n",
"\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"#sizes = range(1000, 6666, 1000)\n",
"#train_size, train_score, val_score = learning_curve(rf_lrn, X, y, train_sizes=sizes, cv=3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MILAN POLLUTION RF REGRESSOR"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stazione_id</th>\n",
" <th>data</th>\n",
" <th>inquinante</th>\n",
" <th>valore</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>2019/01/03</td>\n",
" <td>NO2</td>\n",
" <td>51.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2019/01/03</td>\n",
" <td>CO_8h</td>\n",
" <td>1.2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>2019/01/03</td>\n",
" <td>PM10</td>\n",
" <td>29.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>2019/01/03</td>\n",
" <td>NO2</td>\n",
" <td>139.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2019/01/03</td>\n",
" <td>CO_8h</td>\n",
" <td>1.3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stazione_id data inquinante valore\n",
"0 3 2019/01/03 NO2 51.0\n",
"1 3 2019/01/03 CO_8h 1.2\n",
"2 4 2019/01/03 PM10 29.0\n",
"3 4 2019/01/03 NO2 139.0\n",
"4 4 2019/01/03 CO_8h 1.3"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = pd.read_csv(\"Datasets/RilevazioneQA/qaria_2019.csv\")\n",
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6162, 4)"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.shape"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 NO2\n",
"1 CO_8h\n",
"2 PM10\n",
"3 NO2\n",
"4 CO_8h\n",
" ... \n",
"6157 NO2\n",
"6158 O3\n",
"6159 NO2\n",
"6160 CO_8h\n",
"6161 C6H6\n",
"Name: inquinante, Length: 6162, dtype: object"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset[\"inquinante\"]"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"stazione_id int64\n",
"data object\n",
"inquinante object\n",
"valore float64\n",
"dtype: object"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.dtypes\n"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stazione_id</th>\n",
" <th>valore</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>count</td>\n",
" <td>6162.000000</td>\n",
" <td>4488.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean</td>\n",
" <td>4.615385</td>\n",
" <td>35.406009</td>\n",
" </tr>\n",
" <tr>\n",
" <td>std</td>\n",
" <td>2.167715</td>\n",
" <td>39.452066</td>\n",
" </tr>\n",
" <tr>\n",
" <td>min</td>\n",
" <td>1.000000</td>\n",
" <td>0.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>25%</td>\n",
" <td>2.000000</td>\n",
" <td>2.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>50%</td>\n",
" <td>4.500000</td>\n",
" <td>21.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>75%</td>\n",
" <td>6.000000</td>\n",
" <td>60.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>max</td>\n",
" <td>8.000000</td>\n",
" <td>234.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stazione_id valore\n",
"count 6162.000000 4488.000000\n",
"mean 4.615385 35.406009\n",
"std 2.167715 39.452066\n",
"min 1.000000 0.250000\n",
"25% 2.000000 2.100000\n",
"50% 4.500000 21.000000\n",
"75% 6.000000 60.000000\n",
"max 8.000000 234.000000"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.describe()"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stazione_id</th>\n",
" <th>data</th>\n",
" <th>inquinante</th>\n",
" <th>valore</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>2019/01/03</td>\n",
" <td>PM10</td>\n",
" <td>29.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>2</td>\n",
" <td>2019/01/03</td>\n",
" <td>PM10</td>\n",
" <td>20.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>6</td>\n",
" <td>2019/01/03</td>\n",
" <td>PM10</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" <td>7</td>\n",
" <td>2019/01/03</td>\n",
" <td>PM10</td>\n",
" <td>32.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>2019/01/04</td>\n",
" <td>PM10</td>\n",
" <td>25.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6120</td>\n",
" <td>2</td>\n",
" <td>2019/12/30</td>\n",
" <td>PM10</td>\n",
" <td>59.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6125</td>\n",
" <td>6</td>\n",
" <td>2019/12/30</td>\n",
" <td>PM10</td>\n",
" <td>69.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6139</td>\n",
" <td>4</td>\n",
" <td>2019/12/31</td>\n",
" <td>PM10</td>\n",
" <td>57.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6146</td>\n",
" <td>2</td>\n",
" <td>2019/12/31</td>\n",
" <td>PM10</td>\n",
" <td>51.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6151</td>\n",
" <td>6</td>\n",
" <td>2019/12/31</td>\n",
" <td>PM10</td>\n",
" <td>59.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>900 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" stazione_id data inquinante valore\n",
"2 4 2019/01/03 PM10 29.0\n",
"9 2 2019/01/03 PM10 20.0\n",
"15 6 2019/01/03 PM10 24.0\n",
"20 7 2019/01/03 PM10 32.0\n",
"29 4 2019/01/04 PM10 25.0\n",
"... ... ... ... ...\n",
"6120 2 2019/12/30 PM10 59.0\n",
"6125 6 2019/12/30 PM10 69.0\n",
"6139 4 2019/12/31 PM10 57.0\n",
"6146 2 2019/12/31 PM10 51.0\n",
"6151 6 2019/12/31 PM10 59.0\n",
"\n",
"[900 rows x 4 columns]"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indexName = dataset[ dataset[\"inquinante\"]!=\"PM10\"].index\n",
"indexName\n",
"dataset.drop(indexName, inplace=True)\n",
"dataset = dataset.dropna()\n",
"dataset.describe()\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [],
"source": [
"y = dataset.iloc[:, 3].values\n",
"lenght = [i for i in range(1,len(y)+1)]\n",
"dataset[\"n\"] = lenght\n",
"dataset\n",
"x = dataset.iloc[:,4:5]\n",
"X = x"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"regressor = RandomForestRegressor(n_estimators=10, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
" max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10,\n",
" n_jobs=None, oob_score=False, random_state=0, verbose=0,\n",
" warm_start=False)"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"regressor.fit(x,y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
2020-03-01 13:09:38 +01:00
"execution_count": 211,
2020-02-29 21:00:09 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2020-03-01 13:09:38 +01:00
"The predicted value of PM10 at time 1000000 is [57.2]\n"
2020-02-29 21:00:09 +01:00
]
}
],
"source": [
"# Step 4 - Predict\n",
2020-03-01 13:09:38 +01:00
"time = 1000000\n",
2020-02-29 21:00:09 +01:00
"y_pred = regressor.predict([[time]])\n",
2020-03-01 13:09:38 +01:00
"print('The predicted value of PM10 at time ',time,' is ',y_pred)"
2020-02-29 21:00:09 +01:00
]
},
{
"cell_type": "code",
2020-03-01 13:09:38 +01:00
"execution_count": null,
2020-02-29 21:00:09 +01:00
"metadata": {},
2020-03-01 13:09:38 +01:00
"outputs": [],
"source": []
2020-02-29 21:00:09 +01:00
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}