|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "id": "56c9e512", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [], |
| 9 | + "source": [ |
| 10 | + "import numpy as np\n", |
| 11 | + "import pandas as pd\n", |
| 12 | + "\n", |
| 13 | + "import matplotlib.pyplot as plt # for plotting graphs.\n", |
| 14 | + "\n", |
| 15 | + "from sklearn.model_selection import train_test_split # for splitting the data into training and testing data.\n", |
| 16 | + "from sklearn.naive_bayes import GaussianNB # importing the Guassian Naive Bayes model.\n", |
| 17 | + "\n", |
| 18 | + "np.set_printoptions(suppress=True, precision=6) # set the printing options" |
| 19 | + ] |
| 20 | + }, |
| 21 | + { |
| 22 | + "cell_type": "code", |
| 23 | + "execution_count": 2, |
| 24 | + "id": "c1944b98", |
| 25 | + "metadata": {}, |
| 26 | + "outputs": [ |
| 27 | + { |
| 28 | + "data": { |
| 29 | + "text/html": [ |
| 30 | + "<div>\n", |
| 31 | + "<style scoped>\n", |
| 32 | + " .dataframe tbody tr th:only-of-type {\n", |
| 33 | + " vertical-align: middle;\n", |
| 34 | + " }\n", |
| 35 | + "\n", |
| 36 | + " .dataframe tbody tr th {\n", |
| 37 | + " vertical-align: top;\n", |
| 38 | + " }\n", |
| 39 | + "\n", |
| 40 | + " .dataframe thead th {\n", |
| 41 | + " text-align: right;\n", |
| 42 | + " }\n", |
| 43 | + "</style>\n", |
| 44 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 45 | + " <thead>\n", |
| 46 | + " <tr style=\"text-align: right;\">\n", |
| 47 | + " <th></th>\n", |
| 48 | + " <th>passenger_id</th>\n", |
| 49 | + " <th>name</th>\n", |
| 50 | + " <th>p_class</th>\n", |
| 51 | + " <th>gender</th>\n", |
| 52 | + " <th>age</th>\n", |
| 53 | + " <th>sib_sp</th>\n", |
| 54 | + " <th>parch</th>\n", |
| 55 | + " <th>ticket</th>\n", |
| 56 | + " <th>fare</th>\n", |
| 57 | + " <th>cabin</th>\n", |
| 58 | + " <th>embarked</th>\n", |
| 59 | + " <th>survived</th>\n", |
| 60 | + " </tr>\n", |
| 61 | + " </thead>\n", |
| 62 | + " <tbody>\n", |
| 63 | + " <tr>\n", |
| 64 | + " <th>0</th>\n", |
| 65 | + " <td>1</td>\n", |
| 66 | + " <td>Braund, Mr. Owen Harris</td>\n", |
| 67 | + " <td>3</td>\n", |
| 68 | + " <td>male</td>\n", |
| 69 | + " <td>22.0</td>\n", |
| 70 | + " <td>1</td>\n", |
| 71 | + " <td>0</td>\n", |
| 72 | + " <td>A/5 21171</td>\n", |
| 73 | + " <td>7.2500</td>\n", |
| 74 | + " <td>NaN</td>\n", |
| 75 | + " <td>S</td>\n", |
| 76 | + " <td>0</td>\n", |
| 77 | + " </tr>\n", |
| 78 | + " <tr>\n", |
| 79 | + " <th>1</th>\n", |
| 80 | + " <td>2</td>\n", |
| 81 | + " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", |
| 82 | + " <td>1</td>\n", |
| 83 | + " <td>female</td>\n", |
| 84 | + " <td>38.0</td>\n", |
| 85 | + " <td>1</td>\n", |
| 86 | + " <td>0</td>\n", |
| 87 | + " <td>PC 17599</td>\n", |
| 88 | + " <td>71.2833</td>\n", |
| 89 | + " <td>C85</td>\n", |
| 90 | + " <td>C</td>\n", |
| 91 | + " <td>1</td>\n", |
| 92 | + " </tr>\n", |
| 93 | + " <tr>\n", |
| 94 | + " <th>2</th>\n", |
| 95 | + " <td>3</td>\n", |
| 96 | + " <td>Heikkinen, Miss. Laina</td>\n", |
| 97 | + " <td>3</td>\n", |
| 98 | + " <td>female</td>\n", |
| 99 | + " <td>26.0</td>\n", |
| 100 | + " <td>0</td>\n", |
| 101 | + " <td>0</td>\n", |
| 102 | + " <td>STON/O2. 3101282</td>\n", |
| 103 | + " <td>7.9250</td>\n", |
| 104 | + " <td>NaN</td>\n", |
| 105 | + " <td>S</td>\n", |
| 106 | + " <td>1</td>\n", |
| 107 | + " </tr>\n", |
| 108 | + " <tr>\n", |
| 109 | + " <th>3</th>\n", |
| 110 | + " <td>4</td>\n", |
| 111 | + " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", |
| 112 | + " <td>1</td>\n", |
| 113 | + " <td>female</td>\n", |
| 114 | + " <td>35.0</td>\n", |
| 115 | + " <td>1</td>\n", |
| 116 | + " <td>0</td>\n", |
| 117 | + " <td>113803</td>\n", |
| 118 | + " <td>53.1000</td>\n", |
| 119 | + " <td>C123</td>\n", |
| 120 | + " <td>S</td>\n", |
| 121 | + " <td>1</td>\n", |
| 122 | + " </tr>\n", |
| 123 | + " <tr>\n", |
| 124 | + " <th>4</th>\n", |
| 125 | + " <td>5</td>\n", |
| 126 | + " <td>Allen, Mr. William Henry</td>\n", |
| 127 | + " <td>3</td>\n", |
| 128 | + " <td>male</td>\n", |
| 129 | + " <td>35.0</td>\n", |
| 130 | + " <td>0</td>\n", |
| 131 | + " <td>0</td>\n", |
| 132 | + " <td>373450</td>\n", |
| 133 | + " <td>8.0500</td>\n", |
| 134 | + " <td>NaN</td>\n", |
| 135 | + " <td>S</td>\n", |
| 136 | + " <td>0</td>\n", |
| 137 | + " </tr>\n", |
| 138 | + " </tbody>\n", |
| 139 | + "</table>\n", |
| 140 | + "</div>" |
| 141 | + ], |
| 142 | + "text/plain": [ |
| 143 | + " passenger_id name p_class \\\n", |
| 144 | + "0 1 Braund, Mr. Owen Harris 3 \n", |
| 145 | + "1 2 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 \n", |
| 146 | + "2 3 Heikkinen, Miss. Laina 3 \n", |
| 147 | + "3 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 \n", |
| 148 | + "4 5 Allen, Mr. William Henry 3 \n", |
| 149 | + "\n", |
| 150 | + " gender age sib_sp parch ticket fare cabin embarked \\\n", |
| 151 | + "0 male 22.0 1 0 A/5 21171 7.2500 NaN S \n", |
| 152 | + "1 female 38.0 1 0 PC 17599 71.2833 C85 C \n", |
| 153 | + "2 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S \n", |
| 154 | + "3 female 35.0 1 0 113803 53.1000 C123 S \n", |
| 155 | + "4 male 35.0 0 0 373450 8.0500 NaN S \n", |
| 156 | + "\n", |
| 157 | + " survived \n", |
| 158 | + "0 0 \n", |
| 159 | + "1 1 \n", |
| 160 | + "2 1 \n", |
| 161 | + "3 1 \n", |
| 162 | + "4 0 " |
| 163 | + ] |
| 164 | + }, |
| 165 | + "execution_count": 2, |
| 166 | + "metadata": {}, |
| 167 | + "output_type": "execute_result" |
| 168 | + } |
| 169 | + ], |
| 170 | + "source": [ |
| 171 | + "df = pd.read_csv(\"data/titanic-data.csv\")\n", |
| 172 | + "df.head()" |
| 173 | + ] |
| 174 | + }, |
| 175 | + { |
| 176 | + "cell_type": "code", |
| 177 | + "execution_count": 3, |
| 178 | + "id": "d91d3988", |
| 179 | + "metadata": {}, |
| 180 | + "outputs": [ |
| 181 | + { |
| 182 | + "data": { |
| 183 | + "text/html": [ |
| 184 | + "<div>\n", |
| 185 | + "<style scoped>\n", |
| 186 | + " .dataframe tbody tr th:only-of-type {\n", |
| 187 | + " vertical-align: middle;\n", |
| 188 | + " }\n", |
| 189 | + "\n", |
| 190 | + " .dataframe tbody tr th {\n", |
| 191 | + " vertical-align: top;\n", |
| 192 | + " }\n", |
| 193 | + "\n", |
| 194 | + " .dataframe thead th {\n", |
| 195 | + " text-align: right;\n", |
| 196 | + " }\n", |
| 197 | + "</style>\n", |
| 198 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 199 | + " <thead>\n", |
| 200 | + " <tr style=\"text-align: right;\">\n", |
| 201 | + " <th></th>\n", |
| 202 | + " <th>p_class</th>\n", |
| 203 | + " <th>gender</th>\n", |
| 204 | + " <th>age</th>\n", |
| 205 | + " <th>fare</th>\n", |
| 206 | + " <th>survived</th>\n", |
| 207 | + " </tr>\n", |
| 208 | + " </thead>\n", |
| 209 | + " <tbody>\n", |
| 210 | + " <tr>\n", |
| 211 | + " <th>0</th>\n", |
| 212 | + " <td>3</td>\n", |
| 213 | + " <td>male</td>\n", |
| 214 | + " <td>22.0</td>\n", |
| 215 | + " <td>7.2500</td>\n", |
| 216 | + " <td>0</td>\n", |
| 217 | + " </tr>\n", |
| 218 | + " <tr>\n", |
| 219 | + " <th>1</th>\n", |
| 220 | + " <td>1</td>\n", |
| 221 | + " <td>female</td>\n", |
| 222 | + " <td>38.0</td>\n", |
| 223 | + " <td>71.2833</td>\n", |
| 224 | + " <td>1</td>\n", |
| 225 | + " </tr>\n", |
| 226 | + " <tr>\n", |
| 227 | + " <th>2</th>\n", |
| 228 | + " <td>3</td>\n", |
| 229 | + " <td>female</td>\n", |
| 230 | + " <td>26.0</td>\n", |
| 231 | + " <td>7.9250</td>\n", |
| 232 | + " <td>1</td>\n", |
| 233 | + " </tr>\n", |
| 234 | + " <tr>\n", |
| 235 | + " <th>3</th>\n", |
| 236 | + " <td>1</td>\n", |
| 237 | + " <td>female</td>\n", |
| 238 | + " <td>35.0</td>\n", |
| 239 | + " <td>53.1000</td>\n", |
| 240 | + " <td>1</td>\n", |
| 241 | + " </tr>\n", |
| 242 | + " <tr>\n", |
| 243 | + " <th>4</th>\n", |
| 244 | + " <td>3</td>\n", |
| 245 | + " <td>male</td>\n", |
| 246 | + " <td>35.0</td>\n", |
| 247 | + " <td>8.0500</td>\n", |
| 248 | + " <td>0</td>\n", |
| 249 | + " </tr>\n", |
| 250 | + " </tbody>\n", |
| 251 | + "</table>\n", |
| 252 | + "</div>" |
| 253 | + ], |
| 254 | + "text/plain": [ |
| 255 | + " p_class gender age fare survived\n", |
| 256 | + "0 3 male 22.0 7.2500 0\n", |
| 257 | + "1 1 female 38.0 71.2833 1\n", |
| 258 | + "2 3 female 26.0 7.9250 1\n", |
| 259 | + "3 1 female 35.0 53.1000 1\n", |
| 260 | + "4 3 male 35.0 8.0500 0" |
| 261 | + ] |
| 262 | + }, |
| 263 | + "execution_count": 3, |
| 264 | + "metadata": {}, |
| 265 | + "output_type": "execute_result" |
| 266 | + } |
| 267 | + ], |
| 268 | + "source": [ |
| 269 | + "# dropping certain columns that don't have any impact on the survival rate.\n", |
| 270 | + "df.drop([\"passenger_id\", \"name\", \"sib_sp\", \"parch\", \"ticket\", \"cabin\", \"embarked\"], axis=1, inplace=True)\n", |
| 271 | + "df.head()" |
| 272 | + ] |
| 273 | + }, |
| 274 | + { |
| 275 | + "cell_type": "code", |
| 276 | + "execution_count": 4, |
| 277 | + "id": "0a5a4824", |
| 278 | + "metadata": {}, |
| 279 | + "outputs": [ |
| 280 | + { |
| 281 | + "data": { |
| 282 | + "text/plain": [ |
| 283 | + "0 0\n", |
| 284 | + "1 1\n", |
| 285 | + "2 1\n", |
| 286 | + "3 1\n", |
| 287 | + "4 0\n", |
| 288 | + "Name: survived, dtype: int64" |
| 289 | + ] |
| 290 | + }, |
| 291 | + "metadata": {}, |
| 292 | + "output_type": "display_data" |
| 293 | + }, |
| 294 | + { |
| 295 | + "data": { |
| 296 | + "text/html": [ |
| 297 | + "<div>\n", |
| 298 | + "<style scoped>\n", |
| 299 | + " .dataframe tbody tr th:only-of-type {\n", |
| 300 | + " vertical-align: middle;\n", |
| 301 | + " }\n", |
| 302 | + "\n", |
| 303 | + " .dataframe tbody tr th {\n", |
| 304 | + " vertical-align: top;\n", |
| 305 | + " }\n", |
| 306 | + "\n", |
| 307 | + " .dataframe thead th {\n", |
| 308 | + " text-align: right;\n", |
| 309 | + " }\n", |
| 310 | + "</style>\n", |
| 311 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 312 | + " <thead>\n", |
| 313 | + " <tr style=\"text-align: right;\">\n", |
| 314 | + " <th></th>\n", |
| 315 | + " <th>p_class</th>\n", |
| 316 | + " <th>gender</th>\n", |
| 317 | + " <th>age</th>\n", |
| 318 | + " <th>fare</th>\n", |
| 319 | + " </tr>\n", |
| 320 | + " </thead>\n", |
| 321 | + " <tbody>\n", |
| 322 | + " <tr>\n", |
| 323 | + " <th>0</th>\n", |
| 324 | + " <td>3</td>\n", |
| 325 | + " <td>male</td>\n", |
| 326 | + " <td>22.0</td>\n", |
| 327 | + " <td>7.2500</td>\n", |
| 328 | + " </tr>\n", |
| 329 | + " <tr>\n", |
| 330 | + " <th>1</th>\n", |
| 331 | + " <td>1</td>\n", |
| 332 | + " <td>female</td>\n", |
| 333 | + " <td>38.0</td>\n", |
| 334 | + " <td>71.2833</td>\n", |
| 335 | + " </tr>\n", |
| 336 | + " <tr>\n", |
| 337 | + " <th>2</th>\n", |
| 338 | + " <td>3</td>\n", |
| 339 | + " <td>female</td>\n", |
| 340 | + " <td>26.0</td>\n", |
| 341 | + " <td>7.9250</td>\n", |
| 342 | + " </tr>\n", |
| 343 | + " <tr>\n", |
| 344 | + " <th>3</th>\n", |
| 345 | + " <td>1</td>\n", |
| 346 | + " <td>female</td>\n", |
| 347 | + " <td>35.0</td>\n", |
| 348 | + " <td>53.1000</td>\n", |
| 349 | + " </tr>\n", |
| 350 | + " <tr>\n", |
| 351 | + " <th>4</th>\n", |
| 352 | + " <td>3</td>\n", |
| 353 | + " <td>male</td>\n", |
| 354 | + " <td>35.0</td>\n", |
| 355 | + " <td>8.0500</td>\n", |
| 356 | + " </tr>\n", |
| 357 | + " </tbody>\n", |
| 358 | + "</table>\n", |
| 359 | + "</div>" |
| 360 | + ], |
| 361 | + "text/plain": [ |
| 362 | + " p_class gender age fare\n", |
| 363 | + "0 3 male 22.0 7.2500\n", |
| 364 | + "1 1 female 38.0 71.2833\n", |
| 365 | + "2 3 female 26.0 7.9250\n", |
| 366 | + "3 1 female 35.0 53.1000\n", |
| 367 | + "4 3 male 35.0 8.0500" |
| 368 | + ] |
| 369 | + }, |
| 370 | + "metadata": {}, |
| 371 | + "output_type": "display_data" |
| 372 | + } |
| 373 | + ], |
| 374 | + "source": [ |
| 375 | + "target = df[\"survived\"] # creating a series for the survived column.\n", |
| 376 | + "inputs = df.drop(\"survived\", axis=1) # creating a separate dataframe by removing the survived column.\n", |
| 377 | + "\n", |
| 378 | + "display(target.head())\n", |
| 379 | + "display(inputs.head())" |
| 380 | + ] |
| 381 | + }, |
| 382 | + { |
| 383 | + "cell_type": "code", |
| 384 | + "execution_count": 5, |
| 385 | + "id": "69849efe", |
| 386 | + "metadata": {}, |
| 387 | + "outputs": [ |
| 388 | + { |
| 389 | + "data": { |
| 390 | + "text/html": [ |
| 391 | + "<div>\n", |
| 392 | + "<style scoped>\n", |
| 393 | + " .dataframe tbody tr th:only-of-type {\n", |
| 394 | + " vertical-align: middle;\n", |
| 395 | + " }\n", |
| 396 | + "\n", |
| 397 | + " .dataframe tbody tr th {\n", |
| 398 | + " vertical-align: top;\n", |
| 399 | + " }\n", |
| 400 | + "\n", |
| 401 | + " .dataframe thead th {\n", |
| 402 | + " text-align: right;\n", |
| 403 | + " }\n", |
| 404 | + "</style>\n", |
| 405 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 406 | + " <thead>\n", |
| 407 | + " <tr style=\"text-align: right;\">\n", |
| 408 | + " <th></th>\n", |
| 409 | + " <th>female</th>\n", |
| 410 | + " <th>male</th>\n", |
| 411 | + " </tr>\n", |
| 412 | + " </thead>\n", |
| 413 | + " <tbody>\n", |
| 414 | + " <tr>\n", |
| 415 | + " <th>0</th>\n", |
| 416 | + " <td>0</td>\n", |
| 417 | + " <td>1</td>\n", |
| 418 | + " </tr>\n", |
| 419 | + " <tr>\n", |
| 420 | + " <th>1</th>\n", |
| 421 | + " <td>1</td>\n", |
| 422 | + " <td>0</td>\n", |
| 423 | + " </tr>\n", |
| 424 | + " <tr>\n", |
| 425 | + " <th>2</th>\n", |
| 426 | + " <td>1</td>\n", |
| 427 | + " <td>0</td>\n", |
| 428 | + " </tr>\n", |
| 429 | + " <tr>\n", |
| 430 | + " <th>3</th>\n", |
| 431 | + " <td>1</td>\n", |
| 432 | + " <td>0</td>\n", |
| 433 | + " </tr>\n", |
| 434 | + " <tr>\n", |
| 435 | + " <th>4</th>\n", |
| 436 | + " <td>0</td>\n", |
| 437 | + " <td>1</td>\n", |
| 438 | + " </tr>\n", |
| 439 | + " </tbody>\n", |
| 440 | + "</table>\n", |
| 441 | + "</div>" |
| 442 | + ], |
| 443 | + "text/plain": [ |
| 444 | + " female male\n", |
| 445 | + "0 0 1\n", |
| 446 | + "1 1 0\n", |
| 447 | + "2 1 0\n", |
| 448 | + "3 1 0\n", |
| 449 | + "4 0 1" |
| 450 | + ] |
| 451 | + }, |
| 452 | + "metadata": {}, |
| 453 | + "output_type": "display_data" |
| 454 | + }, |
| 455 | + { |
| 456 | + "name": "stdout", |
| 457 | + "output_type": "stream", |
| 458 | + "text": [ |
| 459 | + "female uint8\n", |
| 460 | + "male uint8\n", |
| 461 | + "dtype: object\n" |
| 462 | + ] |
| 463 | + } |
| 464 | + ], |
| 465 | + "source": [ |
| 466 | + "dummies = pd.get_dummies(inputs[\"gender\"]) # converting the gender column into dummy variables.\n", |
| 467 | + "\n", |
| 468 | + "display(dummies.head())\n", |
| 469 | + "print(dummies.dtypes)" |
| 470 | + ] |
| 471 | + }, |
| 472 | + { |
| 473 | + "cell_type": "code", |
| 474 | + "execution_count": 6, |
| 475 | + "id": "9bdf780c", |
| 476 | + "metadata": {}, |
| 477 | + "outputs": [ |
| 478 | + { |
| 479 | + "data": { |
| 480 | + "text/html": [ |
| 481 | + "<div>\n", |
| 482 | + "<style scoped>\n", |
| 483 | + " .dataframe tbody tr th:only-of-type {\n", |
| 484 | + " vertical-align: middle;\n", |
| 485 | + " }\n", |
| 486 | + "\n", |
| 487 | + " .dataframe tbody tr th {\n", |
| 488 | + " vertical-align: top;\n", |
| 489 | + " }\n", |
| 490 | + "\n", |
| 491 | + " .dataframe thead th {\n", |
| 492 | + " text-align: right;\n", |
| 493 | + " }\n", |
| 494 | + "</style>\n", |
| 495 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 496 | + " <thead>\n", |
| 497 | + " <tr style=\"text-align: right;\">\n", |
| 498 | + " <th></th>\n", |
| 499 | + " <th>p_class</th>\n", |
| 500 | + " <th>gender</th>\n", |
| 501 | + " <th>age</th>\n", |
| 502 | + " <th>fare</th>\n", |
| 503 | + " <th>female</th>\n", |
| 504 | + " <th>male</th>\n", |
| 505 | + " </tr>\n", |
| 506 | + " </thead>\n", |
| 507 | + " <tbody>\n", |
| 508 | + " <tr>\n", |
| 509 | + " <th>0</th>\n", |
| 510 | + " <td>3</td>\n", |
| 511 | + " <td>male</td>\n", |
| 512 | + " <td>22.0</td>\n", |
| 513 | + " <td>7.2500</td>\n", |
| 514 | + " <td>0</td>\n", |
| 515 | + " <td>1</td>\n", |
| 516 | + " </tr>\n", |
| 517 | + " <tr>\n", |
| 518 | + " <th>1</th>\n", |
| 519 | + " <td>1</td>\n", |
| 520 | + " <td>female</td>\n", |
| 521 | + " <td>38.0</td>\n", |
| 522 | + " <td>71.2833</td>\n", |
| 523 | + " <td>1</td>\n", |
| 524 | + " <td>0</td>\n", |
| 525 | + " </tr>\n", |
| 526 | + " <tr>\n", |
| 527 | + " <th>2</th>\n", |
| 528 | + " <td>3</td>\n", |
| 529 | + " <td>female</td>\n", |
| 530 | + " <td>26.0</td>\n", |
| 531 | + " <td>7.9250</td>\n", |
| 532 | + " <td>1</td>\n", |
| 533 | + " <td>0</td>\n", |
| 534 | + " </tr>\n", |
| 535 | + " <tr>\n", |
| 536 | + " <th>3</th>\n", |
| 537 | + " <td>1</td>\n", |
| 538 | + " <td>female</td>\n", |
| 539 | + " <td>35.0</td>\n", |
| 540 | + " <td>53.1000</td>\n", |
| 541 | + " <td>1</td>\n", |
| 542 | + " <td>0</td>\n", |
| 543 | + " </tr>\n", |
| 544 | + " <tr>\n", |
| 545 | + " <th>4</th>\n", |
| 546 | + " <td>3</td>\n", |
| 547 | + " <td>male</td>\n", |
| 548 | + " <td>35.0</td>\n", |
| 549 | + " <td>8.0500</td>\n", |
| 550 | + " <td>0</td>\n", |
| 551 | + " <td>1</td>\n", |
| 552 | + " </tr>\n", |
| 553 | + " </tbody>\n", |
| 554 | + "</table>\n", |
| 555 | + "</div>" |
| 556 | + ], |
| 557 | + "text/plain": [ |
| 558 | + " p_class gender age fare female male\n", |
| 559 | + "0 3 male 22.0 7.2500 0 1\n", |
| 560 | + "1 1 female 38.0 71.2833 1 0\n", |
| 561 | + "2 3 female 26.0 7.9250 1 0\n", |
| 562 | + "3 1 female 35.0 53.1000 1 0\n", |
| 563 | + "4 3 male 35.0 8.0500 0 1" |
| 564 | + ] |
| 565 | + }, |
| 566 | + "execution_count": 6, |
| 567 | + "metadata": {}, |
| 568 | + "output_type": "execute_result" |
| 569 | + } |
| 570 | + ], |
| 571 | + "source": [ |
| 572 | + "# concatenating the inputs dataframe with the dummies dataframe.\n", |
| 573 | + "inputs = pd.concat([inputs, dummies], axis=1)\n", |
| 574 | + "inputs.head()" |
| 575 | + ] |
| 576 | + }, |
| 577 | + { |
| 578 | + "cell_type": "code", |
| 579 | + "execution_count": 7, |
| 580 | + "id": "24fcccc2", |
| 581 | + "metadata": {}, |
| 582 | + "outputs": [ |
| 583 | + { |
| 584 | + "data": { |
| 585 | + "text/html": [ |
| 586 | + "<div>\n", |
| 587 | + "<style scoped>\n", |
| 588 | + " .dataframe tbody tr th:only-of-type {\n", |
| 589 | + " vertical-align: middle;\n", |
| 590 | + " }\n", |
| 591 | + "\n", |
| 592 | + " .dataframe tbody tr th {\n", |
| 593 | + " vertical-align: top;\n", |
| 594 | + " }\n", |
| 595 | + "\n", |
| 596 | + " .dataframe thead th {\n", |
| 597 | + " text-align: right;\n", |
| 598 | + " }\n", |
| 599 | + "</style>\n", |
| 600 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 601 | + " <thead>\n", |
| 602 | + " <tr style=\"text-align: right;\">\n", |
| 603 | + " <th></th>\n", |
| 604 | + " <th>p_class</th>\n", |
| 605 | + " <th>age</th>\n", |
| 606 | + " <th>fare</th>\n", |
| 607 | + " <th>female</th>\n", |
| 608 | + " <th>male</th>\n", |
| 609 | + " </tr>\n", |
| 610 | + " </thead>\n", |
| 611 | + " <tbody>\n", |
| 612 | + " <tr>\n", |
| 613 | + " <th>0</th>\n", |
| 614 | + " <td>3</td>\n", |
| 615 | + " <td>22.0</td>\n", |
| 616 | + " <td>7.2500</td>\n", |
| 617 | + " <td>0</td>\n", |
| 618 | + " <td>1</td>\n", |
| 619 | + " </tr>\n", |
| 620 | + " <tr>\n", |
| 621 | + " <th>1</th>\n", |
| 622 | + " <td>1</td>\n", |
| 623 | + " <td>38.0</td>\n", |
| 624 | + " <td>71.2833</td>\n", |
| 625 | + " <td>1</td>\n", |
| 626 | + " <td>0</td>\n", |
| 627 | + " </tr>\n", |
| 628 | + " <tr>\n", |
| 629 | + " <th>2</th>\n", |
| 630 | + " <td>3</td>\n", |
| 631 | + " <td>26.0</td>\n", |
| 632 | + " <td>7.9250</td>\n", |
| 633 | + " <td>1</td>\n", |
| 634 | + " <td>0</td>\n", |
| 635 | + " </tr>\n", |
| 636 | + " <tr>\n", |
| 637 | + " <th>3</th>\n", |
| 638 | + " <td>1</td>\n", |
| 639 | + " <td>35.0</td>\n", |
| 640 | + " <td>53.1000</td>\n", |
| 641 | + " <td>1</td>\n", |
| 642 | + " <td>0</td>\n", |
| 643 | + " </tr>\n", |
| 644 | + " <tr>\n", |
| 645 | + " <th>4</th>\n", |
| 646 | + " <td>3</td>\n", |
| 647 | + " <td>35.0</td>\n", |
| 648 | + " <td>8.0500</td>\n", |
| 649 | + " <td>0</td>\n", |
| 650 | + " <td>1</td>\n", |
| 651 | + " </tr>\n", |
| 652 | + " </tbody>\n", |
| 653 | + "</table>\n", |
| 654 | + "</div>" |
| 655 | + ], |
| 656 | + "text/plain": [ |
| 657 | + " p_class age fare female male\n", |
| 658 | + "0 3 22.0 7.2500 0 1\n", |
| 659 | + "1 1 38.0 71.2833 1 0\n", |
| 660 | + "2 3 26.0 7.9250 1 0\n", |
| 661 | + "3 1 35.0 53.1000 1 0\n", |
| 662 | + "4 3 35.0 8.0500 0 1" |
| 663 | + ] |
| 664 | + }, |
| 665 | + "execution_count": 7, |
| 666 | + "metadata": {}, |
| 667 | + "output_type": "execute_result" |
| 668 | + } |
| 669 | + ], |
| 670 | + "source": [ |
| 671 | + "# dropping the gender column because we now have the female and male columns.\n", |
| 672 | + "inputs.drop([\"gender\"], axis=1, inplace=True)\n", |
| 673 | + "inputs.head()" |
| 674 | + ] |
| 675 | + }, |
| 676 | + { |
| 677 | + "cell_type": "code", |
| 678 | + "execution_count": 8, |
| 679 | + "id": "c8f8c764", |
| 680 | + "metadata": {}, |
| 681 | + "outputs": [ |
| 682 | + { |
| 683 | + "data": { |
| 684 | + "text/plain": [ |
| 685 | + "0 22.0\n", |
| 686 | + "1 38.0\n", |
| 687 | + "2 26.0\n", |
| 688 | + "3 35.0\n", |
| 689 | + "4 35.0\n", |
| 690 | + "5 NaN\n", |
| 691 | + "6 54.0\n", |
| 692 | + "7 2.0\n", |
| 693 | + "8 27.0\n", |
| 694 | + "9 14.0\n", |
| 695 | + "Name: age, dtype: float64" |
| 696 | + ] |
| 697 | + }, |
| 698 | + "execution_count": 8, |
| 699 | + "metadata": {}, |
| 700 | + "output_type": "execute_result" |
| 701 | + } |
| 702 | + ], |
| 703 | + "source": [ |
| 704 | + "# it can be observed that some columns contains null values.\n", |
| 705 | + "inputs.age[:10]" |
| 706 | + ] |
| 707 | + }, |
| 708 | + { |
| 709 | + "cell_type": "code", |
| 710 | + "execution_count": 9, |
| 711 | + "id": "c9c01a43", |
| 712 | + "metadata": {}, |
| 713 | + "outputs": [ |
| 714 | + { |
| 715 | + "data": { |
| 716 | + "text/plain": [ |
| 717 | + "0 22.000000\n", |
| 718 | + "1 38.000000\n", |
| 719 | + "2 26.000000\n", |
| 720 | + "3 35.000000\n", |
| 721 | + "4 35.000000\n", |
| 722 | + "5 29.699118\n", |
| 723 | + "6 54.000000\n", |
| 724 | + "7 2.000000\n", |
| 725 | + "8 27.000000\n", |
| 726 | + "9 14.000000\n", |
| 727 | + "Name: age, dtype: float64" |
| 728 | + ] |
| 729 | + }, |
| 730 | + "execution_count": 9, |
| 731 | + "metadata": {}, |
| 732 | + "output_type": "execute_result" |
| 733 | + } |
| 734 | + ], |
| 735 | + "source": [ |
| 736 | + "# one way of handling these null values is to fill those values with the mean value of the whole column.\n", |
| 737 | + "# we can also make these values an integer type values.\n", |
| 738 | + "inputs[\"age\"] = inputs[\"age\"].fillna(inputs[\"age\"].mean())\n", |
| 739 | + "\n", |
| 740 | + "inputs.age[:10]\n", |
| 741 | + "# as we can see the 5th row in the earlier cell was null and now it is replaced with the mean value." |
| 742 | + ] |
| 743 | + }, |
| 744 | + { |
| 745 | + "cell_type": "code", |
| 746 | + "execution_count": 10, |
| 747 | + "id": "5bda4746", |
| 748 | + "metadata": {}, |
| 749 | + "outputs": [ |
| 750 | + { |
| 751 | + "name": "stdout", |
| 752 | + "output_type": "stream", |
| 753 | + "text": [ |
| 754 | + "712 179 891\n", |
| 755 | + "0.7991021324354658\n", |
| 756 | + "0.20089786756453423\n" |
| 757 | + ] |
| 758 | + } |
| 759 | + ], |
| 760 | + "source": [ |
| 761 | + "X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)\n", |
| 762 | + "# 20% test size and 80% train size.\n", |
| 763 | + "print(len(X_train), len(X_test), len(inputs))\n", |
| 764 | + "\n", |
| 765 | + "# calculating training and testing data percentage.\n", |
| 766 | + "print(len(X_train) / len(inputs)) # training data %\n", |
| 767 | + "print(len(X_test) / len(inputs)) # testing data %" |
| 768 | + ] |
| 769 | + }, |
| 770 | + { |
| 771 | + "cell_type": "code", |
| 772 | + "execution_count": 11, |
| 773 | + "id": "42deb68a", |
| 774 | + "metadata": {}, |
| 775 | + "outputs": [ |
| 776 | + { |
| 777 | + "data": { |
| 778 | + "text/html": [ |
| 779 | + "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GaussianNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GaussianNB</label><div class=\"sk-toggleable__content\"><pre>GaussianNB()</pre></div></div></div></div></div>" |
| 780 | + ], |
| 781 | + "text/plain": [ |
| 782 | + "GaussianNB()" |
| 783 | + ] |
| 784 | + }, |
| 785 | + "execution_count": 11, |
| 786 | + "metadata": {}, |
| 787 | + "output_type": "execute_result" |
| 788 | + } |
| 789 | + ], |
| 790 | + "source": [ |
| 791 | + "# we are using the Gaussian Naive Bayes model.\n", |
| 792 | + "model = GaussianNB()\n", |
| 793 | + "model.fit(X_train, y_train)" |
| 794 | + ] |
| 795 | + }, |
| 796 | + { |
| 797 | + "cell_type": "code", |
| 798 | + "execution_count": 12, |
| 799 | + "id": "422daf07", |
| 800 | + "metadata": {}, |
| 801 | + "outputs": [ |
| 802 | + { |
| 803 | + "data": { |
| 804 | + "text/plain": [ |
| 805 | + "0.7374301675977654" |
| 806 | + ] |
| 807 | + }, |
| 808 | + "execution_count": 12, |
| 809 | + "metadata": {}, |
| 810 | + "output_type": "execute_result" |
| 811 | + } |
| 812 | + ], |
| 813 | + "source": [ |
| 814 | + "model.score(X_test, y_test)" |
| 815 | + ] |
| 816 | + }, |
| 817 | + { |
| 818 | + "cell_type": "code", |
| 819 | + "execution_count": 13, |
| 820 | + "id": "ec58fa13", |
| 821 | + "metadata": {}, |
| 822 | + "outputs": [ |
| 823 | + { |
| 824 | + "name": "stdout", |
| 825 | + "output_type": "stream", |
| 826 | + "text": [ |
| 827 | + "[0 0 1 0 0]\n", |
| 828 | + "0.9931484209536514, 0.015823706167673823, 0.9932294253552363, 0.9833500719459427, 0.9919031866574911, " |
| 829 | + ] |
| 830 | + } |
| 831 | + ], |
| 832 | + "source": [ |
| 833 | + "pred = np.array(model.predict(X_test))\n", |
| 834 | + "pred_probability = np.array(model.predict_proba(X_test)) # calculating the probabilities\n", |
| 835 | + "\n", |
| 836 | + "print(pred[:5])\n", |
| 837 | + "for i in range(1, 6):\n", |
| 838 | + " print(pred_probability[i][0], end=\", \")" |
| 839 | + ] |
| 840 | + }, |
| 841 | + { |
| 842 | + "cell_type": "code", |
| 843 | + "execution_count": null, |
| 844 | + "id": "25eed843", |
| 845 | + "metadata": {}, |
| 846 | + "outputs": [], |
| 847 | + "source": [] |
| 848 | + } |
| 849 | + ], |
| 850 | + "metadata": { |
| 851 | + "kernelspec": { |
| 852 | + "display_name": "Python 3 (ipykernel)", |
| 853 | + "language": "python", |
| 854 | + "name": "python3" |
| 855 | + }, |
| 856 | + "language_info": { |
| 857 | + "codemirror_mode": { |
| 858 | + "name": "ipython", |
| 859 | + "version": 3 |
| 860 | + }, |
| 861 | + "file_extension": ".py", |
| 862 | + "mimetype": "text/x-python", |
| 863 | + "name": "python", |
| 864 | + "nbconvert_exporter": "python", |
| 865 | + "pygments_lexer": "ipython3", |
| 866 | + "version": "3.11.2" |
| 867 | + } |
| 868 | + }, |
| 869 | + "nbformat": 4, |
| 870 | + "nbformat_minor": 5 |
| 871 | +} |
0 commit comments