diff --git a/Pipfile b/Pipfile index 1970535..0e4e2d2 100644 --- a/Pipfile +++ b/Pipfile @@ -6,14 +6,14 @@ verify_ssl = true [dev-packages] [packages] -scipy = "*" scikit-learn = "*" pandas = "*" +numpy = "==1.15.*" +pytest = "==3.5.1" gunicorn = "*" peewee = "*" psycopg2 = "*" Flask = "*" -category_encoders = "==1.2.6" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 16e67dd..8a45550 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e29aaf9b98c5b4006a59c6ddef416a11d7fd40911aeec75aed59b3f35b980f1a" + "sha256": "99ac5b6af84e19a2a2eddff6215e1eacb5b23122b866d6fdf27e94e02f7cae03" }, "pipfile-spec": 6, "requires": { @@ -16,12 +16,12 @@ ] }, "default": { - "category-encoders": { + "attrs": { "hashes": [ - "sha256:99ccf0e451035d26dcfe9b21bea6835307b1ec090c289c5462001d3ad15f5bd0" + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" ], - "index": "pypi", - "version": "==1.2.6" + "version": "==19.1.0" }, "click": { "hashes": [ @@ -93,33 +93,46 @@ ], "version": "==1.1.1" }, + "more-itertools": { + "hashes": [ + "sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40", + "sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1" + ], + "version": "==6.0.0" + }, "numpy": { "hashes": [ - "sha256:1980f8d84548d74921685f68096911585fee393975f53797614b34d4f409b6da", - "sha256:22752cd809272671b273bb86df0f505f505a12368a3a5fc0aa811c7ece4dfd5c", - "sha256:23cc40313036cffd5d1873ef3ce2e949bdee0646c5d6f375bf7ee4f368db2511", - "sha256:2b0b118ff547fecabc247a2668f48f48b3b1f7d63676ebc5be7352a5fd9e85a5", - "sha256:3a0bd1edf64f6a911427b608a894111f9fcdb25284f724016f34a84c9a3a6ea9", - "sha256:3f25f6c7b0d000017e5ac55977a3999b0b1a74491eacb3c1aa716f0e01f6dcd1", - "sha256:4061c79ac2230594a7419151028e808239450e676c39e58302ad296232e3c2e8", - "sha256:560ceaa24f971ab37dede7ba030fc5d8fa173305d94365f814d9523ffd5d5916", - "sha256:62be044cd58da2a947b7e7b2252a10b42920df9520fc3d39f5c4c70d5460b8ba", - "sha256:6c692e3879dde0b67a9dc78f9bfb6f61c666b4562fd8619632d7043fb5b691b0", - "sha256:6f65e37b5a331df950ef6ff03bd4136b3c0bbcf44d4b8e99135d68a537711b5a", - "sha256:7a78cc4ddb253a55971115f8320a7ce28fd23a065fc33166d601f51760eecfa9", - "sha256:80a41edf64a3626e729a62df7dd278474fc1726836552b67a8c6396fd7e86760", - "sha256:893f4d75255f25a7b8516feb5766c6b63c54780323b9bd4bc51cdd7efc943c73", - "sha256:972ea92f9c1b54cc1c1a3d8508e326c0114aaf0f34996772a30f3f52b73b942f", - "sha256:9f1d4865436f794accdabadc57a8395bd3faa755449b4f65b88b7df65ae05f89", - "sha256:9f4cd7832b35e736b739be03b55875706c8c3e5fe334a06210f1a61e5c2c8ca5", - "sha256:adab43bf657488300d3aeeb8030d7f024fcc86e3a9b8848741ea2ea903e56610", - "sha256:bd2834d496ba9b1bdda3a6cf3de4dc0d4a0e7be306335940402ec95132ad063d", - "sha256:d20c0360940f30003a23c0adae2fe50a0a04f3e48dc05c298493b51fd6280197", - "sha256:d3b3ed87061d2314ff3659bb73896e622252da52558f2380f12c421fbdee3d89", - "sha256:dc235bf29a406dfda5790d01b998a1c01d7d37f449128c0b1b7d1c89a84fae8b", - "sha256:fb3c83554f39f48f3fa3123b9c24aecf681b1c289f9334f8215c1d3c8e2f6e5b" + "sha256:0df89ca13c25eaa1621a3f09af4c8ba20da849692dcae184cb55e80952c453fb", + "sha256:154c35f195fd3e1fad2569930ca51907057ae35e03938f89a8aedae91dd1b7c7", + "sha256:18e84323cdb8de3325e741a7a8dd4a82db74fde363dce32b625324c7b32aa6d7", + "sha256:1e8956c37fc138d65ded2d96ab3949bd49038cc6e8a4494b1515b0ba88c91565", + "sha256:23557bdbca3ccbde3abaa12a6e82299bc92d2b9139011f8c16ca1bb8c75d1e95", + "sha256:24fd645a5e5d224aa6e39d93e4a722fafa9160154f296fd5ef9580191c755053", + "sha256:36e36b6868e4440760d4b9b44587ea1dc1f06532858d10abba98e851e154ca70", + "sha256:3d734559db35aa3697dadcea492a423118c5c55d176da2f3be9c98d4803fc2a7", + "sha256:416a2070acf3a2b5d586f9a6507bb97e33574df5bd7508ea970bbf4fc563fa52", + "sha256:4a22dc3f5221a644dfe4a63bf990052cc674ef12a157b1056969079985c92816", + "sha256:4d8d3e5aa6087490912c14a3c10fbdd380b40b421c13920ff468163bc50e016f", + "sha256:4f41fd159fba1245e1958a99d349df49c616b133636e0cf668f169bce2aeac2d", + "sha256:561ef098c50f91fbac2cc9305b68c915e9eb915a74d9038ecf8af274d748f76f", + "sha256:56994e14b386b5c0a9b875a76d22d707b315fa037affc7819cda08b6d0489756", + "sha256:73a1f2a529604c50c262179fcca59c87a05ff4614fe8a15c186934d84d09d9a5", + "sha256:7da99445fd890206bfcc7419f79871ba8e73d9d9e6b82fe09980bc5bb4efc35f", + "sha256:99d59e0bcadac4aa3280616591fb7bcd560e2218f5e31d5223a2e12a1425d495", + "sha256:a4cc09489843c70b22e8373ca3dfa52b3fab778b57cf81462f1203b0852e95e3", + "sha256:a61dc29cfca9831a03442a21d4b5fd77e3067beca4b5f81f1a89a04a71cf93fa", + "sha256:b1853df739b32fa913cc59ad9137caa9cc3d97ff871e2bbd89c2a2a1d4a69451", + "sha256:b1f44c335532c0581b77491b7715a871d0dd72e97487ac0f57337ccf3ab3469b", + "sha256:b261e0cb0d6faa8fd6863af26d30351fd2ffdb15b82e51e81e96b9e9e2e7ba16", + "sha256:c857ae5dba375ea26a6228f98c195fec0898a0fd91bcf0e8a0cae6d9faf3eca7", + "sha256:cf5bb4a7d53a71bb6a0144d31df784a973b36d8687d615ef6a7e9b1809917a9b", + "sha256:db9814ff0457b46f2e1d494c1efa4111ca089e08c8b983635ebffb9c1573361f", + "sha256:df04f4bad8a359daa2ff74f8108ea051670cafbca533bb2636c58b16e962989e", + "sha256:ecf81720934a0e18526177e645cbd6a8a21bb0ddc887ff9738de07a1df5c6b61", + "sha256:edfa6fba9157e0e3be0f40168eb142511012683ac3dc82420bee4a3f3981b30e" ], - "version": "==1.16.2" + "index": "pypi", + "version": "==1.15.4" }, "pandas": { "hashes": [ @@ -147,13 +160,6 @@ "index": "pypi", "version": "==0.24.2" }, - "patsy": { - "hashes": [ - "sha256:5465be1c0e670c3a965355ec09e9a502bf2c4cbe4875e8528b0221190a8a5d40", - "sha256:f115cec4201e1465cd58b9866b0b0e7b941caafec129869057405bfe5b5e3991" - ], - "version": "==0.5.1" - }, "peewee": { "hashes": [ "sha256:074331625cf4335a27af3a8f644eabe2858cd3fc91fa95a7f18db16bd640f7cc" @@ -161,6 +167,14 @@ "index": "pypi", "version": "==3.9.2" }, + "pluggy": { + "hashes": [ + "sha256:7f8ae7f5bdf75671a718d2daf0a64b7885f74510bcd98b1a0bb420eb9a9d0cff", + "sha256:d345c8fe681115900d6da8d048ba67c25df42973bda370783cd58826442dcd7c", + "sha256:e160a7fcf25762bb60efc7e171d4497ff1d8d2d75a3d0df7a21b76821ecbf5c5" + ], + "version": "==0.6.0" + }, "psycopg2": { "hashes": [ "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94", @@ -197,6 +211,21 @@ "index": "pypi", "version": "==2.7.7" }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pytest": { + "hashes": [ + "sha256:54713b26c97538db6ff0703a12b19aeaeb60b5e599de542e7fca0ec83b9038e8", + "sha256:829230122facf05a5f81a6d4dfe6454a04978ea3746853b2b84567ecf8e5c526" + ], + "index": "pypi", + "version": "==3.5.1" + }, "python-dateutil": { "hashes": [ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", @@ -276,7 +305,6 @@ "sha256:f31338ee269d201abe76083a990905473987371ff6f3fdb76a3f9073a361cf37", "sha256:f6b88c8d302c3dac8dff7766955e38d670c82e0d79edfc7eae47d6bb2c186594" ], - "index": "pypi", "version": "==1.2.1" }, "six": { @@ -286,39 +314,12 @@ ], "version": "==1.12.0" }, - "statsmodels": { - "hashes": [ - "sha256:0fd6af8db18b776c81c8fba54de20e9ec2f11b9310871b6b666d8805e3cf5ece", - "sha256:18844bbd95fcf62885d195571334762533ae16de182e1032ccc1595a98ffffb4", - "sha256:27e87cc6cd390fce8f44df225dadf589e1df6272f36b267ccdece2a9c4f52938", - "sha256:2902f5eef49fc38c112ffd8168dd76f7ae27f6cb5aa735cf55bc887b49aaec6e", - "sha256:31c2e26436a992e66355c0b3ef4b7c9714a0aa8375952d24f0593ac7c417b1e9", - "sha256:5d91ad30b8e20a45f583077ffeb4352be01955033f3dcd09bc06c30be1d29e8f", - "sha256:5de3d525b9a8679cd6c0f7f7c8cb8508275ab86cc3c1a140b2dc6b6390adb943", - "sha256:6461f93a842c649922c2c9a9bc9d9c4834110b89de8c4af196a791ab8f42ba3b", - "sha256:78d1b40c18d41f6c683c1c184be146264a782d409a89d8ed6c78acd1e1c11659", - "sha256:7c1a7cf557139f4bcbf97172268a8001156e42a7eeccca04d15c0cb7c3491ada", - "sha256:8532885c5778f94dae7ad83c4ac3f6916d4c8eb294f47ecefe2f0d3b967e6a16", - "sha256:95d35b33a301ded560662c733780ce58b37e218d122bb1b9c14e216aa9d42a2a", - "sha256:b48e283ba171698dca3989c0c03e6f25d3f431640383d926235d26ce48f3891c", - "sha256:b4b4b25c0e4228b1d33098894c3b29f4546e45afb29b333582cbaa5e16f38f3c", - "sha256:c06fd4af98f4c7ab61c9a79fd051ad4d7247991a691c3b4883c611029bac30a2", - "sha256:d2003c70c854f35a6446a465c61c994486039feb2fd47345a1e9984e95d55878", - "sha256:d7182803cdb09f1f17a335c0eae71d84905da9b0bc35c3d2c2379745f33096d9", - "sha256:d9b85bd98e90a02f2192084a85c857465e40e508629ac922242dba70731d0449", - "sha256:e2d9fd696e2d1523386d0f64f115352acbfaf59d5ca4c681c23ea064393a2ac4", - "sha256:ede078fdc9af857ed454d1e9e51831b2d577255c794d4044ecc332d40f3e3b36", - "sha256:f512afa7bc10b848aaacab5dfff6f61255142dd3a5581f82980c12745b0b6cd3", - "sha256:fbf789cc6d3fadca4350fa87e5f710ad2628e1fdff71bf8f853ecd49599ebe23" - ], - "version": "==0.9.0" - }, "werkzeug": { "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" + "sha256:96da23fa8ccecbc3ae832a83df5c722c11547d021637faacb0bec4dd2f4666c8", + "sha256:ca5c2dcd367d6c0df87185b9082929d255358f5391923269335782b213d52655" ], - "version": "==0.14.1" + "version": "==0.15.1" } }, "develop": {} diff --git a/README.md b/README.md index ef3bd51..c9c9a5e 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ This information is directly copied from the [UCI datasets repository for adult ```bash R -e rmarkdown::render"('eda_adult.Rmd', clean=TRUE, output_format='pdf_document')" + +# use this command to generate a markdown +R -e rmarkdown::render"('eda_adult.Rmd', clean=TRUE, output_format='github_document')" ``` The script `eda_adult.pdf` will contain the exploratory analysis report on the adult income dataset. diff --git a/eda_adult.Rmd b/eda_adult.Rmd index ba691a2..9892716 100644 --- a/eda_adult.Rmd +++ b/eda_adult.Rmd @@ -9,6 +9,8 @@ knitr::opts_chunk$set(echo = TRUE) ```{r} # importing the required libraries +# uncomment the below line if tidyverse not already installed +# install.packages("tidyverse") suppressPackageStartupMessages(library(tidyverse)) ``` diff --git a/eda_adult.html b/eda_adult.html new file mode 100644 index 0000000..3f03a46 --- /dev/null +++ b/eda_adult.html @@ -0,0 +1,548 @@ + + + + +
+ + + + + + + + + + + + + +# importing the required libraries
+# uncomment the below line if tidyverse not already installed
+# install.packages("tidyverse")
+suppressPackageStartupMessages(library(tidyverse))
In this R Markdown document, I will be doing a preliminary data analysis and exploration for the adult dataset from UCI ML datasets repository.
+# reading the dataset
+adult <- read.csv("data/adult.data", header = FALSE)
+test <- read.csv("data/adult.test", header = FALSE, skip = 1)
+
+head(adult)
## V1 V2 V3 V4 V5 V6
+## 1 39 State-gov 77516 Bachelors 13 Never-married
+## 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
+## 3 38 Private 215646 HS-grad 9 Divorced
+## 4 53 Private 234721 11th 7 Married-civ-spouse
+## 5 28 Private 338409 Bachelors 13 Married-civ-spouse
+## 6 37 Private 284582 Masters 14 Married-civ-spouse
+## V7 V8 V9 V10 V11 V12 V13
+## 1 Adm-clerical Not-in-family White Male 2174 0 40
+## 2 Exec-managerial Husband White Male 0 0 13
+## 3 Handlers-cleaners Not-in-family White Male 0 0 40
+## 4 Handlers-cleaners Husband Black Male 0 0 40
+## 5 Prof-specialty Wife Black Female 0 0 40
+## 6 Exec-managerial Wife White Female 0 0 40
+## V14 V15
+## 1 United-States <=50K
+## 2 United-States <=50K
+## 3 United-States <=50K
+## 4 United-States <=50K
+## 5 Cuba <=50K
+## 6 United-States <=50K
+We see that there are 14 features and 1 response which is binary (whether the annual income is <= or greater than 50k).
+# assigning proper column names to variables
+adult <- adult %>%
+ magrittr::set_colnames(c("age", "workclass", "fnlwgt", "education", "education_num",
+ "marital_status", "occupation", "relationship", "race",
+ "sex", "capital_gain", "capital_loss", "hours_per_week",
+ "native_country", "income"))
+head(adult)
## age workclass fnlwgt education education_num
+## 1 39 State-gov 77516 Bachelors 13
+## 2 50 Self-emp-not-inc 83311 Bachelors 13
+## 3 38 Private 215646 HS-grad 9
+## 4 53 Private 234721 11th 7
+## 5 28 Private 338409 Bachelors 13
+## 6 37 Private 284582 Masters 14
+## marital_status occupation relationship race sex
+## 1 Never-married Adm-clerical Not-in-family White Male
+## 2 Married-civ-spouse Exec-managerial Husband White Male
+## 3 Divorced Handlers-cleaners Not-in-family White Male
+## 4 Married-civ-spouse Handlers-cleaners Husband Black Male
+## 5 Married-civ-spouse Prof-specialty Wife Black Female
+## 6 Married-civ-spouse Exec-managerial Wife White Female
+## capital_gain capital_loss hours_per_week native_country income
+## 1 2174 0 40 United-States <=50K
+## 2 0 0 13 United-States <=50K
+## 3 0 0 40 United-States <=50K
+## 4 0 0 40 United-States <=50K
+## 5 0 0 40 Cuba <=50K
+## 6 0 0 40 United-States <=50K
+Now that we have imported the data in the required format, we will move on to looking at the dataset and cleaning it wherever required.
+# looking at the structure of the data
+str(adult)
## 'data.frame': 32561 obs. of 15 variables:
+## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
+## $ workclass : Factor w/ 9 levels " ?"," Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
+## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
+## $ education : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
+## $ education_num : int 13 13 9 7 13 14 5 9 14 13 ...
+## $ marital_status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
+## $ occupation : Factor w/ 15 levels " ?"," Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
+## $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
+## $ race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
+## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
+## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
+## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
+## $ hours_per_week: int 40 13 40 40 40 40 16 45 50 40 ...
+## $ native_country: Factor w/ 42 levels " ?"," Cambodia",..: 40 40 40 40 6 40 24 40 40 40 ...
+## $ income : Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...
+summary(adult)
## age workclass fnlwgt
+## Min. :17.00 Private :22696 Min. : 12285
+## 1st Qu.:28.00 Self-emp-not-inc: 2541 1st Qu.: 117827
+## Median :37.00 Local-gov : 2093 Median : 178356
+## Mean :38.58 ? : 1836 Mean : 189778
+## 3rd Qu.:48.00 State-gov : 1298 3rd Qu.: 237051
+## Max. :90.00 Self-emp-inc : 1116 Max. :1484705
+## (Other) : 981
+## education education_num marital_status
+## HS-grad :10501 Min. : 1.00 Divorced : 4443
+## Some-college: 7291 1st Qu.: 9.00 Married-AF-spouse : 23
+## Bachelors : 5355 Median :10.00 Married-civ-spouse :14976
+## Masters : 1723 Mean :10.08 Married-spouse-absent: 418
+## Assoc-voc : 1382 3rd Qu.:12.00 Never-married :10683
+## 11th : 1175 Max. :16.00 Separated : 1025
+## (Other) : 5134 Widowed : 993
+## occupation relationship
+## Prof-specialty :4140 Husband :13193
+## Craft-repair :4099 Not-in-family : 8305
+## Exec-managerial:4066 Other-relative: 981
+## Adm-clerical :3770 Own-child : 5068
+## Sales :3650 Unmarried : 3446
+## Other-service :3295 Wife : 1568
+## (Other) :9541
+## race sex capital_gain
+## Amer-Indian-Eskimo: 311 Female:10771 Min. : 0
+## Asian-Pac-Islander: 1039 Male :21790 1st Qu.: 0
+## Black : 3124 Median : 0
+## Other : 271 Mean : 1078
+## White :27816 3rd Qu.: 0
+## Max. :99999
+##
+## capital_loss hours_per_week native_country income
+## Min. : 0.0 Min. : 1.00 United-States:29170 <=50K:24720
+## 1st Qu.: 0.0 1st Qu.:40.00 Mexico : 643 >50K : 7841
+## Median : 0.0 Median :40.00 ? : 583
+## Mean : 87.3 Mean :40.44 Philippines : 198
+## 3rd Qu.: 0.0 3rd Qu.:45.00 Germany : 137
+## Max. :4356.0 Max. :99.00 Canada : 121
+## (Other) : 1709
+We see that there are missing values in 3 columns (in the form of ?
) looking at the structure of data. There also seems to be whitespace as a prefix in many of the categorical variables. We also don't need the final weight variable which was put up by the Census Board and hence, will remove it. Let us handle all these cases.
adult <- adult %>%
+ mutate(income = str_trim(income, side = c("left")),
+ occupation = ifelse(occupation == " ?", "unknown", as.character(occupation)),
+ workclass = ifelse(workclass == " ?", "unknown", as.character(workclass)),
+ native_country = ifelse(native_country == " ?", "unknown", as.character(native_country)),
+ workclass = str_trim(workclass, side = c("left")),
+ education = str_trim(education, side = c("left")),
+ marital_status = str_trim(marital_status, side = c("left")),
+ occupation = str_trim(occupation, side = c("left")),
+ relationship = str_trim(relationship, side = c("left")),
+ sex = str_trim(sex, side = c("left")),
+ race = str_trim(race, side = c("left")),
+ native_country = str_trim(native_country, side = c("left")))
+head(adult)
## age workclass fnlwgt education education_num marital_status
+## 1 39 State-gov 77516 Bachelors 13 Never-married
+## 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
+## 3 38 Private 215646 HS-grad 9 Divorced
+## 4 53 Private 234721 11th 7 Married-civ-spouse
+## 5 28 Private 338409 Bachelors 13 Married-civ-spouse
+## 6 37 Private 284582 Masters 14 Married-civ-spouse
+## occupation relationship race sex capital_gain capital_loss
+## 1 Adm-clerical Not-in-family White Male 2174 0
+## 2 Exec-managerial Husband White Male 0 0
+## 3 Handlers-cleaners Not-in-family White Male 0 0
+## 4 Handlers-cleaners Husband Black Male 0 0
+## 5 Prof-specialty Wife Black Female 0 0
+## 6 Exec-managerial Wife White Female 0 0
+## hours_per_week native_country income
+## 1 40 United-States <=50K
+## 2 13 United-States <=50K
+## 3 40 United-States <=50K
+## 4 40 United-States <=50K
+## 5 40 Cuba <=50K
+## 6 40 United-States <=50K
+# checking missing value category
+adult %>%
+ filter(is.na(workclass)) %>%
+ group_by(income) %>%
+ count()
## # A tibble: 0 x 2
+## # Groups: income [0]
+## # ... with 2 variables: income <chr>, n <int>
+adult %>%
+ filter(is.na(occupation)) %>%
+ group_by(income) %>%
+ count()
## # A tibble: 0 x 2
+## # Groups: income [0]
+## # ... with 2 variables: income <chr>, n <int>
+adult %>%
+ filter(is.na(native_country)) %>%
+ group_by(income) %>%
+ count()
## # A tibble: 0 x 2
+## # Groups: income [0]
+## # ... with 2 variables: income <chr>, n <int>
+We see that most of the people with missing values in the above 3 columns belong to the <50K income category and since we have a lot of data points for that category, we can safely ignore these cases from our analysis.
+adult <- adult %>%
+ filter(!is.na(workclass), !is.na(occupation), !is.na(native_country))
adult %>%
+ keep(is.numeric) %>% # Keep only numeric columns
+ gather() %>% # Convert to key-value pairs
+ ggplot(aes(value)) + # Plot the values
+ facet_wrap(~ key, scales = "free") + # In separate panels
+ geom_histogram(color = "darkblue",
+ fill = "lightblue", bins = 25) +
+ ggtitle("Histograms for numeric variables") +
+ theme_bw()
The age and final weight distributions seem to be right-skewed and a log/square-root transformation on these would be a good choice while building the model. Plotting capital_gain
and capital_loss
without the 0 value would give a better look at the distribution.
adult %>%
+ select(capital_gain) %>%
+ filter(capital_gain != 0) %>%
+ ggplot() +
+ geom_histogram(aes(x = capital_gain), color = "darkblue",
+ fill = "lightblue", bins = 25) +
+ ggtitle("Histogram for capital gain") +
+ theme_bw()
adult %>%
+ select(capital_loss) %>%
+ filter(capital_loss != 0) %>%
+ ggplot() +
+ geom_histogram(aes(x = capital_loss), color = "darkblue",
+ fill = "lightblue", bins = 25) +
+ ggtitle("Histogram for capital loss") +
+ theme_bw()
We see that there is a set of people with a very high capital gain (about a 100000) and the rest of the people form a right-skewed distribution. I suspect that these people with such a high capital gain would fall into the category of >50K income bracket. Let's find out!
+adult %>%
+ filter(capital_gain > 90000) %>%
+ group_by(income) %>%
+ count()
## # A tibble: 1 x 2
+## # Groups: income [1]
+## income n
+## <chr> <int>
+## 1 >50K 159
+As expected, they are indeed a high-income group! Capital gain might be good predictor of income group. Let us now look at the distribution of some categorical variables.
+adult %>%
+ select(-income) %>%
+ keep(is.character) %>%
+ gather() %>%
+ ggplot(aes(value)) +
+ facet_wrap(~ key, scales = "free", ncol = 4) +
+ geom_bar(color = "darkblue", fill = "lightblue") +
+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
Let us plot separately some figures which are not so clear in the picture above and break them down into actual numbers.
+# native country
+adult %>%
+ group_by(native_country) %>%
+ summarise(freq = n()) %>%
+ mutate(prop = freq / sum(freq) * 100) %>%
+ arrange(desc(prop))
## # A tibble: 42 x 3
+## native_country freq prop
+## <chr> <int> <dbl>
+## 1 United-States 29170 89.6
+## 2 Mexico 643 1.97
+## 3 unknown 583 1.79
+## 4 Philippines 198 0.608
+## 5 Germany 137 0.421
+## 6 Canada 121 0.372
+## 7 Puerto-Rico 114 0.350
+## 8 El-Salvador 106 0.326
+## 9 India 100 0.307
+## 10 Cuba 95 0.292
+## # ... with 32 more rows
+Most of the people in the dataset are from the United-States.
+# workclass
+adult %>%
+ group_by(workclass) %>%
+ summarise(freq = n()) %>%
+ mutate(prop = freq / sum(freq) * 100) %>%
+ arrange(desc(prop))
## # A tibble: 9 x 3
+## workclass freq prop
+## <chr> <int> <dbl>
+## 1 Private 22696 69.7
+## 2 Self-emp-not-inc 2541 7.80
+## 3 Local-gov 2093 6.43
+## 4 unknown 1836 5.64
+## 5 State-gov 1298 3.99
+## 6 Self-emp-inc 1116 3.43
+## 7 Federal-gov 960 2.95
+## 8 Without-pay 14 0.0430
+## 9 Never-worked 7 0.0215
+As we see, most of the people belong to the Private
workforce. Let us remake this plot so as to look at other classes clearly.
# distribution without private workclass
+adult %>%
+ filter(workclass != "Private") %>%
+ ggplot() +
+ geom_bar(aes(x = workclass), color = "darkblue",
+ fill = "lightblue") +
+ ggtitle("Bar plot for Workclass") +
+ theme_bw()
There are very less people who have never worked or who are living without a pay. We will now move on to multivariate analysis.
+In this section, we will be analyzing the relationship between different predictors and the response along with some relationships and patterns within the predictors.
+adult %>%
+ ggplot() +
+ geom_violin(aes(x = income, y = age)) +
+ geom_boxplot(aes(x = income, y = age), width=0.05) +
+ geom_hline(yintercept = 35, color = "red") +
+ geom_hline(yintercept = 50, color = "red") +
+ theme_bw()
We see that most of the population under the age of 25 earns < 50K a year. This makes sense and most likely, this section of the population would not be earning much at all (as there will be a lot of students in this section). Most of the people who earn > 50K a year are in their 30s and 40s.
+adult %>%
+ group_by(education, income) %>%
+ summarise(freq = n()) %>%
+ mutate(prop = freq / sum(freq) * 100) %>%
+ filter(income == ">50K") %>%
+ arrange(desc(prop)) %>%
+ ggplot() +
+ geom_col(aes(x = reorder(education, prop), y = prop), color = "darkblue",
+ fill = "lightblue") +
+ labs(x = "Education Level", y = "Percentage") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
No surprise here! Adults with a high level of education have a higher proportion of people who earn more than 50K a year. There also are some people with a low education level who are earning > 50K per year. Let us have a closer look at these people.
+adult %>%
+ filter(education_num < 8, income == ">50K") %>%
+ group_by(workclass) %>%
+ count() %>%
+ ggplot() +
+ geom_col(aes(x = reorder(workclass, n), y = n), color = "darkblue",
+ fill = "lightblue") +
+ labs(x = "Work class", y = "Count") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
adult %>%
+ filter(education_num < 8, income == ">50K") %>%
+ group_by(education) %>%
+ count() %>%
+ ggplot() +
+ geom_col(aes(x = reorder(education, n), y = n), color = "darkblue",
+ fill = "lightblue") +
+ labs(x = "Education level", y = "Count") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
These people are mostly high-school dropouts who worked for private companies or were self-employed.
+adult %>%
+ filter(education_num < 8, income == ">50K") %>%
+ group_by(sex) %>%
+ count()
## # A tibble: 2 x 2
+## # Groups: sex [2]
+## sex n
+## <chr> <int>
+## 1 Female 19
+## 2 Male 192
+Not surprisingly, most of these people are males (10:1 male-female ratio). We don't see many cases of women dropping out from schools and earning really high amounts of money. This probably reflects that men are more prone to risk taking than most women.
+# proportion of high earning people by occupation
+adult %>%
+ group_by(occupation, income) %>%
+ summarise(freq = n()) %>%
+ mutate(prop = freq / sum(freq) * 100) %>%
+ filter(income == ">50K") %>%
+ arrange(desc(prop)) %>%
+ ggplot() +
+ geom_col(aes(x = reorder(occupation, prop), y = prop), color = "darkblue",
+ fill = "lightblue") +
+ labs(x = "Occupation", y = "Percentage") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
adult %>%
+ ggplot() +
+ geom_point(aes(x = hours_per_week, y = education_num, color = income)) +
+ labs(x = "Working hours per week", y = "Education level") +
+ theme_bw()
A lot of people who work > 50 hours per week have an income > 50K. Also, there is a bunch of people who don't spend a lot of time working, are really well-educated and earn a high income! They are probably the ones who earn fortunes for a few minutes of their lives.
+adult %>%
+ ggplot() +
+ geom_jitter(aes(x = marital_status, y = hours_per_week, color = income),
+ alpha = 0.1) +
+ labs(y = "Hours per week", x = "Marital Status") +
+ geom_hline(yintercept = 40, color = "darkgreen") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
It seems that people who never married are mostly having an income less than 50K a year. Also, most of the people who earn > 50K a year are Married civilians who work > 40 hours per week.
+# distribution of income by race
+adult %>%
+ ggplot() +
+ geom_bar(aes(x = race, fill = income), position = "dodge2") +
+ labs(x = "Race") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
# distribution of income by race without whites
+adult %>%
+ group_by(race, income) %>%
+ summarise(freq = n()) %>%
+ mutate(perc = freq / sum(freq) * 100) %>%
+ filter(race != "White") %>%
+ ggplot() +
+ geom_col(aes(x = race, y = freq, fill = income), position = "dodge2") +
+ geom_text(aes(x = race, y = freq, label = paste0(round(perc), "%")),
+ size = 4, position = position_dodge2(width = 0.8)) +
+ labs(x = "Sex", y = "Count") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
The ratio of people with an income >50K to the ones with an income <=50K seems to be nearly constant across all races.
+adult %>%
+ ggplot() +
+ geom_bar(aes(x = race), color = "darkblue",
+ fill = "lightblue") +
+ facet_wrap(~ education, scales = "free_y") +
+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+ labs(x = "Race") +
+ ggtitle("Distribution of race by education level")
++It is surprising to see the reduced number of black people as compared to whites in the higher level of education categories like
+Doctorate
,Prof-school
andMasters
. Let us look at the proportion of high earning people by native country.
# percentage of high earning people by native-country
+adult %>%
+ group_by(native_country, income) %>%
+ summarise(freq = n()) %>%
+ mutate(prop = freq / sum(freq) * 100) %>%
+ filter(income == ">50K") %>%
+ arrange(desc(prop)) %>%
+ ggplot() +
+ geom_col(aes(x = reorder(native_country, prop), y = prop), color = "darkblue",
+ fill = "lightblue") +
+ labs(x = "Native Country", y = "Percentage") +
+ coord_flip()
The distribution we see above shows that most of the migrants to the United-States are high earning people and there are also many countries whose migrants don't earn a high income. The United-States lies somewhere near the middle of these two categories of countries.
+# distribution of income by sex
+adult %>%
+ group_by(sex, income) %>%
+ summarise(freq = n()) %>%
+ mutate(perc = freq / sum(freq) * 100) %>%
+ ggplot() +
+ geom_col(aes(x = sex, y = freq, fill = income), position = "dodge2") +
+ labs(x = "Sex", y = "Count") +
+ geom_text(aes(x = sex, y = freq, label = paste0(round(perc), "%")),
+ size = 4, position = position_dodge2(width = 0.8)) +
+ theme_bw()
We see that only 11% of females earn more than 50K a year whereas the percentage is around 3 times higher in males (31%).
+# combining train and test for cleaning and export
+
+test <- test %>%
+ magrittr::set_colnames(c("age", "workclass", "fnlwgt", "education", "education_num",
+ "marital_status", "occupation", "relationship", "race",
+ "sex", "capital_gain", "capital_loss", "hours_per_week",
+ "native_country", "income"))
+
+test <- test %>%
+ mutate(income = str_trim(income, side = c("left")),
+ income = ifelse(income == "<=50K.", "<=50K", ">50K"),
+ occupation = ifelse(occupation == " ?", "unknown", as.character(occupation)),
+ workclass = ifelse(workclass == " ?", "unknown", as.character(workclass)),
+ native_country = ifelse(native_country == " ?", "unknown", as.character(native_country)),
+ workclass = str_trim(workclass, side = c("left")),
+ education = str_trim(education, side = c("left")),
+ marital_status = str_trim(marital_status, side = c("left")),
+ occupation = str_trim(occupation, side = c("left")),
+ relationship = str_trim(relationship, side = c("left")),
+ sex = str_trim(sex, side = c("left")),
+ race = str_trim(race, side = c("left")),
+ native_country = str_trim(native_country, side = c("left")))
+
+adult <- rbind(adult, test)
First, we will combine similar categories in different categorical variables into a smaller number of categories.
+# combining categories in workclass
+adult <- adult %>%
+ mutate(workclass = replace(workclass, workclass %in% c('State-gov', 'Federal-gov',
+ 'Local-gov'), 0),
+ workclass = replace(workclass, workclass %in% c('Self-emp-not-inc', 'Self-emp-inc',
+ 'Without-pay', 'Never-worked'), 1),
+ workclass = replace(workclass, workclass %in% c('Private'), 2),
+ workclass = replace(workclass, workclass %in% c('unknown'), -1))
+
+# combining categories in marital_status
+adult <- adult %>%
+ mutate(marital_status = replace(marital_status, marital_status %in% c('Married-civ-spouse',
+ 'Married-spouse-absent',
+ 'Married-AF-spouse'), 0),
+ marital_status = replace(marital_status, marital_status %in% c('Never-married','Divorced',
+ 'Separated','Widowed'), 1))
+# combining categories in education
+adult <- adult %>%
+ select(-education_num) %>%
+ mutate(education = replace(education, education %in% c("HS-grad", "11th", "9th", "7th-8th",
+ "5th-6th", "10th", "Preschool", "12th",
+ "1st-4th"), 0),
+ education = replace(education, education %in% c("Bachelors", "Some-college", "Assoc-acdm",
+ "Assoc-voc"), 1),
+ education = replace(education, education %in% c("Masters", "Prof-school", "Doctorate",
+ "Assoc-voc"), 2))
+
+# combining categories in occupation
+adult <- adult %>%
+ mutate(occupation = replace(occupation, occupation %in% c("Priv-house-serv", "Handlers-cleaners",
+ "Other-service", "Armed-Forces",
+ "Machine-op-inspct", "Farming-fishing",
+ "Adm-clerical"), 0),
+ occupation = replace(occupation, occupation %in% c("Tech-support", "Craft-repair",
+ "Protective-serv", "Transport-moving",
+ "Sales"), 1),
+ occupation = replace(occupation, occupation %in% c("Exec-managerial", "Prof-specialty"), 2),
+ occupation = replace(occupation, occupation %in% c("unknown"), -1))
+
+# combining categories in race
+adult <- adult %>%
+ mutate(race = replace(race, race %in% c("White"), 0),
+ race = replace(race, race %in% c("Black"), 1),
+ race = replace(race, race %in% c("Asian-Pac-Islander",
+ "Amer-Indian-Eskimo", "Other"), 2))
+
+# handling sex and native country and target
+adult <- adult %>%
+ mutate(sex = ifelse(sex == "Male", 1, 0),
+ native_country = ifelse(native_country == "United-States", 1, 0),
+ income = ifelse(income == "<=50K", 0, 1))
+
+# removing unnecessary variables eg: relationship, as information
+# already encoded in sex and marital status
+adult <- adult %>%
+ select(-c(relationship, fnlwgt))
+
+train <- adult[1:(nrow(adult) - nrow(test)), ]
+test <- adult[(nrow(adult) - nrow(test) + 1):nrow(adult), ]
Now that we have cleaned the dataset, let's export it so that it can be used in model building directly.
+# exporting the datasets
+write_csv(train, "data/train.csv", col_names = TRUE)
+write_csv(test, "data/test.csv", col_names = TRUE)