diff --git a/Pipfile b/Pipfile index 1970535..0e4e2d2 100644 --- a/Pipfile +++ b/Pipfile @@ -6,14 +6,14 @@ verify_ssl = true [dev-packages] [packages] -scipy = "*" scikit-learn = "*" pandas = "*" +numpy = "==1.15.*" +pytest = "==3.5.1" gunicorn = "*" peewee = "*" psycopg2 = "*" Flask = "*" -category_encoders = "==1.2.6" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 16e67dd..8a45550 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e29aaf9b98c5b4006a59c6ddef416a11d7fd40911aeec75aed59b3f35b980f1a" + "sha256": "99ac5b6af84e19a2a2eddff6215e1eacb5b23122b866d6fdf27e94e02f7cae03" }, "pipfile-spec": 6, "requires": { @@ -16,12 +16,12 @@ ] }, "default": { - "category-encoders": { + "attrs": { "hashes": [ - "sha256:99ccf0e451035d26dcfe9b21bea6835307b1ec090c289c5462001d3ad15f5bd0" + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" ], - "index": "pypi", - "version": "==1.2.6" + "version": "==19.1.0" }, "click": { "hashes": [ @@ -93,33 +93,46 @@ ], "version": "==1.1.1" }, + "more-itertools": { + "hashes": [ + "sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40", + "sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1" + ], + "version": "==6.0.0" + }, "numpy": { "hashes": [ - "sha256:1980f8d84548d74921685f68096911585fee393975f53797614b34d4f409b6da", - "sha256:22752cd809272671b273bb86df0f505f505a12368a3a5fc0aa811c7ece4dfd5c", - "sha256:23cc40313036cffd5d1873ef3ce2e949bdee0646c5d6f375bf7ee4f368db2511", - "sha256:2b0b118ff547fecabc247a2668f48f48b3b1f7d63676ebc5be7352a5fd9e85a5", - "sha256:3a0bd1edf64f6a911427b608a894111f9fcdb25284f724016f34a84c9a3a6ea9", - "sha256:3f25f6c7b0d000017e5ac55977a3999b0b1a74491eacb3c1aa716f0e01f6dcd1", - "sha256:4061c79ac2230594a7419151028e808239450e676c39e58302ad296232e3c2e8", - "sha256:560ceaa24f971ab37dede7ba030fc5d8fa173305d94365f814d9523ffd5d5916", - "sha256:62be044cd58da2a947b7e7b2252a10b42920df9520fc3d39f5c4c70d5460b8ba", - "sha256:6c692e3879dde0b67a9dc78f9bfb6f61c666b4562fd8619632d7043fb5b691b0", - "sha256:6f65e37b5a331df950ef6ff03bd4136b3c0bbcf44d4b8e99135d68a537711b5a", - "sha256:7a78cc4ddb253a55971115f8320a7ce28fd23a065fc33166d601f51760eecfa9", - "sha256:80a41edf64a3626e729a62df7dd278474fc1726836552b67a8c6396fd7e86760", - "sha256:893f4d75255f25a7b8516feb5766c6b63c54780323b9bd4bc51cdd7efc943c73", - "sha256:972ea92f9c1b54cc1c1a3d8508e326c0114aaf0f34996772a30f3f52b73b942f", - "sha256:9f1d4865436f794accdabadc57a8395bd3faa755449b4f65b88b7df65ae05f89", - "sha256:9f4cd7832b35e736b739be03b55875706c8c3e5fe334a06210f1a61e5c2c8ca5", - "sha256:adab43bf657488300d3aeeb8030d7f024fcc86e3a9b8848741ea2ea903e56610", - "sha256:bd2834d496ba9b1bdda3a6cf3de4dc0d4a0e7be306335940402ec95132ad063d", - "sha256:d20c0360940f30003a23c0adae2fe50a0a04f3e48dc05c298493b51fd6280197", - "sha256:d3b3ed87061d2314ff3659bb73896e622252da52558f2380f12c421fbdee3d89", - "sha256:dc235bf29a406dfda5790d01b998a1c01d7d37f449128c0b1b7d1c89a84fae8b", - "sha256:fb3c83554f39f48f3fa3123b9c24aecf681b1c289f9334f8215c1d3c8e2f6e5b" + "sha256:0df89ca13c25eaa1621a3f09af4c8ba20da849692dcae184cb55e80952c453fb", + "sha256:154c35f195fd3e1fad2569930ca51907057ae35e03938f89a8aedae91dd1b7c7", + "sha256:18e84323cdb8de3325e741a7a8dd4a82db74fde363dce32b625324c7b32aa6d7", + "sha256:1e8956c37fc138d65ded2d96ab3949bd49038cc6e8a4494b1515b0ba88c91565", + "sha256:23557bdbca3ccbde3abaa12a6e82299bc92d2b9139011f8c16ca1bb8c75d1e95", + "sha256:24fd645a5e5d224aa6e39d93e4a722fafa9160154f296fd5ef9580191c755053", + "sha256:36e36b6868e4440760d4b9b44587ea1dc1f06532858d10abba98e851e154ca70", + "sha256:3d734559db35aa3697dadcea492a423118c5c55d176da2f3be9c98d4803fc2a7", + "sha256:416a2070acf3a2b5d586f9a6507bb97e33574df5bd7508ea970bbf4fc563fa52", + "sha256:4a22dc3f5221a644dfe4a63bf990052cc674ef12a157b1056969079985c92816", + "sha256:4d8d3e5aa6087490912c14a3c10fbdd380b40b421c13920ff468163bc50e016f", + "sha256:4f41fd159fba1245e1958a99d349df49c616b133636e0cf668f169bce2aeac2d", + "sha256:561ef098c50f91fbac2cc9305b68c915e9eb915a74d9038ecf8af274d748f76f", + "sha256:56994e14b386b5c0a9b875a76d22d707b315fa037affc7819cda08b6d0489756", + "sha256:73a1f2a529604c50c262179fcca59c87a05ff4614fe8a15c186934d84d09d9a5", + "sha256:7da99445fd890206bfcc7419f79871ba8e73d9d9e6b82fe09980bc5bb4efc35f", + "sha256:99d59e0bcadac4aa3280616591fb7bcd560e2218f5e31d5223a2e12a1425d495", + "sha256:a4cc09489843c70b22e8373ca3dfa52b3fab778b57cf81462f1203b0852e95e3", + "sha256:a61dc29cfca9831a03442a21d4b5fd77e3067beca4b5f81f1a89a04a71cf93fa", + "sha256:b1853df739b32fa913cc59ad9137caa9cc3d97ff871e2bbd89c2a2a1d4a69451", + "sha256:b1f44c335532c0581b77491b7715a871d0dd72e97487ac0f57337ccf3ab3469b", + "sha256:b261e0cb0d6faa8fd6863af26d30351fd2ffdb15b82e51e81e96b9e9e2e7ba16", + "sha256:c857ae5dba375ea26a6228f98c195fec0898a0fd91bcf0e8a0cae6d9faf3eca7", + "sha256:cf5bb4a7d53a71bb6a0144d31df784a973b36d8687d615ef6a7e9b1809917a9b", + "sha256:db9814ff0457b46f2e1d494c1efa4111ca089e08c8b983635ebffb9c1573361f", + "sha256:df04f4bad8a359daa2ff74f8108ea051670cafbca533bb2636c58b16e962989e", + "sha256:ecf81720934a0e18526177e645cbd6a8a21bb0ddc887ff9738de07a1df5c6b61", + "sha256:edfa6fba9157e0e3be0f40168eb142511012683ac3dc82420bee4a3f3981b30e" ], - "version": "==1.16.2" + "index": "pypi", + "version": "==1.15.4" }, "pandas": { "hashes": [ @@ -147,13 +160,6 @@ "index": "pypi", "version": "==0.24.2" }, - "patsy": { - "hashes": [ - "sha256:5465be1c0e670c3a965355ec09e9a502bf2c4cbe4875e8528b0221190a8a5d40", - "sha256:f115cec4201e1465cd58b9866b0b0e7b941caafec129869057405bfe5b5e3991" - ], - "version": "==0.5.1" - }, "peewee": { "hashes": [ "sha256:074331625cf4335a27af3a8f644eabe2858cd3fc91fa95a7f18db16bd640f7cc" @@ -161,6 +167,14 @@ "index": "pypi", "version": "==3.9.2" }, + "pluggy": { + "hashes": [ + "sha256:7f8ae7f5bdf75671a718d2daf0a64b7885f74510bcd98b1a0bb420eb9a9d0cff", + "sha256:d345c8fe681115900d6da8d048ba67c25df42973bda370783cd58826442dcd7c", + "sha256:e160a7fcf25762bb60efc7e171d4497ff1d8d2d75a3d0df7a21b76821ecbf5c5" + ], + "version": "==0.6.0" + }, "psycopg2": { "hashes": [ "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94", @@ -197,6 +211,21 @@ "index": "pypi", "version": "==2.7.7" }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pytest": { + "hashes": [ + "sha256:54713b26c97538db6ff0703a12b19aeaeb60b5e599de542e7fca0ec83b9038e8", + "sha256:829230122facf05a5f81a6d4dfe6454a04978ea3746853b2b84567ecf8e5c526" + ], + "index": "pypi", + "version": "==3.5.1" + }, "python-dateutil": { "hashes": [ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", @@ -276,7 +305,6 @@ "sha256:f31338ee269d201abe76083a990905473987371ff6f3fdb76a3f9073a361cf37", "sha256:f6b88c8d302c3dac8dff7766955e38d670c82e0d79edfc7eae47d6bb2c186594" ], - "index": "pypi", "version": "==1.2.1" }, "six": { @@ -286,39 +314,12 @@ ], "version": "==1.12.0" }, - "statsmodels": { - "hashes": [ - "sha256:0fd6af8db18b776c81c8fba54de20e9ec2f11b9310871b6b666d8805e3cf5ece", - "sha256:18844bbd95fcf62885d195571334762533ae16de182e1032ccc1595a98ffffb4", - "sha256:27e87cc6cd390fce8f44df225dadf589e1df6272f36b267ccdece2a9c4f52938", - "sha256:2902f5eef49fc38c112ffd8168dd76f7ae27f6cb5aa735cf55bc887b49aaec6e", - "sha256:31c2e26436a992e66355c0b3ef4b7c9714a0aa8375952d24f0593ac7c417b1e9", - "sha256:5d91ad30b8e20a45f583077ffeb4352be01955033f3dcd09bc06c30be1d29e8f", - "sha256:5de3d525b9a8679cd6c0f7f7c8cb8508275ab86cc3c1a140b2dc6b6390adb943", - "sha256:6461f93a842c649922c2c9a9bc9d9c4834110b89de8c4af196a791ab8f42ba3b", - "sha256:78d1b40c18d41f6c683c1c184be146264a782d409a89d8ed6c78acd1e1c11659", - "sha256:7c1a7cf557139f4bcbf97172268a8001156e42a7eeccca04d15c0cb7c3491ada", - "sha256:8532885c5778f94dae7ad83c4ac3f6916d4c8eb294f47ecefe2f0d3b967e6a16", - "sha256:95d35b33a301ded560662c733780ce58b37e218d122bb1b9c14e216aa9d42a2a", - "sha256:b48e283ba171698dca3989c0c03e6f25d3f431640383d926235d26ce48f3891c", - "sha256:b4b4b25c0e4228b1d33098894c3b29f4546e45afb29b333582cbaa5e16f38f3c", - "sha256:c06fd4af98f4c7ab61c9a79fd051ad4d7247991a691c3b4883c611029bac30a2", - "sha256:d2003c70c854f35a6446a465c61c994486039feb2fd47345a1e9984e95d55878", - "sha256:d7182803cdb09f1f17a335c0eae71d84905da9b0bc35c3d2c2379745f33096d9", - "sha256:d9b85bd98e90a02f2192084a85c857465e40e508629ac922242dba70731d0449", - "sha256:e2d9fd696e2d1523386d0f64f115352acbfaf59d5ca4c681c23ea064393a2ac4", - "sha256:ede078fdc9af857ed454d1e9e51831b2d577255c794d4044ecc332d40f3e3b36", - "sha256:f512afa7bc10b848aaacab5dfff6f61255142dd3a5581f82980c12745b0b6cd3", - "sha256:fbf789cc6d3fadca4350fa87e5f710ad2628e1fdff71bf8f853ecd49599ebe23" - ], - "version": "==0.9.0" - }, "werkzeug": { "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" + "sha256:96da23fa8ccecbc3ae832a83df5c722c11547d021637faacb0bec4dd2f4666c8", + "sha256:ca5c2dcd367d6c0df87185b9082929d255358f5391923269335782b213d52655" ], - "version": "==0.14.1" + "version": "==0.15.1" } }, "develop": {} diff --git a/README.md b/README.md index ef3bd51..c9c9a5e 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ This information is directly copied from the [UCI datasets repository for adult ```bash R -e rmarkdown::render"('eda_adult.Rmd', clean=TRUE, output_format='pdf_document')" + +# use this command to generate a markdown +R -e rmarkdown::render"('eda_adult.Rmd', clean=TRUE, output_format='github_document')" ``` The script `eda_adult.pdf` will contain the exploratory analysis report on the adult income dataset. diff --git a/eda_adult.Rmd b/eda_adult.Rmd index ba691a2..9892716 100644 --- a/eda_adult.Rmd +++ b/eda_adult.Rmd @@ -9,6 +9,8 @@ knitr::opts_chunk$set(echo = TRUE) ```{r} # importing the required libraries +# uncomment the below line if tidyverse not already installed +# install.packages("tidyverse") suppressPackageStartupMessages(library(tidyverse)) ``` diff --git a/eda_adult.html b/eda_adult.html new file mode 100644 index 0000000..3f03a46 --- /dev/null +++ b/eda_adult.html @@ -0,0 +1,548 @@ + + + + + + + + + + + + + + + + + + +

Adult Dataset

+
# importing the required libraries
+# uncomment the below line if tidyverse not already installed
+# install.packages("tidyverse")
+suppressPackageStartupMessages(library(tidyverse))
+

In this R Markdown document, I will be doing a preliminary data analysis and exploration for the adult dataset from UCI ML datasets repository.

+

Section 1: Reading and Cleaning up the dataset

+
# reading the dataset
+adult <- read.csv("data/adult.data", header = FALSE)
+test <- read.csv("data/adult.test", header = FALSE, skip = 1)
+
+head(adult)
+
##   V1                V2     V3         V4 V5                  V6
+## 1 39         State-gov  77516  Bachelors 13       Never-married
+## 2 50  Self-emp-not-inc  83311  Bachelors 13  Married-civ-spouse
+## 3 38           Private 215646    HS-grad  9            Divorced
+## 4 53           Private 234721       11th  7  Married-civ-spouse
+## 5 28           Private 338409  Bachelors 13  Married-civ-spouse
+## 6 37           Private 284582    Masters 14  Married-civ-spouse
+##                   V7             V8     V9     V10  V11 V12 V13
+## 1       Adm-clerical  Not-in-family  White    Male 2174   0  40
+## 2    Exec-managerial        Husband  White    Male    0   0  13
+## 3  Handlers-cleaners  Not-in-family  White    Male    0   0  40
+## 4  Handlers-cleaners        Husband  Black    Male    0   0  40
+## 5     Prof-specialty           Wife  Black  Female    0   0  40
+## 6    Exec-managerial           Wife  White  Female    0   0  40
+##              V14    V15
+## 1  United-States  <=50K
+## 2  United-States  <=50K
+## 3  United-States  <=50K
+## 4  United-States  <=50K
+## 5           Cuba  <=50K
+## 6  United-States  <=50K
+

We see that there are 14 features and 1 response which is binary (whether the annual income is <= or greater than 50k).

+
# assigning proper column names to variables
+adult <- adult %>%
+    magrittr::set_colnames(c("age", "workclass", "fnlwgt", "education", "education_num",
+                             "marital_status", "occupation", "relationship", "race",
+                             "sex", "capital_gain", "capital_loss", "hours_per_week",
+                             "native_country", "income"))
+head(adult)
+
##   age         workclass fnlwgt  education education_num
+## 1  39         State-gov  77516  Bachelors            13
+## 2  50  Self-emp-not-inc  83311  Bachelors            13
+## 3  38           Private 215646    HS-grad             9
+## 4  53           Private 234721       11th             7
+## 5  28           Private 338409  Bachelors            13
+## 6  37           Private 284582    Masters            14
+##        marital_status         occupation   relationship   race     sex
+## 1       Never-married       Adm-clerical  Not-in-family  White    Male
+## 2  Married-civ-spouse    Exec-managerial        Husband  White    Male
+## 3            Divorced  Handlers-cleaners  Not-in-family  White    Male
+## 4  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male
+## 5  Married-civ-spouse     Prof-specialty           Wife  Black  Female
+## 6  Married-civ-spouse    Exec-managerial           Wife  White  Female
+##   capital_gain capital_loss hours_per_week native_country income
+## 1         2174            0             40  United-States  <=50K
+## 2            0            0             13  United-States  <=50K
+## 3            0            0             40  United-States  <=50K
+## 4            0            0             40  United-States  <=50K
+## 5            0            0             40           Cuba  <=50K
+## 6            0            0             40  United-States  <=50K
+

Now that we have imported the data in the required format, we will move on to looking at the dataset and cleaning it wherever required.

+
# looking at the structure of the data
+str(adult)
+
## 'data.frame':    32561 obs. of  15 variables:
+##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
+##  $ workclass     : Factor w/ 9 levels " ?"," Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
+##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
+##  $ education     : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
+##  $ education_num : int  13 13 9 7 13 14 5 9 14 13 ...
+##  $ marital_status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
+##  $ occupation    : Factor w/ 15 levels " ?"," Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
+##  $ relationship  : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
+##  $ race          : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
+##  $ sex           : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
+##  $ capital_gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
+##  $ capital_loss  : int  0 0 0 0 0 0 0 0 0 0 ...
+##  $ hours_per_week: int  40 13 40 40 40 40 16 45 50 40 ...
+##  $ native_country: Factor w/ 42 levels " ?"," Cambodia",..: 40 40 40 40 6 40 24 40 40 40 ...
+##  $ income        : Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...
+
summary(adult)
+
##       age                    workclass         fnlwgt       
+##  Min.   :17.00    Private         :22696   Min.   :  12285  
+##  1st Qu.:28.00    Self-emp-not-inc: 2541   1st Qu.: 117827  
+##  Median :37.00    Local-gov       : 2093   Median : 178356  
+##  Mean   :38.58    ?               : 1836   Mean   : 189778  
+##  3rd Qu.:48.00    State-gov       : 1298   3rd Qu.: 237051  
+##  Max.   :90.00    Self-emp-inc    : 1116   Max.   :1484705  
+##                  (Other)          :  981                    
+##          education     education_num                  marital_status 
+##   HS-grad     :10501   Min.   : 1.00    Divorced             : 4443  
+##   Some-college: 7291   1st Qu.: 9.00    Married-AF-spouse    :   23  
+##   Bachelors   : 5355   Median :10.00    Married-civ-spouse   :14976  
+##   Masters     : 1723   Mean   :10.08    Married-spouse-absent:  418  
+##   Assoc-voc   : 1382   3rd Qu.:12.00    Never-married        :10683  
+##   11th        : 1175   Max.   :16.00    Separated            : 1025  
+##  (Other)      : 5134                    Widowed              :  993  
+##             occupation            relationship  
+##   Prof-specialty :4140    Husband       :13193  
+##   Craft-repair   :4099    Not-in-family : 8305  
+##   Exec-managerial:4066    Other-relative:  981  
+##   Adm-clerical   :3770    Own-child     : 5068  
+##   Sales          :3650    Unmarried     : 3446  
+##   Other-service  :3295    Wife          : 1568  
+##  (Other)         :9541                          
+##                   race            sex         capital_gain  
+##   Amer-Indian-Eskimo:  311    Female:10771   Min.   :    0  
+##   Asian-Pac-Islander: 1039    Male  :21790   1st Qu.:    0  
+##   Black             : 3124                   Median :    0  
+##   Other             :  271                   Mean   : 1078  
+##   White             :27816                   3rd Qu.:    0  
+##                                              Max.   :99999  
+##                                                             
+##   capital_loss    hours_per_week         native_country     income     
+##  Min.   :   0.0   Min.   : 1.00    United-States:29170    <=50K:24720  
+##  1st Qu.:   0.0   1st Qu.:40.00    Mexico       :  643    >50K : 7841  
+##  Median :   0.0   Median :40.00    ?            :  583                 
+##  Mean   :  87.3   Mean   :40.44    Philippines  :  198                 
+##  3rd Qu.:   0.0   3rd Qu.:45.00    Germany      :  137                 
+##  Max.   :4356.0   Max.   :99.00    Canada       :  121                 
+##                                   (Other)       : 1709
+

We see that there are missing values in 3 columns (in the form of ?) looking at the structure of data. There also seems to be whitespace as a prefix in many of the categorical variables. We also don't need the final weight variable which was put up by the Census Board and hence, will remove it. Let us handle all these cases.

+
adult <- adult %>%
+    mutate(income = str_trim(income, side = c("left")),
+           occupation = ifelse(occupation == " ?", "unknown", as.character(occupation)),
+           workclass = ifelse(workclass == " ?", "unknown", as.character(workclass)),
+           native_country = ifelse(native_country == " ?", "unknown", as.character(native_country)),
+           workclass = str_trim(workclass, side = c("left")),
+           education = str_trim(education, side = c("left")),
+           marital_status = str_trim(marital_status, side = c("left")),
+           occupation = str_trim(occupation, side = c("left")),
+           relationship = str_trim(relationship, side = c("left")),
+           sex = str_trim(sex, side = c("left")),
+           race = str_trim(race, side = c("left")),
+           native_country = str_trim(native_country, side = c("left")))
+head(adult)
+
##   age        workclass fnlwgt education education_num     marital_status
+## 1  39        State-gov  77516 Bachelors            13      Never-married
+## 2  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
+## 3  38          Private 215646   HS-grad             9           Divorced
+## 4  53          Private 234721      11th             7 Married-civ-spouse
+## 5  28          Private 338409 Bachelors            13 Married-civ-spouse
+## 6  37          Private 284582   Masters            14 Married-civ-spouse
+##          occupation  relationship  race    sex capital_gain capital_loss
+## 1      Adm-clerical Not-in-family White   Male         2174            0
+## 2   Exec-managerial       Husband White   Male            0            0
+## 3 Handlers-cleaners Not-in-family White   Male            0            0
+## 4 Handlers-cleaners       Husband Black   Male            0            0
+## 5    Prof-specialty          Wife Black Female            0            0
+## 6   Exec-managerial          Wife White Female            0            0
+##   hours_per_week native_country income
+## 1             40  United-States  <=50K
+## 2             13  United-States  <=50K
+## 3             40  United-States  <=50K
+## 4             40  United-States  <=50K
+## 5             40           Cuba  <=50K
+## 6             40  United-States  <=50K
+
# checking missing value category
+adult %>%
+    filter(is.na(workclass)) %>%
+    group_by(income) %>%
+    count()
+
## # A tibble: 0 x 2
+## # Groups:   income [0]
+## # ... with 2 variables: income <chr>, n <int>
+
adult %>%
+    filter(is.na(occupation)) %>%
+    group_by(income) %>%
+    count()
+
## # A tibble: 0 x 2
+## # Groups:   income [0]
+## # ... with 2 variables: income <chr>, n <int>
+
adult %>%
+    filter(is.na(native_country)) %>%
+    group_by(income) %>%
+    count()
+
## # A tibble: 0 x 2
+## # Groups:   income [0]
+## # ... with 2 variables: income <chr>, n <int>
+

We see that most of the people with missing values in the above 3 columns belong to the <50K income category and since we have a lot of data points for that category, we can safely ignore these cases from our analysis.

+
adult <- adult %>%
+    filter(!is.na(workclass), !is.na(occupation), !is.na(native_country))
+

Section 2: Univariate Analysis

+
adult %>%
+    keep(is.numeric) %>%                     # Keep only numeric columns
+    gather() %>%                             # Convert to key-value pairs
+    ggplot(aes(value)) +                     # Plot the values
+    facet_wrap(~ key, scales = "free") +     # In separate panels
+    geom_histogram(color = "darkblue",
+                   fill = "lightblue", bins = 25) +
+    ggtitle("Histograms for numeric variables") +
+    theme_bw()
+

+

The age and final weight distributions seem to be right-skewed and a log/square-root transformation on these would be a good choice while building the model. Plotting capital_gain and capital_loss without the 0 value would give a better look at the distribution.

+
adult %>%
+    select(capital_gain) %>%
+    filter(capital_gain != 0) %>%
+    ggplot() +
+    geom_histogram(aes(x = capital_gain), color = "darkblue",
+                   fill = "lightblue", bins = 25) +
+    ggtitle("Histogram for capital gain") +
+    theme_bw()
+

+
adult %>%
+    select(capital_loss) %>%
+    filter(capital_loss != 0) %>%
+    ggplot() +
+    geom_histogram(aes(x = capital_loss), color = "darkblue",
+                   fill = "lightblue", bins = 25) +
+    ggtitle("Histogram for capital loss") +
+    theme_bw()
+

+

We see that there is a set of people with a very high capital gain (about a 100000) and the rest of the people form a right-skewed distribution. I suspect that these people with such a high capital gain would fall into the category of >50K income bracket. Let's find out!

+
adult %>%
+    filter(capital_gain > 90000) %>%
+    group_by(income) %>%
+    count()
+
## # A tibble: 1 x 2
+## # Groups:   income [1]
+##   income     n
+##   <chr>  <int>
+## 1 >50K     159
+

As expected, they are indeed a high-income group! Capital gain might be good predictor of income group. Let us now look at the distribution of some categorical variables.

+
adult %>%
+    select(-income) %>%
+    keep(is.character) %>%
+    gather() %>%
+    ggplot(aes(value)) +
+    facet_wrap(~ key, scales = "free", ncol = 4) +
+    geom_bar(color = "darkblue", fill = "lightblue") +
+    theme(axis.text.x = element_text(angle = 90, hjust = 1))
+

+

Let us plot separately some figures which are not so clear in the picture above and break them down into actual numbers.

+
# native country
+adult %>%
+    group_by(native_country) %>%
+    summarise(freq = n()) %>%
+    mutate(prop = freq / sum(freq) * 100) %>%
+    arrange(desc(prop))
+
## # A tibble: 42 x 3
+##    native_country  freq   prop
+##    <chr>          <int>  <dbl>
+##  1 United-States  29170 89.6  
+##  2 Mexico           643  1.97 
+##  3 unknown          583  1.79 
+##  4 Philippines      198  0.608
+##  5 Germany          137  0.421
+##  6 Canada           121  0.372
+##  7 Puerto-Rico      114  0.350
+##  8 El-Salvador      106  0.326
+##  9 India            100  0.307
+## 10 Cuba              95  0.292
+## # ... with 32 more rows
+

Most of the people in the dataset are from the United-States.

+
# workclass
+adult %>%
+    group_by(workclass) %>%
+    summarise(freq = n()) %>%
+    mutate(prop = freq / sum(freq) * 100) %>%
+    arrange(desc(prop))
+
## # A tibble: 9 x 3
+##   workclass         freq    prop
+##   <chr>            <int>   <dbl>
+## 1 Private          22696 69.7   
+## 2 Self-emp-not-inc  2541  7.80  
+## 3 Local-gov         2093  6.43  
+## 4 unknown           1836  5.64  
+## 5 State-gov         1298  3.99  
+## 6 Self-emp-inc      1116  3.43  
+## 7 Federal-gov        960  2.95  
+## 8 Without-pay         14  0.0430
+## 9 Never-worked         7  0.0215
+

As we see, most of the people belong to the Private workforce. Let us remake this plot so as to look at other classes clearly.

+
# distribution without private workclass
+adult %>%
+    filter(workclass != "Private") %>%
+    ggplot() +
+    geom_bar(aes(x = workclass), color = "darkblue",
+                   fill = "lightblue") +
+    ggtitle("Bar plot for Workclass") +
+    theme_bw()
+

+

There are very less people who have never worked or who are living without a pay. We will now move on to multivariate analysis.

+

Section 3: Multivariate Analysis

+

In this section, we will be analyzing the relationship between different predictors and the response along with some relationships and patterns within the predictors.

+
adult %>%
+    ggplot() +
+    geom_violin(aes(x = income, y = age)) +
+    geom_boxplot(aes(x = income, y = age), width=0.05) +
+    geom_hline(yintercept = 35, color = "red") +
+    geom_hline(yintercept = 50, color = "red") +
+    theme_bw()
+

+

We see that most of the population under the age of 25 earns < 50K a year. This makes sense and most likely, this section of the population would not be earning much at all (as there will be a lot of students in this section). Most of the people who earn > 50K a year are in their 30s and 40s.

+
adult %>%
+    group_by(education, income) %>%
+    summarise(freq = n()) %>%
+    mutate(prop = freq / sum(freq) * 100) %>%
+    filter(income == ">50K") %>%
+    arrange(desc(prop)) %>%
+    ggplot() +
+    geom_col(aes(x = reorder(education, prop), y = prop), color = "darkblue",
+                   fill = "lightblue") +
+    labs(x = "Education Level", y = "Percentage") +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+

+

No surprise here! Adults with a high level of education have a higher proportion of people who earn more than 50K a year. There also are some people with a low education level who are earning > 50K per year. Let us have a closer look at these people.

+
adult %>%
+    filter(education_num < 8, income == ">50K") %>%
+    group_by(workclass) %>%
+    count() %>%
+    ggplot() +
+    geom_col(aes(x = reorder(workclass, n), y = n), color = "darkblue",
+                   fill = "lightblue") +
+    labs(x = "Work class", y = "Count") +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+

+
adult %>%
+    filter(education_num < 8, income == ">50K") %>%
+    group_by(education) %>%
+    count() %>%
+    ggplot() +
+    geom_col(aes(x = reorder(education, n), y = n), color = "darkblue",
+                   fill = "lightblue") +
+    labs(x = "Education level", y = "Count") +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+

+

These people are mostly high-school dropouts who worked for private companies or were self-employed.

+
adult %>%
+    filter(education_num < 8, income == ">50K") %>%
+    group_by(sex) %>%
+    count()
+
## # A tibble: 2 x 2
+## # Groups:   sex [2]
+##   sex        n
+##   <chr>  <int>
+## 1 Female    19
+## 2 Male     192
+

Not surprisingly, most of these people are males (10:1 male-female ratio). We don't see many cases of women dropping out from schools and earning really high amounts of money. This probably reflects that men are more prone to risk taking than most women.

+
# proportion of high earning people by occupation
+adult %>%
+    group_by(occupation, income) %>%
+    summarise(freq = n()) %>%
+    mutate(prop = freq / sum(freq) * 100) %>%
+    filter(income == ">50K") %>%
+    arrange(desc(prop)) %>%
+    ggplot() +
+    geom_col(aes(x = reorder(occupation, prop), y = prop), color = "darkblue",
+                   fill = "lightblue") +
+    labs(x = "Occupation", y = "Percentage") +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+

+
adult %>%
+    ggplot() +
+    geom_point(aes(x = hours_per_week, y = education_num, color = income)) +
+    labs(x = "Working hours per week", y = "Education level") +
+    theme_bw()
+

+

A lot of people who work > 50 hours per week have an income > 50K. Also, there is a bunch of people who don't spend a lot of time working, are really well-educated and earn a high income! They are probably the ones who earn fortunes for a few minutes of their lives.

+
adult %>%
+    ggplot() +
+    geom_jitter(aes(x = marital_status, y = hours_per_week, color = income),
+                alpha = 0.1) +
+    labs(y = "Hours per week", x = "Marital Status") +
+    geom_hline(yintercept = 40, color = "darkgreen") +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+

+

It seems that people who never married are mostly having an income less than 50K a year. Also, most of the people who earn > 50K a year are Married civilians who work > 40 hours per week.

+
# distribution of income by race
+adult %>%
+    ggplot() +
+    geom_bar(aes(x = race, fill = income), position = "dodge2") +
+    labs(x = "Race") +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+

+
# distribution of income by race without whites
+adult %>%
+    group_by(race, income) %>%
+    summarise(freq = n()) %>%
+    mutate(perc = freq / sum(freq) * 100) %>%
+    filter(race != "White") %>%
+    ggplot() +
+    geom_col(aes(x = race, y = freq, fill = income), position = "dodge2") +
+    geom_text(aes(x = race, y = freq, label = paste0(round(perc), "%")),
+              size = 4, position = position_dodge2(width = 0.8)) +
+    labs(x = "Sex", y = "Count") +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+

+

The ratio of people with an income >50K to the ones with an income <=50K seems to be nearly constant across all races.

+
adult %>%
+    ggplot() +
+    geom_bar(aes(x = race), color = "darkblue",
+                   fill = "lightblue") +
+    facet_wrap(~ education, scales = "free_y") +
+    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+    labs(x = "Race") +
+    ggtitle("Distribution of race by education level")
+

+
+

It is surprising to see the reduced number of black people as compared to whites in the higher level of education categories like Doctorate, Prof-school and Masters. Let us look at the proportion of high earning people by native country.

+
+
# percentage of high earning people by native-country
+adult %>%
+    group_by(native_country, income) %>%
+    summarise(freq = n()) %>%
+    mutate(prop = freq / sum(freq) * 100) %>%
+    filter(income == ">50K") %>%
+    arrange(desc(prop)) %>%
+    ggplot() +
+    geom_col(aes(x = reorder(native_country, prop), y = prop), color = "darkblue",
+                   fill = "lightblue") +
+    labs(x = "Native Country", y = "Percentage") +
+    coord_flip()
+

+

The distribution we see above shows that most of the migrants to the United-States are high earning people and there are also many countries whose migrants don't earn a high income. The United-States lies somewhere near the middle of these two categories of countries.

+
# distribution of income by sex
+adult %>%
+    group_by(sex, income) %>%
+    summarise(freq = n()) %>%
+    mutate(perc = freq / sum(freq) * 100) %>%
+    ggplot() +
+    geom_col(aes(x = sex, y = freq, fill = income), position = "dodge2") +
+    labs(x = "Sex", y = "Count") +
+    geom_text(aes(x = sex, y = freq, label = paste0(round(perc), "%")),
+              size = 4, position = position_dodge2(width = 0.8)) +
+    theme_bw()
+

+

We see that only 11% of females earn more than 50K a year whereas the percentage is around 3 times higher in males (31%).

+

Section 4: Data Cleaning

+
# combining train and test for cleaning and export
+
+test <- test %>%
+    magrittr::set_colnames(c("age", "workclass", "fnlwgt", "education", "education_num",
+                             "marital_status", "occupation", "relationship", "race",
+                             "sex", "capital_gain", "capital_loss", "hours_per_week",
+                             "native_country", "income"))
+
+test <- test %>%
+    mutate(income = str_trim(income, side = c("left")),
+           income = ifelse(income == "<=50K.", "<=50K", ">50K"),
+           occupation = ifelse(occupation == " ?", "unknown", as.character(occupation)),
+           workclass = ifelse(workclass == " ?", "unknown", as.character(workclass)),
+           native_country = ifelse(native_country == " ?", "unknown", as.character(native_country)),
+           workclass = str_trim(workclass, side = c("left")),
+           education = str_trim(education, side = c("left")),
+           marital_status = str_trim(marital_status, side = c("left")),
+           occupation = str_trim(occupation, side = c("left")),
+           relationship = str_trim(relationship, side = c("left")),
+           sex = str_trim(sex, side = c("left")),
+           race = str_trim(race, side = c("left")),
+           native_country = str_trim(native_country, side = c("left")))
+
+adult <- rbind(adult, test)
+

First, we will combine similar categories in different categorical variables into a smaller number of categories.

+
# combining categories in workclass
+adult <- adult %>%
+    mutate(workclass = replace(workclass, workclass %in% c('State-gov', 'Federal-gov',
+                                                 'Local-gov'), 0),
+           workclass = replace(workclass, workclass %in% c('Self-emp-not-inc', 'Self-emp-inc',
+                                                 'Without-pay', 'Never-worked'), 1),
+           workclass = replace(workclass, workclass %in% c('Private'), 2),
+           workclass = replace(workclass, workclass %in% c('unknown'), -1))
+
+# combining categories in marital_status
+adult <- adult %>%
+    mutate(marital_status = replace(marital_status, marital_status %in% c('Married-civ-spouse',
+                                                           'Married-spouse-absent',
+                                                           'Married-AF-spouse'), 0),
+           marital_status = replace(marital_status, marital_status %in% c('Never-married','Divorced',
+                                                           'Separated','Widowed'), 1))
+# combining categories in education
+adult <- adult %>%
+    select(-education_num) %>%
+    mutate(education = replace(education, education %in% c("HS-grad", "11th", "9th", "7th-8th",
+                                                           "5th-6th", "10th", "Preschool", "12th",
+                                                           "1st-4th"), 0),
+           education = replace(education, education %in% c("Bachelors", "Some-college", "Assoc-acdm",
+                                                           "Assoc-voc"), 1),
+           education = replace(education, education %in% c("Masters", "Prof-school", "Doctorate",
+                                                           "Assoc-voc"), 2))
+
+# combining categories in occupation
+adult <- adult %>%
+    mutate(occupation = replace(occupation, occupation %in% c("Priv-house-serv", "Handlers-cleaners",
+                                                              "Other-service", "Armed-Forces",
+                                                              "Machine-op-inspct", "Farming-fishing",
+                                                              "Adm-clerical"), 0),
+           occupation = replace(occupation, occupation %in% c("Tech-support", "Craft-repair",
+                                                              "Protective-serv", "Transport-moving",
+                                                              "Sales"), 1),
+           occupation = replace(occupation, occupation %in% c("Exec-managerial", "Prof-specialty"), 2),
+           occupation = replace(occupation, occupation %in% c("unknown"), -1))
+
+# combining categories in race
+adult <- adult %>%
+    mutate(race = replace(race, race %in% c("White"), 0),
+           race = replace(race, race %in% c("Black"), 1),
+           race = replace(race, race %in% c("Asian-Pac-Islander",
+                                            "Amer-Indian-Eskimo", "Other"), 2))
+
+# handling sex and native country and target
+adult <- adult %>%
+    mutate(sex = ifelse(sex == "Male", 1, 0),
+           native_country = ifelse(native_country == "United-States", 1, 0),
+           income = ifelse(income == "<=50K", 0, 1))
+
+# removing unnecessary variables eg: relationship, as information
+# already encoded in sex and marital status
+adult <- adult %>%
+    select(-c(relationship, fnlwgt))
+
+train <- adult[1:(nrow(adult) - nrow(test)), ]
+test <- adult[(nrow(adult) - nrow(test) + 1):nrow(adult), ]
+

Now that we have cleaned the dataset, let's export it so that it can be used in model building directly.

+
# exporting the datasets
+write_csv(train, "data/train.csv", col_names = TRUE)
+write_csv(test, "data/test.csv", col_names = TRUE)
+ + + diff --git a/eda_adult.md b/eda_adult.md index de0f969..89e3c4d 100644 --- a/eda_adult.md +++ b/eda_adult.md @@ -3,6 +3,8 @@ Adult Dataset ``` r # importing the required libraries +# uncomment the below line if tidyverse not already installed +# install.packages("tidyverse") suppressPackageStartupMessages(library(tidyverse)) ``` diff --git a/eda_adult.pdf b/eda_adult.pdf index 5c4cb07..79c3bcd 100644 Binary files a/eda_adult.pdf and b/eda_adult.pdf differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-11-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-11-1.png new file mode 100644 index 0000000..f35fd3e Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-11-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-14-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-14-1.png new file mode 100644 index 0000000..0baa9b1 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-14-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-15-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-15-1.png new file mode 100644 index 0000000..e29ef56 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-15-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-16-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-16-1.png new file mode 100644 index 0000000..0a02097 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-16-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-17-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-17-1.png new file mode 100644 index 0000000..a3b008b Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-17-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-18-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-18-1.png new file mode 100644 index 0000000..79e7c1e Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-18-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-20-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-20-1.png new file mode 100644 index 0000000..f9d7d12 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-20-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-21-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-21-1.png new file mode 100644 index 0000000..bf8f0a0 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-21-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-22-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-22-1.png new file mode 100644 index 0000000..e6de770 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-22-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-23-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-23-1.png new file mode 100644 index 0000000..7ade373 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-23-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-24-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-24-1.png new file mode 100644 index 0000000..b3c22d0 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-24-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-25-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-25-1.png new file mode 100644 index 0000000..6befbd6 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-25-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-26-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-26-1.png new file mode 100644 index 0000000..70392d9 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-26-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-27-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-27-1.png new file mode 100644 index 0000000..17d332d Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-27-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-8-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-8-1.png new file mode 100644 index 0000000..1d13193 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-8-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-9-1.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-9-1.png new file mode 100644 index 0000000..1f4af30 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-9-1.png differ diff --git a/eda_adult_files/figure-markdown_github/unnamed-chunk-9-2.png b/eda_adult_files/figure-markdown_github/unnamed-chunk-9-2.png new file mode 100644 index 0000000..990a739 Binary files /dev/null and b/eda_adult_files/figure-markdown_github/unnamed-chunk-9-2.png differ diff --git a/requirements.txt b/requirements.txt index 19a4d4b..e1c849d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ scikit-learn pandas +numpy==1.15.* +pytest==3.5.1 flask gunicorn peewee