add data cleaning tutorial

x4nth055 · x4nth055 · commit b0ae13afbbb8 · 2022-09-18T21:41:38.000+01:00
diff --git a/README.md b/README.md
@@ -120,6 +120,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [How to Plot Weather Temperature in Python](https://www.thepythoncode.com/article/interactive-weather-plot-with-matplotlib-and-requests). ([code](general/interactive-weather-plot/))
     - [How to Generate SVG Country Maps in Python](https://www.thepythoncode.com/article/generate-svg-country-maps-python). ([code](general/generate-svg-country-map))
     - [How to Query the Ethereum Blockchain with Python](https://www.thepythoncode.com/article/query-ethereum-blockchain-with-python). ([code](general/query-ethereum))
+    - [Data Cleaning with Pandas in Python](https://www.thepythoncode.com/article/data-cleaning-using-pandas-in-python). ([code](general/data-cleaning-pandas))
 
     
     
diff --git a/general/data-cleaning-pandas/README.md b/general/data-cleaning-pandas/README.md
@@ -0,0 +1 @@
+# [Data Cleaning with Pandas in Python](https://www.thepythoncode.com/article/data-cleaning-using-pandas-in-python)
diff --git a/general/data-cleaning-pandas/data_cleaning.py b/general/data-cleaning-pandas/data_cleaning.py
@@ -0,0 +1,10 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+print(data_frames.head(10))
diff --git a/general/data-cleaning-pandas/data_cleaning2.py b/general/data-cleaning-pandas/data_cleaning2.py
@@ -0,0 +1,10 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+print(data_frames.info())
diff --git a/general/data-cleaning-pandas/data_cleaning3.py b/general/data-cleaning-pandas/data_cleaning3.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+# Data Type Conversion
+# Remove '$' from donation strings
+data_frames['donation'] = data_frames['donation'].str.strip('$')
+
+# Convert donation stings into numerical data type
+data_frames['donation'] = data_frames['donation'].astype('float64')
+
+print(data_frames.head(10))
+print(data_frames.info())
diff --git a/general/data-cleaning-pandas/data_cleaning4.py b/general/data-cleaning-pandas/data_cleaning4.py
@@ -0,0 +1,32 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+# Data Type Conversion
+# Remove '$' from donation strings
+data_frames['donation'] = data_frames['donation'].str.strip('$')
+
+# Convert donation stings into numerical data type
+data_frames['donation'] = data_frames['donation'].astype('float64')
+
+
+# Handle Data Inconsistencies
+# Capitalize strings
+data_frames['street_address'] = data_frames['street_address'].str.split()
+
+def capitalize_words(arr):
+    for index, word in enumerate(arr):
+        if index == 0:
+            pass
+        else:
+            arr[index] = word.capitalize()
+
+data_frames['street_address'].apply(lambda x: capitalize_words(x))
+data_frames['street_address'] = data_frames['street_address'].str.join(' ')
+
+print(data_frames['street_address'])
diff --git a/general/data-cleaning-pandas/data_cleaning5.py b/general/data-cleaning-pandas/data_cleaning5.py
@@ -0,0 +1,41 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+# Data Type Conversion
+# Remove '$' from donation strings
+data_frames['donation'] = data_frames['donation'].str.strip('$')
+
+# Convert donation stings into numerical data type
+data_frames['donation'] = data_frames['donation'].astype('float64')
+
+
+# Handle Data Inconsistencies
+# Normalize strings
+data_frames['street_address'] = data_frames['street_address'].str.split()
+
+def normalize_words(arr):
+    for index, word in enumerate(arr):
+        if index == 0:
+            pass
+        else:
+            arr[index] = normalize(word)
+
+def normalize(word):
+    if word.lower() == 'st':
+        word = 'street'
+    elif word.lower() == 'rd':
+        word = 'road'
+
+    return word.capitalize()
+
+
+data_frames['street_address'].apply(lambda x: normalize_words(x))
+data_frames['street_address'] = data_frames['street_address'].str.join(' ')
+
+print(data_frames.head(10))
diff --git a/general/data-cleaning-pandas/data_cleaning6.py b/general/data-cleaning-pandas/data_cleaning6.py
@@ -0,0 +1,49 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+# Data Type Conversion
+# Remove '$' from donation strings
+data_frames['donation'] = data_frames['donation'].str.strip('$')
+
+# Convert donation stings into numerical data type
+data_frames['donation'] = data_frames['donation'].astype('float64')
+
+
+# Handle Data Inconsistencies
+# Normalize strings
+data_frames['street_address'] = data_frames['street_address'].str.split()
+
+def normalize_words(arr):
+    for index, word in enumerate(arr):
+        if index == 0:
+            pass
+        else:
+            arr[index] = normalize(word)
+
+def normalize(word):
+    if word.lower() == 'st':
+        word = 'street'
+    elif word.lower() == 'rd':
+        word = 'road'
+
+    return word.capitalize()
+
+
+data_frames['street_address'].apply(lambda x: normalize_words(x))
+data_frames['street_address'] = data_frames['street_address'].str.join(' ')
+
+
+# Remove Out-of-Range Data
+# create boolean Series for out of range donations 
+out_of_range = data_frames['donation'] < 0
+
+# keep only the rows that are NOT out of range
+data_frames['donation'] = data_frames['donation'][~out_of_range]
+
+print(data_frames.head(10))
diff --git a/general/data-cleaning-pandas/data_cleaning7.py b/general/data-cleaning-pandas/data_cleaning7.py
@@ -0,0 +1,54 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+# Data Type Conversion
+# Remove '$' from donation strings
+data_frames['donation'] = data_frames['donation'].str.strip('$')
+
+# Convert donation stings into numerical data type
+data_frames['donation'] = data_frames['donation'].astype('float64')
+
+
+# Handle Data Inconsistencies
+# Normalize strings
+data_frames['street_address'] = data_frames['street_address'].str.split()
+
+def normalize_words(arr):
+    for index, word in enumerate(arr):
+        if index == 0:
+            pass
+        else:
+            arr[index] = normalize(word)
+
+def normalize(word):
+    if word.lower() == 'st':
+        word = 'street'
+    elif word.lower() == 'rd':
+        word = 'road'
+
+    return word.capitalize()
+
+
+data_frames['street_address'].apply(lambda x: normalize_words(x))
+data_frames['street_address'] = data_frames['street_address'].str.join(' ')
+
+
+# Remove Out-of-Range Data
+# create boolean Series for out of range donations 
+out_of_range = data_frames['donation'] < 0
+
+# keep only the rows that are NOT out of range
+data_frames['donation'] = data_frames['donation'][~out_of_range]
+
+
+# Remove duplicates
+columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
+data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
+
+print(data_frames_no_dupes.info())
diff --git a/general/data-cleaning-pandas/data_cleaning8.py b/general/data-cleaning-pandas/data_cleaning8.py
@@ -0,0 +1,60 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+
+# Data Type Conversion
+# Remove '$' from donation strings
+data_frames['donation'] = data_frames['donation'].str.strip('$')
+
+# Convert donation stings into numerical data type
+data_frames['donation'] = data_frames['donation'].astype('float64')
+
+
+# Handle Data Inconsistencies
+# Normalize strings
+data_frames['street_address'] = data_frames['street_address'].str.split()
+
+def normalize_words(arr):
+    for index, word in enumerate(arr):
+        if index == 0:
+            pass
+        else:
+            arr[index] = normalize(word)
+
+def normalize(word):
+    if word.lower() == 'st':
+        word = 'street'
+    elif word.lower() == 'rd':
+        word = 'road'
+
+    return word.capitalize()
+
+
+data_frames['street_address'].apply(lambda x: normalize_words(x))
+data_frames['street_address'] = data_frames['street_address'].str.join(' ')
+
+
+# Remove Out-of-Range Data
+# create boolean Series for out of range donations 
+out_of_range = data_frames['donation'] < 0
+
+# keep only the rows that are NOT out of range
+data_frames['donation'] = data_frames['donation'][~out_of_range]
+
+
+# Remove duplicates
+columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
+data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
+
+
+# Drop Missing Data
+columns_to_check = ['state', 'donation']
+data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)
+
+
+print(data_frames_no_missing.head(20))
diff --git a/general/data-cleaning-pandas/data_cleaning9.py b/general/data-cleaning-pandas/data_cleaning9.py
@@ -0,0 +1,59 @@
+import pandas as pd
+
+# Config settings
+pd.set_option('max_columns', None)
+pd.set_option('max_rows', 12)
+
+# Import CSV data
+data_frames = pd.read_csv (r'simulated_data.csv')
+# Data Type Conversion
+# Remove '$' from donation strings
+data_frames['donation'] = data_frames['donation'].str.strip('$')
+# Convert donation stings into numerical data type
+data_frames['donation'] = data_frames['donation'].astype('float64')
+# Handle Data Inconsistencies
+# Normalize strings
+data_frames['street_address'] = data_frames['street_address'].str.split()
+
+def normalize_words(arr):
+    for index, word in enumerate(arr):
+        if index == 0:
+            pass
+        else:
+            arr[index] = normalize(word)
+
+def normalize(word):
+    if word.lower() == 'st':
+        word = 'street'
+    elif word.lower() == 'rd':
+        word = 'road'
+
+    return word.capitalize()
+
+
+data_frames['street_address'].apply(lambda x: normalize_words(x))
+data_frames['street_address'] = data_frames['street_address'].str.join(' ')
+
+
+# Remove Out-of-Range Data
+# create boolean Series for out of range donations 
+out_of_range = data_frames['donation'] < 0
+
+# keep only the rows that are NOT out of range
+data_frames['donation'] = data_frames['donation'][~out_of_range]
+
+
+# Remove duplicates
+columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
+data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
+
+
+# Drop Missing Data
+columns_to_check = ['state', 'donation']
+data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)
+
+
+print(data_frames_no_missing.head(20))
+
+
+data_frames_no_missing.to_csv(r'clean_donations_data.csv', index = False)
diff --git a/general/data-cleaning-pandas/helpers.py b/general/data-cleaning-pandas/helpers.py
diff --git a/general/data-cleaning-pandas/requirements.txt b/general/data-cleaning-pandas/requirements.txt
diff --git a/general/data-cleaning-pandas/simulator.py b/general/data-cleaning-pandas/simulator.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# [Data Cleaning with Pandas in Python](https://www.thepythoncode.com/article/data-cleaning-using-pandas-in-python)`