Skip to content

Commit b0ae13a

Browse files
committed
add data cleaning tutorial
1 parent de76901 commit b0ae13a

14 files changed

+440
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
120120
- [How to Plot Weather Temperature in Python](https://www.thepythoncode.com/article/interactive-weather-plot-with-matplotlib-and-requests). ([code](general/interactive-weather-plot/))
121121
- [How to Generate SVG Country Maps in Python](https://www.thepythoncode.com/article/generate-svg-country-maps-python). ([code](general/generate-svg-country-map))
122122
- [How to Query the Ethereum Blockchain with Python](https://www.thepythoncode.com/article/query-ethereum-blockchain-with-python). ([code](general/query-ethereum))
123+
- [Data Cleaning with Pandas in Python](https://www.thepythoncode.com/article/data-cleaning-using-pandas-in-python). ([code](general/data-cleaning-pandas))
123124

124125

125126

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [Data Cleaning with Pandas in Python](https://www.thepythoncode.com/article/data-cleaning-using-pandas-in-python)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
print(data_frames.head(10))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
print(data_frames.info())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
# Data Type Conversion
11+
# Remove '$' from donation strings
12+
data_frames['donation'] = data_frames['donation'].str.strip('$')
13+
14+
# Convert donation stings into numerical data type
15+
data_frames['donation'] = data_frames['donation'].astype('float64')
16+
17+
print(data_frames.head(10))
18+
print(data_frames.info())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
# Data Type Conversion
11+
# Remove '$' from donation strings
12+
data_frames['donation'] = data_frames['donation'].str.strip('$')
13+
14+
# Convert donation stings into numerical data type
15+
data_frames['donation'] = data_frames['donation'].astype('float64')
16+
17+
18+
# Handle Data Inconsistencies
19+
# Capitalize strings
20+
data_frames['street_address'] = data_frames['street_address'].str.split()
21+
22+
def capitalize_words(arr):
23+
for index, word in enumerate(arr):
24+
if index == 0:
25+
pass
26+
else:
27+
arr[index] = word.capitalize()
28+
29+
data_frames['street_address'].apply(lambda x: capitalize_words(x))
30+
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
31+
32+
print(data_frames['street_address'])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
# Data Type Conversion
11+
# Remove '$' from donation strings
12+
data_frames['donation'] = data_frames['donation'].str.strip('$')
13+
14+
# Convert donation stings into numerical data type
15+
data_frames['donation'] = data_frames['donation'].astype('float64')
16+
17+
18+
# Handle Data Inconsistencies
19+
# Normalize strings
20+
data_frames['street_address'] = data_frames['street_address'].str.split()
21+
22+
def normalize_words(arr):
23+
for index, word in enumerate(arr):
24+
if index == 0:
25+
pass
26+
else:
27+
arr[index] = normalize(word)
28+
29+
def normalize(word):
30+
if word.lower() == 'st':
31+
word = 'street'
32+
elif word.lower() == 'rd':
33+
word = 'road'
34+
35+
return word.capitalize()
36+
37+
38+
data_frames['street_address'].apply(lambda x: normalize_words(x))
39+
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
40+
41+
print(data_frames.head(10))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
# Data Type Conversion
11+
# Remove '$' from donation strings
12+
data_frames['donation'] = data_frames['donation'].str.strip('$')
13+
14+
# Convert donation stings into numerical data type
15+
data_frames['donation'] = data_frames['donation'].astype('float64')
16+
17+
18+
# Handle Data Inconsistencies
19+
# Normalize strings
20+
data_frames['street_address'] = data_frames['street_address'].str.split()
21+
22+
def normalize_words(arr):
23+
for index, word in enumerate(arr):
24+
if index == 0:
25+
pass
26+
else:
27+
arr[index] = normalize(word)
28+
29+
def normalize(word):
30+
if word.lower() == 'st':
31+
word = 'street'
32+
elif word.lower() == 'rd':
33+
word = 'road'
34+
35+
return word.capitalize()
36+
37+
38+
data_frames['street_address'].apply(lambda x: normalize_words(x))
39+
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
40+
41+
42+
# Remove Out-of-Range Data
43+
# create boolean Series for out of range donations
44+
out_of_range = data_frames['donation'] < 0
45+
46+
# keep only the rows that are NOT out of range
47+
data_frames['donation'] = data_frames['donation'][~out_of_range]
48+
49+
print(data_frames.head(10))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
# Data Type Conversion
11+
# Remove '$' from donation strings
12+
data_frames['donation'] = data_frames['donation'].str.strip('$')
13+
14+
# Convert donation stings into numerical data type
15+
data_frames['donation'] = data_frames['donation'].astype('float64')
16+
17+
18+
# Handle Data Inconsistencies
19+
# Normalize strings
20+
data_frames['street_address'] = data_frames['street_address'].str.split()
21+
22+
def normalize_words(arr):
23+
for index, word in enumerate(arr):
24+
if index == 0:
25+
pass
26+
else:
27+
arr[index] = normalize(word)
28+
29+
def normalize(word):
30+
if word.lower() == 'st':
31+
word = 'street'
32+
elif word.lower() == 'rd':
33+
word = 'road'
34+
35+
return word.capitalize()
36+
37+
38+
data_frames['street_address'].apply(lambda x: normalize_words(x))
39+
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
40+
41+
42+
# Remove Out-of-Range Data
43+
# create boolean Series for out of range donations
44+
out_of_range = data_frames['donation'] < 0
45+
46+
# keep only the rows that are NOT out of range
47+
data_frames['donation'] = data_frames['donation'][~out_of_range]
48+
49+
50+
# Remove duplicates
51+
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
52+
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
53+
54+
print(data_frames_no_dupes.info())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
10+
# Data Type Conversion
11+
# Remove '$' from donation strings
12+
data_frames['donation'] = data_frames['donation'].str.strip('$')
13+
14+
# Convert donation stings into numerical data type
15+
data_frames['donation'] = data_frames['donation'].astype('float64')
16+
17+
18+
# Handle Data Inconsistencies
19+
# Normalize strings
20+
data_frames['street_address'] = data_frames['street_address'].str.split()
21+
22+
def normalize_words(arr):
23+
for index, word in enumerate(arr):
24+
if index == 0:
25+
pass
26+
else:
27+
arr[index] = normalize(word)
28+
29+
def normalize(word):
30+
if word.lower() == 'st':
31+
word = 'street'
32+
elif word.lower() == 'rd':
33+
word = 'road'
34+
35+
return word.capitalize()
36+
37+
38+
data_frames['street_address'].apply(lambda x: normalize_words(x))
39+
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
40+
41+
42+
# Remove Out-of-Range Data
43+
# create boolean Series for out of range donations
44+
out_of_range = data_frames['donation'] < 0
45+
46+
# keep only the rows that are NOT out of range
47+
data_frames['donation'] = data_frames['donation'][~out_of_range]
48+
49+
50+
# Remove duplicates
51+
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
52+
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
53+
54+
55+
# Drop Missing Data
56+
columns_to_check = ['state', 'donation']
57+
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)
58+
59+
60+
print(data_frames_no_missing.head(20))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import pandas as pd
2+
3+
# Config settings
4+
pd.set_option('max_columns', None)
5+
pd.set_option('max_rows', 12)
6+
7+
# Import CSV data
8+
data_frames = pd.read_csv (r'simulated_data.csv')
9+
# Data Type Conversion
10+
# Remove '$' from donation strings
11+
data_frames['donation'] = data_frames['donation'].str.strip('$')
12+
# Convert donation stings into numerical data type
13+
data_frames['donation'] = data_frames['donation'].astype('float64')
14+
# Handle Data Inconsistencies
15+
# Normalize strings
16+
data_frames['street_address'] = data_frames['street_address'].str.split()
17+
18+
def normalize_words(arr):
19+
for index, word in enumerate(arr):
20+
if index == 0:
21+
pass
22+
else:
23+
arr[index] = normalize(word)
24+
25+
def normalize(word):
26+
if word.lower() == 'st':
27+
word = 'street'
28+
elif word.lower() == 'rd':
29+
word = 'road'
30+
31+
return word.capitalize()
32+
33+
34+
data_frames['street_address'].apply(lambda x: normalize_words(x))
35+
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
36+
37+
38+
# Remove Out-of-Range Data
39+
# create boolean Series for out of range donations
40+
out_of_range = data_frames['donation'] < 0
41+
42+
# keep only the rows that are NOT out of range
43+
data_frames['donation'] = data_frames['donation'][~out_of_range]
44+
45+
46+
# Remove duplicates
47+
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
48+
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
49+
50+
51+
# Drop Missing Data
52+
columns_to_check = ['state', 'donation']
53+
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)
54+
55+
56+
print(data_frames_no_missing.head(20))
57+
58+
59+
data_frames_no_missing.to_csv(r'clean_donations_data.csv', index = False)

0 commit comments

Comments
 (0)