-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSQL_DataCleaning_HousingNewYorkUnitsByBuild.sql
213 lines (163 loc) · 6.91 KB
/
SQL_DataCleaning_HousingNewYorkUnitsByBuild.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
--This project is done by myself
--The following data is found at the following link:
https://data.cityofnewyork.us/Housing-Development/Housing-New-York-Units-by-Building/hg8x-zxpr
USE PortfolioProject
GO
select *
From dbo.HousingNewYorkUnitsByBuild
order by ProjectID
--Standardize Date Format
ALTER TABLE dbo.HousingNewYorkUnitsByBuild
ALTER COLUMN [ProjectStartDate] date
ALTER TABLE dbo.HousingNewYorkUnitsByBuild
ALTER COLUMN [ProjectCompletionDate] date
ALTER TABLE dbo.HousingNewYorkUnitsByBuild
ALTER COLUMN [BuildingCompletionDate] date
--Check to see if there is need to change Y and N to Yes and No or the other way around in "Extended Affordability Only" field.
Select ExtendedAffordabilityOnly, count(ExtendedAffordabilityOnly)
From dbo.HousingNewYorkUnitsByBuild
Group by ExtendedAffordabilityOnly
--It turns out this column is fine because it only has YES and NO as output. But if this is not the case and I also see Y and N in the ouputs, I can do the following:
UPDATE HousingNewYorkUnitsByBuild
SET ExtendedAffordabilityOnly = CASE WHEN ExtendedAffordabilityOnly = 'Y' Then 'YES'
WHEN ExtendedAffordabilityOnly = 'N' Then 'NO'
ELSE ExtendedAffordabilityOnly
END
FROM dbo.HousingNewYorkUnitsByBuild
--REMOVE DUPLICATES
With RowNumCTE as(
Select*,
ROW_NUMBER() OVER (
PARTITION BY ProjectName,
ProjectStartDate,
ProjectCompletionDate,
Street,
Postcode
ORDER BY
ProjectID
) as Row_Num
FROM dbo.HousingNewYorkUnitsByBuild
)
DELETE
FROM RowNumCTE
WHERE Row_Num > 1
-- DELETE UNUSED COLUMNS
ALTER TABLE dbo.HousingNewYorkUnitsByBuild
DROP COLUMN Latitude, Longitude, [Latitude (Internal)], [Longitude (Internal)]
-- All the credit of the following Data Cleaning goes toward Alex The Analyst on Youtube, I learned from him.
-- Here is the link for the video if you want to see: https://www.youtube.com/watch?v=8rO7ztF4NtU&list=PLUaB-1hjhk8H48Pj32z4GZgGWyylqv85f&index=6
-- I have PRACTICED everything in this Data Cleaning Work.
/****** Script for SelectTopNRows command from SSMS ******/
SELECT TOP (1000) [UniqueID ]
,[ParcelID]
,[LandUse]
,[PropertyAddress]
,[SaleDate]
,[SalePrice]
,[LegalReference]
,[SoldAsVacant]
,[OwnerName]
,[OwnerAddress]
,[Acreage]
,[TaxDistrict]
,[LandValue]
,[BuildingValue]
,[TotalValue]
,[YearBuilt]
,[Bedrooms]
,[FullBath]
,[HalfBath]
FROM [PortfolioProject].[dbo].[NashvilleHousing]
Select *
From PortfolioProject.dbo.NashvilleHousing
--Standardize Date Format
Alter TABLE NashvilleHousing
ALTER COLUMN [SaleDate] date
-- Populate Property Address Data (There are NULL values)
Select a.ParcelID, a.PropertyAddress, b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress,b.PropertyAddress)
From [PortfolioProject].[dbo].[NashvilleHousing] as a
JOIN [PortfolioProject].[dbo].[NashvilleHousing] as b
on a.ParcelID = b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
Where a.PropertyAddress is Null
Update a
SET PropertyAddress = ISNULL(a.PropertyAddress,b.PropertyAddress) --I can also do ISNULL(a.PropertyAddress,NO ADDRESS) , what this means is that when a.PropertyAddress IS NULL, the NULL box will be populated by the word 'NO ADDRESS'.
From [PortfolioProject].[dbo].[NashvilleHousing] as a
JOIN [PortfolioProject].[dbo].[NashvilleHousing] as b
on a.ParcelID = b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
Where a.PropertyAddress is Null
--Breaking out Address into Individual Columns (Address, City, State)
Select PropertyAddress
From PortfolioProject.dbo.NashvilleHousing
select
substring(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)-1) as Address --In the PropertyAddress column, starts from the first character, then, looking for the number of position for comma from Property Address. When you put -1, it moves 1 position to the left away from the comma.
, substring(PropertyAddress, CHARINDEX(',', PropertyAddress)+1, len(PropertyAddress)) as City --It's referring to the right side of the comma, and len(PropertyAddress) means length of the PropertyAddress, so it goes from 1 position from the right side of the comma to the end of the property address.
From PortfolioProject.dbo.NashvilleHousing
Alter Table NashvilleHousing
Add PropertySpiltAddress Nvarchar(255);
Update NashvilleHousing
SET PropertySpiltAddress = substring(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)-1)
From PortfolioProject.dbo.NashvilleHousing
Alter Table NashvilleHousing
Add PropertySpiltCity Nvarchar(255);
Update NashvilleHousing
SET PropertySpiltCity = substring(PropertyAddress, CHARINDEX(',', PropertyAddress)+1, len(PropertyAddress))
From PortfolioProject.dbo.NashvilleHousing
Select
PARSENAME(REPLACE(OwnerAddress,',','.'),3) as OwnerSplitAddress -- PARSENAME is only useful for period, and NOT comma. So you need to replace these comma with period.
,PARSENAME(REPLACE(OwnerAddress,',','.'),2) as OwnerSplitCity
,PARSENAME(REPLACE(OwnerAddress,',','.'),1) as OwnderSplitState
From PortfolioProject.dbo.NashvilleHousing
ALTER TABLE NashvilleHousing
ADD OwnerSplitAddress Nvarchar(225);
UPDATE NashvilleHousing
SET OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress,',','.'),3)
From PortfolioProject.dbo.NashvilleHousing
ALTER TABLE NashvilleHousing
ADD OwnerSplitCity Nvarchar(225);
UPDATE NashvilleHousing
SET OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress,',','.'),2)
From PortfolioProject.dbo.NashvilleHousing
ALTER TABLE NashvilleHousing
ADD OwnerSplitState Nvarchar(225);
UPDATE NashvilleHousing
SET OwnerSplitState = PARSENAME(REPLACE(OwnerAddress,',','.'),1)
FROM PortfolioProject.dbo.NashvilleHousing
--Change Y and N to Yes and No in "Sold as Vacant" field
Select Distinct(SoldAsVacant), count(SoldAsVacant) as SoldAsVacantCount
From PortfolioProject.dbo.NashvilleHousing
Group by SoldAsVacant
order by 2
Select SoldAsVacant
,CASE WHEN SoldAsVacant = 'Y' Then 'YES'
WHEN SoldAsVacant = 'N' Then 'No'
ELSE SoldAsVacant
End
FROM PortfolioProject.dbo.NashvilleHousing
UPDATE NashvilleHousing
SET SoldAsVacant = CASE WHEN SoldAsVacant = 'Y' Then 'YES'
WHEN SoldAsVacant = 'N' Then 'No'
ELSE SoldAsVacant
End
FROM PortfolioProject.dbo.NashvilleHousing
-- Remove Duplicates
-- We need to partition on things that are unique to each row. In this case, I'm just pretending the UniqueID column doesn't exist.
With RowNumCTE as(
Select *,
ROW_NUMBER() OVER (
PARTITION BY ParcelID,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) as Row_Num
FROM PortfolioProject.dbo.NashvilleHousing
)
DELETE
From RowNumCTE
Where Row_Num > 1
-- DELETE UNUSED COLUMNS : I know it's not a standard practice to delete data, this is just to show I can do this if needed.
ALTER TABLE PortfolioProject.dbo.NashvilleHousing
DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress