-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNashville-Housing-Dataset
186 lines (150 loc) · 3.71 KB
/
Nashville-Housing-Dataset
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
SELECT
*
FROM
nashville_housing_data.nashville_housing;
-- STANDARDIZED DATE
SELECT
SaleDate, CONVERT( SaleDate , DATE) Date
FROM
nashville_housing;
UPDATE nashville_housing
SET
SaleDate = CONVERT( SaleDate , DATE);
-- POPULATE PROPERTY ADDRESS
UPDATE nashville_housing
SET
PropertyAddress = NULLIF(propertyAddress, '''');
SELECT
*
FROM
nashville_housing
ORDER BY ParcelID;
SELECT
a.ParcelID,
a.PropertyAddress,
a.ParcelID,
b.PropertyAddress,
IFNULL(a.PropertyAddress, b.PropertyAddress)
FROM
nashville_housing a
JOIN
nashville_housing b ON a.ParcelID = b.ParcelID
AND a.UniqueID <> b.UniqueID
WHERE
a.PropertyAddress IS NULL;
UPDATE nashville_housing a
JOIN
nashville_housing b ON a.ParcelID = b.ParcelID
AND a.UniqueID <> b.UniqueID
SET
a.PropertyAddress = IFNULL(a.PropertyAddress, b.PropertyAddress)
WHERE
a.PropertyAddress IS NULL;
-- BREAKING OUT ADDRESSES INTO INDIVIDUAL COLUMNS (ADDRESS, CITY, STATE)
SELECT
PropertyAddress
FROM
nashville_housing
ORDER BY ParcelID;
SELECT
SUBSTRING(PropertyAddress,
1,
POSITION(',' IN PropertyAddress) - 1) AS Address,
SUBSTRING(PropertyAddress,
POSITION(',' IN PropertyAddress) + 1) State
FROM
nashville_housing;
ALTER TABLE nashville_housing
ADD Address VARCHAR(255);
UPDATE nashville_housing
SET
Address = SUBSTRING(PropertyAddress,
1,
POSITION(',' IN PropertyAddress) - 1);
ALTER TABLE nashville_housing
ADD City VARCHAR(255);
UPDATE nashville_housing
SET
City = SUBSTRING(PropertyAddress,
POSITION(',' IN PropertyAddress) + 1);
UPDATE nashville_housing
SET
OwnerAddress = NULLIF(OwnerAddress, '''');
SELECT
OwnerAddress
FROM
nashville_housing;
SELECT
OwnerAddress,
SUBSTRING_INDEX(OwnerAddress, ',', 1) AS OwnerAA,
SUBSTRING_INDEX(SUBSTRING_INDEX(OwnerAddress, ',', 2),
',',
- 1),
SUBSTRING_INDEX(OwnerAddress, ',', - 1)
FROM
nashville_housing;
ALTER TABLE nashville_housing
ADD OwnerADress VARCHAR(255);
UPDATE nashville_housing
SET
OwnerADress = SUBSTRING_INDEX(OwnerAddress, ',', 1);
ALTER TABLE nashville_housing
ADD OwnerCity VARCHAR(255);
UPDATE nashville_housing
SET
OwnerCity = SUBSTRING_INDEX(SUBSTRING_INDEX(OwnerAddress, ',', 2),
',',
- 1);
ALTER TABLE nashville_housing
ADD OwnerState VARCHAR(255);
UPDATE nashville_housing
SET
OwnerState = SUBSTRING_INDEX(OwnerAddress, ',', - 1);
-- CHANGE Y AND N TO YES AND NO IN "SOLD AS VACANT FIELD"
SELECT DISTINCT
SoldAsVacant, COUNT(SoldAsVacant)
FROM
nashville_housing
GROUP BY SoldAsVacant;
SELECT
SoldAsVacant,
CASE
WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
WHEN SoldAsVacant = 'No' THEN 'No'
WHEN SoldAsVacant = 'Yes' THEN 'Yes'
ELSE 'SoldAsVacant'
END
FROM
nashville_housing;
UPDATE nashville_housing
SET
SoldAsVacant = CASE
WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
WHEN SoldAsVacant = 'No' THEN 'No'
WHEN SoldAsVacant = 'Yes' THEN 'Yes'
ELSE 'SoldAsVacant'
END;
-- REMOVE DUPLICATES
CREATE TEMPORARY TABLE Duplicate_values AS(
SELECT *, ROW_NUMBER() OVER(PARTITION BY
ParcelID, PropertyAddress,SaleDate,SalePrice,LegalReference ORDER BY UniqueID) RN
FROM nashville_housing
ORDER BY ParcelID);
DELETE FROM Duplicate_values
WHERE
RN > 1;
SELECT
*
FROM
Duplicate_values
WHERE
RN > 1;
-- DELETE UNUSED COLUMNS
SELECT
*
FROM
nashville_housing;
ALTER TABLE nashville_housing
DROP COLUMN PropertyAddress, DROP COLUMN OwnerAddress, DROP COLUMN TaxDistrict;