forked from Smartitect/dataprep
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnotebook02_01_manipulatePeople.py
184 lines (144 loc) · 6.14 KB
/
notebook02_01_manipulatePeople.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#%% [markdown]
#---
# # Stage 2 : Manipulate PEOPLE
# Let's pick up with the **PEOPLE** package we prepared earlier in stage 1.
# ## Common
# So this is where we are trying to do all the common stuff to ingest all of the files. Key is recognition that there are common patterns we can exploit across the files.
# NOTE - still to figure out how to do this from a single file and import it successfully.
#%%
# Import all of the libraries we need to use...
import pandas as pd
import azureml.dataprep as dprep
import seaborn as sns
import os as os
import re as re
import collections
from azureml.dataprep import value
from azureml.dataprep import col
from azureml.dataprep import Package
# Let's also set up global variables and common functions...
# Path to the source data
dataPath = "./data"
# Path to the location where the dataprep packags that are created
packagePath = "./packages"
# Name of package file
packageFileSuffix = "_package.dprep"
# A helper function to create full package path
def createFullPackagePath(packageName, stage, qualityFlag):
return packagePath + '/' + packageName + '_' + stage + '_' + qualityFlag + packageFileSuffix
# A save package helper function
def savePackage(dataFlowToPackage, packageName, stage, qualityFlag):
dataFlowToPackage = dataFlowToPackage.set_name(packageName)
packageToSave = dprep.Package(dataFlowToPackage)
fullPackagePath = createFullPackagePath(packageName, stage, qualityFlag)
packageToSave = packageToSave.save(fullPackagePath)
return fullPackagePath
# An open package helper function
def openPackage(packageName, stage, qualityFlag):
fullPackagePath = createFullPackagePath(packageName, stage, qualityFlag)
packageToOpen = Package.open(fullPackagePath)
dataFlow = packageToOpen[packageName]
return dataFlow
#%% [markdown]
# Load the A class PEOPLE data from stage 1 and inspect the top 100 rows:
#%%
peopleDataFlow = openPackage('PEOPLE', '1', 'A')
peopleDataFlow.head(100)
#%% [markdown]
# ## ADDRESS
# There is a need to replace the CR/LF `\0d0a` string with a comma in the address, and replace empty values with an empty string:
#%%
peopleDataFlow = peopleDataFlow.str_replace('ADDRESS', r'\0d0a', ', ')
peopleDataFlow = peopleDataFlow.replace('ADDRESS', r', , , ', None)
peopleDataFlow.head(5)
#%%
peopleDataFlow.get_profile()
#%% [markdown]
# Inspect the progile -so good things to see:
# - No errors in any of the date columns having applied this type to the column
# - One missing value for DOB : date of birth
# - Lots of missing values for DATEDIED which is intuitive
# - Only 203 people with married date, doesn't seem right : MARRDATE
#
### Data Types
# A few datatypes that haven't been picked up automtically
#%%
peopleDataFlow = peopleDataFlow.to_datetime(columns = ['MINRETIREMENTDATE'], date_time_formats = ['%d/%m/%Y'])
peopleDataFlow = peopleDataFlow.to_long(['ID'])
#%% [markdown]
# ## SEX
# Find distinct values:
#%%
peopleDataFlow.get_profile().columns['SEX'].value_counts
#%% [markdown]
# Looks good, apart from one record, all either an M or an F, no anomolies or missing values.
# Let's quarantine the row which is missing this field:
#%%
quarantinedPeopleDataFlow = peopleDataFlow.filter(peopleDataFlow['SEX'] == None)
#%% [markdown]
# ## TITLE
# Find distinct values:
#%%
peopleDataFlow.get_profile().columns['TITLE'].value_counts
#%% [markdown]
# Here we can see that there are a number of different ways of representing the likes of "Mrs", "Mrs.", "mrs"
# Lets see what fuzzy grouping can do:
#%%
builder = peopleDataFlow.builders.fuzzy_group_column(source_column='TITLE',
new_column_name='TITLE_grouped',
similarity_threshold=0.8,
similarity_score_column_name='TITLE_grouped_score')
builder.learn()
builder.groups
#%% [markdown]
# First pass with a threashold of 0.8 doesn't do so well, so let's try with a threshold of 0.9:
#%%
builder.similarity_threshold = 0.9
builder.learn()
builder.groups
#%% [markdown]
# So this cleans up the instances of "Mrs", "Mrs." and "mrs" + "Rev" and "REV"
#%%
peopleDataFlow = builder.to_dataflow()
peopleDataFlow.get_profile().columns['TITLE_grouped'].value_counts
#%% [markdown]
# Tidy up the loose ends now
#%%
peopleDataFlow = peopleDataFlow.replace('TITLE_grouped', 'Sist', 'Sister')
peopleDataFlow = peopleDataFlow.replace('TITLE_grouped', 'Sis', 'Sister')
peopleDataFlow = peopleDataFlow.replace('TITLE_grouped', 'M', None)
peopleDataFlow.get_profile().columns['TITLE_grouped'].value_counts
#%% [markdown]
# ## MSTA - Marital Status
# Find distinct values:
#%%
peopleDataFlow.get_profile().columns['MSTA'].value_counts
#%% [markdown]
# Here we can see that there are a number of different ways of representing the likes of "Mrs", "Mrs.", "mrs"
# Let's see what fuzzy grouping can do:
#%%
builder = peopleDataFlow.builders.fuzzy_group_column(source_column='MSTA',
new_column_name='MSTA_grouped',
similarity_threshold=0.1,
similarity_score_column_name='MSTA_grouped_score')
builder.learn()
builder.groups
#%% [markdown]
# So this cleans up the instances of "Serarated" and "Seperated", but despite placing the threashold as low as 0.1 it has not picked up the other example such as those around "Widowed" or "Civil Partner"
#%%
peopleDataFlow = builder.to_dataflow()
peopleDataFlow.get_profile().columns['MSTA_grouped'].value_counts
#%%
peopleDataFlow = peopleDataFlow.replace('MSTA_grouped', 'Unknown', None)
peopleDataFlow = peopleDataFlow.replace('MSTA_grouped', 'Widow/Wido', 'Widowed')
peopleDataFlow = peopleDataFlow.replace('MSTA_grouped', '0', None)
peopleDataFlow = peopleDataFlow.replace('MSTA_grouped', 'Civil Part', 'Civil Partner')
peopleDataFlow = peopleDataFlow.replace('MSTA_grouped', '1', None)
#%%
peopleDataFlow.get_profile().columns['MSTA_grouped'].value_counts
#%% [markdown]
# ## Save PEOPLE data
# Finally we'll save away what we've created so that it can be picked up later on in the process.
#%%
fullPackagePath = savePackage(peopleDataFlow, 'PEOPLE', '2', 'A')
print('Saved package to file {0}'.format(fullPackagePath))