-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplot.py
139 lines (127 loc) · 6.04 KB
/
plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from imports import *
from prepare import prep_data, basic_clean, lemmatize, remove_stopwords, split, clean, clean_skills
def overall_distribution(train):
'''
This function plot the overall candidate education distribution accross all jobs
'''
y = [train.edu_bachelor.sum(),train.edu_master.sum(), train.edu_phd.sum(), train.edu_other.sum()]
plt.figure(figsize=(16,12))
mylabels = ["Bachelor", "Master", 'PhD', 'Other']
myexplode = [0.2, 0, 0, 0]
mycolors = ['#45818e', '#783f04', '#e7dbcd', '#5b5b5b' ]
textprops = {"fontsize":15}
plt.pie(y, labels = mylabels, explode = myexplode, colors = mycolors, textprops=textprops, autopct='%.1f%%')
plt.legend()
plt.title('Candidate Education Distribution',fontsize=18)
plt.show()
def role(train):
'''
This functions plot the education distribution per role
'''
pct = [['Data Scientist', 'bachelor', 22.333333],['Data Scientist', 'master', 58.888889],['Data Scientist', 'phd', 15.022222], ['Data Scientist', 'other', 3.266667],
['Data Analyst', 'bachelor', 26.685714],['Data Analyst', 'master', 64.257143],['Data Analyst', 'phd', 5.942857],['Data Analyst', 'other', 3.914286],
['Data Engineer','bachelor',28.342105],['Data Engineer', 'master', 62.421053],['Data Engineer', 'phd', 6.421053], ['Data Engineer', 'other', 2.710526],
['Managerial Roles', 'bachelor', 13.882353], ['Managerial Roles', 'master',59.294118], ['Managerial Roles','phd', 22.117647], ['Managerial Roles', 'other', 4.529412]
]
roledf = pd.DataFrame(pct, columns = ['Role', 'Highest Education Level', 'Percentage'])
plt.figure(figsize = (20,8))
sns.barplot(x='Role', y='Percentage',hue = 'Highest Education Level', data =roledf, palette = ['#45818e', '#783f04', '#e7dbcd', '#5b5b5b'])
# plt.ylim = (0.7,1)
plt.xlabel("Role", fontsize = 16)
plt.ylabel("Percentage", fontsize = 16)
plt.title("Education Distribution per Role", fontsize = 20)
def level(train):
'''
This functions plot the education distribution per level
'''
plt.figure(figsize = (16,8))
sns.countplot(x='level', hue='label', data=train, palette= ['#783f04','#45818e'])
plt.title('Education Distribution per Level', fontsize = 20)
plt.xlabel('Job Level', fontsize = 16)
plt.ylabel('Count', fontsize = 16)
def requirements(train):
'''
This function visualize the top 20 most frequent words in requirement
Detailed steps please reference explore.ipynb
'''
all_text = ' '.join(train.requirements)
u_text = ' '.join(train[train.label == 'u'].requirements)
g_text = ' '.join(train[train.label == 'g'].requirements)
all_text = clean(all_text)
u_text = clean(u_text)
g_text = clean(g_text)
all_freq = pd.Series(all_text.split()).value_counts()
u_freq = pd.Series(u_text.split()).value_counts()
g_freq = pd.Series(g_text.split()).value_counts()
word_counts = pd.concat([all_freq, u_freq, g_freq], sort=True, axis=1)
word_counts.columns = ['all', 'u', 'g']
word_counts = word_counts.fillna(0).apply(lambda s: s.astype(int))
word_counts.assign(p_u=word_counts.u / word_counts['all'],p_g=word_counts.g / word_counts['all'])\
.sort_values(by='all')\
.tail(20)\
[['u', 'g']]\
.sort_values(by='g')\
.plot.barh(stacked=True, figsize = (16,10), color = ['#45818e', '#783f04'])
def g_bigrams(train):
'''
This function plot the wordcloud for top bigrams in the requirements for graduate degree category
'''
g_text = ' '.join(train[train.label == 'g'].requirements)
g_text = clean(g_text)
top_20_g_bigrams = (pd.Series(nltk.ngrams(g_text.split(), 2))
.value_counts()
.head(20))
data = {k[0] + ' ' + k[1]: v for k, v in top_20_g_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400, colormap = 'YlOrBr_r').generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()
def u_bigrams(train):
'''
This function plot the wordcloud for top bigrams in the requirements for undergraduate degree category
'''
u_text = ' '.join(train[train.label == 'u'].requirements)
u_text = clean(u_text)
top_20_u_bigrams = (pd.Series(nltk.ngrams(u_text.split(), 2))
.value_counts()
.head(20))
data = {k[0] + ' ' + k[1]: v for k, v in top_20_u_bigrams.to_dict().items()}
colors = ['#194569', '#5F84A2', '#91AEC4', '#B7D0E1', '#CADEED', '#DBECF4']
img = WordCloud(background_color='white', width=800, height=400, colormap='Blues_r').generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()
def g_skills(train):
'''
This function generates the wordcounts for top skills for graduate degree
'''
# Separating skills into Series
g_skills = ' '.join(train[train.label == 'g'].skills)
g_skills = clean_skills(g_skills)
top_20_g_skill_bigrams = (pd.Series(nltk.ngrams(g_skills.split(), 2))
.value_counts()
.head(20))
data = {k[0] + ' ' + k[1]: v for k, v in top_20_g_skill_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400, colormap='YlOrBr_r').generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()
def u_skills(train):
'''
This function generates the wordcounts for top skills for undergraduate degree
'''
# Separating skills into Series
u_skills = ' '.join(train[train.label == 'u'].skills)
u_skills = clean_skills(u_skills)
top_20_u_skill_bigrams = (pd.Series(nltk.ngrams(u_skills.split(), 2))
.value_counts()
.head(20))
data = {k[0] + ' ' + k[1]: v for k, v in top_20_u_skill_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400, colormap='Blues_r').generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()