Skip to content

Commit f7ace84

Browse files
author
MarvinXN
committed
add files
1 parent b39322a commit f7ace84

File tree

123 files changed

+5491
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

123 files changed

+5491
-0
lines changed

misc/MLinPython/01/chapter01.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Chapter 1 of Machine Learning in Python has no code associated with it.
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2
2+
__author__ = 'mike_bowles'
3+
import pandas as pd
4+
from pandas import DataFrame
5+
import matplotlib.pyplot as plot
6+
7+
target_url = ("http://archive.ics.uci.edu/ml/machine-"
8+
"learning-databases/abalone/abalone.data")
9+
#read abalone data
10+
abalone = pd.read_csv(target_url,header=None, prefix="V")
11+
abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
12+
'Whole weight', 'Shucked weight',
13+
'Viscera weight', 'Shell weight', 'Rings']
14+
15+
#calculate correlation matrix
16+
corMat = DataFrame(abalone.iloc[:,1:9].corr())
17+
#print correlation matrix
18+
print(corMat)
19+
20+
#visualize correlations using heatmap
21+
plot.pcolor(corMat)
22+
plot.show()
23+

misc/MLinPython/02/abaloneCorrHeat.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
import matplotlib.pyplot as plot
5+
6+
target_url = ("http://archive.ics.uci.edu/ml/machine-"
7+
"learning-databases/abalone/abalone.data")
8+
#read abalone data
9+
abalone = pd.read_csv(target_url,header=None, prefix="V")
10+
abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
11+
'Whole weight', 'Shucked weight',
12+
'Viscera weight', 'Shell weight', 'Rings']
13+
14+
#calculate correlation matrix
15+
corMat = DataFrame(abalone.iloc[:,1:9].corr())
16+
#print correlation matrix
17+
print(corMat)
18+
19+
#visualize correlations using heatmap
20+
plot.pcolor(corMat)
21+
plot.show()
22+

misc/MLinPython/02/abaloneCorrMat.txt

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Length Diameter Height Whole Wt Shucked Wt
2+
Length 1.000000 0.986812 0.827554 0.925261 0.897914
3+
Diameter 0.986812 1.000000 0.833684 0.925452 0.893162
4+
Height 0.827554 0.833684 1.000000 0.819221 0.774972
5+
Whole weight 0.925261 0.925452 0.819221 1.000000 0.969405
6+
Shucked weight 0.897914 0.893162 0.774972 0.969405 1.000000
7+
Viscera weight 0.903018 0.899724 0.798319 0.966375 0.931961
8+
Shell weight 0.897706 0.905330 0.817338 0.955355 0.882617
9+
Rings 0.556720 0.574660 0.557467 0.540390 0.420884
10+
11+
Viscera weight Shell weight Rings
12+
Length 0.903018 0.897706 0.556720
13+
Diameter 0.899724 0.905330 0.574660
14+
Height 0.798319 0.817338 0.557467
15+
Whole weight 0.966375 0.955355 0.540390
16+
Shucked weight 0.931961 0.882617 0.420884
17+
Viscera weight 1.000000 0.907656 0.503819
18+
Shell weight 0.907656 1.000000 0.627574
19+
Rings 0.503819 0.627574 1.000000
+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
import matplotlib.pyplot as plot
5+
from math import exp
6+
target_url = ("http://archive.ics.uci.edu/ml/machine-"
7+
"learning-databases/abalone/abalone.data")
8+
#read abalone data
9+
abalone = pd.read_csv(target_url,header=None, prefix="V")
10+
abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
11+
'Whole Wt', 'Shucked Wt',
12+
'Viscera Wt', 'Shell Wt', 'Rings']
13+
#get summary to use for scaling
14+
summary = abalone.describe()
15+
minRings = summary.iloc[3,7]
16+
maxRings = summary.iloc[7,7]
17+
nrows = len(abalone.index)
18+
19+
for i in range(nrows):
20+
#plot rows of data as if they were series data
21+
dataRow = abalone.iloc[i,1:8]
22+
labelColor = (abalone.iloc[i,8] - minRings) / (maxRings - minRings)
23+
dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
24+
25+
plot.xlabel("Attribute Index")
26+
plot.ylabel(("Attribute Values"))
27+
plot.show()
28+
29+
#renormalize using mean and standard variation, then compress
30+
# with logit function
31+
32+
meanRings = summary.iloc[1,7]
33+
sdRings = summary.iloc[2,7]
34+
35+
for i in range(nrows):
36+
#plot rows of data as if they were series data
37+
dataRow = abalone.iloc[i,1:8]
38+
normTarget = (abalone.iloc[i,8] - meanRings)/sdRings
39+
labelColor = 1.0/(1.0 + exp(-normTarget))
40+
dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
41+
42+
plot.xlabel("Attribute Index")
43+
plot.ylabel(("Attribute Values"))
44+
plot.show()

misc/MLinPython/02/abaloneSummary.py

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
from pylab import *
5+
import matplotlib.pyplot as plot
6+
7+
target_url = ("http://archive.ics.uci.edu/ml/machine-"
8+
"learning-databases/abalone/abalone.data")
9+
#read abalone data
10+
abalone = pd.read_csv(target_url,header=None, prefix="V")
11+
abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
12+
'Shucked weight', 'Viscera weight', 'Shell weight',
13+
'Rings']
14+
15+
16+
print(abalone.head())
17+
print(abalone.tail())
18+
19+
#print summary of data frame
20+
summary = abalone.describe()
21+
print(summary)
22+
23+
#box plot the real-valued attributes
24+
#convert to array for plot routine
25+
array = abalone.iloc[:,1:9].values
26+
boxplot(array)
27+
plot.xlabel("Attribute Index")
28+
plot.ylabel(("Quartile Ranges"))
29+
show()
30+
31+
#the last column (rings) is out of scale with the rest
32+
# - remove and replot
33+
array2 = abalone.iloc[:,1:8].values
34+
boxplot(array2)
35+
plot.xlabel("Attribute Index")
36+
plot.ylabel(("Quartile Ranges"))
37+
show()
38+
39+
#removing is okay but renormalizing the variables generalizes better.
40+
#renormalize columns to zero mean and unit standard deviation
41+
#this is a common normalization and desirable for other operations
42+
# (like k-means clustering or k-nearest neighbors
43+
abaloneNormalized = abalone.iloc[:,1:9]
44+
45+
46+
for i in range(8):
47+
mean = summary.iloc[1, i]
48+
sd = summary.iloc[2, i]
49+
abaloneNormalized.iloc[:,i:(i + 1)] = (
50+
abaloneNormalized.iloc[:,i:(i + 1)] - mean) / sd
51+
52+
array3 = abaloneNormalized.values
53+
boxplot(array3)
54+
plot.xlabel("Attribute Index")
55+
plot.ylabel(("Quartile Ranges - Normalized "))
56+
show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
Sex Length Diameter Height Whole wt Shucked wt Viscera wt
2+
0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
3+
1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
4+
2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
5+
3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
6+
4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
7+
8+
Shell weight Rings
9+
0 0.150 15
10+
1 0.070 7
11+
2 0.210 9
12+
3 0.155 10
13+
4 0.055 7
14+
Sex Length Diameter Height Whole weight Shucked weight
15+
4172 F 0.565 0.450 0.165 0.8870 0.3700
16+
4173 M 0.590 0.440 0.135 0.9660 0.4390
17+
4174 M 0.600 0.475 0.205 1.1760 0.5255
18+
4175 F 0.625 0.485 0.150 1.0945 0.5310
19+
4176 M 0.710 0.555 0.195 1.9485 0.9455
20+
21+
Viscera weight Shell weight Rings
22+
4172 0.2390 0.2490 11
23+
4173 0.2145 0.2605 10
24+
4174 0.2875 0.3080 9
25+
4175 0.2610 0.2960 10
26+
4176 0.3765 0.4950 12
27+
Length Diameter Height Whole wt Shucked wt
28+
count 4177.000000 4177.000000 4177.000000 4177.000000 4177.000000
29+
mean 0.523992 0.407881 0.139516 0.828742 0.359367
30+
std 0.120093 0.099240 0.041827 0.490389 0.221963
31+
min 0.075000 0.055000 0.000000 0.002000 0.001000
32+
25% 0.450000 0.350000 0.115000 0.441500 0.186000
33+
50% 0.545000 0.425000 0.140000 0.799500 0.336000
34+
75% 0.615000 0.480000 0.165000 1.153000 0.502000
35+
max 0.815000 0.650000 1.130000 2.825500 1.488000
36+
37+
Viscera weight Shell weight Rings
38+
count 4177.000000 4177.000000 4177.000000
39+
mean 0.180594 0.238831 9.933684
40+
std 0.109614 0.139203 3.224169
41+
min 0.000500 0.001500 1.000000
42+
25% 0.093500 0.130000 8.000000
43+
50% 0.171000 0.234000 9.000000
44+
75% 0.253000 0.329000 11.000000
45+
max 0.760000 1.005000 29.000000

misc/MLinPython/02/chapter02.zip

24 KB
Binary file not shown.

misc/MLinPython/02/corrCalc.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
from math import sqrt
5+
import sys
6+
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
7+
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
8+
9+
#read rocks versus mines data into pandas data frame
10+
rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
11+
12+
#calculate correlations between real-valued attributes
13+
dataRow2 = rocksVMines.iloc[1,0:60]
14+
dataRow3 = rocksVMines.iloc[2,0:60]
15+
dataRow21 = rocksVMines.iloc[20,0:60]
16+
17+
mean2 = 0.0; mean3 = 0.0; mean21 = 0.0
18+
numElt = len(dataRow2)
19+
for i in range(numElt):
20+
mean2 += dataRow2[i]/numElt
21+
mean3 += dataRow3[i]/numElt
22+
mean21 += dataRow21[i]/numElt
23+
24+
var2 = 0.0; var3 = 0.0; var21 = 0.0
25+
for i in range(numElt):
26+
var2 += (dataRow2[i] - mean2) * (dataRow2[i] - mean2)/numElt
27+
var3 += (dataRow3[i] - mean3) * (dataRow3[i] - mean3)/numElt
28+
var21 += (dataRow21[i] - mean21) * (dataRow21[i] - mean21)/numElt
29+
30+
corr23 = 0.0; corr221 = 0.0
31+
for i in range(numElt):
32+
corr23 += (dataRow2[i] - mean2) * \
33+
(dataRow3[i] - mean3) / (sqrt(var2*var3) * numElt)
34+
corr221 += (dataRow2[i] - mean2) * \
35+
(dataRow21[i] - mean21) / (sqrt(var2*var21) * numElt)
36+
37+
sys.stdout.write("Correlation between attribute 2 and 3 \n")
38+
print(corr23)
39+
sys.stdout.write(" \n")
40+
41+
sys.stdout.write("Correlation between attribute 2 and 21 \n")
42+
print(corr221)
43+
sys.stdout.write(" \n")
44+
45+
46+
# Output:
47+
# Correlation between attribute 2 and 3
48+
# 0.770938121191
49+
#
50+
# Correlation between attribute 2 and 21
51+
# 0.466548080789

misc/MLinPython/02/corrPlot.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
import matplotlib.pyplot as plot
5+
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
6+
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
7+
8+
#read rocks versus mines data into pandas data frame
9+
rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
10+
11+
#calculate correlations between real-valued attributes
12+
dataRow2 = rocksVMines.iloc[1,0:60]
13+
dataRow3 = rocksVMines.iloc[2,0:60]
14+
15+
plot.scatter(dataRow2, dataRow3)
16+
17+
18+
plot.xlabel("2nd Attribute")
19+
plot.ylabel(("3rd Attribute"))
20+
plot.show()
21+
22+
dataRow21 = rocksVMines.iloc[20,0:60]
23+
24+
plot.scatter(dataRow2, dataRow21)
25+
26+
27+
plot.xlabel("2nd Attribute")
28+
plot.ylabel(("21st Attribute"))
29+
plot.show()
+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
from pylab import *
5+
import matplotlib.pyplot as plot
6+
from math import exp
7+
8+
target_url = ("https://archive.ics.uci.edu/ml/machine-"
9+
"learning-databases/glass/glass.data")
10+
glass = pd.read_csv(target_url,header=None, prefix="V")
11+
glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
12+
'K', 'Ca', 'Ba', 'Fe', 'Type']
13+
ncols = len(glass.columns)
14+
15+
#calculate correlation matrix
16+
corMat = DataFrame(glass.iloc[:, 1:(ncols - 1)].corr())
17+
18+
#visualize correlations using heatmap
19+
plot.pcolor(corMat)
20+
plot.show()
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
from pylab import *
5+
import matplotlib.pyplot as plot
6+
7+
target_url = ("https://archive.ics.uci.edu/ml/machine-"
8+
"learning-databases/glass/glass.data")
9+
glass = pd.read_csv(target_url,header=None, prefix="V")
10+
glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
11+
'K', 'Ca', 'Ba', 'Fe', 'Type']
12+
13+
14+
glassNormalized = glass
15+
ncols = len(glassNormalized.columns)
16+
nrows = len(glassNormalized.index)
17+
summary = glassNormalized.describe()
18+
nDataCol = ncols - 1
19+
20+
#normalize except for labels
21+
for i in range(ncols - 1):
22+
mean = summary.iloc[1, i]
23+
sd = summary.iloc[2, i]
24+
glassNormalized.iloc[:,i:(i + 1)] = \
25+
(glassNormalized.iloc[:,i:(i + 1)] - mean) / sd
26+
27+
#Plot Parallel Coordinate Graph with normalized values
28+
for i in range(nrows):
29+
#plot rows of data as if they were series data
30+
dataRow = glassNormalized.iloc[i,1:nDataCol]
31+
labelColor = glassNormalized.iloc[i,nDataCol]/7.0
32+
dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
33+
34+
plot.xlabel("Attribute Index")
35+
plot.ylabel(("Attribute Values"))
36+
plot.show()

misc/MLinPython/02/glassSummary.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
__author__ = 'mike_bowles'
2+
import pandas as pd
3+
from pandas import DataFrame
4+
from pylab import *
5+
import matplotlib.pyplot as plot
6+
7+
target_url = ("https://archive.ics.uci.edu/ml/machine-"
8+
"learning-databases/glass/glass.data")
9+
10+
glass = pd.read_csv(target_url,header=None, prefix="V")
11+
glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
12+
'K', 'Ca', 'Ba', 'Fe', 'Type']
13+
14+
print(glass.head())
15+
16+
#generate statistical summaries
17+
summary = glass.describe()
18+
print(summary)
19+
ncol1 = len(glass.columns)
20+
21+
glassNormalized = glass.iloc[:, 1:ncol1]
22+
ncol2 = len(glassNormalized.columns)
23+
summary2 = glassNormalized.describe()
24+
25+
for i in range(ncol2):
26+
mean = summary2.iloc[1, i]
27+
sd = summary2.iloc[2, i]
28+
glassNormalized.iloc[:,i:(i + 1)] = \
29+
(glassNormalized.iloc[:,i:(i + 1)] - mean) / sd
30+
31+
array = glassNormalized.values
32+
boxplot(array)
33+
plot.xlabel("Attribute Index")
34+
plot.ylabel(("Quartile Ranges - Normalized "))
35+
show()

0 commit comments

Comments
 (0)