MarvinXN
diff --git a/‎misc/MLinPython/01/chapter01.txt
+1 b/‎misc/MLinPython/01/chapter01.txt
+1
diff --git a/‎misc/MLinPython/02/#abaloneCorrHeat.py#
+23 b/‎misc/MLinPython/02/#abaloneCorrHeat.py#
+23
diff --git a/‎misc/MLinPython/02/abaloneCorrHeat.py
+22 b/‎misc/MLinPython/02/abaloneCorrHeat.py
+22
diff --git a/‎misc/MLinPython/02/abaloneCorrMat.txt
+19 b/‎misc/MLinPython/02/abaloneCorrMat.txt
+19
diff --git a/‎misc/MLinPython/02/abaloneParallelPlot.py
+44 b/‎misc/MLinPython/02/abaloneParallelPlot.py
+44
diff --git a/‎misc/MLinPython/02/abaloneSummary.py
+56 b/‎misc/MLinPython/02/abaloneSummary.py
+56
diff --git a/‎misc/MLinPython/02/abaloneSummaryOutput.txt
+45 b/‎misc/MLinPython/02/abaloneSummaryOutput.txt
+45
diff --git a/‎misc/MLinPython/02/chapter02.zip
24 KB b/‎misc/MLinPython/02/chapter02.zip
24 KB
diff --git a/‎misc/MLinPython/02/corrCalc.py
+51 b/‎misc/MLinPython/02/corrCalc.py
+51
diff --git a/‎misc/MLinPython/02/corrPlot.py
+29 b/‎misc/MLinPython/02/corrPlot.py
+29
diff --git a/‎misc/MLinPython/02/glassCorrHeatMap.py
+20 b/‎misc/MLinPython/02/glassCorrHeatMap.py
+20
diff --git a/‎misc/MLinPython/02/glassParallelPlot.py
+36 b/‎misc/MLinPython/02/glassParallelPlot.py
+36
diff --git a/‎misc/MLinPython/02/glassSummary.py
+35 b/‎misc/MLinPython/02/glassSummary.py
+35
@@ -0,0 +1 @@
+Chapter 1 of Machine Learning in Python has no code associated with it.
@@ -0,0 +1,23 @@
+2
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+import matplotlib.pyplot as plot
+
+target_url = ("http://archive.ics.uci.edu/ml/machine-"
+              "learning-databases/abalone/abalone.data")
+#read abalone data
+abalone = pd.read_csv(target_url,header=None, prefix="V")
+abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
+                   'Whole weight', 'Shucked weight',
+                   'Viscera weight', 'Shell weight', 'Rings']
+
+#calculate correlation matrix
+corMat = DataFrame(abalone.iloc[:,1:9].corr())
+#print correlation matrix
+print(corMat)
+
+#visualize correlations using heatmap
+plot.pcolor(corMat)
+plot.show()
+
@@ -0,0 +1,22 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+import matplotlib.pyplot as plot
+
+target_url = ("http://archive.ics.uci.edu/ml/machine-"
+              "learning-databases/abalone/abalone.data")
+#read abalone data
+abalone = pd.read_csv(target_url,header=None, prefix="V")
+abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
+                   'Whole weight', 'Shucked weight',
+                   'Viscera weight', 'Shell weight', 'Rings']
+
+#calculate correlation matrix
+corMat = DataFrame(abalone.iloc[:,1:9].corr())
+#print correlation matrix
+print(corMat)
+
+#visualize correlations using heatmap
+plot.pcolor(corMat)
+plot.show()
+
@@ -0,0 +1,19 @@
+                  Length  Diameter    Height  Whole Wt  Shucked Wt
+Length          1.000000  0.986812  0.827554   0.925261   0.897914
+Diameter        0.986812  1.000000  0.833684   0.925452   0.893162
+Height          0.827554  0.833684  1.000000   0.819221   0.774972
+Whole weight    0.925261  0.925452  0.819221   1.000000   0.969405
+Shucked weight  0.897914  0.893162  0.774972   0.969405   1.000000
+Viscera weight  0.903018  0.899724  0.798319   0.966375   0.931961
+Shell weight    0.897706  0.905330  0.817338   0.955355   0.882617
+Rings           0.556720  0.574660  0.557467   0.540390   0.420884
+
+                Viscera weight  Shell weight     Rings  
+Length                0.903018      0.897706  0.556720  
+Diameter              0.899724      0.905330  0.574660  
+Height                0.798319      0.817338  0.557467  
+Whole weight          0.966375      0.955355  0.540390  
+Shucked weight        0.931961      0.882617  0.420884  
+Viscera weight        1.000000      0.907656  0.503819  
+Shell weight          0.907656      1.000000  0.627574  
+Rings                 0.503819      0.627574  1.000000  
@@ -0,0 +1,44 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+import matplotlib.pyplot as plot
+from math import exp
+target_url = ("http://archive.ics.uci.edu/ml/machine-"
+              "learning-databases/abalone/abalone.data")
+#read abalone data
+abalone = pd.read_csv(target_url,header=None, prefix="V")
+abalone.columns = ['Sex', 'Length', 'Diameter', 'Height',
+                   'Whole Wt', 'Shucked Wt',
+                   'Viscera Wt', 'Shell Wt', 'Rings']
+#get summary to use for scaling
+summary = abalone.describe()
+minRings = summary.iloc[3,7]
+maxRings = summary.iloc[7,7]
+nrows = len(abalone.index)
+
+for i in range(nrows):
+    #plot rows of data as if they were series data
+    dataRow = abalone.iloc[i,1:8]
+    labelColor = (abalone.iloc[i,8] - minRings) / (maxRings - minRings)
+    dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
+
+plot.xlabel("Attribute Index")
+plot.ylabel(("Attribute Values"))
+plot.show()
+
+#renormalize using mean and standard variation, then compress
+# with logit function
+
+meanRings = summary.iloc[1,7]
+sdRings = summary.iloc[2,7]
+
+for i in range(nrows):
+    #plot rows of data as if they were series data
+    dataRow = abalone.iloc[i,1:8]
+    normTarget = (abalone.iloc[i,8] - meanRings)/sdRings
+    labelColor = 1.0/(1.0 + exp(-normTarget))
+    dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
+
+plot.xlabel("Attribute Index")
+plot.ylabel(("Attribute Values"))
+plot.show()
@@ -0,0 +1,56 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+from pylab import *
+import matplotlib.pyplot as plot
+
+target_url = ("http://archive.ics.uci.edu/ml/machine-"
+              "learning-databases/abalone/abalone.data")
+#read abalone data
+abalone = pd.read_csv(target_url,header=None, prefix="V")
+abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
+                   'Shucked weight', 'Viscera weight', 'Shell weight',
+                   'Rings']
+
+
+print(abalone.head())
+print(abalone.tail())
+
+#print summary of data frame
+summary = abalone.describe()
+print(summary)
+
+#box plot the real-valued attributes
+#convert to array for plot routine
+array = abalone.iloc[:,1:9].values
+boxplot(array)
+plot.xlabel("Attribute Index")
+plot.ylabel(("Quartile Ranges"))
+show()
+
+#the last column (rings) is out of scale with the rest
+# - remove and replot
+array2 = abalone.iloc[:,1:8].values
+boxplot(array2)
+plot.xlabel("Attribute Index")
+plot.ylabel(("Quartile Ranges"))
+show()
+
+#removing is okay but renormalizing the variables generalizes better.
+#renormalize columns to zero mean and unit standard deviation
+#this is a common normalization and desirable for other operations
+# (like k-means clustering or k-nearest neighbors
+abaloneNormalized = abalone.iloc[:,1:9]
+
+
+for i in range(8):
+    mean = summary.iloc[1, i]
+    sd = summary.iloc[2, i]
+    abaloneNormalized.iloc[:,i:(i + 1)] = (
+                    abaloneNormalized.iloc[:,i:(i + 1)] - mean) / sd
+
+array3 = abaloneNormalized.values
+boxplot(array3)
+plot.xlabel("Attribute Index")
+plot.ylabel(("Quartile Ranges - Normalized "))
+show()
@@ -0,0 +1,45 @@
+  Sex  Length  Diameter  Height  Whole wt  Shucked wt  Viscera wt
+0   M   0.455     0.365   0.095    0.5140      0.2245      0.1010
+1   M   0.350     0.265   0.090    0.2255      0.0995      0.0485
+2   F   0.530     0.420   0.135    0.6770      0.2565      0.1415
+3   M   0.440     0.365   0.125    0.5160      0.2155      0.1140
+4   I   0.330     0.255   0.080    0.2050      0.0895      0.0395
+
+   Shell weight  Rings  
+0         0.150     15  
+1         0.070      7  
+2         0.210      9  
+3         0.155     10  
+4         0.055      7  
+     Sex  Length  Diameter  Height  Whole weight  Shucked weight
+4172   F   0.565     0.450   0.165        0.8870          0.3700
+4173   M   0.590     0.440   0.135        0.9660          0.4390
+4174   M   0.600     0.475   0.205        1.1760          0.5255
+4175   F   0.625     0.485   0.150        1.0945          0.5310
+4176   M   0.710     0.555   0.195        1.9485          0.9455
+
+      Viscera weight  Shell weight  Rings  
+4172          0.2390        0.2490     11  
+4173          0.2145        0.2605     10  
+4174          0.2875        0.3080      9  
+4175          0.2610        0.2960     10  
+4176          0.3765        0.4950     12  
+            Length     Diameter       Height    Whole wt  Shucked wt
+count  4177.000000  4177.000000  4177.000000 4177.000000 4177.000000
+mean      0.523992     0.407881     0.139516    0.828742    0.359367
+std       0.120093     0.099240     0.041827    0.490389    0.221963
+min       0.075000     0.055000     0.000000    0.002000    0.001000
+25%       0.450000     0.350000     0.115000    0.441500    0.186000
+50%       0.545000     0.425000     0.140000    0.799500    0.336000
+75%       0.615000     0.480000     0.165000    1.153000    0.502000
+max       0.815000     0.650000     1.130000    2.825500    1.488000
+
+       Viscera weight  Shell weight        Rings  
+count     4177.000000   4177.000000  4177.000000  
+mean         0.180594      0.238831     9.933684  
+std          0.109614      0.139203     3.224169  
+min          0.000500      0.001500     1.000000  
+25%          0.093500      0.130000     8.000000  
+50%          0.171000      0.234000     9.000000  
+75%          0.253000      0.329000    11.000000
+max          0.760000      1.005000    29.000000 
@@ -0,0 +1,51 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+from math import sqrt
+import sys
+target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
+"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
+
+#read rocks versus mines data into pandas data frame
+rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
+
+#calculate correlations between real-valued attributes
+dataRow2 = rocksVMines.iloc[1,0:60]
+dataRow3 = rocksVMines.iloc[2,0:60]
+dataRow21 = rocksVMines.iloc[20,0:60]
+
+mean2 = 0.0; mean3 = 0.0; mean21 = 0.0
+numElt = len(dataRow2)
+for i in range(numElt):
+    mean2 += dataRow2[i]/numElt
+    mean3 += dataRow3[i]/numElt
+    mean21 += dataRow21[i]/numElt
+
+var2 = 0.0; var3 = 0.0; var21 = 0.0
+for i in range(numElt):
+    var2 += (dataRow2[i] - mean2) * (dataRow2[i] - mean2)/numElt
+    var3 += (dataRow3[i] - mean3) * (dataRow3[i] - mean3)/numElt
+    var21 += (dataRow21[i] - mean21) * (dataRow21[i] - mean21)/numElt
+
+corr23 = 0.0; corr221 = 0.0
+for i in range(numElt):
+    corr23 += (dataRow2[i] - mean2) * \
+              (dataRow3[i] - mean3) / (sqrt(var2*var3) * numElt)
+    corr221 += (dataRow2[i] - mean2) * \
+               (dataRow21[i] - mean21) / (sqrt(var2*var21) * numElt)
+
+sys.stdout.write("Correlation between attribute 2 and 3 \n")
+print(corr23)
+sys.stdout.write(" \n")
+
+sys.stdout.write("Correlation between attribute 2 and 21 \n")
+print(corr221)
+sys.stdout.write(" \n")
+
+
+# Output:
+# Correlation between attribute 2 and 3
+# 0.770938121191
+#
+# Correlation between attribute 2 and 21
+# 0.466548080789
@@ -0,0 +1,29 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+import matplotlib.pyplot as plot
+target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
+"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
+
+#read rocks versus mines data into pandas data frame
+rocksVMines = pd.read_csv(target_url,header=None, prefix="V")
+
+#calculate correlations between real-valued attributes
+dataRow2 = rocksVMines.iloc[1,0:60]
+dataRow3 = rocksVMines.iloc[2,0:60]
+
+plot.scatter(dataRow2, dataRow3)
+
+
+plot.xlabel("2nd Attribute")
+plot.ylabel(("3rd Attribute"))
+plot.show()
+
+dataRow21 = rocksVMines.iloc[20,0:60]
+
+plot.scatter(dataRow2, dataRow21)
+
+
+plot.xlabel("2nd Attribute")
+plot.ylabel(("21st Attribute"))
+plot.show()
@@ -0,0 +1,20 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+from pylab import *
+import matplotlib.pyplot as plot
+from math import exp
+
+target_url = ("https://archive.ics.uci.edu/ml/machine-"
+              "learning-databases/glass/glass.data")
+glass = pd.read_csv(target_url,header=None, prefix="V")
+glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
+                 'K', 'Ca', 'Ba', 'Fe', 'Type']
+ncols = len(glass.columns)
+
+#calculate correlation matrix
+corMat = DataFrame(glass.iloc[:, 1:(ncols - 1)].corr())
+
+#visualize correlations using heatmap
+plot.pcolor(corMat)
+plot.show()
@@ -0,0 +1,36 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+from pylab import *
+import matplotlib.pyplot as plot
+
+target_url = ("https://archive.ics.uci.edu/ml/machine-"
+              "learning-databases/glass/glass.data")
+glass = pd.read_csv(target_url,header=None, prefix="V")
+glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
+                 'K', 'Ca', 'Ba', 'Fe', 'Type']
+
+
+glassNormalized = glass
+ncols = len(glassNormalized.columns)
+nrows = len(glassNormalized.index)
+summary = glassNormalized.describe()
+nDataCol = ncols - 1
+
+#normalize except for labels
+for i in range(ncols - 1):
+    mean = summary.iloc[1, i]
+    sd = summary.iloc[2, i]
+    glassNormalized.iloc[:,i:(i + 1)] = \
+        (glassNormalized.iloc[:,i:(i + 1)] - mean) / sd
+
+#Plot Parallel Coordinate Graph with normalized values
+for i in range(nrows):
+    #plot rows of data as if they were series data
+    dataRow = glassNormalized.iloc[i,1:nDataCol]
+    labelColor = glassNormalized.iloc[i,nDataCol]/7.0
+    dataRow.plot(color=plot.cm.RdYlBu(labelColor), alpha=0.5)
+
+plot.xlabel("Attribute Index")
+plot.ylabel(("Attribute Values"))
+plot.show()
@@ -0,0 +1,35 @@
+__author__ = 'mike_bowles'
+import pandas as pd
+from pandas import DataFrame
+from pylab import *
+import matplotlib.pyplot as plot
+
+target_url = ("https://archive.ics.uci.edu/ml/machine-"
+              "learning-databases/glass/glass.data")
+
+glass = pd.read_csv(target_url,header=None, prefix="V")
+glass.columns = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si',
+                 'K', 'Ca', 'Ba', 'Fe', 'Type']
+
+print(glass.head())
+
+#generate statistical summaries
+summary = glass.describe()
+print(summary)
+ncol1 = len(glass.columns)
+
+glassNormalized = glass.iloc[:, 1:ncol1]
+ncol2 = len(glassNormalized.columns)
+summary2 = glassNormalized.describe()
+
+for i in range(ncol2):
+    mean = summary2.iloc[1, i]
+    sd = summary2.iloc[2, i]
+    glassNormalized.iloc[:,i:(i + 1)] = \
+        (glassNormalized.iloc[:,i:(i + 1)] - mean) / sd
+
+array = glassNormalized.values
+boxplot(array)
+plot.xlabel("Attribute Index")
+plot.ylabel(("Quartile Ranges - Normalized "))
+show()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Chapter 1 of Machine Learning in Python has no code associated with it.`