Eclectic Data Mining

So far I'm still working on the K-Means implementation, this is the first working result that I get, still I need to do the following improvements to the code:

1. Improve the readability.
2. Include the maximization function (done, it is used to compare the candidate models)
3. Include the z-score normalization (done)
4. Work on the stability of the clustering by performing the relocation of observations to clusters only if that would reflect a gain on the overall maximization function
5. Manage labeled data
6. Take a sample of the data to improve the scalability
7. Order the results by count of cluster frequency (done)
8. Multiple candidate models (done)
---------------------------------------
9. Other implementations
9.1. LINQ (still C#)
9.2. MatLab
9.3. Java...
9.4. F#...
9.5. C / C++...

10. Execution trace

11. Range based standardization for not normal distributed variables

The task that the system needs to perform is dividing n observations in k clusters where each observation belongs to the cluster with the nearest mean.

To invoke the static procedures of the class:

Clustering.WriteCSV("c:\\temp\\output.csv",
Clustering.KMeans(10,Clustering.ReadCSV("C:\\temp\\set.csv"),true,true,10));

Class:

using System;
//using System.Collections.Generic;
using System.Linq;
//using System.Text;
using System.IO;
using System.Globalization;

namespace DM_Foundation
{
class Clustering
{

public static bool Compare(int[] oldCluster,int[] newCluster)
{
bool compare = true;

for (int r = 0; r <= oldCluster.GetUpperBound(0); r++)
{

if (oldCluster[r] != newCluster[r])
{
compare = false;
}

}

return compare;

}

public static int [] KMeans(int clusterCount,double[,] data,bool orderFrequency,bool standadScore,int candidateModels)
{

if (standadScore)
{
data = ZScore(data);
}

int[] selectedClusterRelation = new int[data.GetUpperBound(0)+1];

double minCost = 0;
//Candidate Model Iteration

for (int model = 1; model <= candidateModels; model++)
{

int[] clusterRelation = RandomCluster(clusterCount, data);

int[] oldClusterRelation = new int[clusterRelation.GetUpperBound(0) + 1];

Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());

int iteration = 0;
bool convergence = false;

while (iteration < 1000 & convergence == false)
{
clusterRelation = AssignCluster(clusterCount, clusterRelation, data);
convergence = Compare(oldClusterRelation, clusterRelation);
Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());
iteration++;
}

if (orderFrequency == true)
{
clusterRelation = Clustering.OrderCluster(clusterCount, clusterRelation);
}

double cost = Cost(clusterCount, clusterRelation, data);

if (model == 1)
{
Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
minCost = cost;
}

else
{
if (cost < minCost)
{

Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
minCost = cost;
}

}

}

return selectedClusterRelation;

}

public static double Cost(int clusterCount, int[] clusterRelation, double[,] data)
{

double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);

double cost = 0;

for (int row = 0; row <= data.GetUpperBound(0); row++)
{
cost += Math.Pow(Distance(GetRow(row, data), GetRow(clusterRelation[row], clusterCenter)), 2);

}
return cost;
}

public static double[,] GetRow(int row, double[,] data)
{
int columnCount = data.GetUpperBound(1) + 1;
double [,] getRow = new double[1, columnCount];

for (int column = 0; column <= columnCount - 1; column++)
{
getRow[0, column] = data[row, column];
}
return getRow;
}

public static double Distance(double[,] pointA, double[,] pointB)
{
double distance = 0;

int columnCount = pointA.GetUpperBound(1) + 1;

for (int column = 0; column <= columnCount - 1; column++)
{
distance += (pointA[0,column]-pointB[0,column])*(pointA[0,column]-pointB[0,column]);
}

distance = Math.Sqrt(distance);
return distance;

}

public static int[] AssignCluster(int clusterCount, int[] clusterRelation, double[,] data)
{

int[] countByCluster = CountByCluster(clusterCount, clusterRelation);
double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);

for (int row = 0; row <= clusterRelation.GetUpperBound(0); row++)
{

double maxDistance = Distance( GetRow(row,data),GetRow(clusterRelation[row],clusterCenter));

for (int cluster = 0;cluster <= clusterCount-1;cluster++)

{

if (countByCluster[cluster] != 0)
{
if (maxDistance>=Distance( GetRow(row,data),GetRow(cluster,clusterCenter)))
{
maxDistance=Distance( GetRow(row,data),GetRow(cluster,clusterCenter));
clusterRelation[row]=cluster;
}
}

}

}

return clusterRelation;
}

public static void WriteCSV (string filePath, double [,] data)

{
using (StreamWriter writer = new StreamWriter(filePath))
{
for (int row = 0; row <= data.GetUpperBound(0); row++)
{

string csvRow = "";

int cols = data.GetUpperBound(1) + 1;

for (int col = 0; col <= cols - 1; col++)
{

csvRow += System.Convert.ToString (data[row, col]);

if (col< cols-1 )
{
csvRow += ",";
}
}
writer.WriteLine(csvRow);
}
}
}

public static void WriteCSV(string filePath, int[] data)
{
using (StreamWriter writer = new StreamWriter(filePath))
{
for (int row = 0; row <= data.GetUpperBound(0); row++)
{

string csvRow = "";
csvRow += System.Convert.ToString(data[row]);
writer.WriteLine(csvRow);
}

}
}

public static double [,] ReadCSV(string filePath)
{
// Returns the CSV files data as a double [rows,columns] array
// Takes the number of columns on the first row as the column count

String[] fileContent = File.ReadAllLines(filePath);
int row = 0;
int columnCount = fileContent[0].Split(',').Length;
double[,] readCSV = new double[fileContent.Count(), columnCount];
foreach (string Line in fileContent)
{
string[] split = Line.Split(new[] { ',' });
for (int column=0; column <= columnCount -1;column++)
{
readCSV[row, column] = double.Parse(split[column], CultureInfo.InvariantCulture);
}

row++;
}
return readCSV;
}

public static int[] RandomCluster(int clusterCount, double[,] source)
{

int[] randomCluster = new int[source.GetUpperBound(0)+1];
Random random = new Random();
for (int row = 0; row <= randomCluster.GetUpperBound(0); row++)

{

randomCluster[row] = random.Next(clusterCount );
}

return randomCluster;

}

public static int[] CountByCluster(int clusterCount, int [] obsCluster)
{

int[] countByCluster = new int[clusterCount];
foreach (int obs in obsCluster)
{

countByCluster[obs]++;
}
return countByCluster;
}

public static int[] OrderCluster(int clusterCount, int[] obsCluster)

{

int[] countByCluster = CountByCluster(clusterCount,obsCluster);

int[] countByClusterX = new int[clusterCount];

for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
{

countByClusterX[c] = c;

}

Array.Sort(countByCluster,countByClusterX);

Array.Reverse(countByClusterX);

for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
{

int newC = c;
int oldC = countByClusterX[c];

for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{

if (obsCluster[row] == newC) { obsCluster[row] = -1; }
}

for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{

if (obsCluster[row] == oldC) { obsCluster[row] = newC ; }
}

for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{

if (obsCluster[row] == -1) { obsCluster[row] = oldC; }
}

for (int row = 0; row <= countByClusterX.GetUpperBound(0); row++)
{

if (countByClusterX[row] == newC) { countByClusterX[row] = oldC; }
}

}

return obsCluster;
}

public static double[,] ZScore(double[,] data)
{

for (int dim = 0; dim <= data.GetUpperBound(1); dim++)
{

// Mean

double mean = 0;

for (int r = 0; r <= data.GetUpperBound(0); r++)
{
mean += data[r, dim];
}

mean = mean / data.GetUpperBound(0);

// Variance

double variance = 0;

for (int r = 0; r <= data.GetUpperBound(0); r++)
{
variance += Math.Pow((data[r, dim]-mean),2);
}

variance = variance / data.GetUpperBound(0);
double stdv = Math.Sqrt(variance);

// Z-Score - Standard Score

for (int r = 0; r <= data.GetUpperBound(0); r++)
{
data[r, dim] = (data[r, dim] - mean) / stdv;
}

}

return data;

}

public static double[,] ClusterCenter(int clusterCount, int[] obsCluster, double[,] source)
{

int cols = source.GetUpperBound(1)+1;

int[] countByCluster = Clustering.CountByCluster(clusterCount, obsCluster);
double[,] clusterSum = new double[clusterCount, cols];
double[,] clusterCenter = new double[clusterCount, cols];

for (int row = 0;row<=source.GetUpperBound(0);row++)

{

for (int col = 0; col<=cols-1; col++)
{
clusterSum[obsCluster[row], col] += source[row, col];
}

}

for (int row = 0; row <= clusterSum.GetUpperBound(0); row++)

{
if (countByCluster[row] != 0)
{
for (int col = 0; col <= cols - 1; col++)
{
clusterCenter[row, col] = clusterSum[row, col] / (double)countByCluster[row];
}
}
}

return clusterCenter;
}

}
}

To connect to SQL Server from Matlab can be a very tricky task, specially because the ODBC connectivity is implemented in Matlab by using a JDBC bridge to ODBC technology, I spent almost 3 hours trying to use ODBC connectivity, after that I moved to the JDBC driver, the installation process is a little bit tricky, the process is the following:

1. Download the JDBC driver from Microsoft from:

http://www.microsoft.com/downloads/details.aspx?FamilyID=99b21b65-e98f-4a61-b811-19912601fdc9&displaylang=en

2. Execute it, in will be decompressed to the selected location

3. Start the Notepad application as administrator and open the file:

C:\Program Files\MATLAB\R2010a\toolbox\local\classpath.txt

4. Add a reference like this to the JDBC driver after the last line of that file:

c:/SQLJDBC/sqljdbc_2.0/enu/sqljdbc4.jar

5. After that you have to restart MATLAB and then you can test the connection:

dbConn = database('master', 'user', 'password', 'com.microsoft.sqlserver.jdbc.SQLServerDriver', 'jdbc:sqlserver://localhost:1433;databaseName=master;');

6. To test the connection you can execute:

ping(dbConn);

7. Please not that the local instance of SQL Server require to have the TCP/IP protocol network connections enabled.

Eclectic Data Mining

Sunday, August 26, 2012

What do we need to implement the EM (Expectation Maximization) algorithm ?

Sunday, February 19, 2012

Implementing the K-Means algorithm on C#

Saturday, April 17, 2010

The problems of Data Mining

Friday, April 16, 2010

Which tool chose to learn Data Mining?

Thursday, April 15, 2010

Connect to SQL Server from Matlab

Wednesday, April 14, 2010

SQL Server 2008 Data Mining Functionalities

Followers

Blog Archive

About Me