So far I'm still working on the K-Means implementation, this is the first working result that I get, still I need to do the following improvements to the code:
1. Improve the readability.
2. Include the maximization function (done, it is used to compare the candidate models)
3. Include the z-score normalization (done)
4. Work on the stability of the clustering by performing the relocation of observations to clusters only if that would reflect a gain on the overall maximization function
5. Manage labeled data
6. Take a sample of the data to improve the scalability
7. Order the results by count of cluster frequency (done)
8. Multiple candidate models (done)
---------------------------------------
9. Other implementations
9.1. LINQ (still C#)
9.2. MatLab
9.3. Java...
9.4. F#...
9.5. C / C++...
10. Execution trace
11. Range based standardization for not normal distributed variables
To invoke the static procedures of the class:
Clustering.WriteCSV("c:\\temp\\output.csv",
Clustering.KMeans(10,Clustering.ReadCSV("C:\\temp\\set.csv"),true,true,10));
Class:
using System;
//using System.Collections.Generic;
using System.Linq;
//using System.Text;
using System.IO;
using System.Globalization;
namespace DM_Foundation
{
class Clustering
{
public static bool Compare(int[] oldCluster,int[] newCluster)
{
bool compare = true;
for (int r = 0; r <= oldCluster.GetUpperBound(0); r++)
{
if (oldCluster[r] != newCluster[r])
{
compare = false;
}
}
return compare;
}
public static int [] KMeans(int clusterCount,double[,] data,bool orderFrequency,bool standadScore,int candidateModels)
{
if (standadScore)
{
data = ZScore(data);
}
int[] selectedClusterRelation = new int[data.GetUpperBound(0)+1];
double minCost = 0;
//Candidate Model Iteration
for (int model = 1; model <= candidateModels; model++)
{
int[] clusterRelation = RandomCluster(clusterCount, data);
int[] oldClusterRelation = new int[clusterRelation.GetUpperBound(0) + 1];
Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());
int iteration = 0;
bool convergence = false;
while (iteration < 1000 & convergence == false)
{
clusterRelation = AssignCluster(clusterCount, clusterRelation, data);
convergence = Compare(oldClusterRelation, clusterRelation);
Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());
iteration++;
}
if (orderFrequency == true)
{
clusterRelation = Clustering.OrderCluster(clusterCount, clusterRelation);
}
double cost = Cost(clusterCount, clusterRelation, data);
if (model == 1)
{
Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
minCost = cost;
}
else
{
if (cost < minCost)
{
Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
minCost = cost;
}
}
}
return selectedClusterRelation;
}
public static double Cost(int clusterCount, int[] clusterRelation, double[,] data)
{
double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);
double cost = 0;
for (int row = 0; row <= data.GetUpperBound(0); row++)
{
cost += Math.Pow(Distance(GetRow(row, data), GetRow(clusterRelation[row], clusterCenter)), 2);
}
return cost;
}
public static double[,] GetRow(int row, double[,] data)
{
int columnCount = data.GetUpperBound(1) + 1;
double [,] getRow = new double[1, columnCount];
for (int column = 0; column <= columnCount - 1; column++)
{
getRow[0, column] = data[row, column];
}
return getRow;
}
public static double Distance(double[,] pointA, double[,] pointB)
{
double distance = 0;
int columnCount = pointA.GetUpperBound(1) + 1;
for (int column = 0; column <= columnCount - 1; column++)
{
distance += (pointA[0,column]-pointB[0,column])*(pointA[0,column]-pointB[0,column]);
}
distance = Math.Sqrt(distance);
return distance;
}
public static int[] AssignCluster(int clusterCount, int[] clusterRelation, double[,] data)
{
int[] countByCluster = CountByCluster(clusterCount, clusterRelation);
double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);
for (int row = 0; row <= clusterRelation.GetUpperBound(0); row++)
{
double maxDistance = Distance( GetRow(row,data),GetRow(clusterRelation[row],clusterCenter));
for (int cluster = 0;cluster <= clusterCount-1;cluster++)
{
if (countByCluster[cluster] != 0)
{
if (maxDistance>=Distance( GetRow(row,data),GetRow(cluster,clusterCenter)))
{
maxDistance=Distance( GetRow(row,data),GetRow(cluster,clusterCenter));
clusterRelation[row]=cluster;
}
}
}
}
return clusterRelation;
}
public static void WriteCSV (string filePath, double [,] data)
{
using (StreamWriter writer = new StreamWriter(filePath))
{
for (int row = 0; row <= data.GetUpperBound(0); row++)
{
string csvRow = "";
int cols = data.GetUpperBound(1) + 1;
for (int col = 0; col <= cols - 1; col++)
{
csvRow += System.Convert.ToString (data[row, col]);
if (col< cols-1 )
{
csvRow += ",";
}
}
writer.WriteLine(csvRow);
}
}
}
public static void WriteCSV(string filePath, int[] data)
{
using (StreamWriter writer = new StreamWriter(filePath))
{
for (int row = 0; row <= data.GetUpperBound(0); row++)
{
string csvRow = "";
csvRow += System.Convert.ToString(data[row]);
writer.WriteLine(csvRow);
}
}
}
public static double [,] ReadCSV(string filePath)
{
// Returns the CSV files data as a double [rows,columns] array
// Takes the number of columns on the first row as the column count
String[] fileContent = File.ReadAllLines(filePath);
int row = 0;
int columnCount = fileContent[0].Split(',').Length;
double[,] readCSV = new double[fileContent.Count(), columnCount];
foreach (string Line in fileContent)
{
string[] split = Line.Split(new[] { ',' });
for (int column=0; column <= columnCount -1;column++)
{
readCSV[row, column] = double.Parse(split[column], CultureInfo.InvariantCulture);
}
row++;
}
return readCSV;
}
public static int[] RandomCluster(int clusterCount, double[,] source)
{
int[] randomCluster = new int[source.GetUpperBound(0)+1];
Random random = new Random();
for (int row = 0; row <= randomCluster.GetUpperBound(0); row++)
{
randomCluster[row] = random.Next(clusterCount );
}
return randomCluster;
}
public static int[] CountByCluster(int clusterCount, int [] obsCluster)
{
int[] countByCluster = new int[clusterCount];
foreach (int obs in obsCluster)
{
countByCluster[obs]++;
}
return countByCluster;
}
public static int[] OrderCluster(int clusterCount, int[] obsCluster)
{
int[] countByCluster = CountByCluster(clusterCount,obsCluster);
int[] countByClusterX = new int[clusterCount];
for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
{
countByClusterX[c] = c;
}
Array.Sort(countByCluster,countByClusterX);
Array.Reverse(countByClusterX);
for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
{
int newC = c;
int oldC = countByClusterX[c];
for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{
if (obsCluster[row] == newC) { obsCluster[row] = -1; }
}
for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{
if (obsCluster[row] == oldC) { obsCluster[row] = newC ; }
}
for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{
if (obsCluster[row] == -1) { obsCluster[row] = oldC; }
}
for (int row = 0; row <= countByClusterX.GetUpperBound(0); row++)
{
if (countByClusterX[row] == newC) { countByClusterX[row] = oldC; }
}
}
return obsCluster;
}
public static double[,] ZScore(double[,] data)
{
for (int dim = 0; dim <= data.GetUpperBound(1); dim++)
{
// Mean
double mean = 0;
for (int r = 0; r <= data.GetUpperBound(0); r++)
{
mean += data[r, dim];
}
mean = mean / data.GetUpperBound(0);
// Variance
double variance = 0;
for (int r = 0; r <= data.GetUpperBound(0); r++)
{
variance += Math.Pow((data[r, dim]-mean),2);
}
variance = variance / data.GetUpperBound(0);
double stdv = Math.Sqrt(variance);
// Z-Score - Standard Score
for (int r = 0; r <= data.GetUpperBound(0); r++)
{
data[r, dim] = (data[r, dim] - mean) / stdv;
}
}
return data;
}
public static double[,] ClusterCenter(int clusterCount, int[] obsCluster, double[,] source)
{
int cols = source.GetUpperBound(1)+1;
int[] countByCluster = Clustering.CountByCluster(clusterCount, obsCluster);
double[,] clusterSum = new double[clusterCount, cols];
double[,] clusterCenter = new double[clusterCount, cols];
for (int row = 0;row<=source.GetUpperBound(0);row++)
{
for (int col = 0; col<=cols-1; col++)
{
clusterSum[obsCluster[row], col] += source[row, col];
}
}
for (int row = 0; row <= clusterSum.GetUpperBound(0); row++)
{
if (countByCluster[row] != 0)
{
for (int col = 0; col <= cols - 1; col++)
{
clusterCenter[row, col] = clusterSum[row, col] / (double)countByCluster[row];
}
}
}
return clusterCenter;
}
}
}
1. Improve the readability.
2. Include the maximization function (done, it is used to compare the candidate models)
3. Include the z-score normalization (done)
4. Work on the stability of the clustering by performing the relocation of observations to clusters only if that would reflect a gain on the overall maximization function
5. Manage labeled data
6. Take a sample of the data to improve the scalability
7. Order the results by count of cluster frequency (done)
8. Multiple candidate models (done)
---------------------------------------
9. Other implementations
9.1. LINQ (still C#)
9.2. MatLab
9.3. Java...
9.4. F#...
9.5. C / C++...
10. Execution trace
11. Range based standardization for not normal distributed variables
The task that the system needs to perform is dividing n observations in k clusters where each observation belongs to the cluster with the nearest mean.
To invoke the static procedures of the class:
Clustering.WriteCSV("c:\\temp\\output.csv",
Clustering.KMeans(10,Clustering.ReadCSV("C:\\temp\\set.csv"),true,true,10));
Class:
using System;
//using System.Collections.Generic;
using System.Linq;
//using System.Text;
using System.IO;
using System.Globalization;
namespace DM_Foundation
{
class Clustering
{
public static bool Compare(int[] oldCluster,int[] newCluster)
{
bool compare = true;
for (int r = 0; r <= oldCluster.GetUpperBound(0); r++)
{
if (oldCluster[r] != newCluster[r])
{
compare = false;
}
}
return compare;
}
public static int [] KMeans(int clusterCount,double[,] data,bool orderFrequency,bool standadScore,int candidateModels)
{
if (standadScore)
{
data = ZScore(data);
}
int[] selectedClusterRelation = new int[data.GetUpperBound(0)+1];
double minCost = 0;
//Candidate Model Iteration
for (int model = 1; model <= candidateModels; model++)
{
int[] clusterRelation = RandomCluster(clusterCount, data);
int[] oldClusterRelation = new int[clusterRelation.GetUpperBound(0) + 1];
Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());
int iteration = 0;
bool convergence = false;
while (iteration < 1000 & convergence == false)
{
clusterRelation = AssignCluster(clusterCount, clusterRelation, data);
convergence = Compare(oldClusterRelation, clusterRelation);
Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());
iteration++;
}
if (orderFrequency == true)
{
clusterRelation = Clustering.OrderCluster(clusterCount, clusterRelation);
}
double cost = Cost(clusterCount, clusterRelation, data);
if (model == 1)
{
Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
minCost = cost;
}
else
{
if (cost < minCost)
{
Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
minCost = cost;
}
}
}
return selectedClusterRelation;
}
public static double Cost(int clusterCount, int[] clusterRelation, double[,] data)
{
double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);
double cost = 0;
for (int row = 0; row <= data.GetUpperBound(0); row++)
{
cost += Math.Pow(Distance(GetRow(row, data), GetRow(clusterRelation[row], clusterCenter)), 2);
}
return cost;
}
public static double[,] GetRow(int row, double[,] data)
{
int columnCount = data.GetUpperBound(1) + 1;
double [,] getRow = new double[1, columnCount];
for (int column = 0; column <= columnCount - 1; column++)
{
getRow[0, column] = data[row, column];
}
return getRow;
}
public static double Distance(double[,] pointA, double[,] pointB)
{
double distance = 0;
int columnCount = pointA.GetUpperBound(1) + 1;
for (int column = 0; column <= columnCount - 1; column++)
{
distance += (pointA[0,column]-pointB[0,column])*(pointA[0,column]-pointB[0,column]);
}
distance = Math.Sqrt(distance);
return distance;
}
public static int[] AssignCluster(int clusterCount, int[] clusterRelation, double[,] data)
{
int[] countByCluster = CountByCluster(clusterCount, clusterRelation);
double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);
for (int row = 0; row <= clusterRelation.GetUpperBound(0); row++)
{
double maxDistance = Distance( GetRow(row,data),GetRow(clusterRelation[row],clusterCenter));
for (int cluster = 0;cluster <= clusterCount-1;cluster++)
{
if (countByCluster[cluster] != 0)
{
if (maxDistance>=Distance( GetRow(row,data),GetRow(cluster,clusterCenter)))
{
maxDistance=Distance( GetRow(row,data),GetRow(cluster,clusterCenter));
clusterRelation[row]=cluster;
}
}
}
}
return clusterRelation;
}
public static void WriteCSV (string filePath, double [,] data)
{
using (StreamWriter writer = new StreamWriter(filePath))
{
for (int row = 0; row <= data.GetUpperBound(0); row++)
{
string csvRow = "";
int cols = data.GetUpperBound(1) + 1;
for (int col = 0; col <= cols - 1; col++)
{
csvRow += System.Convert.ToString (data[row, col]);
if (col< cols-1 )
{
csvRow += ",";
}
}
writer.WriteLine(csvRow);
}
}
}
public static void WriteCSV(string filePath, int[] data)
{
using (StreamWriter writer = new StreamWriter(filePath))
{
for (int row = 0; row <= data.GetUpperBound(0); row++)
{
string csvRow = "";
csvRow += System.Convert.ToString(data[row]);
writer.WriteLine(csvRow);
}
}
}
public static double [,] ReadCSV(string filePath)
{
// Returns the CSV files data as a double [rows,columns] array
// Takes the number of columns on the first row as the column count
String[] fileContent = File.ReadAllLines(filePath);
int row = 0;
int columnCount = fileContent[0].Split(',').Length;
double[,] readCSV = new double[fileContent.Count(), columnCount];
foreach (string Line in fileContent)
{
string[] split = Line.Split(new[] { ',' });
for (int column=0; column <= columnCount -1;column++)
{
readCSV[row, column] = double.Parse(split[column], CultureInfo.InvariantCulture);
}
row++;
}
return readCSV;
}
public static int[] RandomCluster(int clusterCount, double[,] source)
{
int[] randomCluster = new int[source.GetUpperBound(0)+1];
Random random = new Random();
for (int row = 0; row <= randomCluster.GetUpperBound(0); row++)
{
randomCluster[row] = random.Next(clusterCount );
}
return randomCluster;
}
public static int[] CountByCluster(int clusterCount, int [] obsCluster)
{
int[] countByCluster = new int[clusterCount];
foreach (int obs in obsCluster)
{
countByCluster[obs]++;
}
return countByCluster;
}
public static int[] OrderCluster(int clusterCount, int[] obsCluster)
{
int[] countByCluster = CountByCluster(clusterCount,obsCluster);
int[] countByClusterX = new int[clusterCount];
for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
{
countByClusterX[c] = c;
}
Array.Sort(countByCluster,countByClusterX);
Array.Reverse(countByClusterX);
for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
{
int newC = c;
int oldC = countByClusterX[c];
for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{
if (obsCluster[row] == newC) { obsCluster[row] = -1; }
}
for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{
if (obsCluster[row] == oldC) { obsCluster[row] = newC ; }
}
for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
{
if (obsCluster[row] == -1) { obsCluster[row] = oldC; }
}
for (int row = 0; row <= countByClusterX.GetUpperBound(0); row++)
{
if (countByClusterX[row] == newC) { countByClusterX[row] = oldC; }
}
}
return obsCluster;
}
public static double[,] ZScore(double[,] data)
{
for (int dim = 0; dim <= data.GetUpperBound(1); dim++)
{
// Mean
double mean = 0;
for (int r = 0; r <= data.GetUpperBound(0); r++)
{
mean += data[r, dim];
}
mean = mean / data.GetUpperBound(0);
// Variance
double variance = 0;
for (int r = 0; r <= data.GetUpperBound(0); r++)
{
variance += Math.Pow((data[r, dim]-mean),2);
}
variance = variance / data.GetUpperBound(0);
double stdv = Math.Sqrt(variance);
// Z-Score - Standard Score
for (int r = 0; r <= data.GetUpperBound(0); r++)
{
data[r, dim] = (data[r, dim] - mean) / stdv;
}
}
return data;
}
public static double[,] ClusterCenter(int clusterCount, int[] obsCluster, double[,] source)
{
int cols = source.GetUpperBound(1)+1;
int[] countByCluster = Clustering.CountByCluster(clusterCount, obsCluster);
double[,] clusterSum = new double[clusterCount, cols];
double[,] clusterCenter = new double[clusterCount, cols];
for (int row = 0;row<=source.GetUpperBound(0);row++)
{
for (int col = 0; col<=cols-1; col++)
{
clusterSum[obsCluster[row], col] += source[row, col];
}
}
for (int row = 0; row <= clusterSum.GetUpperBound(0); row++)
{
if (countByCluster[row] != 0)
{
for (int col = 0; col <= cols - 1; col++)
{
clusterCenter[row, col] = clusterSum[row, col] / (double)countByCluster[row];
}
}
}
return clusterCenter;
}
}
}