Sunday, August 26, 2012

What do we need to implement the EM (Expectation Maximization) algorithm ?

The following components are required:

0. Matrix calculations foundation

0.1. Product
0.2. Transposition
0.3. Escalar product
0.4. Determinant
0.5. Sum

Hopefully no matrix inversion would be needed!!!!

1. Multivariate normal probability function (Multinomial) 

2. Mean and standard deviation estimation function (Maximum likelihood)

3. Covariance calculation function (which is the standard deviation source)

4. Some nice plotting can help to show the result

5. Looks like the most complex component would be the Acumulated Multivariate, it involves multiple integrals, basically a numeric method need to be used, to do that we need to research on:

5.1. Montecarlo integration methods

5.2. Cholesky matrix decomposition





Sunday, February 19, 2012

Implementing the K-Means algorithm on C#

So far I'm still working on the K-Means implementation, this is the first working result that I get, still I need to do the following improvements to the code:

1. Improve the readability.
2. Include the maximization function (done, it is used to compare the candidate models)
3. Include the z-score normalization (done)
4. Work on the stability of the clustering by performing the relocation of observations to clusters only if that would reflect a gain on the overall maximization function
5. Manage labeled data
6. Take a sample of the data to improve the scalability
7. Order the results by count of cluster frequency (done)
8. Multiple candidate models (done)
---------------------------------------
9. Other implementations 
9.1. LINQ (still C#)
9.2. MatLab
9.3. Java...
9.4. F#...
9.5. C / C++...


10. Execution trace


11. Range based standardization for not normal distributed variables 


The task that the system needs to perform is dividing n observations in k clusters where each observation belongs to the cluster with the nearest mean.


To invoke the static procedures of the class:


Clustering.WriteCSV("c:\\temp\\output.csv",
            Clustering.KMeans(10,Clustering.ReadCSV("C:\\temp\\set.csv"),true,true,10));


Class:





using System;
//using System.Collections.Generic;
using System.Linq;
//using System.Text;
using System.IO;
using System.Globalization;


namespace DM_Foundation
{
    class Clustering
    {






        public static bool Compare(int[] oldCluster,int[] newCluster)
    {
        bool compare = true;

        for (int r = 0; r <= oldCluster.GetUpperBound(0); r++)
        {

            if (oldCluster[r] != newCluster[r])
            {
                compare = false;        
            }

        }

        return compare;

    }

        public static int [] KMeans(int clusterCount,double[,] data,bool orderFrequency,bool standadScore,int candidateModels)
        {


            if (standadScore)
            {
                data = ZScore(data);
            }


            int[] selectedClusterRelation = new int[data.GetUpperBound(0)+1];

            double minCost = 0;
            //Candidate Model Iteration

            for (int model = 1; model <= candidateModels; model++)
            {

                int[] clusterRelation = RandomCluster(clusterCount, data);

                int[] oldClusterRelation = new int[clusterRelation.GetUpperBound(0) + 1];

                Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());

                int iteration = 0;
                bool convergence = false;

                while (iteration < 1000 & convergence == false)
                {
                    clusterRelation = AssignCluster(clusterCount, clusterRelation, data);
                    convergence = Compare(oldClusterRelation, clusterRelation);
                    Array.Copy(clusterRelation, oldClusterRelation, clusterRelation.Count());
                    iteration++;
                }

                if (orderFrequency == true)
                {
                    clusterRelation = Clustering.OrderCluster(clusterCount, clusterRelation);
                }


                double cost = Cost(clusterCount, clusterRelation, data);


                if (model == 1)
                {
                    Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
                    minCost = cost;
                }

                else
                {
                    if (cost < minCost)
                    {

                        Array.Copy(clusterRelation, selectedClusterRelation, clusterRelation.Count());
                        minCost = cost;
                    }

                }
             

            }


            return selectedClusterRelation;


        }



        public static double Cost(int clusterCount, int[] clusterRelation, double[,] data)
        {

            double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);

            double cost = 0;

            for (int row = 0; row <= data.GetUpperBound(0); row++)
            {
                cost += Math.Pow(Distance(GetRow(row, data), GetRow(clusterRelation[row], clusterCenter)), 2);

            }
            return cost;
        }

        public static double[,] GetRow(int row, double[,] data)
        {
            int columnCount = data.GetUpperBound(1) + 1;
            double [,] getRow = new double[1, columnCount];

            for (int column = 0; column <= columnCount - 1; column++)
            {
                getRow[0, column] = data[row, column];
            }
            return getRow;
        }

        public static double Distance(double[,] pointA, double[,] pointB)
        {
            double distance = 0;


             int columnCount = pointA.GetUpperBound(1) + 1;
         
            for (int column = 0; column <= columnCount - 1; column++)
            {
                distance += (pointA[0,column]-pointB[0,column])*(pointA[0,column]-pointB[0,column]);
            }

            distance = Math.Sqrt(distance);
            return distance;

        }


        public static int[] AssignCluster(int clusterCount, int[] clusterRelation, double[,] data)
        {

            int[] countByCluster = CountByCluster(clusterCount, clusterRelation);
            double[,] clusterCenter = ClusterCenter(clusterCount, clusterRelation, data);

            for (int row = 0; row <= clusterRelation.GetUpperBound(0); row++)
            {

               double maxDistance = Distance( GetRow(row,data),GetRow(clusterRelation[row],clusterCenter));

                for (int cluster = 0;cluster <= clusterCount-1;cluster++)

                {
                 
                    if (countByCluster[cluster] != 0)
                    {
                    if (maxDistance>=Distance( GetRow(row,data),GetRow(cluster,clusterCenter)))
                    {
                        maxDistance=Distance( GetRow(row,data),GetRow(cluster,clusterCenter));
                        clusterRelation[row]=cluster;
                    }
                    }

                }

            }

            return clusterRelation;
        }


        public static void WriteCSV (string filePath, double [,] data)
             
        {
            using (StreamWriter writer = new StreamWriter(filePath))
            {
                for (int row = 0; row <= data.GetUpperBound(0); row++)
                {

                    string csvRow = "";

                    int cols = data.GetUpperBound(1) + 1;

                    for (int col = 0; col <= cols - 1; col++)
                    {
                     
                     
                        csvRow += System.Convert.ToString (data[row, col]);
                     
                        if (col< cols-1 )
                        {
                            csvRow += ",";
                        }
                    }
                    writer.WriteLine(csvRow);
                }
            }
        }

        public static void WriteCSV(string filePath, int[] data)
        {
            using (StreamWriter writer = new StreamWriter(filePath))
            {
                for (int row = 0; row <= data.GetUpperBound(0); row++)
                {

                    string csvRow = "";
                    csvRow += System.Convert.ToString(data[row]);
                    writer.WriteLine(csvRow);
                 }
                 
             }
          }
     

        public static double [,] ReadCSV(string filePath)
        {
            // Returns the CSV files data as a double [rows,columns] array
            // Takes the number of columns on the first row as the column count
         
            String[] fileContent = File.ReadAllLines(filePath);
            int row = 0;
            int columnCount = fileContent[0].Split(',').Length;
            double[,] readCSV = new double[fileContent.Count(), columnCount];
            foreach (string Line in fileContent)
            {
                string[] split = Line.Split(new[] { ',' });
                for (int column=0; column <= columnCount -1;column++)
                {
                    readCSV[row, column] = double.Parse(split[column], CultureInfo.InvariantCulture);
                }
             
                row++;
            }
            return readCSV;
        }

        public static int[] RandomCluster(int clusterCount, double[,] source)
    {

     
         
            int[] randomCluster = new int[source.GetUpperBound(0)+1];
            Random random = new Random();
            for (int row = 0; row <= randomCluster.GetUpperBound(0); row++)
         
            {
             
                randomCluster[row] = random.Next(clusterCount );
            }

            return randomCluster;


    }


        public static int[] CountByCluster(int clusterCount, int [] obsCluster)
        {

            int[] countByCluster = new int[clusterCount];
            foreach (int obs in obsCluster)
            {

                countByCluster[obs]++;
            }
            return countByCluster;
        }



        public static int[] OrderCluster(int clusterCount, int[] obsCluster)

    {





        int[] countByCluster = CountByCluster(clusterCount,obsCluster);

        int[] countByClusterX = new int[clusterCount];

        for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
        {

            countByClusterX[c] = c;
         

        }

        Array.Sort(countByCluster,countByClusterX);

        Array.Reverse(countByClusterX);

        for (int c = 0; c <= countByCluster.GetUpperBound(0); c++)
        {

            int newC = c;
            int oldC = countByClusterX[c];

            for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
            {

                if (obsCluster[row] == newC) { obsCluster[row] = -1; }
            }

            for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
            {

                if (obsCluster[row] == oldC) { obsCluster[row] = newC ; }
            }

            for (int row = 0; row <= obsCluster.GetUpperBound(0); row++)
            {

                if (obsCluster[row] == -1) { obsCluster[row] = oldC; }
            }

            for (int row = 0; row <= countByClusterX.GetUpperBound(0); row++)
            {

                if (countByClusterX[row] == newC) { countByClusterX[row] = oldC; }
            }
         
        }


        return obsCluster;
    }



        public static double[,] ZScore(double[,] data)
        {


            for (int dim = 0; dim <= data.GetUpperBound(1); dim++)
            {

                // Mean

                double mean = 0;

                for (int r = 0; r <= data.GetUpperBound(0); r++)
                {
                    mean += data[r, dim];
                }

                mean = mean / data.GetUpperBound(0);

                // Variance

                double variance = 0;

                for (int r = 0; r <= data.GetUpperBound(0); r++)
                {
                    variance += Math.Pow((data[r, dim]-mean),2);
                }

                variance = variance / data.GetUpperBound(0);
                double stdv = Math.Sqrt(variance);



                // Z-Score - Standard Score

             

                for (int r = 0; r <= data.GetUpperBound(0); r++)
                {
                    data[r, dim] = (data[r, dim] - mean) / stdv;
                }


             
            }

            return data;

        }

        public static double[,] ClusterCenter(int clusterCount, int[] obsCluster, double[,] source)
        {

            int cols = source.GetUpperBound(1)+1;

            int[] countByCluster = Clustering.CountByCluster(clusterCount, obsCluster);
            double[,] clusterSum = new double[clusterCount, cols];
            double[,] clusterCenter = new double[clusterCount, cols];

            for (int row = 0;row<=source.GetUpperBound(0);row++)

            {
             
                for (int col = 0; col<=cols-1; col++)
                {
                    clusterSum[obsCluster[row], col] += source[row, col];
                }
             
            }


            for (int row = 0; row <= clusterSum.GetUpperBound(0); row++)

            {
                if (countByCluster[row] != 0)
                {
                    for (int col = 0; col <= cols - 1; col++)
                    {
                        clusterCenter[row, col] = clusterSum[row, col] / (double)countByCluster[row];
                    }
                }
            }

            return clusterCenter;
        }




    }
}