Rash thoughts about .NET, C#, F# and Dynamics NAV.


"Every solution will only lead to new problems."

Saturday, 31. January 2009


n-dimensional k-means clustering with F# – part II

Filed under: BioInformatik,English posts,F#,Informatik — Steffen Forkmann at 11:28 Uhr

In the last post I show how we can calculate clusters of n-dimensional Objects with the K-means-Algorithm in F#. This time I will simplify the code by using the built-in type vector.

First of all we redefine the interface for “clusterable” objects:

type IClusterable =
  abstract DimValues: vector with get

This time we do not need any dimension information – the vector type will care about this. Now we can reduce our distance function to:

let calcDist (p1:vector) (p2:vector) = (p1 - p2).Norm

The function Norm calculates the 2-Norm of a vector.

2-Norm ==> Eulidean distance 

So calcDist will still compute the n-dimensional Euclidean distance.

The next step is the definition of a n-dimensional Centroid type. This time we will only use a type alias of vector:

type Centroid = vector

The calculation of the centroid and the centroid error can be written as:

let calcCentroid (items:'a list when 'a :> IClusterable)
     (oldCentroid:Centroid) =
  let count = items.Length
  if count = 0 then oldCentroid else  
  let sum =
    items
      |> List.fold_left 
        (fun vec item -> vec + item.DimValues) // vector addition
        (nullVector oldCentroid)
   
  sum * (1./(count |> float))  // multiply with scalar
     

let calcError centroid (items:'a list when 'a :> IClusterable) =
  let sq x = x * x

  items
    |> List.sum_by (fun i -> calcDist centroid i.DimValues |> sq)

The calcError function is straight forward but not perfect in terms of performance. As we use the calcDist function and hence the 2-Norm the function will first calculate a square root and then squares the result.

The rest of the code is pretty much same as in the last post:

/// creates n-dimensional 0 - vector
let nullVector (v:vector) = v * 0.
    
type Cluster<'a> when 'a :> IClusterable =
  {Items: 'a list;
   Count: int;
   Centroid: Centroid;
   Error: float}

  member x.Print =
     printfn "(Items: %d, Centroid: %A, Error: %.2f)"
       x.Count x.Centroid x.Error

  static member CreateCluster (item:'a) =
    let items = [item]
    let empty = nullVector item.DimValues

    let centroid = calcCentroid items empty
    {new Cluster<'a> with
      Items = items and
      Count = 1 and
      Centroid = centroid and
      Error = calcError centroid items}

  member x.EvolveCluster (items:'a list) =
    let l = items.Length
    let centroid = calcCentroid items x.Centroid
    {x with
      Items = items;
      Count = l;
      Centroid = centroid;
      Error = calcError centroid items} 

let rand = new Random()  

let InitClusters (k:int)
      (items:'a array when 'a :> IClusterable)  =
  let length = items.Length
  let getInitItem() = items.[rand.Next length]
  Array.init k (fun x ->
                  getInitItem()
                    |> Cluster<'a>.CreateCluster)
                    
                    
let AssignItemsToClusters k (clusters:Cluster<'a> array)
      (items:'a seq when 'a :> IClusterable) =
  if k <= 0 then failwith "KMeans needs k > 0"
  let findNearestCluster (item:'a) =
    let minDist,nearest,lastPos =
      clusters
        |> Array.fold_left
            (fun (minDist,nearest,pos) cluster ->
               let distance = calcDist item.DimValues (cluster.Centroid)
               if distance < minDist then
                 (distance,pos,pos+1)
               else
                 (minDist,nearest,pos+1))
            (Double.PositiveInfinity,0,0)
    nearest

  let assigned =
    items
      |> Seq.map (fun item -> item,findNearestCluster item)

  let newClusters = Array.create k []
  for item,nearest in assigned do
    newClusters.[nearest] <- item::(newClusters.[nearest])

  clusters
    |> Array.mapi (fun i c -> c.EvolveCluster newClusters.[i])  

    
let calcClusterError (clusters:Cluster<'a> array) =
  clusters
    |> Array.fold_left (fun acc cluster -> acc + cluster.Error) 0.  
                      
let kMeansClustering K epsilon
       (items:'a array when 'a :> IClusterable) =
  let k = if K <= 0 then 1 else K
  let rec clustering lastError (clusters:Cluster<'a> array) =
    let newClusters =
      AssignItemsToClusters k clusters items
    let newError = calcClusterError newClusters

    if abs(lastError - newError) > epsilon then
      clustering newError newClusters
    else
      newClusters,newError    

  InitClusters k items
    |> clustering Double.PositiveInfinity

This vector approach shortens the code for the kMeans algorithm, but has a little longer runtime. I am interested in any suggestions.

Tags: , , , ,

Thursday, 29. January 2009


n-dimensional k-means clustering with F#

Filed under: BioInformatik,English posts,F#,Informatik — Steffen Forkmann at 11:38 Uhr

The K-means-Algorithm is one of the simplest unsupervised learning algorithms that solve the well known clustering problem. In this article I will show how we can implement this in F#.

First of all we define an interface for “clusterable” objects:

type IClusterable =

  abstract DimValues: float array with get

  abstract Dimensions: int

The next step is to define a distance function. We will use n-dimensional Euclidean distance here.

Euclidean distance

let calcDist (p1:IClusterable) (p2:IClusterable) =

  let sq x = x * x

  if p1.Dimensions <> p2.Dimensions then

    failwith "Cluster dimensions aren’t equal."

 

  p2.DimValues

    |> Array.fold2

        (fun acc x y -> x – y |> sq |> (+) acc)

        0. p1.DimValues

   |> sqrt 

Now we define a n-dimensional Centroid type. Our kMeans-Algorithm will minimize the squared distances to k centroids (or “means”).

type Centroid =

  {Values: float array;

   dimensions: int}

  member x.Print = printfn "%A" x.Values

  interface IClusterable with

    member x.DimValues = x.Values   

    member x.Dimensions = x.dimensions   

The centroid of a finite set of n-dimensional points x1, x2, …, xn is calculated as

image

let calcCentroid (items:’a list when ‘a :> IClusterable)

     (oldCentroid:Centroid) =

  let count = items.Length

  if count = 0 then oldCentroid else

  let mValues =    

    [|for d in 0..oldCentroid.dimensions-1 do

        let sum =

          items

            |> List.sumBy (fun item -> item.DimValues.[d])

 

        yield sum / (count |> float)|]

 

  { Values = mValues;

    dimensions = oldCentroid.dimensions}

We made a small modification – if we don’t have any items assigned to a cluster the centroid won’t change. This is important for the robustness of the algorithm.

Now we can calculate the squared errors to a centroid:

let calcError centroid (items:’a list when ‘a :> IClusterable) =

  let calcDiffToCentroid (item:’a) =

    centroid.Values

      |> Array.fold2

          (fun acc c i -> acc + (c-i)*(c-i))

          0. item.DimValues

 

  items

    |> List.sumBy calcDiffToCentroid 

For storing cluster information we create a cluster type:

type Cluster<‘a> when ‘a :> IClusterable =

  {Items: ‘a list;

   Count: int;

   Centroid: Centroid;

   Error: float}

 

  member x.Print =

     printfn "(Items: %d, Centroid: %A, Error: %.2f)"

       x.Count x.Centroid x.Error

 

  static member CreateCluster dimensions (item:’a) =

    let items = [item]

    let empty =

      { Values = [||];

        dimensions = dimensions}

 

    let centroid = calcCentroid items empty

    { Items = items;

      Count = 1;

      Centroid = centroid;

      Error = calcError centroid items}

 

  member x.EvolveCluster (items:’a list) =

    let l = items.Length

    let centroid = calcCentroid items x.Centroid

    {x with

      Items = items;

      Count = l;

      Centroid = centroid;

      Error = calcError centroid items} 

Now we need a function to initialize the clusters and a function to assign a items to the nearest cluster:

open System

 

let rand = new Random() 

 

let InitClusters (k:int) dimensions

      (items:’a array when ‘a :> IClusterable)  =

  let length = items.Length

  let getInitItem() = items.[rand.Next length]

  Array.init k (fun _ ->

                  getInitItem()

                    |> Cluster<‘a>.CreateCluster dimensions) 

 

let AssignItemsToClusters k dimensions (clusters:Cluster<‘a> array)

      (items:’a seq when ‘a :> IClusterable) =

  if k <= 0 then failwith "KMeans needs k > 0"  

  let findNearestCluster item =

    let minDist,nearest,lastPos =

      clusters

        |> Array.fold

            (fun (minDist,nearest,pos) cluster ->

               let distance = calcDist item (cluster.Centroid)

               if distance < minDist then

                 (distance,pos,pos+1)

               else

                 (minDist,nearest,pos+1))

            (Double.PositiveInfinity,0,0) 

    nearest

 

  let assigned =

    items

      |> Seq.map (fun item -> item,findNearestCluster item)

 

  let newClusters = Array.create k []  

  for item,nearest in assigned do        

    newClusters.[nearest] <- item::(newClusters.[nearest])

 

  clusters

    |> Array.mapi (fun i c -> c.EvolveCluster newClusters.[i])   

The last step is a function which calculates the squared error over all clusters:

let calcClusterError (clusters:Cluster<‘a> array) =

  clusters

    |> Array.sumBy (fun cluster -> cluster.Error)

Now it is an easy task to write the kMeans algorithm:

let kMeansClustering K dimensions epsilon

       (items:’a array when ‘a :> IClusterable) =

  let k = if K <= 0 then 1 else K

  let rec clustering lastError (clusters:Cluster<‘a> array) =

    let newClusters =

      AssignItemsToClusters k dimensions clusters items

    let newError = calcClusterError newClusters

 

    if abs(lastError – newError) > epsilon then       

      clustering newError newClusters

    else

      newClusters,newError   

 

  InitClusters k dimensions items

    |> clustering Double.PositiveInfinity 

We can test this algorithm with random 2-dimensional points:

type Point =

  {X:float; Y: float} 

  member x.Print = printfn "(%.2f,%.2f)" x.X x.Y

  interface IClusterable with

    member x.DimValues = [| x.X; x.Y |]

    member x.Dimensions = 2  

 

let point x y = {X = x; Y = y}

let getRandomCoordinate() = rand.NextDouble() – 0.5 |> (*) 10. 

let randPoint _ = point (getRandomCoordinate()) (getRandomCoordinate())    

let points n = Array.init n randPoint

let items = points 1000

 

printfn "Items:"

items |> Seq.iter (fun i -> i.Print)

 

let clusters,error = kMeansClustering 3 2 0.0001 items

printfn "\n"

clusters |> Seq.iter (fun c -> c.Print)   

printfn "Error: %A" error   

Next time I will show how we can simplify the code by using F#’s built-in type vector instead of array.

Tags: , , , ,