Rash thoughts about .NET, C#, F# and Dynamics NAV.

"Every solution will only lead to new problems."

Saturday, 31. January 2009

n-dimensional k-means clustering with F# – part II

Filed under: BioInformatik,English posts,F#,Informatik — Steffen Forkmann at 11:28 Uhr

In the last post I show how we can calculate clusters of n-dimensional Objects with the K-means-Algorithm in F#. This time I will simplify the code by using the built-in type vector.

First of all we redefine the interface for “clusterable” objects:

type IClusterable =
  abstract DimValues: vector with get

This time we do not need any dimension information – the vector type will care about this. Now we can reduce our distance function to:

let calcDist (p1:vector) (p2:vector) = (p1 - p2).Norm

The function Norm calculates the 2-Norm of a vector.

2-Norm ==> Eulidean distance 

So calcDist will still compute the n-dimensional Euclidean distance.

The next step is the definition of a n-dimensional Centroid type. This time we will only use a type alias of vector:

type Centroid = vector

The calculation of the centroid and the centroid error can be written as:

let calcCentroid (items:'a list when 'a :> IClusterable)
     (oldCentroid:Centroid) =
  let count = items.Length
  if count = 0 then oldCentroid else  
  let sum =
      |> List.fold_left 
        (fun vec item -> vec + item.DimValues) // vector addition
        (nullVector oldCentroid)
  sum * (1./(count |> float))  // multiply with scalar

let calcError centroid (items:'a list when 'a :> IClusterable) =
  let sq x = x * x

    |> List.sum_by (fun i -> calcDist centroid i.DimValues |> sq)

The calcError function is straight forward but not perfect in terms of performance. As we use the calcDist function and hence the 2-Norm the function will first calculate a square root and then squares the result.

The rest of the code is pretty much same as in the last post:

/// creates n-dimensional 0 - vector
let nullVector (v:vector) = v * 0.
type Cluster<'a> when 'a :> IClusterable =
  {Items: 'a list;
   Count: int;
   Centroid: Centroid;
   Error: float}

  member x.Print =
     printfn "(Items: %d, Centroid: %A, Error: %.2f)"
       x.Count x.Centroid x.Error

  static member CreateCluster (item:'a) =
    let items = [item]
    let empty = nullVector item.DimValues

    let centroid = calcCentroid items empty
    {new Cluster<'a> with
      Items = items and
      Count = 1 and
      Centroid = centroid and
      Error = calcError centroid items}

  member x.EvolveCluster (items:'a list) =
    let l = items.Length
    let centroid = calcCentroid items x.Centroid
    {x with
      Items = items;
      Count = l;
      Centroid = centroid;
      Error = calcError centroid items} 

let rand = new Random()  

let InitClusters (k:int)
      (items:'a array when 'a :> IClusterable)  =
  let length = items.Length
  let getInitItem() = items.[rand.Next length]
  Array.init k (fun x ->
                    |> Cluster<'a>.CreateCluster)
let AssignItemsToClusters k (clusters:Cluster<'a> array)
      (items:'a seq when 'a :> IClusterable) =
  if k <= 0 then failwith "KMeans needs k > 0"
  let findNearestCluster (item:'a) =
    let minDist,nearest,lastPos =
        |> Array.fold_left
            (fun (minDist,nearest,pos) cluster ->
               let distance = calcDist item.DimValues (cluster.Centroid)
               if distance < minDist then

  let assigned =
      |> Seq.map (fun item -> item,findNearestCluster item)

  let newClusters = Array.create k []
  for item,nearest in assigned do
    newClusters.[nearest] <- item::(newClusters.[nearest])

    |> Array.mapi (fun i c -> c.EvolveCluster newClusters.[i])  

let calcClusterError (clusters:Cluster<'a> array) =
    |> Array.fold_left (fun acc cluster -> acc + cluster.Error) 0.  
let kMeansClustering K epsilon
       (items:'a array when 'a :> IClusterable) =
  let k = if K <= 0 then 1 else K
  let rec clustering lastError (clusters:Cluster<'a> array) =
    let newClusters =
      AssignItemsToClusters k clusters items
    let newError = calcClusterError newClusters

    if abs(lastError - newError) > epsilon then
      clustering newError newClusters

  InitClusters k items
    |> clustering Double.PositiveInfinity

This vector approach shortens the code for the kMeans algorithm, but has a little longer runtime. I am interested in any suggestions.

Tags: , , , ,

No Comments »

No comments yet.

RSS feed for comments on this post. | TrackBack URI

Leave a comment

XHTML ( You can use these tags): <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong> .