In the last post I show how we can calculate clusters of n-dimensional Objects with the K-means-Algorithm in F#. This time I will simplify the code by using the built-in type vector.
First of all we redefine the interface for “clusterable” objects:
type IClusterable =
abstract DimValues: vector with get
This time we do not need any dimension information – the vector type will care about this. Now we can reduce our distance function to:
let calcDist (p1:vector) (p2:vector) = (p1 - p2).Norm
The function Norm calculates the 2-Norm of a vector.
So calcDist will still compute the n-dimensional Euclidean distance.
The next step is the definition of a n-dimensional Centroid type. This time we will only use a type alias of vector:
type Centroid = vector
The calculation of the centroid and the centroid error can be written as:
let calcCentroid (items:'a list when 'a :> IClusterable)
(oldCentroid:Centroid) =
let count = items.Length
if count = 0 then oldCentroid else
let sum =
items
|> List.fold_left
(fun vec item -> vec + item.DimValues) // vector addition
(nullVector oldCentroid)
sum * (1./(count |> float)) // multiply with scalar
let calcError centroid (items:'a list when 'a :> IClusterable) =
let sq x = x * x
items
|> List.sum_by (fun i -> calcDist centroid i.DimValues |> sq)
The calcError function is straight forward but not perfect in terms of performance. As we use the calcDist function and hence the 2-Norm the function will first calculate a square root and then squares the result.
The rest of the code is pretty much same as in the last post:
/// creates n-dimensional 0 - vector
let nullVector (v:vector) = v * 0.
type Cluster<'a> when 'a :> IClusterable =
{Items: 'a list;
Count: int;
Centroid: Centroid;
Error: float}
member x.Print =
printfn "(Items: %d, Centroid: %A, Error: %.2f)"
x.Count x.Centroid x.Error
static member CreateCluster (item:'a) =
let items = [item]
let empty = nullVector item.DimValues
let centroid = calcCentroid items empty
{new Cluster<'a> with
Items = items and
Count = 1 and
Centroid = centroid and
Error = calcError centroid items}
member x.EvolveCluster (items:'a list) =
let l = items.Length
let centroid = calcCentroid items x.Centroid
{x with
Items = items;
Count = l;
Centroid = centroid;
Error = calcError centroid items}
let rand = new Random()
let InitClusters (k:int)
(items:'a array when 'a :> IClusterable) =
let length = items.Length
let getInitItem() = items.[rand.Next length]
Array.init k (fun x ->
getInitItem()
|> Cluster<'a>.CreateCluster)
let AssignItemsToClusters k (clusters:Cluster<'a> array)
(items:'a seq when 'a :> IClusterable) =
if k <= 0 then failwith "KMeans needs k > 0"
let findNearestCluster (item:'a) =
let minDist,nearest,lastPos =
clusters
|> Array.fold_left
(fun (minDist,nearest,pos) cluster ->
let distance = calcDist item.DimValues (cluster.Centroid)
if distance < minDist then
(distance,pos,pos+1)
else
(minDist,nearest,pos+1))
(Double.PositiveInfinity,0,0)
nearest
let assigned =
items
|> Seq.map (fun item -> item,findNearestCluster item)
let newClusters = Array.create k []
for item,nearest in assigned do
newClusters.[nearest] <- item::(newClusters.[nearest])
clusters
|> Array.mapi (fun i c -> c.EvolveCluster newClusters.[i])
let calcClusterError (clusters:Cluster<'a> array) =
clusters
|> Array.fold_left (fun acc cluster -> acc + cluster.Error) 0.
let kMeansClustering K epsilon
(items:'a array when 'a :> IClusterable) =
let k = if K <= 0 then 1 else K
let rec clustering lastError (clusters:Cluster<'a> array) =
let newClusters =
AssignItemsToClusters k clusters items
let newError = calcClusterError newClusters
if abs(lastError - newError) > epsilon then
clustering newError newClusters
else
newClusters,newError
InitClusters k items
|> clustering Double.PositiveInfinity
This vector approach shortens the code for the kMeans algorithm, but has a little longer runtime. I am interested in any suggestions.
Tags:
Centroid,
Clustering,
k-means,
KMeans,
vector
The K-means-Algorithm is one of the simplest unsupervised learning algorithms that solve the well known clustering problem. In this article I will show how we can implement this in F#.
First of all we define an interface for “clusterable” objects:
type IClusterable =
abstract DimValues: float array with get
abstract Dimensions: int
The next step is to define a distance function. We will use n-dimensional Euclidean distance here.
let calcDist (p1:IClusterable) (p2:IClusterable) =
let sq x = x * x
if p1.Dimensions <> p2.Dimensions then
failwith "Cluster dimensions aren’t equal."
p2.DimValues
|> Array.fold2
(fun acc x y -> x – y |> sq |> (+) acc)
0. p1.DimValues
|> sqrt
Now we define a n-dimensional Centroid type. Our kMeans-Algorithm will minimize the squared distances to k centroids (or “means”).
type Centroid =
{Values: float array;
dimensions: int}
member x.Print = printfn "%A" x.Values
interface IClusterable with
member x.DimValues = x.Values
member x.Dimensions = x.dimensions
The centroid of a finite set of n-dimensional points x1, x2, …, xn is calculated as
let calcCentroid (items:’a list when ‘a :> IClusterable)
(oldCentroid:Centroid) =
let count = items.Length
if count = 0 then oldCentroid else
let mValues =
[|for d in 0..oldCentroid.dimensions-1 do
let sum =
items
|> List.sumBy (fun item -> item.DimValues.[d])
yield sum / (count |> float)|]
{ Values = mValues;
dimensions = oldCentroid.dimensions}
We made a small modification – if we don’t have any items assigned to a cluster the centroid won’t change. This is important for the robustness of the algorithm.
Now we can calculate the squared errors to a centroid:
let calcError centroid (items:’a list when ‘a :> IClusterable) =
let calcDiffToCentroid (item:’a) =
centroid.Values
|> Array.fold2
(fun acc c i -> acc + (c-i)*(c-i))
0. item.DimValues
items
|> List.sumBy calcDiffToCentroid
For storing cluster information we create a cluster type:
type Cluster<‘a> when ‘a :> IClusterable =
{Items: ‘a list;
Count: int;
Centroid: Centroid;
Error: float}
member x.Print =
printfn "(Items: %d, Centroid: %A, Error: %.2f)"
x.Count x.Centroid x.Error
static member CreateCluster dimensions (item:’a) =
let items = [item]
let empty =
{ Values = [||];
dimensions = dimensions}
let centroid = calcCentroid items empty
{ Items = items;
Count = 1;
Centroid = centroid;
Error = calcError centroid items}
member x.EvolveCluster (items:’a list) =
let l = items.Length
let centroid = calcCentroid items x.Centroid
{x with
Items = items;
Count = l;
Centroid = centroid;
Error = calcError centroid items}
Now we need a function to initialize the clusters and a function to assign a items to the nearest cluster:
open System
let rand = new Random()
let InitClusters (k:int) dimensions
(items:’a array when ‘a :> IClusterable) =
let length = items.Length
let getInitItem() = items.[rand.Next length]
Array.init k (fun _ ->
getInitItem()
|> Cluster<‘a>.CreateCluster dimensions)
let AssignItemsToClusters k dimensions (clusters:Cluster<‘a> array)
(items:’a seq when ‘a :> IClusterable) =
if k <= 0 then failwith "KMeans needs k > 0"
let findNearestCluster item =
let minDist,nearest,lastPos =
clusters
|> Array.fold
(fun (minDist,nearest,pos) cluster ->
let distance = calcDist item (cluster.Centroid)
if distance < minDist then
(distance,pos,pos+1)
else
(minDist,nearest,pos+1))
(Double.PositiveInfinity,0,0)
nearest
let assigned =
items
|> Seq.map (fun item -> item,findNearestCluster item)
let newClusters = Array.create k []
for item,nearest in assigned do
newClusters.[nearest] <- item::(newClusters.[nearest])
clusters
|> Array.mapi (fun i c -> c.EvolveCluster newClusters.[i])
The last step is a function which calculates the squared error over all clusters:
let calcClusterError (clusters:Cluster<‘a> array) =
clusters
|> Array.sumBy (fun cluster -> cluster.Error)
Now it is an easy task to write the kMeans algorithm:
let kMeansClustering K dimensions epsilon
(items:’a array when ‘a :> IClusterable) =
let k = if K <= 0 then 1 else K
let rec clustering lastError (clusters:Cluster<‘a> array) =
let newClusters =
AssignItemsToClusters k dimensions clusters items
let newError = calcClusterError newClusters
if abs(lastError – newError) > epsilon then
clustering newError newClusters
else
newClusters,newError
InitClusters k dimensions items
|> clustering Double.PositiveInfinity
We can test this algorithm with random 2-dimensional points:
type Point =
{X:float; Y: float}
member x.Print = printfn "(%.2f,%.2f)" x.X x.Y
interface IClusterable with
member x.DimValues = [| x.X; x.Y |]
member x.Dimensions = 2
let point x y = {X = x; Y = y}
let getRandomCoordinate() = rand.NextDouble() – 0.5 |> (*) 10.
let randPoint _ = point (getRandomCoordinate()) (getRandomCoordinate())
let points n = Array.init n randPoint
let items = points 1000
printfn "Items:"
items |> Seq.iter (fun i -> i.Print)
let clusters,error = kMeansClustering 3 2 0.0001 items
printfn "\n"
clusters |> Seq.iter (fun c -> c.Print)
printfn "Error: %A" error
Next time I will show how we can simplify the code by using F#’s built-in type vector instead of array.
Tags:
Centroid,
Clustering,
F#,
k-means,
KMeans