diff --git a/README.md b/README.md index 4d88a064..11be805a 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,7 @@ a set of rules that precisely define a sequence of operations. * **Machine Learning** * `B` [NanoNeuron](https://github.com/trekhleb/nano-neuron) - 7 simple JS functions that illustrate how machines can actually learn (forward/backward propagation) * `B` [k-NN](src/algorithms/ml/knn) - k-nearest neighbors classification algorithm + * `B` [k-Means](src/algorithms/ml/kmeans) - k-Means clustering algorithm * **Uncategorized** * `B` [Tower of Hanoi](src/algorithms/uncategorized/hanoi-tower) * `B` [Square Matrix Rotation](src/algorithms/uncategorized/square-matrix-rotation) - in-place algorithm diff --git a/src/algorithms/ml/kmeans/README.md b/src/algorithms/ml/kmeans/README.md new file mode 100644 index 00000000..1c0d53ee --- /dev/null +++ b/src/algorithms/ml/kmeans/README.md @@ -0,0 +1,32 @@ +# k-Means Algorithm + +The **k-Means algorithm** is an unsupervised Machine Learning algorithm. It's a clustering algorithm, which groups the sample data on the basis of similarity between dimentions of vectors. + +In k-Means classification, the output is a set of classess asssigned to each vector. Each cluster location is continously optimized in order to get the accurate locations of each cluster such that they represent each group clearly. + +The idea is to calculate the similarity between cluster location and data vectors, and reassign clusters based on it. [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) is used mostly for this task. + +![Euclidean distance between two points](https://upload.wikimedia.org/wikipedia/commons/5/55/Euclidean_distance_2d.svg) + +_Image source: [Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)_ + +The algorithm is as follows: + +1. Check for errors like invalid/inconsistent data +2. Initialize the k cluster locations with initial/random k points +3. Calculate the distance of each data point from each cluster +4. Assign the cluster label of each data point equal to that of the cluster at it's minimum distance +5. Calculate the centroid of each cluster based on the data points it contains +6. Repeat each of the above steps until the centroid locations are varying + +Here is a visualization of k-Means clustering for better understanding: + +![KNN Visualization 1](https://upload.wikimedia.org/wikipedia/commons/e/ea/K-means_convergence.gif) + +_Image source: [Wikipedia](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm)_ + +The centroids are moving continously in order to create better distinction between the different set of data points. As we can see, after a few iterations, the difference in centroids is quite low between iterations. For example between itrations `13` and `14` the difference is quite small because there the optimizer is tuning boundary cases. + +## References + +- [k-Means neighbors algorithm on Wikipedia](https://en.wikipedia.org/wiki/K-means_clustering) diff --git a/src/algorithms/ml/kmeans/__test__/kmeans.test.js b/src/algorithms/ml/kmeans/__test__/kmeans.test.js new file mode 100644 index 00000000..ed90e9f7 --- /dev/null +++ b/src/algorithms/ml/kmeans/__test__/kmeans.test.js @@ -0,0 +1,36 @@ +import kMeans from '../kmeans'; + +describe('kMeans', () => { + it('should throw an error on invalid data', () => { + expect(() => { + kMeans(); + }).toThrowError('Either dataSet or labels or toClassify were not set'); + }); + + it('should throw an error on inconsistent data', () => { + expect(() => { + kMeans([[1, 2], [1]], 2); + }).toThrowError('Inconsistent vector lengths'); + }); + + it('should find the nearest neighbour', () => { + const dataSet = [[1, 1], [6, 2], [3, 3], [4, 5], [9, 2], [2, 4], [8, 7]]; + const k = 2; + const expectedCluster = [0, 1, 0, 1, 1, 0, 1]; + expect(kMeans(dataSet, k)).toEqual(expectedCluster); + }); + + it('should find the clusters with equal distances', () => { + const dataSet = [[0, 0], [1, 1], [2, 2]]; + const k = 3; + const expectedCluster = [0, 1, 2]; + expect(kMeans(dataSet, k)).toEqual(expectedCluster); + }); + + it('should find the nearest neighbour in 3D space', () => { + const dataSet = [[0, 0, 0], [0, 1, 0], [2, 0, 2]]; + const k = 2; + const expectedCluster = [1, 1, 0]; + expect(kMeans(dataSet, k)).toEqual(expectedCluster); + }); +}); diff --git a/src/algorithms/ml/kmeans/kmeans.js b/src/algorithms/ml/kmeans/kmeans.js new file mode 100644 index 00000000..099ddab5 --- /dev/null +++ b/src/algorithms/ml/kmeans/kmeans.js @@ -0,0 +1,98 @@ +/** + * Calculates calculate the euclidean distance between 2 vectors. + * + * @param {number[]} x1 + * @param {number[]} x2 + * @returns {number} + */ +function euclideanDistance(x1, x2) { + // Checking for errors. + if (x1.length !== x2.length) { + throw new Error('Inconsistent vector lengths'); + } + // Calculate the euclidean distance between 2 vectors and return. + let squaresTotal = 0; + for (let i = 0; i < x1.length; i += 1) { + squaresTotal += (x1[i] - x2[i]) ** 2; + } + return Number(Math.sqrt(squaresTotal).toFixed(2)); +} +/** + * Classifies the point in space based on k-nearest neighbors algorithm. + * + * @param {number[][]} dataSet - array of dataSet points, i.e. [[0, 1], [3, 4], [5, 7]] + * @param {number} k - number of nearest neighbors which will be taken into account (preferably odd) + * @return {number[]} - the class of the point + */ +export default function kMeans( + dataSetm, + k = 1, +) { + const dataSet = dataSetm; + if (!dataSet) { + throw new Error('Either dataSet or labels or toClassify were not set'); + } + + // starting algorithm + // assign k clusters locations equal to the location of initial k points + const clusterCenters = []; + const nDim = dataSet[0].length; + for (let i = 0; i < k; i += 1) { + clusterCenters[clusterCenters.length] = Array.from(dataSet[i]); + } + + // continue optimization till convergence + // centroids should not be moving once optimized + // calculate distance of each candidate vector from each cluster center + // assign cluster number to each data vector according to minimum distance + let flag = true; + while (flag) { + flag = false; + // calculate and store distance of each dataSet point from each cluster + for (let i = 0; i < dataSet.length; i += 1) { + for (let n = 0; n < k; n += 1) { + dataSet[i][nDim + n] = euclideanDistance(clusterCenters[n], dataSet[i].slice(0, nDim)); + } + + // assign the cluster number to each dataSet point + const sliced = dataSet[i].slice(nDim, nDim + k); + let minmDistCluster = Math.min(...sliced); + for (let j = 0; j < sliced.length; j += 1) { + if (minmDistCluster === sliced[j]) { + minmDistCluster = j; + break; + } + } + + if (dataSet[i].length !== nDim + k + 1) { + flag = true; + dataSet[i][nDim + k] = minmDistCluster; + } else if (dataSet[i][nDim + k] !== minmDistCluster) { + flag = true; + dataSet[i][nDim + k] = minmDistCluster; + } + } + // recalculate cluster centriod values via all dimensions of the points under it + for (let i = 0; i < k; i += 1) { + clusterCenters[i] = Array(nDim).fill(0); + let classCount = 0; + for (let j = 0; j < dataSet.length; j += 1) { + if (dataSet[j][dataSet[j].length - 1] === i) { + classCount += 1; + for (let n = 0; n < nDim; n += 1) { + clusterCenters[i][n] += dataSet[j][n]; + } + } + } + for (let n = 0; n < nDim; n += 1) { + clusterCenters[i][n] = Number((clusterCenters[i][n] / classCount).toFixed(2)); + } + } + } + // return the clusters assigned + const soln = []; + for (let i = 0; i < dataSet.length; i += 1) { + soln.push(dataSet[i][dataSet[i].length - 1]); + } + return soln; +}