Clustering in Machine Learning: k-means Algorithm

Prototyp-based clustering pretends various clusters could be discribed by a list of prototypes.

Prototype is a data instance that is representative of all the data.

This kind of algorithms usually initialize prototypes and update them iteratively. On account of not the same methods to present prototypes and ways to calculate expressions, there exists different algorithms, including k-means, Learning Vector Quantization, Mixture of Gaussian.

The C++ implementation code is as following,

#include <bits/stdc++.h>

using namespace std;

#define MAX_OPERATION 10000

// Author: eclipse

class KMeans{
private:
    vector<vector<double>> data;   // Loaded data from text file.
    vector<vector<int>> clusters;  // Vector stores each cluster which contains positions of each instance.
    vector<vector<double>> means;  // Centroids variable stores mean var among each cluster.
    vector<int> labels;            // Labels that contain the cluster number each instance belongs to.
    
private:
    // Find the minimal distance between x and all centroids.
    int findMinDistance(vector<double> & x) {
        vector<double> distances;
        for (const auto & mean : means)
            distances.push_back(pow(abs(x[0] - mean[0]), (double )2) + pow(abs(x[1] - mean[1]), 2));
        double minDis = distances[0];
        int ret = 0;
        for (int i = 0; i < distances.size(); i++)
            if (distances[i] < minDis) { minDis = distances[i]; ret = i; }
        return ret;
    }

    // Calculate new centroids.
    vector<double> calcNewMean(int i) {
        double sumX = 0, sumY = 0;
        for (const auto & instanceId : clusters[i]) {
            sumX += data[instanceId][0];
            sumY += data[instanceId][1];
        }
        return vector<double>({ sumX / (double )clusters[i].size(), sumY / (double )clusters[i].size() });
    }

    // Deepcopy clusters.
    void deepcopy(vector<vector<int>> & newClusters) {
        for (const auto & cluster : clusters) {
            vector<int> tmp;
            for (const auto & j : cluster)
                tmp.push_back(j);
            newClusters.push_back(tmp);
        }
    }

    // Compare clusters in this instance with newClusters.
    // If they are the same, return true; otherwise, return false.
    bool cmpCluster(vector<vector<int>> & newClusters) {
        for (int i = 0; i < clusters.size(); i++) {
            if (clusters[i].size() != newClusters[i].size()) return false;
            for (int j = 0; j < clusters[i].size(); j++)
                if (clusters[i][j] != newClusters[i][j]) return false;
        }
        return true;
    }

public:
    // Load data from input.txt into vector.
    void Load(const string & fileName) {
        ifstream ifs(fileName);
        string buf;
        while (getline(ifs, buf)) {
            stringstream ss(buf);
            string subStr;
            vector<double> instance;
            while (getline(ss, subStr, '\t'))
                if (!subStr.empty()) instance.push_back(stof(subStr));
            data.push_back(instance);
        }
        ifs.close();
    }

    // Parameter k means the number of clusters intended to get.
    // Run k-means algorithm to get the labels vector stored in instance of this class.
    void Fit(int k) {
        if (data.empty()) {
            cout << "Properly load data first!!!" << endl;
            return;
        }
        for (int i = 0; i < k; i++) {
            clusters.emplace_back();  // Initialize clusters.
            means.push_back(data[(i)]); // Initialize means.
        }
        labels = vector<int>(data.size());  // Initialize labels.

        for (int ii = 0; ii < MAX_OPERATION; ii++) {
            vector<vector<int>> newClusters;
            deepcopy(newClusters);
            for (auto &cluster: clusters)  // Clear clusters.
                cluster.clear();
            for (int i = 0; i < data.size(); i++) {
                int label = findMinDistance(data[i]);
                clusters[label].push_back(i);
                labels[i] = label;
            }
            for (int i = 0; i < k; i++) {
                vector<double> newMean = calcNewMean(i);
                if (newMean[0] != means[i][0] || newMean[1] != means[i][1]) means[i] = newMean;
            }
            if (cmpCluster(newClusters)) break;
        }
    }

    void PrintLabels() {
        cout << "Labels:" << endl <<  "[";
        for (const auto & label : labels)
            cout << label << ",";
        cout << "]" << endl;
    }

    void PrintCentroids() {
        cout << "Centroids:" << endl << "[" << endl;
        for (const auto & mean : means)
            cout << "[" << mean[0] << ", " << mean[1] << "],"<< endl;
        cout << "]" << endl;
    }
};

int main() {
    KMeans kMeans = KMeans();
    kMeans.Load("../Clustering/k-means/input2.txt");
    kMeans.Fit(4);
    kMeans.PrintLabels();
    kMeans.PrintCentroids();
    return 0;
}

Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 2.0 Generic License.