psf/black code formatting (#1277)

2025-07-05 01:09:40 +08:00 · 2019-10-05 01:14:13 -04:00
parent 07f04a2e55
commit 9eac17a408
291 changed files with 6014 additions and 4571 deletions
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@ -1,4 +1,4 @@
-'''README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)
+"""README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)

 Requirements:
  - sklearn
@ -45,17 +45,18 @@ Usage:

  5. Have fun..

-'''
+"""
 from sklearn.metrics import pairwise_distances
 import numpy as np

-TAG = 'K-MEANS-CLUST/ '
+TAG = "K-MEANS-CLUST/ "
+

 def get_initial_centroids(data, k, seed=None):
-    '''Randomly choose k data points as initial centroids'''
-    if seed is not None: # useful for obtaining consistent results
+    """Randomly choose k data points as initial centroids"""
+    if seed is not None:  # useful for obtaining consistent results
        np.random.seed(seed)
-    n = data.shape[0] # number of data points
+    n = data.shape[0]  # number of data points

    # Pick K indices from range [0, N).
    rand_indices = np.random.randint(0, n, k)
@ -63,30 +64,33 @@ def get_initial_centroids(data, k, seed=None):
    # Keep centroids as dense format, as many entries will be nonzero due to averaging.
    # As long as at least one document in a cluster contains a word,
    # it will carry a nonzero weight in the TF-IDF vector of the centroid.
-    centroids = data[rand_indices,:]
+    centroids = data[rand_indices, :]

    return centroids

-def centroid_pairwise_dist(X,centroids):
-    return pairwise_distances(X,centroids,metric='euclidean')
+
+def centroid_pairwise_dist(X, centroids):
+    return pairwise_distances(X, centroids, metric="euclidean")
+

 def assign_clusters(data, centroids):

    # Compute distances between each data point and the set of centroids:
    # Fill in the blank (RHS only)
-    distances_from_centroids = centroid_pairwise_dist(data,centroids)
+    distances_from_centroids = centroid_pairwise_dist(data, centroids)

    # Compute cluster assignments for each data point:
    # Fill in the blank (RHS only)
-    cluster_assignment = np.argmin(distances_from_centroids,axis=1)
+    cluster_assignment = np.argmin(distances_from_centroids, axis=1)

    return cluster_assignment

+
 def revise_centroids(data, k, cluster_assignment):
    new_centroids = []
    for i in range(k):
        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
-        member_data_points = data[cluster_assignment==i]
+        member_data_points = data[cluster_assignment == i]
        # Compute the mean of the data points. Fill in the blank (RHS only)
        centroid = member_data_points.mean(axis=0)
        new_centroids.append(centroid)
@ -94,79 +98,102 @@ def revise_centroids(data, k, cluster_assignment):

    return new_centroids

+
 def compute_heterogeneity(data, k, centroids, cluster_assignment):

    heterogeneity = 0.0
    for i in range(k):

        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
-        member_data_points = data[cluster_assignment==i, :]
+        member_data_points = data[cluster_assignment == i, :]

-        if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
+        if member_data_points.shape[0] > 0:  # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
-            distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
-            squared_distances = distances**2
+            distances = pairwise_distances(
+                member_data_points, [centroids[i]], metric="euclidean"
+            )
+            squared_distances = distances ** 2
            heterogeneity += np.sum(squared_distances)

    return heterogeneity

+
 from matplotlib import pyplot as plt
+
+
 def plot_heterogeneity(heterogeneity, k):
-    plt.figure(figsize=(7,4))
+    plt.figure(figsize=(7, 4))
    plt.plot(heterogeneity, linewidth=4)
-    plt.xlabel('# Iterations')
-    plt.ylabel('Heterogeneity')
-    plt.title('Heterogeneity of clustering over time, K={0:d}'.format(k))
-    plt.rcParams.update({'font.size': 16})
+    plt.xlabel("# Iterations")
+    plt.ylabel("Heterogeneity")
+    plt.title("Heterogeneity of clustering over time, K={0:d}".format(k))
+    plt.rcParams.update({"font.size": 16})
    plt.show()

-def kmeans(data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False):
-    '''This function runs k-means on given data and initial set of centroids.
+
+def kmeans(
+    data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False
+):
+    """This function runs k-means on given data and initial set of centroids.
       maxiter: maximum number of iterations to run.(default=500)
       record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations
                             if None, do not store the history.
-       verbose: if True, print how many data points changed their cluster labels in each iteration'''
+       verbose: if True, print how many data points changed their cluster labels in each iteration"""
    centroids = initial_centroids[:]
    prev_cluster_assignment = None

    for itr in range(maxiter):
        if verbose:
-            print(itr, end='')
+            print(itr, end="")

        # 1. Make cluster assignments using nearest centroids
-        cluster_assignment = assign_clusters(data,centroids)
+        cluster_assignment = assign_clusters(data, centroids)

        # 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
-        centroids = revise_centroids(data,k, cluster_assignment)
+        centroids = revise_centroids(data, k, cluster_assignment)

        # Check for convergence: if none of the assignments changed, stop
-        if prev_cluster_assignment is not None and \
-          (prev_cluster_assignment==cluster_assignment).all():
+        if (
+            prev_cluster_assignment is not None
+            and (prev_cluster_assignment == cluster_assignment).all()
+        ):
            break

        # Print number of new assignments
        if prev_cluster_assignment is not None:
-            num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
+            num_changed = np.sum(prev_cluster_assignment != cluster_assignment)
            if verbose:
-                print('    {0:5d} elements changed their cluster assignment.'.format(num_changed))
+                print(
+                    "    {0:5d} elements changed their cluster assignment.".format(
+                        num_changed
+                    )
+                )

        # Record heterogeneity convergence metric
        if record_heterogeneity is not None:
            # YOUR CODE HERE
-            score = compute_heterogeneity(data,k,centroids,cluster_assignment)
+            score = compute_heterogeneity(data, k, centroids, cluster_assignment)
            record_heterogeneity.append(score)

        prev_cluster_assignment = cluster_assignment[:]

    return centroids, cluster_assignment

+
 # Mock test below
-if False: # change to true to run this test case.
+if False:  # change to true to run this test case.
    import sklearn.datasets as ds
+
    dataset = ds.load_iris()
    k = 3
    heterogeneity = []
-    initial_centroids = get_initial_centroids(dataset['data'], k, seed=0)
-    centroids, cluster_assignment = kmeans(dataset['data'], k, initial_centroids, maxiter=400,
-                                        record_heterogeneity=heterogeneity, verbose=True)
+    initial_centroids = get_initial_centroids(dataset["data"], k, seed=0)
+    centroids, cluster_assignment = kmeans(
+        dataset["data"],
+        k,
+        initial_centroids,
+        maxiter=400,
+        record_heterogeneity=heterogeneity,
+        verbose=True,
+    )
    plot_heterogeneity(heterogeneity, k)