Simplify code by dropping support for legacy Python (#1143)

* Simplify code by dropping support for legacy Python * sort() --> sorted()
2025-07-05 01:09:40 +08:00 · 2019-08-19 15:37:49 +02:00
parent 32aa7ff081
commit 47a9ea2b0b
145 changed files with 367 additions and 976 deletions
--- a/machine_learning/decision_tree.py
+++ b/machine_learning/decision_tree.py
@ -1,10 +1,8 @@
 """
 Implementation of a basic regression decision tree.
 Input data set: The input data set must be 1-dimensional with continuous labels.
-Output: The decision tree maps a real number input to a real number output. 
+Output: The decision tree maps a real number input to a real number output.
 """
-from __future__ import print_function
-
 import numpy as np

 class Decision_Tree:
@ -19,7 +17,7 @@ class Decision_Tree:
    def mean_squared_error(self, labels, prediction):
        """
        mean_squared_error:
-        @param labels: a one dimensional numpy array 
+        @param labels: a one dimensional numpy array
        @param prediction: a floating point value
        return value: mean_squared_error calculates the error if prediction is used to estimate the labels
        """
@ -32,7 +30,7 @@ class Decision_Tree:
        """
        train:
        @param X: a one dimensional numpy array
-        @param y: a one dimensional numpy array. 
+        @param y: a one dimensional numpy array.
        The contents of y are the labels for the corresponding X values

        train does not have a return value
@ -135,6 +133,6 @@ def main():
    print("Predictions: " + str(predictions))
    print("Average error: " + str(avg_error))

-            
+
 if __name__ == '__main__':
    main()
--- a/machine_learning/gradient_descent.py
+++ b/machine_learning/gradient_descent.py
@ -1,7 +1,6 @@
 """
 Implementation of gradient descent algorithm for minimizing cost of a linear hypothesis function.
 """
-from __future__ import print_function, division
 import numpy

 # List of input, output pairs
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@ -17,36 +17,35 @@ Inputs:

 Usage:
  1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
-  
+
  2. create initial_centroids,
        initial_centroids = get_initial_centroids(
-            X, 
-            k, 
+            X,
+            k,
            seed=0 # seed value for initial centroid generation, None for randomness(default=None)
            )

  3. find centroids and clusters using kmeans function.
-  
+
        centroids, cluster_assignment = kmeans(
-            X, 
-            k, 
-            initial_centroids, 
+            X,
+            k,
+            initial_centroids,
            maxiter=400,
-            record_heterogeneity=heterogeneity, 
+            record_heterogeneity=heterogeneity,
            verbose=True # whether to print logs in console or not.(default=False)
            )
-  
-  
+
+
  4. Plot the loss function, hetrogeneity values for every iteration saved in hetrogeneity list.
        plot_heterogeneity(
-            heterogeneity, 
+            heterogeneity,
            k
        )
-  
+
  5. Have fun..
-  
+
 '''
-from __future__ import print_function
 from sklearn.metrics import pairwise_distances
 import numpy as np

@ -57,30 +56,30 @@ def get_initial_centroids(data, k, seed=None):
    if seed is not None: # useful for obtaining consistent results
        np.random.seed(seed)
    n = data.shape[0] # number of data points
-        
+
    # Pick K indices from range [0, N).
    rand_indices = np.random.randint(0, n, k)
-    
+
    # Keep centroids as dense format, as many entries will be nonzero due to averaging.
    # As long as at least one document in a cluster contains a word,
    # it will carry a nonzero weight in the TF-IDF vector of the centroid.
    centroids = data[rand_indices,:]
-    
+
    return centroids

 def centroid_pairwise_dist(X,centroids):
    return pairwise_distances(X,centroids,metric='euclidean')

 def assign_clusters(data, centroids):
-    
+
    # Compute distances between each data point and the set of centroids:
    # Fill in the blank (RHS only)
    distances_from_centroids = centroid_pairwise_dist(data,centroids)
-    
+
    # Compute cluster assignments for each data point:
    # Fill in the blank (RHS only)
    cluster_assignment = np.argmin(distances_from_centroids,axis=1)
-    
+
    return cluster_assignment

 def revise_centroids(data, k, cluster_assignment):
@ -92,23 +91,23 @@ def revise_centroids(data, k, cluster_assignment):
        centroid = member_data_points.mean(axis=0)
        new_centroids.append(centroid)
    new_centroids = np.array(new_centroids)
-    
+
    return new_centroids

 def compute_heterogeneity(data, k, centroids, cluster_assignment):
-    
+
    heterogeneity = 0.0
    for i in range(k):
-        
+
        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment==i, :]
-        
+
        if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
            distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
            squared_distances = distances**2
            heterogeneity += np.sum(squared_distances)
-        
+
    return heterogeneity

 from matplotlib import pyplot as plt
@ -129,36 +128,36 @@ def kmeans(data, k, initial_centroids, maxiter=500, record_heterogeneity=None, v
       verbose: if True, print how many data points changed their cluster labels in each iteration'''
    centroids = initial_centroids[:]
    prev_cluster_assignment = None
-    
-    for itr in range(maxiter):        
+
+    for itr in range(maxiter):
        if verbose:
            print(itr, end='')
-        
+
        # 1. Make cluster assignments using nearest centroids
        cluster_assignment = assign_clusters(data,centroids)
-            
+
        # 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
        centroids = revise_centroids(data,k, cluster_assignment)
-            
+
        # Check for convergence: if none of the assignments changed, stop
        if prev_cluster_assignment is not None and \
          (prev_cluster_assignment==cluster_assignment).all():
            break
-        
-        # Print number of new assignments 
+
+        # Print number of new assignments
        if prev_cluster_assignment is not None:
            num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
            if verbose:
-                print('    {0:5d} elements changed their cluster assignment.'.format(num_changed))   
-        
+                print('    {0:5d} elements changed their cluster assignment.'.format(num_changed))
+
        # Record heterogeneity convergence metric
        if record_heterogeneity is not None:
            # YOUR CODE HERE
            score = compute_heterogeneity(data,k,centroids,cluster_assignment)
            record_heterogeneity.append(score)
-        
+
        prev_cluster_assignment = cluster_assignment[:]
-        
+
    return centroids, cluster_assignment

 # Mock test below
--- a/machine_learning/linear_regression.py
+++ b/machine_learning/linear_regression.py
@ -7,8 +7,6 @@ We try to set these Feature weights, over many iterations, so that they best
 fits our dataset. In this particular code, i had used a CSGO dataset (ADR vs
 Rating). We try to best fit a line through dataset and estimate the parameters.
 """
-from __future__ import print_function
-
 import requests
 import numpy as np