mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-07-05 01:09:40 +08:00
Simplify code by dropping support for legacy Python (#1143)
* Simplify code by dropping support for legacy Python * sort() --> sorted()
This commit is contained in:
@ -1,10 +1,8 @@
|
||||
"""
|
||||
Implementation of a basic regression decision tree.
|
||||
Input data set: The input data set must be 1-dimensional with continuous labels.
|
||||
Output: The decision tree maps a real number input to a real number output.
|
||||
Output: The decision tree maps a real number input to a real number output.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
class Decision_Tree:
|
||||
@ -19,7 +17,7 @@ class Decision_Tree:
|
||||
def mean_squared_error(self, labels, prediction):
|
||||
"""
|
||||
mean_squared_error:
|
||||
@param labels: a one dimensional numpy array
|
||||
@param labels: a one dimensional numpy array
|
||||
@param prediction: a floating point value
|
||||
return value: mean_squared_error calculates the error if prediction is used to estimate the labels
|
||||
"""
|
||||
@ -32,7 +30,7 @@ class Decision_Tree:
|
||||
"""
|
||||
train:
|
||||
@param X: a one dimensional numpy array
|
||||
@param y: a one dimensional numpy array.
|
||||
@param y: a one dimensional numpy array.
|
||||
The contents of y are the labels for the corresponding X values
|
||||
|
||||
train does not have a return value
|
||||
@ -135,6 +133,6 @@ def main():
|
||||
print("Predictions: " + str(predictions))
|
||||
print("Average error: " + str(avg_error))
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,7 +1,6 @@
|
||||
"""
|
||||
Implementation of gradient descent algorithm for minimizing cost of a linear hypothesis function.
|
||||
"""
|
||||
from __future__ import print_function, division
|
||||
import numpy
|
||||
|
||||
# List of input, output pairs
|
||||
|
@ -17,36 +17,35 @@ Inputs:
|
||||
|
||||
Usage:
|
||||
1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
|
||||
|
||||
|
||||
2. create initial_centroids,
|
||||
initial_centroids = get_initial_centroids(
|
||||
X,
|
||||
k,
|
||||
X,
|
||||
k,
|
||||
seed=0 # seed value for initial centroid generation, None for randomness(default=None)
|
||||
)
|
||||
|
||||
3. find centroids and clusters using kmeans function.
|
||||
|
||||
|
||||
centroids, cluster_assignment = kmeans(
|
||||
X,
|
||||
k,
|
||||
initial_centroids,
|
||||
X,
|
||||
k,
|
||||
initial_centroids,
|
||||
maxiter=400,
|
||||
record_heterogeneity=heterogeneity,
|
||||
record_heterogeneity=heterogeneity,
|
||||
verbose=True # whether to print logs in console or not.(default=False)
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
4. Plot the loss function, hetrogeneity values for every iteration saved in hetrogeneity list.
|
||||
plot_heterogeneity(
|
||||
heterogeneity,
|
||||
heterogeneity,
|
||||
k
|
||||
)
|
||||
|
||||
|
||||
5. Have fun..
|
||||
|
||||
|
||||
'''
|
||||
from __future__ import print_function
|
||||
from sklearn.metrics import pairwise_distances
|
||||
import numpy as np
|
||||
|
||||
@ -57,30 +56,30 @@ def get_initial_centroids(data, k, seed=None):
|
||||
if seed is not None: # useful for obtaining consistent results
|
||||
np.random.seed(seed)
|
||||
n = data.shape[0] # number of data points
|
||||
|
||||
|
||||
# Pick K indices from range [0, N).
|
||||
rand_indices = np.random.randint(0, n, k)
|
||||
|
||||
|
||||
# Keep centroids as dense format, as many entries will be nonzero due to averaging.
|
||||
# As long as at least one document in a cluster contains a word,
|
||||
# it will carry a nonzero weight in the TF-IDF vector of the centroid.
|
||||
centroids = data[rand_indices,:]
|
||||
|
||||
|
||||
return centroids
|
||||
|
||||
def centroid_pairwise_dist(X,centroids):
|
||||
return pairwise_distances(X,centroids,metric='euclidean')
|
||||
|
||||
def assign_clusters(data, centroids):
|
||||
|
||||
|
||||
# Compute distances between each data point and the set of centroids:
|
||||
# Fill in the blank (RHS only)
|
||||
distances_from_centroids = centroid_pairwise_dist(data,centroids)
|
||||
|
||||
|
||||
# Compute cluster assignments for each data point:
|
||||
# Fill in the blank (RHS only)
|
||||
cluster_assignment = np.argmin(distances_from_centroids,axis=1)
|
||||
|
||||
|
||||
return cluster_assignment
|
||||
|
||||
def revise_centroids(data, k, cluster_assignment):
|
||||
@ -92,23 +91,23 @@ def revise_centroids(data, k, cluster_assignment):
|
||||
centroid = member_data_points.mean(axis=0)
|
||||
new_centroids.append(centroid)
|
||||
new_centroids = np.array(new_centroids)
|
||||
|
||||
|
||||
return new_centroids
|
||||
|
||||
def compute_heterogeneity(data, k, centroids, cluster_assignment):
|
||||
|
||||
|
||||
heterogeneity = 0.0
|
||||
for i in range(k):
|
||||
|
||||
|
||||
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
|
||||
member_data_points = data[cluster_assignment==i, :]
|
||||
|
||||
|
||||
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
|
||||
# Compute distances from centroid to data points (RHS only)
|
||||
distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
|
||||
squared_distances = distances**2
|
||||
heterogeneity += np.sum(squared_distances)
|
||||
|
||||
|
||||
return heterogeneity
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
@ -129,36 +128,36 @@ def kmeans(data, k, initial_centroids, maxiter=500, record_heterogeneity=None, v
|
||||
verbose: if True, print how many data points changed their cluster labels in each iteration'''
|
||||
centroids = initial_centroids[:]
|
||||
prev_cluster_assignment = None
|
||||
|
||||
for itr in range(maxiter):
|
||||
|
||||
for itr in range(maxiter):
|
||||
if verbose:
|
||||
print(itr, end='')
|
||||
|
||||
|
||||
# 1. Make cluster assignments using nearest centroids
|
||||
cluster_assignment = assign_clusters(data,centroids)
|
||||
|
||||
|
||||
# 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
|
||||
centroids = revise_centroids(data,k, cluster_assignment)
|
||||
|
||||
|
||||
# Check for convergence: if none of the assignments changed, stop
|
||||
if prev_cluster_assignment is not None and \
|
||||
(prev_cluster_assignment==cluster_assignment).all():
|
||||
break
|
||||
|
||||
# Print number of new assignments
|
||||
|
||||
# Print number of new assignments
|
||||
if prev_cluster_assignment is not None:
|
||||
num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
|
||||
if verbose:
|
||||
print(' {0:5d} elements changed their cluster assignment.'.format(num_changed))
|
||||
|
||||
print(' {0:5d} elements changed their cluster assignment.'.format(num_changed))
|
||||
|
||||
# Record heterogeneity convergence metric
|
||||
if record_heterogeneity is not None:
|
||||
# YOUR CODE HERE
|
||||
score = compute_heterogeneity(data,k,centroids,cluster_assignment)
|
||||
record_heterogeneity.append(score)
|
||||
|
||||
|
||||
prev_cluster_assignment = cluster_assignment[:]
|
||||
|
||||
|
||||
return centroids, cluster_assignment
|
||||
|
||||
# Mock test below
|
||||
|
@ -7,8 +7,6 @@ We try to set these Feature weights, over many iterations, so that they best
|
||||
fits our dataset. In this particular code, i had used a CSGO dataset (ADR vs
|
||||
Rating). We try to best fit a line through dataset and estimate the parameters.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
|
||||
|
Reference in New Issue
Block a user