Added Dequeue in Python

This commit is contained in:
97arushisharma
2017-10-25 01:37:11 +05:30
commit 9bc80eac2d
105 changed files with 295341 additions and 0 deletions

View File

@ -0,0 +1,139 @@
"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""
import numpy as np
class Decision_Tree:
def __init__(self, depth = 5, min_leaf_size = 5):
self.depth = depth
self.decision_boundary = 0
self.left = None
self.right = None
self.min_leaf_size = min_leaf_size
self.prediction = None
def mean_squared_error(self, labels, prediction):
"""
mean_squared_error:
@param labels: a one dimensional numpy array
@param prediction: a floating point value
return value: mean_squared_error calculates the error if prediction is used to estimate the labels
"""
if labels.ndim != 1:
print("Error: Input labels must be one dimensional")
return np.mean((labels - prediction) ** 2)
def train(self, X, y):
"""
train:
@param X: a one dimensional numpy array
@param y: a one dimensional numpy array.
The contents of y are the labels for the corresponding X values
train does not have a return value
"""
"""
this section is to check that the inputs conform to our dimensionality constraints
"""
if X.ndim != 1:
print("Error: Input data set must be one dimensional")
return
if len(X) != len(y):
print("Error: X and y have different lengths")
return
if y.ndim != 1:
print("Error: Data set labels must be one dimensional")
return
if len(X) < 2 * self.min_leaf_size:
self.prediction = np.mean(y)
return
if self.depth == 1:
self.prediction = np.mean(y)
return
best_split = 0
min_error = self.mean_squared_error(X,np.mean(y)) * 2
"""
loop over all possible splits for the decision tree. find the best split.
if no split exists that is less than 2 * error for the entire array
then the data set is not split and the average for the entire array is used as the predictor
"""
for i in range(len(X)):
if len(X[:i]) < self.min_leaf_size:
continue
elif len(X[i:]) < self.min_leaf_size:
continue
else:
error_left = self.mean_squared_error(X[:i], np.mean(y[:i]))
error_right = self.mean_squared_error(X[i:], np.mean(y[i:]))
error = error_left + error_right
if error < min_error:
best_split = i
min_error = error
if best_split != 0:
left_X = X[:best_split]
left_y = y[:best_split]
right_X = X[best_split:]
right_y = y[best_split:]
self.decision_boundary = X[best_split]
self.left = Decision_Tree(depth = self.depth - 1, min_leaf_size = self.min_leaf_size)
self.right = Decision_Tree(depth = self.depth - 1, min_leaf_size = self.min_leaf_size)
self.left.train(left_X, left_y)
self.right.train(right_X, right_y)
else:
self.prediction = np.mean(y)
return
def predict(self, x):
"""
predict:
@param x: a floating point value to predict the label of
the prediction function works by recursively calling the predict function
of the appropriate subtrees based on the tree's decision boundary
"""
if self.prediction is not None:
return self.prediction
elif self.left or self.right is not None:
if x >= self.decision_boundary:
return self.right.predict(x)
else:
return self.left.predict(x)
else:
print("Error: Decision tree not yet trained")
return None
def main():
"""
In this demonstration we're generating a sample data set from the sin function in numpy.
We then train a decision tree on the data set and use the decision tree to predict the
label of 10 different test values. Then the mean squared error over this test is displayed.
"""
X = np.arange(-1., 1., 0.005)
y = np.sin(X)
tree = Decision_Tree(depth = 10, min_leaf_size = 10)
tree.train(X,y)
test_cases = (np.random.rand(10) * 2) - 1
predictions = np.array([tree.predict(x) for x in test_cases])
avg_error = np.mean((predictions - test_cases) ** 2)
print("Test values: " + str(test_cases))
print("Predictions: " + str(predictions))
print("Average error: " + str(avg_error))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,121 @@
"""
Implementation of gradient descent algorithm for minimizing cost of a linear hypothesis function.
"""
import numpy
# List of input, output pairs
train_data = (((5, 2, 3), 15), ((6, 5, 9), 25),
((11, 12, 13), 41), ((1, 1, 1), 8), ((11, 12, 13), 41))
test_data = (((515, 22, 13), 555), ((61, 35, 49), 150))
parameter_vector = [2, 4, 1, 5]
m = len(train_data)
LEARNING_RATE = 0.009
def _error(example_no, data_set='train'):
"""
:param data_set: train data or test data
:param example_no: example number whose error has to be checked
:return: error in example pointed by example number.
"""
return calculate_hypothesis_value(example_no, data_set) - output(example_no, data_set)
def _hypothesis_value(data_input_tuple):
"""
Calculates hypothesis function value for a given input
:param data_input_tuple: Input tuple of a particular example
:return: Value of hypothesis function at that point.
Note that there is an 'biased input' whose value is fixed as 1.
It is not explicitly mentioned in input data.. But, ML hypothesis functions use it.
So, we have to take care of it separately. Line 36 takes care of it.
"""
hyp_val = 0
for i in range(len(parameter_vector) - 1):
hyp_val += data_input_tuple[i]*parameter_vector[i+1]
hyp_val += parameter_vector[0]
return hyp_val
def output(example_no, data_set):
"""
:param data_set: test data or train data
:param example_no: example whose output is to be fetched
:return: output for that example
"""
if data_set == 'train':
return train_data[example_no][1]
elif data_set == 'test':
return test_data[example_no][1]
def calculate_hypothesis_value(example_no, data_set):
"""
Calculates hypothesis value for a given example
:param data_set: test data or train_data
:param example_no: example whose hypothesis value is to be calculated
:return: hypothesis value for that example
"""
if data_set == "train":
return _hypothesis_value(train_data[example_no][0])
elif data_set == "test":
return _hypothesis_value(test_data[example_no][0])
def summation_of_cost_derivative(index, end=m):
"""
Calculates the sum of cost function derivative
:param index: index wrt derivative is being calculated
:param end: value where summation ends, default is m, number of examples
:return: Returns the summation of cost derivative
Note: If index is -1, this means we are calculating summation wrt to biased parameter.
"""
summation_value = 0
for i in range(end):
if index == -1:
summation_value += _error(i)
else:
summation_value += _error(i)*train_data[i][0][index]
return summation_value
def get_cost_derivative(index):
"""
:param index: index of the parameter vector wrt to derivative is to be calculated
:return: derivative wrt to that index
Note: If index is -1, this means we are calculating summation wrt to biased parameter.
"""
cost_derivative_value = summation_of_cost_derivative(index, m)/m
return cost_derivative_value
def run_gradient_descent():
global parameter_vector
# Tune these values to set a tolerance value for predicted output
absolute_error_limit = 0.000002
relative_error_limit = 0
j = 0
while True:
j += 1
temp_parameter_vector = [0, 0, 0, 0]
for i in range(0, len(parameter_vector)):
cost_derivative = get_cost_derivative(i-1)
temp_parameter_vector[i] = parameter_vector[i] - \
LEARNING_RATE*cost_derivative
if numpy.allclose(parameter_vector, temp_parameter_vector,
atol=absolute_error_limit, rtol=relative_error_limit):
break
parameter_vector = temp_parameter_vector
print("Number of iterations:", j)
def test_gradient_descent():
for i in range(len(test_data)):
print("Actual output value:", output(i, 'test'))
print("Hypothesis output:", calculate_hypothesis_value(i, 'test'))
if __name__ == '__main__':
run_gradient_descent()
print("\nTesting gradient descent for a linear hypothesis function.\n")
test_gradient_descent()

View File

@ -0,0 +1,172 @@
'''README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)
Requirements:
- sklearn
- numpy
- matplotlib
Python:
- 3.5
Inputs:
- X , a 2D numpy array of features.
- k , number of clusters to create.
- initial_centroids , initial centroid values generated by utility function(mentioned in usage).
- maxiter , maximum number of iterations to process.
- heterogeneity , empty list that will be filled with hetrogeneity values if passed to kmeans func.
Usage:
1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
2. create initial_centroids,
initial_centroids = get_initial_centroids(
X,
k,
seed=0 # seed value for initial centroid generation, None for randomness(default=None)
)
3. find centroids and clusters using kmeans function.
centroids, cluster_assignment = kmeans(
X,
k,
initial_centroids,
maxiter=400,
record_heterogeneity=heterogeneity,
verbose=True # whether to print logs in console or not.(default=False)
)
4. Plot the loss function, hetrogeneity values for every iteration saved in hetrogeneity list.
plot_heterogeneity(
heterogeneity,
k
)
5. Have fun..
'''
from sklearn.metrics import pairwise_distances
import numpy as np
TAG = 'K-MEANS-CLUST/ '
def get_initial_centroids(data, k, seed=None):
'''Randomly choose k data points as initial centroids'''
if seed is not None: # useful for obtaining consistent results
np.random.seed(seed)
n = data.shape[0] # number of data points
# Pick K indices from range [0, N).
rand_indices = np.random.randint(0, n, k)
# Keep centroids as dense format, as many entries will be nonzero due to averaging.
# As long as at least one document in a cluster contains a word,
# it will carry a nonzero weight in the TF-IDF vector of the centroid.
centroids = data[rand_indices,:]
return centroids
def centroid_pairwise_dist(X,centroids):
return pairwise_distances(X,centroids,metric='euclidean')
def assign_clusters(data, centroids):
# Compute distances between each data point and the set of centroids:
# Fill in the blank (RHS only)
distances_from_centroids = centroid_pairwise_dist(data,centroids)
# Compute cluster assignments for each data point:
# Fill in the blank (RHS only)
cluster_assignment = np.argmin(distances_from_centroids,axis=1)
return cluster_assignment
def revise_centroids(data, k, cluster_assignment):
new_centroids = []
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment==i]
# Compute the mean of the data points. Fill in the blank (RHS only)
centroid = member_data_points.mean(axis=0)
new_centroids.append(centroid)
new_centroids = np.array(new_centroids)
return new_centroids
def compute_heterogeneity(data, k, centroids, cluster_assignment):
heterogeneity = 0.0
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment==i, :]
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
# Compute distances from centroid to data points (RHS only)
distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
squared_distances = distances**2
heterogeneity += np.sum(squared_distances)
return heterogeneity
from matplotlib import pyplot as plt
def plot_heterogeneity(heterogeneity, k):
plt.figure(figsize=(7,4))
plt.plot(heterogeneity, linewidth=4)
plt.xlabel('# Iterations')
plt.ylabel('Heterogeneity')
plt.title('Heterogeneity of clustering over time, K={0:d}'.format(k))
plt.rcParams.update({'font.size': 16})
plt.show()
def kmeans(data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False):
'''This function runs k-means on given data and initial set of centroids.
maxiter: maximum number of iterations to run.(default=500)
record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations
if None, do not store the history.
verbose: if True, print how many data points changed their cluster labels in each iteration'''
centroids = initial_centroids[:]
prev_cluster_assignment = None
for itr in range(maxiter):
if verbose:
print(itr, end='')
# 1. Make cluster assignments using nearest centroids
cluster_assignment = assign_clusters(data,centroids)
# 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
centroids = revise_centroids(data,k, cluster_assignment)
# Check for convergence: if none of the assignments changed, stop
if prev_cluster_assignment is not None and \
(prev_cluster_assignment==cluster_assignment).all():
break
# Print number of new assignments
if prev_cluster_assignment is not None:
num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
if verbose:
print(' {0:5d} elements changed their cluster assignment.'.format(num_changed))
# Record heterogeneity convergence metric
if record_heterogeneity is not None:
# YOUR CODE HERE
score = compute_heterogeneity(data,k,centroids,cluster_assignment)
record_heterogeneity.append(score)
prev_cluster_assignment = cluster_assignment[:]
return centroids, cluster_assignment
# Mock test below
if False: # change to true to run this test case.
import sklearn.datasets as ds
dataset = ds.load_iris()
k = 3
heterogeneity = []
initial_centroids = get_initial_centroids(dataset['data'], k, seed=0)
centroids, cluster_assignment = kmeans(dataset['data'], k, initial_centroids, maxiter=400,
record_heterogeneity=heterogeneity, verbose=True)
plot_heterogeneity(heterogeneity, k)

View File

@ -0,0 +1,108 @@
"""
Linear regression is the most basic type of regression commonly used for
predictive analysis. The idea is preety simple, we have a dataset and we have
a feature's associated with it. The Features should be choose very cautiously
as they determine, how much our model will be able to make future predictions.
We try to set these Feature weights, over many iterations, so that they best
fits our dataset. In this particular code, i had used a CSGO dataset (ADR vs
Rating). We try to best fit a line through dataset and estimate the parameters.
"""
import requests
import numpy as np
def collect_dataset():
""" Collect dataset of CSGO
The dataset contains ADR vs Rating of a Player
:return : dataset obtained from the link, as matrix
"""
response = requests.get('https://raw.githubusercontent.com/yashLadha/' +
'The_Math_of_Intelligence/master/Week1/ADRvs' +
'Rating.csv')
lines = response.text.splitlines()
data = []
for item in lines:
item = item.split(',')
data.append(item)
data.pop(0) # This is for removing the labels from the list
dataset = np.matrix(data)
return dataset
def run_steep_gradient_descent(data_x, data_y,
len_data, alpha, theta):
""" Run steep gradient descent and updates the Feature vector accordingly_
:param data_x : contains the dataset
:param data_y : contains the output associated with each data-entry
:param len_data : length of the data_
:param alpha : Learning rate of the model
:param theta : Feature vector (weight's for our model)
;param return : Updated Feature's, using
curr_features - alpha_ * gradient(w.r.t. feature)
"""
n = len_data
prod = np.dot(theta, data_x.transpose())
prod -= data_y.transpose()
sum_grad = np.dot(prod, data_x)
theta = theta - (alpha / n) * sum_grad
return theta
def sum_of_square_error(data_x, data_y, len_data, theta):
""" Return sum of square error for error calculation
:param data_x : contains our dataset
:param data_y : contains the output (result vector)
:param len_data : len of the dataset
:param theta : contains the feature vector
:return : sum of square error computed from given feature's
"""
error = 0.0
prod = np.dot(theta, data_x.transpose())
prod -= data_y.transpose()
sum_elem = np.sum(np.square(prod))
error = sum_elem / (2 * len_data)
return error
def run_linear_regression(data_x, data_y):
""" Implement Linear regression over the dataset
:param data_x : contains our dataset
:param data_y : contains the output (result vector)
:return : feature for line of best fit (Feature vector)
"""
iterations = 100000
alpha = 0.0001550
no_features = data_x.shape[1]
len_data = data_x.shape[0] - 1
theta = np.zeros((1, no_features))
for i in range(0, iterations):
theta = run_steep_gradient_descent(data_x, data_y,
len_data, alpha, theta)
error = sum_of_square_error(data_x, data_y, len_data, theta)
print('At Iteration %d - Error is %.5f ' % (i + 1, error))
return theta
def main():
""" Driver function """
data = collect_dataset()
len_data = data.shape[0]
data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float)
data_y = data[:, -1].astype(float)
theta = run_linear_regression(data_x, data_y)
len_result = theta.shape[1]
print('Resultant Feature vector : ')
for i in range(0, len_result):
print('%.5f' % (theta[0, i]))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,123 @@
'''
Perceptron
w = w + N * (d(k) - y) * x(k)
Using perceptron network for oil analysis,
with Measuring of 3 parameters that represent chemical characteristics we can classify the oil, in p1 or p2
p1 = -1
p2 = 1
'''
import random
class Perceptron:
def __init__(self, sample, exit, learn_rate=0.01, epoch_number=1000, bias=-1):
self.sample = sample
self.exit = exit
self.learn_rate = learn_rate
self.epoch_number = epoch_number
self.bias = bias
self.number_sample = len(sample)
self.col_sample = len(sample[0])
self.weight = []
def trannig(self):
for sample in self.sample:
sample.insert(0, self.bias)
for i in range(self.col_sample):
self.weight.append(random.random())
self.weight.insert(0, self.bias)
epoch_count = 0
while True:
erro = False
for i in range(self.number_sample):
u = 0
for j in range(self.col_sample + 1):
u = u + self.weight[j] * self.sample[i][j]
y = self.sign(u)
if y != self.exit[i]:
for j in range(self.col_sample + 1):
self.weight[j] = self.weight[j] + self.learn_rate * (self.exit[i] - y) * self.sample[i][j]
erro = True
#print('Epoch: \n',epoch_count)
epoch_count = epoch_count + 1
# if you want controle the epoch or just by erro
if erro == False:
print('\nEpoch:\n',epoch_count)
print('------------------------\n')
#if epoch_count > self.epoch_number or not erro:
break
def sort(self, sample):
sample.insert(0, self.bias)
u = 0
for i in range(self.col_sample + 1):
u = u + self.weight[i] * sample[i]
y = self.sign(u)
if y == -1:
print('Sample: ', sample)
print('classification: P1')
else:
print('Sample: ', sample)
print('classification: P2')
def sign(self, u):
return 1 if u >= 0 else -1
samples = [
[-0.6508, 0.1097, 4.0009],
[-1.4492, 0.8896, 4.4005],
[2.0850, 0.6876, 12.0710],
[0.2626, 1.1476, 7.7985],
[0.6418, 1.0234, 7.0427],
[0.2569, 0.6730, 8.3265],
[1.1155, 0.6043, 7.4446],
[0.0914, 0.3399, 7.0677],
[0.0121, 0.5256, 4.6316],
[-0.0429, 0.4660, 5.4323],
[0.4340, 0.6870, 8.2287],
[0.2735, 1.0287, 7.1934],
[0.4839, 0.4851, 7.4850],
[0.4089, -0.1267, 5.5019],
[1.4391, 0.1614, 8.5843],
[-0.9115, -0.1973, 2.1962],
[0.3654, 1.0475, 7.4858],
[0.2144, 0.7515, 7.1699],
[0.2013, 1.0014, 6.5489],
[0.6483, 0.2183, 5.8991],
[-0.1147, 0.2242, 7.2435],
[-0.7970, 0.8795, 3.8762],
[-1.0625, 0.6366, 2.4707],
[0.5307, 0.1285, 5.6883],
[-1.2200, 0.7777, 1.7252],
[0.3957, 0.1076, 5.6623],
[-0.1013, 0.5989, 7.1812],
[2.4482, 0.9455, 11.2095],
[2.0149, 0.6192, 10.9263],
[0.2012, 0.2611, 5.4631]
]
exit = [-1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1]
network = Perceptron(sample=samples, exit = exit, learn_rate=0.01, epoch_number=1000, bias=-1)
network.trannig()
while True:
sample = []
for i in range(3):
sample.insert(i, float(input('value: ')))
network.sort(sample)

View File

@ -0,0 +1,63 @@
import numpy
""" Here I implemented the scoring functions.
MAE, MSE, RMSE, RMSLE are included.
Those are used for calculating differences between
predicted values and actual values.
Metrics are slightly differentiated. Sometimes squared, rooted,
even log is used.
Using log and roots can be perceived as tools for penalizing big
erors. However, using appropriate metrics depends on the situations,
and types of data
"""
#Mean Absolute Error
def mae(predict, actual):
predict = np.array(predict)
actual = np.array(actual)
difference = abs(predict - actual)
score = difference.mean()
return score
#Mean Squared Error
def mse(predict, actual):
predict = np.array(predict)
actual = np.array(actual)
difference = predict - actual
square_diff = np.square(difference)
score = square_diff.mean()
return score
#Root Mean Squared Error
def rmse(predict, actual):
predict = np.array(predict)
actual = np.array(actual)
difference = predict - actual
square_diff = np.square(dfference)
mean_square_diff = square_diff.mean()
score = np.sqrt(mean_square_diff)
return score
#Root Mean Square Logarithmic Error
def rmsle(predict, actual):
predict = np.array(predict)
actual = np.array(actual)
log_predict = np.log(predict+1)
log_actual = np.log(actual+1)
difference = log_predict - log_actual
square_diff = np.square(difference)
mean_square_diff = square_diff.mean()
score = np.sqrt(mean_square_diff)
return score