Add pep8-naming to pre-commit hooks and fixes incorrect naming conventions (#7062)

* ci(pre-commit): Add pep8-naming to `pre-commit` hooks (#7038)

* refactor: Fix naming conventions (#7038)

* Update arithmetic_analysis/lu_decomposition.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor(lu_decomposition): Replace `NDArray` with `ArrayLike` (#7038)

* chore: Fix naming conventions in doctests (#7038)

* fix: Temporarily disable project euler problem 104 (#7069)

* chore: Fix naming conventions in doctests (#7038)

Co-authored-by: Christian Clauss <cclauss@me.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Caeden
2022-10-12 23:54:20 +01:00
committed by GitHub
parent e2cd982b11
commit 07e991d553
140 changed files with 1552 additions and 1536 deletions

View File

@ -6,7 +6,7 @@ Output: The decision tree maps a real number input to a real number output.
import numpy as np
class Decision_Tree:
class DecisionTree:
def __init__(self, depth=5, min_leaf_size=5):
self.depth = depth
self.decision_boundary = 0
@ -22,17 +22,17 @@ class Decision_Tree:
@param prediction: a floating point value
return value: mean_squared_error calculates the error if prediction is used to
estimate the labels
>>> tester = Decision_Tree()
>>> tester = DecisionTree()
>>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
>>> test_prediction = np.float(6)
>>> tester.mean_squared_error(test_labels, test_prediction) == (
... Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
... test_prediction))
True
>>> test_labels = np.array([1,2,3])
>>> test_prediction = np.float(2)
>>> tester.mean_squared_error(test_labels, test_prediction) == (
... Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
... test_prediction))
True
"""
@ -41,10 +41,10 @@ class Decision_Tree:
return np.mean((labels - prediction) ** 2)
def train(self, X, y):
def train(self, x, y):
"""
train:
@param X: a one dimensional numpy array
@param x: a one dimensional numpy array
@param y: a one dimensional numpy array.
The contents of y are the labels for the corresponding X values
@ -55,17 +55,17 @@ class Decision_Tree:
this section is to check that the inputs conform to our dimensionality
constraints
"""
if X.ndim != 1:
if x.ndim != 1:
print("Error: Input data set must be one dimensional")
return
if len(X) != len(y):
if len(x) != len(y):
print("Error: X and y have different lengths")
return
if y.ndim != 1:
print("Error: Data set labels must be one dimensional")
return
if len(X) < 2 * self.min_leaf_size:
if len(x) < 2 * self.min_leaf_size:
self.prediction = np.mean(y)
return
@ -74,7 +74,7 @@ class Decision_Tree:
return
best_split = 0
min_error = self.mean_squared_error(X, np.mean(y)) * 2
min_error = self.mean_squared_error(x, np.mean(y)) * 2
"""
loop over all possible splits for the decision tree. find the best split.
@ -82,34 +82,34 @@ class Decision_Tree:
then the data set is not split and the average for the entire array is used as
the predictor
"""
for i in range(len(X)):
if len(X[:i]) < self.min_leaf_size:
for i in range(len(x)):
if len(x[:i]) < self.min_leaf_size:
continue
elif len(X[i:]) < self.min_leaf_size:
elif len(x[i:]) < self.min_leaf_size:
continue
else:
error_left = self.mean_squared_error(X[:i], np.mean(y[:i]))
error_right = self.mean_squared_error(X[i:], np.mean(y[i:]))
error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
error = error_left + error_right
if error < min_error:
best_split = i
min_error = error
if best_split != 0:
left_X = X[:best_split]
left_x = x[:best_split]
left_y = y[:best_split]
right_X = X[best_split:]
right_x = x[best_split:]
right_y = y[best_split:]
self.decision_boundary = X[best_split]
self.left = Decision_Tree(
self.decision_boundary = x[best_split]
self.left = DecisionTree(
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
)
self.right = Decision_Tree(
self.right = DecisionTree(
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
)
self.left.train(left_X, left_y)
self.right.train(right_X, right_y)
self.left.train(left_x, left_y)
self.right.train(right_x, right_y)
else:
self.prediction = np.mean(y)
@ -134,7 +134,7 @@ class Decision_Tree:
return None
class Test_Decision_Tree:
class TestDecisionTree:
"""Decision Tres test class"""
@staticmethod
@ -159,11 +159,11 @@ def main():
predict the label of 10 different test values. Then the mean squared error over
this test is displayed.
"""
X = np.arange(-1.0, 1.0, 0.005)
y = np.sin(X)
x = np.arange(-1.0, 1.0, 0.005)
y = np.sin(x)
tree = Decision_Tree(depth=10, min_leaf_size=10)
tree.train(X, y)
tree = DecisionTree(depth=10, min_leaf_size=10)
tree.train(x, y)
test_cases = (np.random.rand(10) * 2) - 1
predictions = np.array([tree.predict(x) for x in test_cases])

View File

@ -17,19 +17,19 @@ def main():
iris = load_iris()
# Split dataset into train and test data
X = iris["data"] # features
Y = iris["target"]
x = iris["data"] # features
y = iris["target"]
x_train, x_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=1
x, y, test_size=0.3, random_state=1
)
# Gaussian Naive Bayes
NB_model = GaussianNB()
NB_model.fit(x_train, y_train)
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
# Display Confusion Matrix
plot_confusion_matrix(
NB_model,
nb_model,
x_test,
y_test,
display_labels=iris["target_names"],

View File

@ -26,25 +26,25 @@ def main():
print(df_boston.describe().T)
# Feature selection
X = df_boston.iloc[:, :-1]
x = df_boston.iloc[:, :-1]
y = df_boston.iloc[:, -1] # target variable
# split the data with 75% train and 25% test sets.
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0, test_size=0.25
x_train, x_test, y_train, y_test = train_test_split(
x, y, random_state=0, test_size=0.25
)
model = GradientBoostingRegressor(
n_estimators=500, max_depth=5, min_samples_split=4, learning_rate=0.01
)
# training the model
model.fit(X_train, y_train)
model.fit(x_train, y_train)
# to see how good the model fit the data
training_score = model.score(X_train, y_train).round(3)
test_score = model.score(X_test, y_test).round(3)
training_score = model.score(x_train, y_train).round(3)
test_score = model.score(x_test, y_test).round(3)
print("Training score of GradientBoosting is :", training_score)
print("The test score of GradientBoosting is :", test_score)
# Let us evaluation the model by finding the errors
y_pred = model.predict(X_test)
y_pred = model.predict(x_test)
# The mean squared error
print(f"Mean squared error: {mean_squared_error(y_test, y_pred):.2f}")

View File

@ -69,8 +69,8 @@ def get_initial_centroids(data, k, seed=None):
return centroids
def centroid_pairwise_dist(X, centroids):
return pairwise_distances(X, centroids, metric="euclidean")
def centroid_pairwise_dist(x, centroids):
return pairwise_distances(x, centroids, metric="euclidean")
def assign_clusters(data, centroids):
@ -197,8 +197,8 @@ if False: # change to true to run this test case.
plot_heterogeneity(heterogeneity, k)
def ReportGenerator(
df: pd.DataFrame, ClusteringVariables: np.ndarray, FillMissingReport=None
def report_generator(
df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
) -> pd.DataFrame:
"""
Function generates easy-erading clustering report. It takes 2 arguments as an input:
@ -214,7 +214,7 @@ def ReportGenerator(
>>> data['col2'] = [100, 200, 300]
>>> data['col3'] = [10, 20, 30]
>>> data['Cluster'] = [1, 1, 2]
>>> ReportGenerator(data, ['col1', 'col2'], 0)
>>> report_generator(data, ['col1', 'col2'], 0)
Features Type Mark 1 2
0 # of Customers ClusterSize False 2.000000 1.000000
1 % of Customers ClusterProportion False 0.666667 0.333333
@ -231,8 +231,8 @@ def ReportGenerator(
[104 rows x 5 columns]
"""
# Fill missing values with given rules
if FillMissingReport:
df.fillna(value=FillMissingReport, inplace=True)
if fill_missing_report:
df.fillna(value=fill_missing_report, inplace=True)
df["dummy"] = 1
numeric_cols = df.select_dtypes(np.number).columns
report = (
@ -313,7 +313,7 @@ def ReportGenerator(
report = pd.concat(
[report, a, clustersize, clusterproportion], axis=0
) # concat report with clustert size and nan values
report["Mark"] = report["Features"].isin(ClusteringVariables)
report["Mark"] = report["Features"].isin(clustering_variables)
cols = report.columns.tolist()
cols = cols[0:2] + cols[-1:] + cols[2:-1]
report = report[cols]

View File

@ -41,11 +41,11 @@ def local_weight(
[0.08272556]])
"""
weight = weighted_matrix(point, training_data_x, bandwidth)
W = (training_data_x.T * (weight * training_data_x)).I * (
w = (training_data_x.T * (weight * training_data_x)).I * (
training_data_x.T * weight * training_data_y.T
)
return W
return w
def local_weight_regression(

View File

@ -35,25 +35,25 @@ def cost_function(h, y):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def log_likelihood(X, Y, weights):
scores = np.dot(X, weights)
return np.sum(Y * scores - np.log(1 + np.exp(scores)))
def log_likelihood(x, y, weights):
scores = np.dot(x, weights)
return np.sum(y * scores - np.log(1 + np.exp(scores)))
# here alpha is the learning rate, X is the feature matrix,y is the target matrix
def logistic_reg(alpha, X, y, max_iterations=70000):
theta = np.zeros(X.shape[1])
def logistic_reg(alpha, x, y, max_iterations=70000):
theta = np.zeros(x.shape[1])
for iterations in range(max_iterations):
z = np.dot(X, theta)
z = np.dot(x, theta)
h = sigmoid_function(z)
gradient = np.dot(X.T, h - y) / y.size
gradient = np.dot(x.T, h - y) / y.size
theta = theta - alpha * gradient # updating the weights
z = np.dot(X, theta)
z = np.dot(x, theta)
h = sigmoid_function(z)
J = cost_function(h, y)
j = cost_function(h, y)
if iterations % 100 == 0:
print(f"loss: {J} \t") # printing the loss after every 100 iterations
print(f"loss: {j} \t") # printing the loss after every 100 iterations
return theta
@ -61,23 +61,23 @@ def logistic_reg(alpha, X, y, max_iterations=70000):
if __name__ == "__main__":
iris = datasets.load_iris()
X = iris.data[:, :2]
x = iris.data[:, :2]
y = (iris.target != 0) * 1
alpha = 0.1
theta = logistic_reg(alpha, X, y, max_iterations=70000)
theta = logistic_reg(alpha, x, y, max_iterations=70000)
print("theta: ", theta) # printing the theta i.e our weights vector
def predict_prob(X):
def predict_prob(x):
return sigmoid_function(
np.dot(X, theta)
np.dot(x, theta)
) # predicting the value of probability from the logistic regression algorithm
plt.figure(figsize=(10, 6))
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color="b", label="0")
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color="r", label="1")
(x1_min, x1_max) = (X[:, 0].min(), X[:, 0].max())
(x2_min, x2_max) = (X[:, 1].min(), X[:, 1].max())
plt.scatter(x[y == 0][:, 0], x[y == 0][:, 1], color="b", label="0")
plt.scatter(x[y == 1][:, 0], x[y == 1][:, 1], color="r", label="1")
(x1_min, x1_max) = (x[:, 0].min(), x[:, 0].max())
(x2_min, x2_max) = (x[:, 1].min(), x[:, 1].max())
(xx1, xx2) = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
grid = np.c_[xx1.ravel(), xx2.ravel()]
probs = predict_prob(grid).reshape(xx1.shape)

View File

@ -15,12 +15,12 @@ test = [[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]]
Y = clf.predict(test)
def wrapper(Y):
def wrapper(y):
"""
>>> wrapper(Y)
[0, 0, 1]
"""
return list(Y)
return list(y)
if __name__ == "__main__":

View File

@ -17,10 +17,10 @@ def main():
iris = load_iris()
# Split dataset into train and test data
X = iris["data"] # features
Y = iris["target"]
x = iris["data"] # features
y = iris["target"]
x_train, x_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=1
x, y, test_size=0.3, random_state=1
)
# Random Forest Classifier

View File

@ -17,10 +17,10 @@ def main():
print(boston.keys())
# Split dataset into train and test data
X = boston["data"] # features
Y = boston["target"]
x = boston["data"] # features
y = boston["target"]
x_train, x_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=1
x, y, test_size=0.3, random_state=1
)
# Random Forest Regressor

View File

@ -80,7 +80,7 @@ class SmoSVM:
# Calculate alphas using SMO algorithm
def fit(self):
K = self._k
k = self._k
state = None
while True:
@ -106,14 +106,14 @@ class SmoSVM:
# 3: update threshold(b)
b1_new = np.float64(
-e1
- y1 * K(i1, i1) * (a1_new - a1)
- y2 * K(i2, i1) * (a2_new - a2)
- y1 * k(i1, i1) * (a1_new - a1)
- y2 * k(i2, i1) * (a2_new - a2)
+ self._b
)
b2_new = np.float64(
-e2
- y2 * K(i2, i2) * (a2_new - a2)
- y1 * K(i1, i2) * (a1_new - a1)
- y2 * k(i2, i2) * (a2_new - a2)
- y1 * k(i1, i2) * (a1_new - a1)
+ self._b
)
if 0.0 < a1_new < self._c:
@ -134,8 +134,8 @@ class SmoSVM:
if s == i1 or s == i2:
continue
self._error[s] += (
y1 * (a1_new - a1) * K(i1, s)
+ y2 * (a2_new - a2) * K(i2, s)
y1 * (a1_new - a1) * k(i1, s)
+ y2 * (a2_new - a2) * k(i2, s)
+ (self._b - b_old)
)
@ -305,56 +305,56 @@ class SmoSVM:
# Get the new alpha2 and new alpha1
def _get_new_alpha(self, i1, i2, a1, a2, e1, e2, y1, y2):
K = self._k
k = self._k
if i1 == i2:
return None, None
# calculate L and H which bound the new alpha2
s = y1 * y2
if s == -1:
L, H = max(0.0, a2 - a1), min(self._c, self._c + a2 - a1)
l, h = max(0.0, a2 - a1), min(self._c, self._c + a2 - a1)
else:
L, H = max(0.0, a2 + a1 - self._c), min(self._c, a2 + a1)
if L == H:
l, h = max(0.0, a2 + a1 - self._c), min(self._c, a2 + a1)
if l == h: # noqa: E741
return None, None
# calculate eta
k11 = K(i1, i1)
k22 = K(i2, i2)
k12 = K(i1, i2)
k11 = k(i1, i1)
k22 = k(i2, i2)
k12 = k(i1, i2)
eta = k11 + k22 - 2.0 * k12
# select the new alpha2 which could get the minimal objectives
if eta > 0.0:
a2_new_unc = a2 + (y2 * (e1 - e2)) / eta
# a2_new has a boundary
if a2_new_unc >= H:
a2_new = H
elif a2_new_unc <= L:
a2_new = L
if a2_new_unc >= h:
a2_new = h
elif a2_new_unc <= l:
a2_new = l
else:
a2_new = a2_new_unc
else:
b = self._b
l1 = a1 + s * (a2 - L)
h1 = a1 + s * (a2 - H)
l1 = a1 + s * (a2 - l)
h1 = a1 + s * (a2 - h)
# way 1
f1 = y1 * (e1 + b) - a1 * K(i1, i1) - s * a2 * K(i1, i2)
f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2)
f1 = y1 * (e1 + b) - a1 * k(i1, i1) - s * a2 * k(i1, i2)
f2 = y2 * (e2 + b) - a2 * k(i2, i2) - s * a1 * k(i1, i2)
ol = (
l1 * f1
+ L * f2
+ 1 / 2 * l1**2 * K(i1, i1)
+ 1 / 2 * L**2 * K(i2, i2)
+ s * L * l1 * K(i1, i2)
+ l * f2
+ 1 / 2 * l1**2 * k(i1, i1)
+ 1 / 2 * l**2 * k(i2, i2)
+ s * l * l1 * k(i1, i2)
)
oh = (
h1 * f1
+ H * f2
+ 1 / 2 * h1**2 * K(i1, i1)
+ 1 / 2 * H**2 * K(i2, i2)
+ s * H * h1 * K(i1, i2)
+ h * f2
+ 1 / 2 * h1**2 * k(i1, i1)
+ 1 / 2 * h**2 * k(i2, i2)
+ s * h * h1 * k(i1, i2)
)
"""
# way 2
@ -362,9 +362,9 @@ class SmoSVM:
objectives
"""
if ol < (oh - self._eps):
a2_new = L
a2_new = l
elif ol > oh + self._eps:
a2_new = H
a2_new = h
else:
a2_new = a2

View File

@ -83,7 +83,7 @@ the third document in the corpus.")
return (len([doc for doc in docs if term in doc]), len(docs))
def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
def inverse_document_frequency(df: int, n: int, smoothing=False) -> float:
"""
Return an integer denoting the importance
of a word. This measure of importance is
@ -109,15 +109,15 @@ def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
1.477
"""
if smoothing:
if N == 0:
if n == 0:
raise ValueError("log10(0) is undefined.")
return round(1 + log10(N / (1 + df)), 3)
return round(1 + log10(n / (1 + df)), 3)
if df == 0:
raise ZeroDivisionError("df must be > 0")
elif N == 0:
elif n == 0:
raise ValueError("log10(0) is undefined.")
return round(log10(N / df), 3)
return round(log10(n / df), 3)
def tf_idf(tf: int, idf: int) -> float: