Add pep8-naming to pre-commit hooks and fixes incorrect naming conventions (#7062)

* ci(pre-commit): Add pep8-naming to `pre-commit` hooks (#7038) * refactor: Fix naming conventions (#7038) * Update arithmetic_analysis/lu_decomposition.py Co-authored-by: Christian Clauss <cclauss@me.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactor(lu_decomposition): Replace `NDArray` with `ArrayLike` (#7038) * chore: Fix naming conventions in doctests (#7038) * fix: Temporarily disable project euler problem 104 (#7069) * chore: Fix naming conventions in doctests (#7038) Co-authored-by: Christian Clauss <cclauss@me.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-07-04 16:57:32 +08:00 · 2022-10-12 23:54:20 +01:00
parent e2cd982b11
commit 07e991d553
140 changed files with 1552 additions and 1536 deletions
--- a/machine_learning/decision_tree.py
+++ b/machine_learning/decision_tree.py
@ -6,7 +6,7 @@ Output: The decision tree maps a real number input to a real number output.
 import numpy as np


-class Decision_Tree:
+class DecisionTree:
    def __init__(self, depth=5, min_leaf_size=5):
        self.depth = depth
        self.decision_boundary = 0
@ -22,17 +22,17 @@ class Decision_Tree:
        @param prediction: a floating point value
        return value: mean_squared_error calculates the error if prediction is used to
            estimate the labels
-        >>> tester = Decision_Tree()
+        >>> tester = DecisionTree()
        >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
        >>> test_prediction = np.float(6)
        >>> tester.mean_squared_error(test_labels, test_prediction) == (
-        ...     Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
+        ...     TestDecisionTree.helper_mean_squared_error_test(test_labels,
        ...         test_prediction))
        True
        >>> test_labels = np.array([1,2,3])
        >>> test_prediction = np.float(2)
        >>> tester.mean_squared_error(test_labels, test_prediction) == (
-        ...     Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
+        ...     TestDecisionTree.helper_mean_squared_error_test(test_labels,
        ...         test_prediction))
        True
        """
@ -41,10 +41,10 @@ class Decision_Tree:

        return np.mean((labels - prediction) ** 2)

-    def train(self, X, y):
+    def train(self, x, y):
        """
        train:
-        @param X: a one dimensional numpy array
+        @param x: a one dimensional numpy array
        @param y: a one dimensional numpy array.
        The contents of y are the labels for the corresponding X values

@ -55,17 +55,17 @@ class Decision_Tree:
        this section is to check that the inputs conform to our dimensionality
        constraints
        """
-        if X.ndim != 1:
+        if x.ndim != 1:
            print("Error: Input data set must be one dimensional")
            return
-        if len(X) != len(y):
+        if len(x) != len(y):
            print("Error: X and y have different lengths")
            return
        if y.ndim != 1:
            print("Error: Data set labels must be one dimensional")
            return

-        if len(X) < 2 * self.min_leaf_size:
+        if len(x) < 2 * self.min_leaf_size:
            self.prediction = np.mean(y)
            return

@ -74,7 +74,7 @@ class Decision_Tree:
            return

        best_split = 0
-        min_error = self.mean_squared_error(X, np.mean(y)) * 2
+        min_error = self.mean_squared_error(x, np.mean(y)) * 2

        """
        loop over all possible splits for the decision tree. find the best split.
@ -82,34 +82,34 @@ class Decision_Tree:
        then the data set is not split and the average for the entire array is used as
        the predictor
        """
-        for i in range(len(X)):
-            if len(X[:i]) < self.min_leaf_size:
+        for i in range(len(x)):
+            if len(x[:i]) < self.min_leaf_size:
                continue
-            elif len(X[i:]) < self.min_leaf_size:
+            elif len(x[i:]) < self.min_leaf_size:
                continue
            else:
-                error_left = self.mean_squared_error(X[:i], np.mean(y[:i]))
-                error_right = self.mean_squared_error(X[i:], np.mean(y[i:]))
+                error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
+                error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
                error = error_left + error_right
                if error < min_error:
                    best_split = i
                    min_error = error

        if best_split != 0:
-            left_X = X[:best_split]
+            left_x = x[:best_split]
            left_y = y[:best_split]
-            right_X = X[best_split:]
+            right_x = x[best_split:]
            right_y = y[best_split:]

-            self.decision_boundary = X[best_split]
-            self.left = Decision_Tree(
+            self.decision_boundary = x[best_split]
+            self.left = DecisionTree(
                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
            )
-            self.right = Decision_Tree(
+            self.right = DecisionTree(
                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
            )
-            self.left.train(left_X, left_y)
-            self.right.train(right_X, right_y)
+            self.left.train(left_x, left_y)
+            self.right.train(right_x, right_y)
        else:
            self.prediction = np.mean(y)

@ -134,7 +134,7 @@ class Decision_Tree:
            return None


-class Test_Decision_Tree:
+class TestDecisionTree:
    """Decision Tres test class"""

    @staticmethod
@ -159,11 +159,11 @@ def main():
    predict the label of 10 different test values. Then the mean squared error over
    this test is displayed.
    """
-    X = np.arange(-1.0, 1.0, 0.005)
-    y = np.sin(X)
+    x = np.arange(-1.0, 1.0, 0.005)
+    y = np.sin(x)

-    tree = Decision_Tree(depth=10, min_leaf_size=10)
-    tree.train(X, y)
+    tree = DecisionTree(depth=10, min_leaf_size=10)
+    tree.train(x, y)

    test_cases = (np.random.rand(10) * 2) - 1
    predictions = np.array([tree.predict(x) for x in test_cases])
--- a/machine_learning/gaussian_naive_bayes.py
+++ b/machine_learning/gaussian_naive_bayes.py
@ -17,19 +17,19 @@ def main():
    iris = load_iris()

    # Split dataset into train and test data
-    X = iris["data"]  # features
-    Y = iris["target"]
+    x = iris["data"]  # features
+    y = iris["target"]
    x_train, x_test, y_train, y_test = train_test_split(
-        X, Y, test_size=0.3, random_state=1
+        x, y, test_size=0.3, random_state=1
    )

    # Gaussian Naive Bayes
-    NB_model = GaussianNB()
-    NB_model.fit(x_train, y_train)
+    nb_model = GaussianNB()
+    nb_model.fit(x_train, y_train)

    # Display Confusion Matrix
    plot_confusion_matrix(
-        NB_model,
+        nb_model,
        x_test,
        y_test,
        display_labels=iris["target_names"],
--- a/machine_learning/gradient_boosting_regressor.py
+++ b/machine_learning/gradient_boosting_regressor.py
@ -26,25 +26,25 @@ def main():
    print(df_boston.describe().T)
    # Feature selection

-    X = df_boston.iloc[:, :-1]
+    x = df_boston.iloc[:, :-1]
    y = df_boston.iloc[:, -1]  # target variable
    # split the data with 75% train and 25% test sets.
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, random_state=0, test_size=0.25
+    x_train, x_test, y_train, y_test = train_test_split(
+        x, y, random_state=0, test_size=0.25
    )

    model = GradientBoostingRegressor(
        n_estimators=500, max_depth=5, min_samples_split=4, learning_rate=0.01
    )
    # training the model
-    model.fit(X_train, y_train)
+    model.fit(x_train, y_train)
    # to see how good the model fit the data
-    training_score = model.score(X_train, y_train).round(3)
-    test_score = model.score(X_test, y_test).round(3)
+    training_score = model.score(x_train, y_train).round(3)
+    test_score = model.score(x_test, y_test).round(3)
    print("Training score of GradientBoosting is :", training_score)
    print("The test score of GradientBoosting is :", test_score)
    # Let us evaluation the model by finding the errors
-    y_pred = model.predict(X_test)
+    y_pred = model.predict(x_test)

    # The mean squared error
    print(f"Mean squared error: {mean_squared_error(y_test, y_pred):.2f}")
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@ -69,8 +69,8 @@ def get_initial_centroids(data, k, seed=None):
    return centroids


-def centroid_pairwise_dist(X, centroids):
-    return pairwise_distances(X, centroids, metric="euclidean")
+def centroid_pairwise_dist(x, centroids):
+    return pairwise_distances(x, centroids, metric="euclidean")


 def assign_clusters(data, centroids):
@ -197,8 +197,8 @@ if False:  # change to true to run this test case.
    plot_heterogeneity(heterogeneity, k)


-def ReportGenerator(
-    df: pd.DataFrame, ClusteringVariables: np.ndarray, FillMissingReport=None
+def report_generator(
+    df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
 ) -> pd.DataFrame:
    """
    Function generates easy-erading clustering report. It takes 2 arguments as an input:
@ -214,7 +214,7 @@ def ReportGenerator(
    >>> data['col2'] = [100, 200, 300]
    >>> data['col3'] = [10, 20, 30]
    >>> data['Cluster'] = [1, 1, 2]
-    >>> ReportGenerator(data, ['col1', 'col2'], 0)
+    >>> report_generator(data, ['col1', 'col2'], 0)
               Features               Type   Mark           1           2
    0    # of Customers        ClusterSize  False    2.000000    1.000000
    1    % of Customers  ClusterProportion  False    0.666667    0.333333
@ -231,8 +231,8 @@ def ReportGenerator(
    [104 rows x 5 columns]
    """
    # Fill missing values with given rules
-    if FillMissingReport:
-        df.fillna(value=FillMissingReport, inplace=True)
+    if fill_missing_report:
+        df.fillna(value=fill_missing_report, inplace=True)
    df["dummy"] = 1
    numeric_cols = df.select_dtypes(np.number).columns
    report = (
@ -313,7 +313,7 @@ def ReportGenerator(
    report = pd.concat(
        [report, a, clustersize, clusterproportion], axis=0
    )  # concat report with clustert size and nan values
-    report["Mark"] = report["Features"].isin(ClusteringVariables)
+    report["Mark"] = report["Features"].isin(clustering_variables)
    cols = report.columns.tolist()
    cols = cols[0:2] + cols[-1:] + cols[2:-1]
    report = report[cols]
--- a/machine_learning/local_weighted_learning/local_weighted_learning.py
+++ b/machine_learning/local_weighted_learning/local_weighted_learning.py
@ -41,11 +41,11 @@ def local_weight(
            [0.08272556]])
    """
    weight = weighted_matrix(point, training_data_x, bandwidth)
-    W = (training_data_x.T * (weight * training_data_x)).I * (
+    w = (training_data_x.T * (weight * training_data_x)).I * (
        training_data_x.T * weight * training_data_y.T
    )

-    return W
+    return w


 def local_weight_regression(
--- a/machine_learning/logistic_regression.py
+++ b/machine_learning/logistic_regression.py
@ -35,25 +35,25 @@ def cost_function(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()


-def log_likelihood(X, Y, weights):
-    scores = np.dot(X, weights)
-    return np.sum(Y * scores - np.log(1 + np.exp(scores)))
+def log_likelihood(x, y, weights):
+    scores = np.dot(x, weights)
+    return np.sum(y * scores - np.log(1 + np.exp(scores)))


 # here alpha is the learning rate, X is the feature matrix,y is the target matrix
-def logistic_reg(alpha, X, y, max_iterations=70000):
-    theta = np.zeros(X.shape[1])
+def logistic_reg(alpha, x, y, max_iterations=70000):
+    theta = np.zeros(x.shape[1])

    for iterations in range(max_iterations):
-        z = np.dot(X, theta)
+        z = np.dot(x, theta)
        h = sigmoid_function(z)
-        gradient = np.dot(X.T, h - y) / y.size
+        gradient = np.dot(x.T, h - y) / y.size
        theta = theta - alpha * gradient  # updating the weights
-        z = np.dot(X, theta)
+        z = np.dot(x, theta)
        h = sigmoid_function(z)
-        J = cost_function(h, y)
+        j = cost_function(h, y)
        if iterations % 100 == 0:
-            print(f"loss: {J} \t")  # printing the loss after every 100 iterations
+            print(f"loss: {j} \t")  # printing the loss after every 100 iterations
    return theta


@ -61,23 +61,23 @@ def logistic_reg(alpha, X, y, max_iterations=70000):

 if __name__ == "__main__":
    iris = datasets.load_iris()
-    X = iris.data[:, :2]
+    x = iris.data[:, :2]
    y = (iris.target != 0) * 1

    alpha = 0.1
-    theta = logistic_reg(alpha, X, y, max_iterations=70000)
+    theta = logistic_reg(alpha, x, y, max_iterations=70000)
    print("theta: ", theta)  # printing the theta i.e our weights vector

-    def predict_prob(X):
+    def predict_prob(x):
        return sigmoid_function(
-            np.dot(X, theta)
+            np.dot(x, theta)
        )  # predicting the value of probability from the logistic regression algorithm

    plt.figure(figsize=(10, 6))
-    plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color="b", label="0")
-    plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color="r", label="1")
-    (x1_min, x1_max) = (X[:, 0].min(), X[:, 0].max())
-    (x2_min, x2_max) = (X[:, 1].min(), X[:, 1].max())
+    plt.scatter(x[y == 0][:, 0], x[y == 0][:, 1], color="b", label="0")
+    plt.scatter(x[y == 1][:, 0], x[y == 1][:, 1], color="r", label="1")
+    (x1_min, x1_max) = (x[:, 0].min(), x[:, 0].max())
+    (x2_min, x2_max) = (x[:, 1].min(), x[:, 1].max())
    (xx1, xx2) = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
    grid = np.c_[xx1.ravel(), xx2.ravel()]
    probs = predict_prob(grid).reshape(xx1.shape)
--- a/machine_learning/multilayer_perceptron_classifier.py
+++ b/machine_learning/multilayer_perceptron_classifier.py
@ -15,12 +15,12 @@ test = [[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]]
 Y = clf.predict(test)


-def wrapper(Y):
+def wrapper(y):
    """
    >>> wrapper(Y)
    [0, 0, 1]
    """
-    return list(Y)
+    return list(y)


 if __name__ == "__main__":
--- a/machine_learning/random_forest_classifier.py
+++ b/machine_learning/random_forest_classifier.py
@ -17,10 +17,10 @@ def main():
    iris = load_iris()

    # Split dataset into train and test data
-    X = iris["data"]  # features
-    Y = iris["target"]
+    x = iris["data"]  # features
+    y = iris["target"]
    x_train, x_test, y_train, y_test = train_test_split(
-        X, Y, test_size=0.3, random_state=1
+        x, y, test_size=0.3, random_state=1
    )

    # Random Forest Classifier
--- a/machine_learning/random_forest_regressor.py
+++ b/machine_learning/random_forest_regressor.py
@ -17,10 +17,10 @@ def main():
    print(boston.keys())

    # Split dataset into train and test data
-    X = boston["data"]  # features
-    Y = boston["target"]
+    x = boston["data"]  # features
+    y = boston["target"]
    x_train, x_test, y_train, y_test = train_test_split(
-        X, Y, test_size=0.3, random_state=1
+        x, y, test_size=0.3, random_state=1
    )

    # Random Forest Regressor
--- a/machine_learning/sequential_minimum_optimization.py
+++ b/machine_learning/sequential_minimum_optimization.py
@ -80,7 +80,7 @@ class SmoSVM:

    # Calculate alphas using SMO algorithm
    def fit(self):
-        K = self._k
+        k = self._k
        state = None
        while True:

@ -106,14 +106,14 @@ class SmoSVM:
            # 3: update threshold(b)
            b1_new = np.float64(
                -e1
-                - y1 * K(i1, i1) * (a1_new - a1)
-                - y2 * K(i2, i1) * (a2_new - a2)
+                - y1 * k(i1, i1) * (a1_new - a1)
+                - y2 * k(i2, i1) * (a2_new - a2)
                + self._b
            )
            b2_new = np.float64(
                -e2
-                - y2 * K(i2, i2) * (a2_new - a2)
-                - y1 * K(i1, i2) * (a1_new - a1)
+                - y2 * k(i2, i2) * (a2_new - a2)
+                - y1 * k(i1, i2) * (a1_new - a1)
                + self._b
            )
            if 0.0 < a1_new < self._c:
@ -134,8 +134,8 @@ class SmoSVM:
                if s == i1 or s == i2:
                    continue
                self._error[s] += (
-                    y1 * (a1_new - a1) * K(i1, s)
-                    + y2 * (a2_new - a2) * K(i2, s)
+                    y1 * (a1_new - a1) * k(i1, s)
+                    + y2 * (a2_new - a2) * k(i2, s)
                    + (self._b - b_old)
                )

@ -305,56 +305,56 @@ class SmoSVM:

    # Get the new alpha2 and new alpha1
    def _get_new_alpha(self, i1, i2, a1, a2, e1, e2, y1, y2):
-        K = self._k
+        k = self._k
        if i1 == i2:
            return None, None

        # calculate L and H  which bound the new alpha2
        s = y1 * y2
        if s == -1:
-            L, H = max(0.0, a2 - a1), min(self._c, self._c + a2 - a1)
+            l, h = max(0.0, a2 - a1), min(self._c, self._c + a2 - a1)
        else:
-            L, H = max(0.0, a2 + a1 - self._c), min(self._c, a2 + a1)
-        if L == H:
+            l, h = max(0.0, a2 + a1 - self._c), min(self._c, a2 + a1)
+        if l == h:  # noqa: E741
            return None, None

        # calculate eta
-        k11 = K(i1, i1)
-        k22 = K(i2, i2)
-        k12 = K(i1, i2)
+        k11 = k(i1, i1)
+        k22 = k(i2, i2)
+        k12 = k(i1, i2)
        eta = k11 + k22 - 2.0 * k12

        # select the new alpha2 which could get the minimal objectives
        if eta > 0.0:
            a2_new_unc = a2 + (y2 * (e1 - e2)) / eta
            # a2_new has a boundary
-            if a2_new_unc >= H:
-                a2_new = H
-            elif a2_new_unc <= L:
-                a2_new = L
+            if a2_new_unc >= h:
+                a2_new = h
+            elif a2_new_unc <= l:
+                a2_new = l
            else:
                a2_new = a2_new_unc
        else:
            b = self._b
-            l1 = a1 + s * (a2 - L)
-            h1 = a1 + s * (a2 - H)
+            l1 = a1 + s * (a2 - l)
+            h1 = a1 + s * (a2 - h)

            # way 1
-            f1 = y1 * (e1 + b) - a1 * K(i1, i1) - s * a2 * K(i1, i2)
-            f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2)
+            f1 = y1 * (e1 + b) - a1 * k(i1, i1) - s * a2 * k(i1, i2)
+            f2 = y2 * (e2 + b) - a2 * k(i2, i2) - s * a1 * k(i1, i2)
            ol = (
                l1 * f1
-                + L * f2
-                + 1 / 2 * l1**2 * K(i1, i1)
-                + 1 / 2 * L**2 * K(i2, i2)
-                + s * L * l1 * K(i1, i2)
+                + l * f2
+                + 1 / 2 * l1**2 * k(i1, i1)
+                + 1 / 2 * l**2 * k(i2, i2)
+                + s * l * l1 * k(i1, i2)
            )
            oh = (
                h1 * f1
-                + H * f2
-                + 1 / 2 * h1**2 * K(i1, i1)
-                + 1 / 2 * H**2 * K(i2, i2)
-                + s * H * h1 * K(i1, i2)
+                + h * f2
+                + 1 / 2 * h1**2 * k(i1, i1)
+                + 1 / 2 * h**2 * k(i2, i2)
+                + s * h * h1 * k(i1, i2)
            )
            """
            # way 2
@ -362,9 +362,9 @@ class SmoSVM:
            objectives
            """
            if ol < (oh - self._eps):
-                a2_new = L
+                a2_new = l
            elif ol > oh + self._eps:
-                a2_new = H
+                a2_new = h
            else:
                a2_new = a2

--- a/machine_learning/word_frequency_functions.py
+++ b/machine_learning/word_frequency_functions.py
@ -83,7 +83,7 @@ the third document in the corpus.")
    return (len([doc for doc in docs if term in doc]), len(docs))


-def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
+def inverse_document_frequency(df: int, n: int, smoothing=False) -> float:
    """
    Return an integer denoting the importance
    of a word. This measure of importance is
@ -109,15 +109,15 @@ def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
    1.477
    """
    if smoothing:
-        if N == 0:
+        if n == 0:
            raise ValueError("log10(0) is undefined.")
-        return round(1 + log10(N / (1 + df)), 3)
+        return round(1 + log10(n / (1 + df)), 3)

    if df == 0:
        raise ZeroDivisionError("df must be > 0")
-    elif N == 0:
+    elif n == 0:
        raise ValueError("log10(0) is undefined.")
-    return round(log10(N / df), 3)
+    return round(log10(n / df), 3)


 def tf_idf(tf: int, idf: int) -> float: