psf/black code formatting (#1277)

2025-07-05 09:21:13 +08:00 · 2019-10-05 01:14:13 -04:00
parent 07f04a2e55
commit 9eac17a408
291 changed files with 6014 additions and 4571 deletions
--- a/machine_learning/sequential_minimum_optimization.py
+++ b/machine_learning/sequential_minimum_optimization.py
@ -41,11 +41,20 @@ import pandas as pd
 from sklearn.datasets import make_blobs, make_circles
 from sklearn.preprocessing import StandardScaler

-CANCER_DATASET_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
+CANCER_DATASET_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"


 class SmoSVM(object):
-    def __init__(self, train, kernel_func, alpha_list=None, cost=0.4, b=0.0, tolerance=0.001, auto_norm=True):
+    def __init__(
+        self,
+        train,
+        kernel_func,
+        alpha_list=None,
+        cost=0.4,
+        b=0.0,
+        tolerance=0.001,
+        auto_norm=True,
+    ):
        self._init = True
        self._auto_norm = auto_norm
        self._c = np.float64(cost)
@ -91,13 +100,25 @@ class SmoSVM(object):
            self.alphas[i1], self.alphas[i2] = a1_new, a2_new

            # 3: update threshold(b)
-            b1_new = np.float64(-e1 - y1 * K(i1, i1) * (a1_new - a1) - y2 * K(i2, i1) * (a2_new - a2) + self._b)
-            b2_new = np.float64(-e2 - y2 * K(i2, i2) * (a2_new - a2) - y1 * K(i1, i2) * (a1_new - a1) + self._b)
+            b1_new = np.float64(
+                -e1
+                - y1 * K(i1, i1) * (a1_new - a1)
+                - y2 * K(i2, i1) * (a2_new - a2)
+                + self._b
+            )
+            b2_new = np.float64(
+                -e2
+                - y2 * K(i2, i2) * (a2_new - a2)
+                - y1 * K(i1, i2) * (a1_new - a1)
+                + self._b
+            )
            if 0.0 < a1_new < self._c:
                b = b1_new
            if 0.0 < a2_new < self._c:
                b = b2_new
-            if not (np.float64(0) < a2_new < self._c) and not (np.float64(0) < a1_new < self._c):
+            if not (np.float64(0) < a2_new < self._c) and not (
+                np.float64(0) < a1_new < self._c
+            ):
                b = (b1_new + b2_new) / 2.0
            b_old = self._b
            self._b = b
@ -107,7 +128,11 @@ class SmoSVM(object):
            for s in self.unbound:
                if s == i1 or s == i2:
                    continue
-                self._error[s] += y1 * (a1_new - a1) * K(i1, s) + y2 * (a2_new - a2) * K(i2, s) + (self._b - b_old)
+                self._error[s] += (
+                    y1 * (a1_new - a1) * K(i1, s)
+                    + y2 * (a2_new - a2) * K(i2, s)
+                    + (self._b - b_old)
+                )

            # if i1 or i2 is non-bound,update there error value to zero
            if self._is_unbound(i1):
@ -119,7 +144,9 @@ class SmoSVM(object):
    def predict(self, test_samples, classify=True):

        if test_samples.shape[1] > self.samples.shape[1]:
-            raise ValueError("Test samples' feature length does not equal to that of train samples")
+            raise ValueError(
+                "Test samples' feature length does not equal to that of train samples"
+            )

        if self._auto_norm:
            test_samples = self._norm(test_samples)
@ -173,14 +200,23 @@ class SmoSVM(object):
        k_matrix = np.zeros([self.length, self.length])
        for i in self._all_samples:
            for j in self._all_samples:
-                k_matrix[i, j] = np.float64(self.Kernel(self.samples[i, :], self.samples[j, :]))
+                k_matrix[i, j] = np.float64(
+                    self.Kernel(self.samples[i, :], self.samples[j, :])
+                )
        return k_matrix

    # Predict test sample's tag
    def _predict(self, sample):
        k = self._k
-        predicted_value = np.sum(
-            [self.alphas[i1] * self.tags[i1] * k(i1, sample) for i1 in self._all_samples]) + self._b
+        predicted_value = (
+            np.sum(
+                [
+                    self.alphas[i1] * self.tags[i1] * k(i1, sample)
+                    for i1 in self._all_samples
+                ]
+            )
+            + self._b
+        )
        return predicted_value

    # Choose alpha1 and alpha2
@ -200,23 +236,27 @@ class SmoSVM(object):
        while True:
            all_not_obey = True
            # all sample
-            print('scanning all sample!')
+            print("scanning all sample!")
            for i1 in [i for i in self._all_samples if self._check_obey_kkt(i)]:
                all_not_obey = False
                yield from self._choose_a2(i1)

            # non-bound sample
-            print('scanning non-bound sample!')
+            print("scanning non-bound sample!")
            while True:
                not_obey = True
-                for i1 in [i for i in self._all_samples if self._check_obey_kkt(i) and self._is_unbound(i)]:
+                for i1 in [
+                    i
+                    for i in self._all_samples
+                    if self._check_obey_kkt(i) and self._is_unbound(i)
+                ]:
                    not_obey = False
                    yield from self._choose_a2(i1)
                if not_obey:
-                    print('all non-bound samples fit the KKT condition!')
+                    print("all non-bound samples fit the KKT condition!")
                    break
            if all_not_obey:
-                print('all samples fit the KKT condition! Optimization done!')
+                print("all samples fit the KKT condition! Optimization done!")
                break
        return False

@ -231,7 +271,11 @@ class SmoSVM(object):

        if len(self.unbound) > 0:
            tmp_error = self._error.copy().tolist()
-            tmp_error_dict = {index: value for index, value in enumerate(tmp_error) if self._is_unbound(index)}
+            tmp_error_dict = {
+                index: value
+                for index, value in enumerate(tmp_error)
+                if self._is_unbound(index)
+            }
            if self._e(i1) >= 0:
                i2 = min(tmp_error_dict, key=lambda index: tmp_error_dict[index])
            else:
@ -289,8 +333,20 @@ class SmoSVM(object):
            # way 1
            f1 = y1 * (e1 + b) - a1 * K(i1, i1) - s * a2 * K(i1, i2)
            f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2)
-            ol = l1 * f1 + L * f2 + 1 / 2 * l1 ** 2 * K(i1, i1) + 1 / 2 * L ** 2 * K(i2, i2) + s * L * l1 * K(i1, i2)
-            oh = h1 * f1 + H * f2 + 1 / 2 * h1 ** 2 * K(i1, i1) + 1 / 2 * H ** 2 * K(i2, i2) + s * H * h1 * K(i1, i2)
+            ol = (
+                l1 * f1
+                + L * f2
+                + 1 / 2 * l1 ** 2 * K(i1, i1)
+                + 1 / 2 * L ** 2 * K(i2, i2)
+                + s * L * l1 * K(i1, i2)
+            )
+            oh = (
+                h1 * f1
+                + H * f2
+                + 1 / 2 * h1 ** 2 * K(i1, i1)
+                + 1 / 2 * H ** 2 * K(i2, i2)
+                + s * H * h1 * K(i1, i2)
+            )
            """
            # way 2
            Use objective function check which alpha2 new could get the minimal objectives
@ -370,14 +426,10 @@ class Kernel(object):
    def _check(self):
        if self._kernel == self._rbf:
            if self.gamma < 0:
-                raise ValueError('gamma value must greater than 0')
+                raise ValueError("gamma value must greater than 0")

    def _get_kernel(self, kernel_name):
-        maps = {
-            'linear': self._linear,
-            'poly': self._polynomial,
-            'rbf': self._rbf
-        }
+        maps = {"linear": self._linear, "poly": self._polynomial, "rbf": self._rbf}
        return maps[kernel_name]

    def __call__(self, v1, v2):
@ -390,34 +442,35 @@ class Kernel(object):
 def count_time(func):
    def call_func(*args, **kwargs):
        import time
+
        start_time = time.time()
        func(*args, **kwargs)
        end_time = time.time()
-        print('smo algorithm cost {} seconds'.format(end_time - start_time))
+        print("smo algorithm cost {} seconds".format(end_time - start_time))

    return call_func


@count_time
 def test_cancel_data():
-    print('Hello!\r\nStart test svm by smo algorithm!')
+    print("Hello!\r\nStart test svm by smo algorithm!")
    # 0: download dataset and load into pandas' dataframe
-    if not os.path.exists(r'cancel_data.csv'):
+    if not os.path.exists(r"cancel_data.csv"):
        request = urllib.request.Request(
            CANCER_DATASET_URL,
-            headers={'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
+            headers={"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},
        )
        response = urllib.request.urlopen(request)
-        content = response.read().decode('utf-8')
-        with open(r'cancel_data.csv', 'w') as f:
+        content = response.read().decode("utf-8")
+        with open(r"cancel_data.csv", "w") as f:
            f.write(content)

-    data = pd.read_csv(r'cancel_data.csv', header=None)
+    data = pd.read_csv(r"cancel_data.csv", header=None)

    # 1: pre-processing data
    del data[data.columns.tolist()[0]]
    data = data.dropna(axis=0)
-    data = data.replace({'M': np.float64(1), 'B': np.float64(-1)})
+    data = data.replace({"M": np.float64(1), "B": np.float64(-1)})
    samples = np.array(data)[:, :]

    # 2: deviding data into train_data data and test_data data
@ -425,11 +478,18 @@ def test_cancel_data():
    test_tags, test_samples = test_data[:, 0], test_data[:, 1:]

    # 3: choose kernel function,and set initial alphas to zero(optional)
-    mykernel = Kernel(kernel='rbf', degree=5, coef0=1, gamma=0.5)
+    mykernel = Kernel(kernel="rbf", degree=5, coef0=1, gamma=0.5)
    al = np.zeros(train_data.shape[0])

    # 4: calculating best alphas using SMO algorithm and predict test_data samples
-    mysvm = SmoSVM(train=train_data, kernel_func=mykernel, alpha_list=al, cost=0.4, b=0.0, tolerance=0.001)
+    mysvm = SmoSVM(
+        train=train_data,
+        kernel_func=mykernel,
+        alpha_list=al,
+        cost=0.4,
+        b=0.0,
+        tolerance=0.001,
+    )
    mysvm.fit()
    predict = mysvm.predict(test_samples)

@ -439,14 +499,18 @@ def test_cancel_data():
    for i in range(test_tags.shape[0]):
        if test_tags[i] == predict[i]:
            score += 1
-    print('\r\nall: {}\r\nright: {}\r\nfalse: {}'.format(test_num, score, test_num - score))
+    print(
+        "\r\nall: {}\r\nright: {}\r\nfalse: {}".format(
+            test_num, score, test_num - score
+        )
+    )
    print("Rough Accuracy: {}".format(score / test_tags.shape[0]))


 def test_demonstration():
    # change stdout
-    print('\r\nStart plot,please wait!!!')
-    sys.stdout = open(os.devnull, 'w')
+    print("\r\nStart plot,please wait!!!")
+    sys.stdout = open(os.devnull, "w")

    ax1 = plt.subplot2grid((2, 2), (0, 0))
    ax2 = plt.subplot2grid((2, 2), (0, 1))
@ -464,32 +528,50 @@ def test_demonstration():
    sys.stdout = sys.__stdout__
    print("Plot done!!!")

+
 def test_linear_kernel(ax, cost):
-    train_x, train_y = make_blobs(n_samples=500, centers=2,
-                                  n_features=2, random_state=1)
+    train_x, train_y = make_blobs(
+        n_samples=500, centers=2, n_features=2, random_state=1
+    )
    train_y[train_y == 0] = -1
    scaler = StandardScaler()
    train_x_scaled = scaler.fit_transform(train_x, train_y)
    train_data = np.hstack((train_y.reshape(500, 1), train_x_scaled))
-    mykernel = Kernel(kernel='linear', degree=5, coef0=1, gamma=0.5)
-    mysvm = SmoSVM(train=train_data, kernel_func=mykernel, cost=cost, tolerance=0.001, auto_norm=False)
+    mykernel = Kernel(kernel="linear", degree=5, coef0=1, gamma=0.5)
+    mysvm = SmoSVM(
+        train=train_data,
+        kernel_func=mykernel,
+        cost=cost,
+        tolerance=0.001,
+        auto_norm=False,
+    )
    mysvm.fit()
    plot_partition_boundary(mysvm, train_data, ax=ax)


 def test_rbf_kernel(ax, cost):
-    train_x, train_y = make_circles(n_samples=500, noise=0.1, factor=0.1, random_state=1)
+    train_x, train_y = make_circles(
+        n_samples=500, noise=0.1, factor=0.1, random_state=1
+    )
    train_y[train_y == 0] = -1
    scaler = StandardScaler()
    train_x_scaled = scaler.fit_transform(train_x, train_y)
    train_data = np.hstack((train_y.reshape(500, 1), train_x_scaled))
-    mykernel = Kernel(kernel='rbf', degree=5, coef0=1, gamma=0.5)
-    mysvm = SmoSVM(train=train_data, kernel_func=mykernel, cost=cost, tolerance=0.001, auto_norm=False)
+    mykernel = Kernel(kernel="rbf", degree=5, coef0=1, gamma=0.5)
+    mysvm = SmoSVM(
+        train=train_data,
+        kernel_func=mykernel,
+        cost=cost,
+        tolerance=0.001,
+        auto_norm=False,
+    )
    mysvm.fit()
    plot_partition_boundary(mysvm, train_data, ax=ax)


-def plot_partition_boundary(model, train_data, ax, resolution=100, colors=('b', 'k', 'r')):
+def plot_partition_boundary(
+    model, train_data, ax, resolution=100, colors=("b", "k", "r")
+):
    """
    We can not get the optimum w of our kernel svm model which is different from linear svm.
    For this reason, we generate randomly destributed points with high desity and prediced values of these points are
@ -502,25 +584,44 @@ def plot_partition_boundary(model, train_data, ax, resolution=100, colors=('b',
    train_data_tags = train_data[:, 0]
    xrange = np.linspace(train_data_x.min(), train_data_x.max(), resolution)
    yrange = np.linspace(train_data_y.min(), train_data_y.max(), resolution)
-    test_samples = np.array([(x, y) for x in xrange for y in yrange]).reshape(resolution * resolution, 2)
+    test_samples = np.array([(x, y) for x in xrange for y in yrange]).reshape(
+        resolution * resolution, 2
+    )

    test_tags = model.predict(test_samples, classify=False)
    grid = test_tags.reshape((len(xrange), len(yrange)))

    # Plot contour map which represents the partition boundary
-    ax.contour(xrange, yrange, np.mat(grid).T, levels=(-1, 0, 1), linestyles=('--', '-', '--'),
-               linewidths=(1, 1, 1),
-               colors=colors)
+    ax.contour(
+        xrange,
+        yrange,
+        np.mat(grid).T,
+        levels=(-1, 0, 1),
+        linestyles=("--", "-", "--"),
+        linewidths=(1, 1, 1),
+        colors=colors,
+    )
    # Plot all train samples
-    ax.scatter(train_data_x, train_data_y, c=train_data_tags, cmap=plt.cm.Dark2, lw=0, alpha=0.5)
+    ax.scatter(
+        train_data_x,
+        train_data_y,
+        c=train_data_tags,
+        cmap=plt.cm.Dark2,
+        lw=0,
+        alpha=0.5,
+    )

    # Plot support vectors
    support = model.support
-    ax.scatter(train_data_x[support], train_data_y[support], c=train_data_tags[support], cmap=plt.cm.Dark2)
+    ax.scatter(
+        train_data_x[support],
+        train_data_y[support],
+        c=train_data_tags[support],
+        cmap=plt.cm.Dark2,
+    )


-if __name__ == '__main__':
+if __name__ == "__main__":
    test_cancel_data()
    test_demonstration()
    plt.show()
-