feat: Implement Principal Component Analysis (PCA) (#12596)

- Added PCA implementation with dataset standardization. - Used Singular Value Decomposition (SVD) for computing principal components. - Fixed import sorting to comply with PEP 8 (Ruff I001). - Ensured type hints and docstrings for better readability. - Added doctests to validate correctness. - Passed all Ruff checks and automated tests.
2025-07-04 16:57:32 +08:00 · 2025-03-02 16:33:12 +05:30
parent f528ce350b
commit 8826ad3a4d
2 changed files with 87 additions and 0 deletions
--- a/machine_learning/principle_component_analysis.py
+++ b/machine_learning/principle_component_analysis.py
@ -0,0 +1,85 @@
+"""
+Principal Component Analysis (PCA) is a dimensionality reduction technique
+used in machine learning. It transforms high-dimensional data into a lower-dimensional
+representation while retaining as much variance as possible.
+
+This implementation follows best practices, including:
+- Standardizing the dataset.
+- Computing principal components using Singular Value Decomposition (SVD).
+- Returning transformed data and explained variance ratio.
+"""
+
+import doctest
+
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+
+
+def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
+    """
+    Collects the dataset (Iris dataset) and returns feature matrix and target values.
+
+    :return: Tuple containing feature matrix (X) and target labels (y)
+
+    Example:
+    >>> X, y = collect_dataset()
+    >>> X.shape
+    (150, 4)
+    >>> y.shape
+    (150,)
+    """
+    data = load_iris()
+    return np.array(data.data), np.array(data.target)
+
+
+def apply_pca(data_x: np.ndarray, n_components: int) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Applies Principal Component Analysis (PCA) to reduce dimensionality.
+
+    :param data_x: Original dataset (features)
+    :param n_components: Number of principal components to retain
+    :return: Tuple containing transformed dataset and explained variance ratio
+
+    Example:
+    >>> X, _ = collect_dataset()
+    >>> transformed_X, variance = apply_pca(X, 2)
+    >>> transformed_X.shape
+    (150, 2)
+    >>> len(variance) == 2
+    True
+    """
+    # Standardizing the dataset
+    scaler = StandardScaler()
+    data_x_scaled = scaler.fit_transform(data_x)
+
+    # Applying PCA
+    pca = PCA(n_components=n_components)
+    principal_components = pca.fit_transform(data_x_scaled)
+
+    return principal_components, pca.explained_variance_ratio_
+
+
+def main() -> None:
+    """
+    Driver function to execute PCA and display results.
+    """
+    data_x, data_y = collect_dataset()
+
+    # Number of principal components to retain
+    n_components = 2
+
+    # Apply PCA
+    transformed_data, variance_ratio = apply_pca(data_x, n_components)
+
+    print("Transformed Dataset (First 5 rows):")
+    print(transformed_data[:5])
+
+    print("\nExplained Variance Ratio:")
+    print(variance_ratio)
+
+
+if __name__ == "__main__":
+    doctest.testmod()
+    main()