mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-07-04 16:57:32 +08:00
feat: Implement Principal Component Analysis (PCA) (#12596)
- Added PCA implementation with dataset standardization. - Used Singular Value Decomposition (SVD) for computing principal components. - Fixed import sorting to comply with PEP 8 (Ruff I001). - Ensured type hints and docstrings for better readability. - Added doctests to validate correctness. - Passed all Ruff checks and automated tests.
This commit is contained in:
85
machine_learning/principle_component_analysis.py
Normal file
85
machine_learning/principle_component_analysis.py
Normal file
@ -0,0 +1,85 @@
|
||||
"""
|
||||
Principal Component Analysis (PCA) is a dimensionality reduction technique
|
||||
used in machine learning. It transforms high-dimensional data into a lower-dimensional
|
||||
representation while retaining as much variance as possible.
|
||||
|
||||
This implementation follows best practices, including:
|
||||
- Standardizing the dataset.
|
||||
- Computing principal components using Singular Value Decomposition (SVD).
|
||||
- Returning transformed data and explained variance ratio.
|
||||
"""
|
||||
|
||||
import doctest
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
|
||||
def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Collects the dataset (Iris dataset) and returns feature matrix and target values.
|
||||
|
||||
:return: Tuple containing feature matrix (X) and target labels (y)
|
||||
|
||||
Example:
|
||||
>>> X, y = collect_dataset()
|
||||
>>> X.shape
|
||||
(150, 4)
|
||||
>>> y.shape
|
||||
(150,)
|
||||
"""
|
||||
data = load_iris()
|
||||
return np.array(data.data), np.array(data.target)
|
||||
|
||||
|
||||
def apply_pca(data_x: np.ndarray, n_components: int) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Applies Principal Component Analysis (PCA) to reduce dimensionality.
|
||||
|
||||
:param data_x: Original dataset (features)
|
||||
:param n_components: Number of principal components to retain
|
||||
:return: Tuple containing transformed dataset and explained variance ratio
|
||||
|
||||
Example:
|
||||
>>> X, _ = collect_dataset()
|
||||
>>> transformed_X, variance = apply_pca(X, 2)
|
||||
>>> transformed_X.shape
|
||||
(150, 2)
|
||||
>>> len(variance) == 2
|
||||
True
|
||||
"""
|
||||
# Standardizing the dataset
|
||||
scaler = StandardScaler()
|
||||
data_x_scaled = scaler.fit_transform(data_x)
|
||||
|
||||
# Applying PCA
|
||||
pca = PCA(n_components=n_components)
|
||||
principal_components = pca.fit_transform(data_x_scaled)
|
||||
|
||||
return principal_components, pca.explained_variance_ratio_
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""
|
||||
Driver function to execute PCA and display results.
|
||||
"""
|
||||
data_x, data_y = collect_dataset()
|
||||
|
||||
# Number of principal components to retain
|
||||
n_components = 2
|
||||
|
||||
# Apply PCA
|
||||
transformed_data, variance_ratio = apply_pca(data_x, n_components)
|
||||
|
||||
print("Transformed Dataset (First 5 rows):")
|
||||
print(transformed_data[:5])
|
||||
|
||||
print("\nExplained Variance Ratio:")
|
||||
print(variance_ratio)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
doctest.testmod()
|
||||
main()
|
Reference in New Issue
Block a user