Code
import gradio as gr
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import load_breast_cancer, load_digits, load_iris, load_wine
from sklearn.preprocessing import StandardScalerHigh-dimensional data often contains redundancy, noise, and dependencies between features, making it challenging to analyze efficiently. Dimension reduction techniques aim to transform data from a high-dimensional space \(\mathbb{R}^d\) to a lower-dimensional representation \(\mathbb{R}^s\) while preserving its essential characteristics.
Two common approaches to dimension reduction are:
This notebook explores both methods and provides an interactive tool to analyze real-world datasets.
Given a dataset \(X = (x_1, \dots, x_n) \in \mathbb{R}^{d \times n}\), we assume zero mean \(\bar{x} = 0\). The goal of PCA is to find a new orthonormal basis that maximizes the variance of the projected data.
Compute the covariance matrix: \[ C = \frac{1}{n} X X^T \in \mathbb{R}^{d \times d} \]
Solve the eigenvalue problem:
\[ C v_k = \lambda_k v_k \]
where \(\lambda_k\) and \(v_k\) are eigenvalues and eigenvectors of \(C\).
Select the top \(s\) eigenvectors \(V = (v_1, \dots, v_s)\) to form the projection matrix.
Compute the dimension-reduced data:
\[ Z = V^T X \in \mathbb{R}^{s \times n} \]
We apply PCA to datasets such as Iris, Digits, Breast Cancer, and Wine and visualize the low-dimensional representation.
PCA is limited to linear transformations, which might fail when data is non-linearly separable. Kernel PCA overcomes this by mapping data into a high-dimensional feature space \(\mathbb{R}^D\) using a kernel function \(k(x, y)\), and then applying PCA in that space.
\[ \phi_i = \Phi(x_i) \in \mathbb{R}^D \]
\[ K_{ij} = k(x_i, x_j) = \langle \Phi(x_i), \Phi(x_j) \rangle \]
\[ \tilde{K} = K - \frac{1}{n} K 1_n - \frac{1}{n} 1_n K + \frac{1}{n^2} 1_n K 1_n \]
\[ n \lambda_k \alpha_k = \tilde{K} \alpha_k \]
\[ Z = (\alpha_1, \dots, \alpha_s)^T \tilde{K} \]
Common Kernels:
Kernel PCA is applied to datasets to reveal non-linear structures in data. The interactive tool allows users to compare PCA and Kernel PCA with different kernels.
def svd_low_rank_approximation(A: np.ndarray, p: int) -> np.ndarray:
"""Compute pseudo-inverse using low-rank SVD approximation"""
U, s, Vt = np.linalg.svd(A, full_matrices=False)
S_inv = np.diag(1 / s[:p])
return Vt[:p].T @ S_inv @ U[:, :p].T
def compute_pca(X: np.ndarray, s: int) -> tuple[np.ndarray, dict]:
"""PCA implementation with variance explained metrics"""
X_centered = X - X.mean(axis=1, keepdims=True)
C = X_centered @ X_centered.T / X.shape[1]
eigvals, eigvecs = np.linalg.eigh(C)
idx = eigvals.argsort()[::-1]
V = eigvecs[:, idx][:, :s]
Z = V.T @ X_centered
metrics = {
"explained_variance": eigvals[idx][:s].tolist(),
"total_variance_ratio": eigvals[idx][:s].sum() / eigvals.sum(),
}
return Z, metrics
def kernel_matrix(
X: np.ndarray, kernel: str, gamma: float = 1.0, degree: int = 2
) -> np.ndarray:
"""Compute centered kernel matrix with RBF/poly/linear kernels"""
n = X.shape[1]
K = np.zeros((n, n))
# Kernel computations
for i in range(n):
for j in range(n):
if kernel == "rbf":
K[i, j] = np.exp(-gamma * np.linalg.norm(X[:, i] - X[:, j]) ** 2)
elif kernel == "poly":
K[i, j] = (X[:, i] @ X[:, j] + 1) ** degree
else: # linear kernel
K[i, j] = X[:, i] @ X[:, j]
# Center kernel matrix
J = np.ones((n, n)) / n
K_centered = K - J @ K - K @ J + J @ K @ J
return K_centered
def compute_kpca(
X: np.ndarray, kernel: str, s: int, **kernel_params
) -> tuple[np.ndarray, dict]:
"""Kernel PCA implementation with automatic centering"""
K = kernel_matrix(X, kernel, **kernel_params)
eigvals, eigvecs = np.linalg.eigh(K)
idx = eigvals.argsort()[::-1]
alpha = eigvecs[:, idx][:, :s]
for i in range(s):
alpha[:, i] /= np.sqrt(eigvals[idx][i]) # Normalize eigenvectors
Z = alpha.T @ K
metrics = {
"top_eigenvalues": eigvals[idx][:s].tolist(),
"kernel_matrix_sample": K[:3, :3].tolist(),
}
return Z, metricsdef create_projection_plot(Z: np.ndarray, labels: np.ndarray) -> go.Figure:
"""Create interactive 3D/2D visualization of projected data"""
dim = Z.shape[0]
if dim == 1:
Z = np.vstack([Z, np.zeros_like(Z)])
df = {
"PC1": Z[0],
"PC2": Z[1],
"PC3": Z[2] if dim > 2 else np.zeros_like(Z[0]),
"Class": labels.astype(str),
}
fig = (
px.scatter_3d(df, x="PC1", y="PC2", z="PC3", color="Class")
if dim > 2
else px.scatter(df, x="PC1", y="PC2", color="Class")
)
fig.update_layout(height=600, scene_camera=dict(up=dict(x=0, y=0, z=1)))
return figdef analyze_data(
dataset_name: str,
method: str,
n_components: int,
kernel: str,
gamma: float,
degree: int,
) -> tuple[go.Figure, dict]:
"""Main analysis function handling dataset loading and processing"""
# Load dataset
data = dataset_map[dataset_name]()
X = StandardScaler().fit_transform(data.data).T
labels = data.target
# Compute projections
if method == "PCA":
Z, metrics = compute_pca(X, n_components)
metric_key = "PCA Metrics"
else:
Z, metrics = compute_kpca(X, kernel, n_components, gamma=gamma, degree=degree)
metric_key = "Kernel PCA Metrics"
# Generate visualization
fig = create_projection_plot(Z, labels)
return fig, {metric_key: metrics}
with gr.Blocks(
css="""gradio-app {background: #222222 !important}""",
title="Dimension Reduction Explorer",
) as demo:
with gr.Row():
dataset = gr.Dropdown(dataset_names, label="Dataset", value=dataset_names[0])
method = gr.Radio(["PCA", "Kernel PCA"], label="Method", value="PCA")
components = gr.Slider(1, 3, value=3, step=1, label="Components")
with gr.Accordion("Kernel Parameters", open=False):
kernel = gr.Dropdown(["linear", "rbf", "poly"], value="linear", label="Kernel")
gamma = gr.Slider(0.01, 1.0, value=0.005, label="RBF Gamma", visible=False)
degree = gr.Slider(
1, 5, value=2, step=1, label="Polynomial Degree", visible=False
)
analyze_btn = gr.Button("Analyze", variant="primary")
with gr.Tabs():
with gr.Tab("Projection"):
plot = gr.Plot(label="Data Projection")
with gr.Tab("Metrics"):
metrics = gr.JSON(label="Analysis Metrics")
kernel.change(
lambda k: (gr.update(visible=k == "rbf"), gr.update(visible=k == "poly")),
inputs=kernel,
outputs=[gamma, degree],
)
kwargs = dict(
fn=analyze_data,
inputs=[dataset, method, components, kernel, gamma, degree],
outputs=[plot, metrics],
)
analyze_btn.click(**kwargs)
components.change(**kwargs)
demo.load(**kwargs)