Dimension Reduction

Code
import gradio as gr
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import load_breast_cancer, load_digits, load_iris, load_wine
from sklearn.preprocessing import StandardScaler

1 Introduction

High-dimensional data often contains redundancy, noise, and dependencies between features, making it challenging to analyze efficiently. Dimension reduction techniques aim to transform data from a high-dimensional space \(\mathbb{R}^d\) to a lower-dimensional representation \(\mathbb{R}^s\) while preserving its essential characteristics.

Two common approaches to dimension reduction are:

  • Linear methods: Principal Component Analysis (PCA)
  • Non-linear methods: Kernel Principal Component Analysis (Kernel PCA)

This notebook explores both methods and provides an interactive tool to analyze real-world datasets.

1.1 Principal Component Analysis (PCA)

Given a dataset \(X = (x_1, \dots, x_n) \in \mathbb{R}^{d \times n}\), we assume zero mean \(\bar{x} = 0\). The goal of PCA is to find a new orthonormal basis that maximizes the variance of the projected data.

  1. Compute the covariance matrix: \[ C = \frac{1}{n} X X^T \in \mathbb{R}^{d \times d} \]

  2. Solve the eigenvalue problem:

\[ C v_k = \lambda_k v_k \]

where \(\lambda_k\) and \(v_k\) are eigenvalues and eigenvectors of \(C\).

  1. Select the top \(s\) eigenvectors \(V = (v_1, \dots, v_s)\) to form the projection matrix.

  2. Compute the dimension-reduced data:

\[ Z = V^T X \in \mathbb{R}^{s \times n} \]

1.2 Properties

  • The variance of the projected data is maximized.
  • The total variance explained by the top \(s\) components is: \[ \text{Var}(V^T X) = \lambda_1 + \dots + \lambda_s \]
  • The approximation error of reconstructing \(X\) from \(Z\) is minimized.

We apply PCA to datasets such as Iris, Digits, Breast Cancer, and Wine and visualize the low-dimensional representation.

1.3 Kernel Principal Component Analysis (Kernel PCA)

PCA is limited to linear transformations, which might fail when data is non-linearly separable. Kernel PCA overcomes this by mapping data into a high-dimensional feature space \(\mathbb{R}^D\) using a kernel function \(k(x, y)\), and then applying PCA in that space.

  1. Define a feature map \(\Phi: \mathbb{R}^d \to \mathbb{R}^D\) that implicitly transforms the data:

\[ \phi_i = \Phi(x_i) \in \mathbb{R}^D \]

  1. Compute the kernel matrix:

\[ K_{ij} = k(x_i, x_j) = \langle \Phi(x_i), \Phi(x_j) \rangle \]

  1. Center the kernel matrix:

\[ \tilde{K} = K - \frac{1}{n} K 1_n - \frac{1}{n} 1_n K + \frac{1}{n^2} 1_n K 1_n \]

  1. Solve the eigenvalue problem:

\[ n \lambda_k \alpha_k = \tilde{K} \alpha_k \]

  1. Compute the principal components:

\[ Z = (\alpha_1, \dots, \alpha_s)^T \tilde{K} \]

Common Kernels:

  • Linear kernel: \(k(x, y) = \langle x, y \rangle\)
  • Polynomial kernel: \(k(x, y) = (\langle x, y \rangle + 1)^d\)
  • RBF (Gaussian) kernel: \(k(x, y) = e^{-\gamma \|x - y\|^2}\)

Kernel PCA is applied to datasets to reveal non-linear structures in data. The interactive tool allows users to compare PCA and Kernel PCA with different kernels.

2 Python Implementation

Code
def svd_low_rank_approximation(A: np.ndarray, p: int) -> np.ndarray:
    """Compute pseudo-inverse using low-rank SVD approximation"""
    U, s, Vt = np.linalg.svd(A, full_matrices=False)
    S_inv = np.diag(1 / s[:p])
    return Vt[:p].T @ S_inv @ U[:, :p].T


def compute_pca(X: np.ndarray, s: int) -> tuple[np.ndarray, dict]:
    """PCA implementation with variance explained metrics"""
    X_centered = X - X.mean(axis=1, keepdims=True)
    C = X_centered @ X_centered.T / X.shape[1]
    eigvals, eigvecs = np.linalg.eigh(C)
    idx = eigvals.argsort()[::-1]

    V = eigvecs[:, idx][:, :s]
    Z = V.T @ X_centered

    metrics = {
        "explained_variance": eigvals[idx][:s].tolist(),
        "total_variance_ratio": eigvals[idx][:s].sum() / eigvals.sum(),
    }
    return Z, metrics


def kernel_matrix(
    X: np.ndarray, kernel: str, gamma: float = 1.0, degree: int = 2
) -> np.ndarray:
    """Compute centered kernel matrix with RBF/poly/linear kernels"""
    n = X.shape[1]
    K = np.zeros((n, n))

    # Kernel computations
    for i in range(n):
        for j in range(n):
            if kernel == "rbf":
                K[i, j] = np.exp(-gamma * np.linalg.norm(X[:, i] - X[:, j]) ** 2)
            elif kernel == "poly":
                K[i, j] = (X[:, i] @ X[:, j] + 1) ** degree
            else:  # linear kernel
                K[i, j] = X[:, i] @ X[:, j]

    # Center kernel matrix
    J = np.ones((n, n)) / n
    K_centered = K - J @ K - K @ J + J @ K @ J
    return K_centered


def compute_kpca(
    X: np.ndarray, kernel: str, s: int, **kernel_params
) -> tuple[np.ndarray, dict]:
    """Kernel PCA implementation with automatic centering"""
    K = kernel_matrix(X, kernel, **kernel_params)
    eigvals, eigvecs = np.linalg.eigh(K)
    idx = eigvals.argsort()[::-1]

    alpha = eigvecs[:, idx][:, :s]
    for i in range(s):
        alpha[:, i] /= np.sqrt(eigvals[idx][i])  # Normalize eigenvectors

    Z = alpha.T @ K
    metrics = {
        "top_eigenvalues": eigvals[idx][:s].tolist(),
        "kernel_matrix_sample": K[:3, :3].tolist(),
    }
    return Z, metrics
Code
def create_projection_plot(Z: np.ndarray, labels: np.ndarray) -> go.Figure:
    """Create interactive 3D/2D visualization of projected data"""
    dim = Z.shape[0]
    if dim == 1:
        Z = np.vstack([Z, np.zeros_like(Z)])

    df = {
        "PC1": Z[0],
        "PC2": Z[1],
        "PC3": Z[2] if dim > 2 else np.zeros_like(Z[0]),
        "Class": labels.astype(str),
    }

    fig = (
        px.scatter_3d(df, x="PC1", y="PC2", z="PC3", color="Class")
        if dim > 2
        else px.scatter(df, x="PC1", y="PC2", color="Class")
    )

    fig.update_layout(height=600, scene_camera=dict(up=dict(x=0, y=0, z=1)))
    return fig

3 Interactive Dashboard

Code
dataset_map = {
    "Breast Cancer": load_breast_cancer,
    "Digits": load_digits,
    "Iris": load_iris,
    "Wine": load_wine,
}
dataset_names = sorted(list(dataset_map.keys()))
Code
def analyze_data(
    dataset_name: str,
    method: str,
    n_components: int,
    kernel: str,
    gamma: float,
    degree: int,
) -> tuple[go.Figure, dict]:
    """Main analysis function handling dataset loading and processing"""
    # Load dataset
    data = dataset_map[dataset_name]()
    X = StandardScaler().fit_transform(data.data).T
    labels = data.target

    # Compute projections
    if method == "PCA":
        Z, metrics = compute_pca(X, n_components)
        metric_key = "PCA Metrics"
    else:
        Z, metrics = compute_kpca(X, kernel, n_components, gamma=gamma, degree=degree)
        metric_key = "Kernel PCA Metrics"

    # Generate visualization
    fig = create_projection_plot(Z, labels)
    return fig, {metric_key: metrics}


with gr.Blocks(
    css="""gradio-app {background: #222222 !important}""",
    title="Dimension Reduction Explorer",
) as demo:
    with gr.Row():
        dataset = gr.Dropdown(dataset_names, label="Dataset", value=dataset_names[0])
        method = gr.Radio(["PCA", "Kernel PCA"], label="Method", value="PCA")
        components = gr.Slider(1, 3, value=3, step=1, label="Components")

    with gr.Accordion("Kernel Parameters", open=False):
        kernel = gr.Dropdown(["linear", "rbf", "poly"], value="linear", label="Kernel")
        gamma = gr.Slider(0.01, 1.0, value=0.005, label="RBF Gamma", visible=False)
        degree = gr.Slider(
            1, 5, value=2, step=1, label="Polynomial Degree", visible=False
        )

    analyze_btn = gr.Button("Analyze", variant="primary")

    with gr.Tabs():
        with gr.Tab("Projection"):
            plot = gr.Plot(label="Data Projection")
        with gr.Tab("Metrics"):
            metrics = gr.JSON(label="Analysis Metrics")

    kernel.change(
        lambda k: (gr.update(visible=k == "rbf"), gr.update(visible=k == "poly")),
        inputs=kernel,
        outputs=[gamma, degree],
    )

    kwargs = dict(
        fn=analyze_data,
        inputs=[dataset, method, components, kernel, gamma, degree],
        outputs=[plot, metrics],
    )
    analyze_btn.click(**kwargs)
    components.change(**kwargs)
    demo.load(**kwargs)
Code
demo.launch(pwa=True, show_api=False, show_error=True)
Code
# Output of this cell set dynamically in Quarto filter step
import micropip await micropip.install('plotly==5.24.1'); import gradio as gr import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.datasets import load_breast_cancer, load_digits, load_iris, load_wine from sklearn.preprocessing import StandardScaler def svd_low_rank_approximation(A: np.ndarray, p: int) -> np.ndarray: """Compute pseudo-inverse using low-rank SVD approximation""" U, s, Vt = np.linalg.svd(A, full_matrices=False) S_inv = np.diag(1 / s[:p]) return Vt[:p].T @ S_inv @ U[:, :p].T def compute_pca(X: np.ndarray, s: int) -> tuple[np.ndarray, dict]: """PCA implementation with variance explained metrics""" X_centered = X - X.mean(axis=1, keepdims=True) C = X_centered @ X_centered.T / X.shape[1] eigvals, eigvecs = np.linalg.eigh(C) idx = eigvals.argsort()[::-1] V = eigvecs[:, idx][:, :s] Z = V.T @ X_centered metrics = { "explained_variance": eigvals[idx][:s].tolist(), "total_variance_ratio": eigvals[idx][:s].sum() / eigvals.sum(), } return Z, metrics def kernel_matrix( X: np.ndarray, kernel: str, gamma: float = 1.0, degree: int = 2 ) -> np.ndarray: """Compute centered kernel matrix with RBF/poly/linear kernels""" n = X.shape[1] K = np.zeros((n, n)) # Kernel computations for i in range(n): for j in range(n): if kernel == "rbf": K[i, j] = np.exp(-gamma * np.linalg.norm(X[:, i] - X[:, j]) ** 2) elif kernel == "poly": K[i, j] = (X[:, i] @ X[:, j] + 1) ** degree else: # linear kernel K[i, j] = X[:, i] @ X[:, j] # Center kernel matrix J = np.ones((n, n)) / n K_centered = K - J @ K - K @ J + J @ K @ J return K_centered def compute_kpca( X: np.ndarray, kernel: str, s: int, **kernel_params ) -> tuple[np.ndarray, dict]: """Kernel PCA implementation with automatic centering""" K = kernel_matrix(X, kernel, **kernel_params) eigvals, eigvecs = np.linalg.eigh(K) idx = eigvals.argsort()[::-1] alpha = eigvecs[:, idx][:, :s] for i in range(s): alpha[:, i] /= np.sqrt(eigvals[idx][i]) # Normalize eigenvectors Z = alpha.T @ K metrics = { "top_eigenvalues": eigvals[idx][:s].tolist(), "kernel_matrix_sample": K[:3, :3].tolist(), } return Z, metrics def create_projection_plot(Z: np.ndarray, labels: np.ndarray) -> go.Figure: """Create interactive 3D/2D visualization of projected data""" dim = Z.shape[0] if dim == 1: Z = np.vstack([Z, np.zeros_like(Z)]) df = { "PC1": Z[0], "PC2": Z[1], "PC3": Z[2] if dim > 2 else np.zeros_like(Z[0]), "Class": labels.astype(str), } fig = ( px.scatter_3d(df, x="PC1", y="PC2", z="PC3", color="Class") if dim > 2 else px.scatter(df, x="PC1", y="PC2", color="Class") ) fig.update_layout(height=600, scene_camera=dict(up=dict(x=0, y=0, z=1))) return fig dataset_map = { "Breast Cancer": load_breast_cancer, "Digits": load_digits, "Iris": load_iris, "Wine": load_wine, } dataset_names = sorted(list(dataset_map.keys())) def analyze_data( dataset_name: str, method: str, n_components: int, kernel: str, gamma: float, degree: int, ) -> tuple[go.Figure, dict]: """Main analysis function handling dataset loading and processing""" # Load dataset data = dataset_map[dataset_name]() X = StandardScaler().fit_transform(data.data).T labels = data.target # Compute projections if method == "PCA": Z, metrics = compute_pca(X, n_components) metric_key = "PCA Metrics" else: Z, metrics = compute_kpca(X, kernel, n_components, gamma=gamma, degree=degree) metric_key = "Kernel PCA Metrics" # Generate visualization fig = create_projection_plot(Z, labels) return fig, {metric_key: metrics} with gr.Blocks( css="""gradio-app {background: #222222 !important}""", title="Dimension Reduction Explorer", ) as demo: with gr.Row(): dataset = gr.Dropdown(dataset_names, label="Dataset", value=dataset_names[0]) method = gr.Radio(["PCA", "Kernel PCA"], label="Method", value="PCA") components = gr.Slider(1, 3, value=3, step=1, label="Components") with gr.Accordion("Kernel Parameters", open=False): kernel = gr.Dropdown(["linear", "rbf", "poly"], value="linear", label="Kernel") gamma = gr.Slider(0.01, 1.0, value=0.005, label="RBF Gamma", visible=False) degree = gr.Slider( 1, 5, value=2, step=1, label="Polynomial Degree", visible=False ) analyze_btn = gr.Button("Analyze", variant="primary") with gr.Tabs(): with gr.Tab("Projection"): plot = gr.Plot(label="Data Projection") with gr.Tab("Metrics"): metrics = gr.JSON(label="Analysis Metrics") kernel.change( lambda k: (gr.update(visible=k == "rbf"), gr.update(visible=k == "poly")), inputs=kernel, outputs=[gamma, degree], ) kwargs = dict( fn=analyze_data, inputs=[dataset, method, components, kernel, gamma, degree], outputs=[plot, metrics], ) analyze_btn.click(**kwargs) components.change(**kwargs) demo.load(**kwargs) demo.launch(pwa=True, show_api=False, show_error=True)