High-dimensional Data Analysis

Code

import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import gradio as gr

1 Introduction

High-dimensional data analysis is a common requirement in modern statistics and machine learning. It involves understanding data where the number of features (dimensions) greatly exceeds the number of samples. This notebook explores several fundamental concepts and theoretical tools to navigate and analyze high-dimensional datasets effectively.

Chebyshev’s Inequality

A statistical bound that estimates the probability deviations of a random variable from its mean, applicable to any distribution with a finite variance. Formally:

\[ P(|X - EX| \geq t) \leq \frac{\text{Var}(X)}{t^2} \]

We demonstrate this using a uniformly distributed random variable to show how bounds hold against actual probability calculations.

Weak Law of Large Numbers (WLLN)

This principle asserts that, with an increasing number of independent samples, the sample mean approaches the expected value with high probability

\[ \lim_{n \to \infty} P(|S_n - \mu| \geq \epsilon) = 0, \quad \forall \epsilon > 0 \]

We illustrate this with Bernoulli trials to observe convergence behavior.

High-dimensional Geometry

As dimensionality increases, random vectors tend toward orthogonality (they become nearly perpendicular). For independent vectors drawn from a normal distribution:

\[ P\left(\frac{| \langle X, Y \rangle |}{||X|| \cdot ||Y||} \geq t \right) \leq \frac{1}{dt^2} \]

As dimensions grow, vector angles tend to small values infrequently, highlighting dimensionality’s role.

Johnson-Lindenstrauss Lemma

An important dimensionality reduction result, ensuring that high-dimensional data can be projected into lower-dimensional spaces with minimal distortion of pairwise distances.

2 Python Implementation

Implementing Chebyshev’s inequality for a uniformly distributed random variable, we compare actual and theoretical probabilities.

Code

def chebyshev_uniform_demo(t: float) -> tuple[float, float, dict]:
    """
    Demonstrates Chebyshev inequality for X ~ Uniform[0,1]
    Returns (actual_prob, bound_prob, stats)
    """
    actual_prob = 1 - 2 * t if 0 < t < 0.5 else 0.0
    var = 1 / 12
    bound_prob = min(var / t**2, 1.0) if t > 0 else 1.0
    stats = {
        "mean": 0.5,
        "variance": var,
        "threshold": t,
        "actual_probability": actual_prob,
        "chebyshev_bound": bound_prob,
    }
    return actual_prob, bound_prob, stats

We simulate the Weak Law of Large Numbers (WLLN) using Bernoulli trials and observe how sample means approach the expected value.

Code

def wlln_simulation(n: int, num_samples: int = 1000) -> dict:
    """Simulates Weak Law of Large Numbers for Bernoulli trials"""
    samples = np.random.binomial(1, 0.5, (num_samples, n))
    sample_means = samples.mean(axis=1)
    stats = {
        "expected_mean": 0.5,
        "sample_means_mean": sample_means.mean(),
        "sample_means_var": sample_means.var(),
        "chebyshev_bound": 1 / (4 * n * 0.05**2),  # For ε=0.05
    }
    return stats

The high_dim_orthogonality function helps us to understand the geometry of high dimensions by checking the orthogonality between random vectors:

Code

def high_dim_orthogonality(d: int, num_pairs: int = 1000) -> dict:
    """Calculates inner product statistics in high dimensions"""
    X = np.random.normal(0, 1, (num_pairs, d))
    Y = np.random.normal(0, 1, (num_pairs, d))
    norms_X = np.linalg.norm(X, axis=1)
    norms_Y = np.linalg.norm(Y, axis=1)
    cos_theta = np.sum(X * Y, axis=1) / (norms_X * norms_Y)
    stats = {
        "mean_angle": np.mean(np.arccos(cos_theta)),
        "prob_above_0.1": np.mean(np.abs(cos_theta) > 0.1),
        "chebyshev_bound": 1 / (d * 0.1**2),
    }
    return stats

With johnson_lindenstrauss_project we perform random projection as per the Johnson-Lindenstrauss Lemma to dimensionality reduction while preserving distances.

Code

def johnson_lindenstrauss_project(X: np.ndarray, k: int) -> np.ndarray:
    """Random projection matrix for JL Lemma"""
    d = X.shape[1]
    Q = np.random.normal(0, 1 / np.sqrt(k), (d, k))
    return X @ Q

3 Interactive Dashboard

Code

with gr.Blocks(
    css="""gradio-app {background: #222222 !important}""",
    title="High-Dimensional Data Behavior",
) as demo:
    with gr.Tab("Chebyshev Inequality"):
        t_input = gr.Slider(0.01, 0.49, value=0.2, label="Threshold t")
        cheb_plot = gr.Plot()
        cheb_json = gr.JSON()

        def update_cheb(t):
            actual, bound, stats = chebyshev_uniform_demo(t)
            fig = go.Figure()
            fig.add_trace(
                go.Scatter(x=np.linspace(0, 1, 100), y=[0.5] * 100, name="Mean")
            )
            fig.add_vrect(
                x0=0.5 - t,
                x1=0.5 + t,
                fillcolor="green",
                opacity=0.2,
                name="Acceptance",
            )
            fig.update_layout(title="Probability Concentration: Actual vs Bound")
            cheb_json = stats
            return fig, cheb_json

        t_input.change(update_cheb, t_input, [cheb_plot, cheb_json])
        demo.load(update_cheb, t_input, [cheb_plot, cheb_json])

    with gr.Tab("Weak Law of Large Numbers"):
        n_input = gr.Slider(10, 1000, value=100, step=10, label="Sample size n")
        wlln_plot = gr.Plot()

        def update_wlln(n):
            means = [wlln_simulation(int(n))["sample_means_mean"] for _ in range(100)]
            fig = px.line(
                x=range(100),
                y=means,
                labels={"x": "Trial", "y": "Sample Mean"},
                title="Convergence of Sample Means",
            )
            fig.add_hline(y=0.5, line_dash="dash")
            return fig

        n_input.change(update_wlln, n_input, wlln_plot)
        demo.load(update_wlln, n_input, wlln_plot)

    with gr.Tab("High-D Orthogonality"):
        dim_input = gr.Slider(2, 1000, value=100, label="Dimension d")
        angle_plot = gr.Plot()
        angle_stats = gr.JSON()

        def update_angles(d):
            stats = high_dim_orthogonality(int(d))
            angles = np.random.normal(0, 1 / np.sqrt(d), 1000)
            fig = px.histogram(
                angles, nbins=50, title="Distribution of cosθ in High Dimensions"
            )
            angle_stats = stats
            return fig, angle_stats

        dim_input.change(update_angles, dim_input, [angle_plot, angle_stats])
        demo.load(update_angles, dim_input, [angle_plot, angle_stats])

Code

demo.launch(pwa=True, show_api=False, show_error=True)

Code

# Output of this cell set dynamically in Quarto filter step

import micropip await micropip.install('plotly==5.24.1'); import numpy as np import plotly.express as px import plotly.graph_objects as go import gradio as gr def chebyshev_uniform_demo(t: float) -> tuple[float, float, dict]: """ Demonstrates Chebyshev inequality for X ~ Uniform[0,1] Returns (actual_prob, bound_prob, stats) """ actual_prob = 1 - 2 * t if 0 < t < 0.5 else 0.0 var = 1 / 12 bound_prob = min(var / t**2, 1.0) if t > 0 else 1.0 stats = { "mean": 0.5, "variance": var, "threshold": t, "actual_probability": actual_prob, "chebyshev_bound": bound_prob, } return actual_prob, bound_prob, stats def wlln_simulation(n: int, num_samples: int = 1000) -> dict: """Simulates Weak Law of Large Numbers for Bernoulli trials""" samples = np.random.binomial(1, 0.5, (num_samples, n)) sample_means = samples.mean(axis=1) stats = { "expected_mean": 0.5, "sample_means_mean": sample_means.mean(), "sample_means_var": sample_means.var(), "chebyshev_bound": 1 / (4 * n * 0.05**2), # For ε=0.05 } return stats def high_dim_orthogonality(d: int, num_pairs: int = 1000) -> dict: """Calculates inner product statistics in high dimensions""" X = np.random.normal(0, 1, (num_pairs, d)) Y = np.random.normal(0, 1, (num_pairs, d)) norms_X = np.linalg.norm(X, axis=1) norms_Y = np.linalg.norm(Y, axis=1) cos_theta = np.sum(X * Y, axis=1) / (norms_X * norms_Y) stats = { "mean_angle": np.mean(np.arccos(cos_theta)), "prob_above_0.1": np.mean(np.abs(cos_theta) > 0.1), "chebyshev_bound": 1 / (d * 0.1**2), } return stats def johnson_lindenstrauss_project(X: np.ndarray, k: int) -> np.ndarray: """Random projection matrix for JL Lemma""" d = X.shape[1] Q = np.random.normal(0, 1 / np.sqrt(k), (d, k)) return X @ Q with gr.Blocks( css="""gradio-app {background: #222222 !important}""", title="High-Dimensional Data Behavior", ) as demo: with gr.Tab("Chebyshev Inequality"): t_input = gr.Slider(0.01, 0.49, value=0.2, label="Threshold t") cheb_plot = gr.Plot() cheb_json = gr.JSON() def update_cheb(t): actual, bound, stats = chebyshev_uniform_demo(t) fig = go.Figure() fig.add_trace( go.Scatter(x=np.linspace(0, 1, 100), y=[0.5] * 100, name="Mean") ) fig.add_vrect( x0=0.5 - t, x1=0.5 + t, fillcolor="green", opacity=0.2, name="Acceptance", ) fig.update_layout(title="Probability Concentration: Actual vs Bound") cheb_json = stats return fig, cheb_json t_input.change(update_cheb, t_input, [cheb_plot, cheb_json]) demo.load(update_cheb, t_input, [cheb_plot, cheb_json]) with gr.Tab("Weak Law of Large Numbers"): n_input = gr.Slider(10, 1000, value=100, step=10, label="Sample size n") wlln_plot = gr.Plot() def update_wlln(n): means = [wlln_simulation(int(n))["sample_means_mean"] for _ in range(100)] fig = px.line( x=range(100), y=means, labels={"x": "Trial", "y": "Sample Mean"}, title="Convergence of Sample Means", ) fig.add_hline(y=0.5, line_dash="dash") return fig n_input.change(update_wlln, n_input, wlln_plot) demo.load(update_wlln, n_input, wlln_plot) with gr.Tab("High-D Orthogonality"): dim_input = gr.Slider(2, 1000, value=100, label="Dimension d") angle_plot = gr.Plot() angle_stats = gr.JSON() def update_angles(d): stats = high_dim_orthogonality(int(d)) angles = np.random.normal(0, 1 / np.sqrt(d), 1000) fig = px.histogram( angles, nbins=50, title="Distribution of cosθ in High Dimensions" ) angle_stats = stats return fig, angle_stats dim_input.change(update_angles, dim_input, [angle_plot, angle_stats]) demo.load(update_angles, dim_input, [angle_plot, angle_stats]) demo.launch(pwa=True, show_api=False, show_error=True)