Code
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import gradio as grHigh-dimensional data analysis is a common requirement in modern statistics and machine learning. It involves understanding data where the number of features (dimensions) greatly exceeds the number of samples. This notebook explores several fundamental concepts and theoretical tools to navigate and analyze high-dimensional datasets effectively.
Chebyshev’s Inequality
A statistical bound that estimates the probability deviations of a random variable from its mean, applicable to any distribution with a finite variance. Formally:
\[ P(|X - EX| \geq t) \leq \frac{\text{Var}(X)}{t^2} \]
We demonstrate this using a uniformly distributed random variable to show how bounds hold against actual probability calculations.
Weak Law of Large Numbers (WLLN)
This principle asserts that, with an increasing number of independent samples, the sample mean approaches the expected value with high probability
\[ \lim_{n \to \infty} P(|S_n - \mu| \geq \epsilon) = 0, \quad \forall \epsilon > 0 \]
We illustrate this with Bernoulli trials to observe convergence behavior.
High-dimensional Geometry
As dimensionality increases, random vectors tend toward orthogonality (they become nearly perpendicular). For independent vectors drawn from a normal distribution:
\[ P\left(\frac{| \langle X, Y \rangle |}{||X|| \cdot ||Y||} \geq t \right) \leq \frac{1}{dt^2} \]
As dimensions grow, vector angles tend to small values infrequently, highlighting dimensionality’s role.
Johnson-Lindenstrauss Lemma
An important dimensionality reduction result, ensuring that high-dimensional data can be projected into lower-dimensional spaces with minimal distortion of pairwise distances.
Implementing Chebyshev’s inequality for a uniformly distributed random variable, we compare actual and theoretical probabilities.
def chebyshev_uniform_demo(t: float) -> tuple[float, float, dict]:
"""
Demonstrates Chebyshev inequality for X ~ Uniform[0,1]
Returns (actual_prob, bound_prob, stats)
"""
actual_prob = 1 - 2 * t if 0 < t < 0.5 else 0.0
var = 1 / 12
bound_prob = min(var / t**2, 1.0) if t > 0 else 1.0
stats = {
"mean": 0.5,
"variance": var,
"threshold": t,
"actual_probability": actual_prob,
"chebyshev_bound": bound_prob,
}
return actual_prob, bound_prob, statsWe simulate the Weak Law of Large Numbers (WLLN) using Bernoulli trials and observe how sample means approach the expected value.
def wlln_simulation(n: int, num_samples: int = 1000) -> dict:
"""Simulates Weak Law of Large Numbers for Bernoulli trials"""
samples = np.random.binomial(1, 0.5, (num_samples, n))
sample_means = samples.mean(axis=1)
stats = {
"expected_mean": 0.5,
"sample_means_mean": sample_means.mean(),
"sample_means_var": sample_means.var(),
"chebyshev_bound": 1 / (4 * n * 0.05**2), # For ε=0.05
}
return statsThe high_dim_orthogonality function helps us to understand the geometry of high dimensions by checking the orthogonality between random vectors:
def high_dim_orthogonality(d: int, num_pairs: int = 1000) -> dict:
"""Calculates inner product statistics in high dimensions"""
X = np.random.normal(0, 1, (num_pairs, d))
Y = np.random.normal(0, 1, (num_pairs, d))
norms_X = np.linalg.norm(X, axis=1)
norms_Y = np.linalg.norm(Y, axis=1)
cos_theta = np.sum(X * Y, axis=1) / (norms_X * norms_Y)
stats = {
"mean_angle": np.mean(np.arccos(cos_theta)),
"prob_above_0.1": np.mean(np.abs(cos_theta) > 0.1),
"chebyshev_bound": 1 / (d * 0.1**2),
}
return statsWith johnson_lindenstrauss_project we perform random projection as per the Johnson-Lindenstrauss Lemma to dimensionality reduction while preserving distances.
with gr.Blocks(
css="""gradio-app {background: #222222 !important}""",
title="High-Dimensional Data Behavior",
) as demo:
with gr.Tab("Chebyshev Inequality"):
t_input = gr.Slider(0.01, 0.49, value=0.2, label="Threshold t")
cheb_plot = gr.Plot()
cheb_json = gr.JSON()
def update_cheb(t):
actual, bound, stats = chebyshev_uniform_demo(t)
fig = go.Figure()
fig.add_trace(
go.Scatter(x=np.linspace(0, 1, 100), y=[0.5] * 100, name="Mean")
)
fig.add_vrect(
x0=0.5 - t,
x1=0.5 + t,
fillcolor="green",
opacity=0.2,
name="Acceptance",
)
fig.update_layout(title="Probability Concentration: Actual vs Bound")
cheb_json = stats
return fig, cheb_json
t_input.change(update_cheb, t_input, [cheb_plot, cheb_json])
demo.load(update_cheb, t_input, [cheb_plot, cheb_json])
with gr.Tab("Weak Law of Large Numbers"):
n_input = gr.Slider(10, 1000, value=100, step=10, label="Sample size n")
wlln_plot = gr.Plot()
def update_wlln(n):
means = [wlln_simulation(int(n))["sample_means_mean"] for _ in range(100)]
fig = px.line(
x=range(100),
y=means,
labels={"x": "Trial", "y": "Sample Mean"},
title="Convergence of Sample Means",
)
fig.add_hline(y=0.5, line_dash="dash")
return fig
n_input.change(update_wlln, n_input, wlln_plot)
demo.load(update_wlln, n_input, wlln_plot)
with gr.Tab("High-D Orthogonality"):
dim_input = gr.Slider(2, 1000, value=100, label="Dimension d")
angle_plot = gr.Plot()
angle_stats = gr.JSON()
def update_angles(d):
stats = high_dim_orthogonality(int(d))
angles = np.random.normal(0, 1 / np.sqrt(d), 1000)
fig = px.histogram(
angles, nbins=50, title="Distribution of cosθ in High Dimensions"
)
angle_stats = stats
return fig, angle_stats
dim_input.change(update_angles, dim_input, [angle_plot, angle_stats])
demo.load(update_angles, dim_input, [angle_plot, angle_stats])