Skip to content

API Reference

Main Interface

compute(data, method='participation_ratio', **kwargs)

Computes effective dimension using the specified method.

Parameters:

Name Type Description Default
data Union[ndarray, Any]

Input data.

required
method str

Method name.

'participation_ratio'
**kwargs

Arguments passed to the estimator.

{}

Returns:

Name Type Description
float float

Estimated effective dimension.

Source code in src/effdim/api.py
def compute(data: Union[np.ndarray, Any], method: str = 'participation_ratio', **kwargs) -> float:
    """
    Computes effective dimension using the specified method.

    Args:
        data: Input data.
        method: Method name.
        **kwargs: Arguments passed to the estimator.

    Returns:
        float: Estimated effective dimension.
    """
    method = method.lower()

    config = METHOD_CONFIG.get(method)
    if not config:
        raise ValueError(f"Unknown method '{method}'. Available: {list(METHOD_CONFIG.keys())}")

    input_type = config['input_type']

    # Branching logic for Data
    if input_type == 'geometric':
        # Geometric methods need raw data (N, D) or distance matrix
        # For now, geometry.py assumes (N, D) points.
        return config['func'](data, **kwargs)

    # Spectral methods need singular values
    s = adapters.get_singular_values(data)

    if input_type == 'variance':
        spectrum = s**2
    else: # 'singular'
        spectrum = s

    return config['func'](spectrum, **kwargs)

analyze(data, methods=None, **kwargs)

Computes multiple effective dimension metrics.

Parameters:

Name Type Description Default
data Union[ndarray, Any]

Input data.

required
methods Optional[List[str]]

List of methods to compute. Defaults to generic set.

None
**kwargs

Shared kwargs (e.g. threshold=0.95). Note: Specific kwargs for specific methods not easily supported in this simple API.

{}

Returns:

Type Description
Dict[str, float]

Dict[str, float]: Dictionary of results.

Source code in src/effdim/api.py
def analyze(data: Union[np.ndarray, Any], methods: Optional[List[str]] = None, **kwargs) -> Dict[str, float]:
    """
    Computes multiple effective dimension metrics.

    Args:
        data: Input data.
        methods: List of methods to compute. Defaults to generic set.
        **kwargs: Shared kwargs (e.g. threshold=0.95). 
                  Note: Specific kwargs for specific methods not easily supported in this simple API.

    Returns:
        Dict[str, float]: Dictionary of results.
    """
    if methods is None:
        methods = ['participation_ratio', 'shannon', 'effective_rank']

    results = {}

    # Cache singular values to avoid re-computing SVD for each method
    # But compute() calls adapters.get_singular_values() every time.
    # Optimization: We should split the logic.
    # For now, simplistic approach is fine. For large data, we should optimize.
    # Let's optimize:

    # Optimize: Compute valid inputs once
    s = None
    s_sq = None

    # Check if we need spectral computation at all
    needs_spectral = False
    for m in methods:
        m_cleaned = m.lower()
        if m_cleaned == 'pr': m_cleaned = 'participation_ratio'
        if m_cleaned == 'entropy': m_cleaned = 'shannon'

        cfg = METHOD_CONFIG.get(m_cleaned)
        if cfg and cfg['input_type'] in ['variance', 'singular']:
            needs_spectral = True
            break

    if needs_spectral:
        s = adapters.get_singular_values(data)
        s_sq = s**2

    for method_name in methods:
        orig_name = method_name
        method_name = method_name.lower()

        config = METHOD_CONFIG.get(method_name)
        if not config:
            # Check aliases in METHOD_CONFIG directly or manual map above?
            # Creating a standard clean name helper would be better but keeping it simple.
            # METHOD_CONFIG now has aliases.
            if method_name not in METHOD_CONFIG:
                 results[orig_name] = np.nan
                 continue
            config = METHOD_CONFIG[method_name] # Retrieve again if needed

        input_type = config['input_type']

        if input_type == 'geometric':
             val = config['func'](data, **kwargs)
        elif input_type == 'variance':
            val = config['func'](s_sq, **kwargs)
        else: # singular
            val = config['func'](s, **kwargs)

        results[orig_name] = val

    return results

Metrics (Spectral)

effective_rank(spectrum)

Computes Effective Rank (Roy & Vetterli, 2007). This is effectively the Shannon Effective Dimension of the normalized spectrum. Alias for shannon_effective_dimension.

Source code in src/effdim/metrics.py
def effective_rank(spectrum: np.ndarray) -> float:
    """
    Computes Effective Rank (Roy & Vetterli, 2007).
    This is effectively the Shannon Effective Dimension of the normalized spectrum.
    Alias for shannon_effective_dimension.
    """
    return shannon_effective_dimension(spectrum)

geometric_mean_dimension(spectrum)

Computes a dimension based on the ratio of arithmetic mean to geometric mean.

Source code in src/effdim/metrics.py
def geometric_mean_dimension(spectrum: np.ndarray) -> float:
    """
    Computes a dimension based on the ratio of arithmetic mean to geometric mean.
    """
    # Filter strict positives
    s = spectrum[spectrum > 0]
    if len(s) == 0:
        return 0.0

    arithmetic = np.mean(s)
    geometric = np.exp(np.mean(np.log(s)))

    # This ratio is 1 if all equal (max dim), and small if sparse.
    # Not a standard 'dimension' count scalar like 5.4, but a ratio.
    # However, some define a dimension proxy from it.
    # For now, I'll return the raw ratio as a placeholder or looks for a specific 'Dimension' formula using it.
    # Vardi's "The effective dimension..."?
    # I will just return the ratio for now.
    return arithmetic / geometric if geometric > 0 else 0.0

participation_ratio(spectrum)

Computes the Participation Ratio (PR). PR = (Sum lambda)^2 / Sum (lambda^2)

Ref: Recanatesi et al.

Source code in src/effdim/metrics.py
def participation_ratio(spectrum: np.ndarray) -> float:
    """
    Computes the Participation Ratio (PR).
    PR = (Sum lambda)^2 / Sum (lambda^2)

    Ref: Recanatesi et al.
    """
    # PR is usually defined on the eigenvalues of the covariance matrix.
    # If spectrum are these eigenvalues:
    s_sum = np.sum(spectrum)
    s_sq_sum = np.sum(spectrum**2)
    if s_sq_sum == 0:
        return 0.0
    return (s_sum**2) / s_sq_sum

pca_explained_variance(spectrum, threshold=0.95)

Returns the number of components needed to explain threshold fraction of variance. Note: For singular values s, variance is proportional to s^2.

Source code in src/effdim/metrics.py
def pca_explained_variance(spectrum: np.ndarray, threshold: float = 0.95) -> float:
    """
    Returns the number of components needed to explain `threshold` fraction of variance.
    Note: For singular values s, variance is proportional to s^2.
    """
    # If input is singular values (from SVD), eigenvalues are s^2.
    # If input is eigenvalues (from Covariance), they are variance already.
    # The adapter returns singular values for data matrix, eigenvalues for covariance.
    # This ambiguity needs handling.
    # Assumption for v0.1: The 'spectrum' passed here is assumed to be the "importance" metric directly.
    # HOWEVER, standard PCA explained variance is on Eigenvalues of Covariance (s^2 / (N-1)).
    # If users pass singular values (s), we should square them to get variance.
    # But if they pass eigenvalues, we shouldn't. 
    # Let's assume the adapter output 's' or 'vals' is the "magnitude of the mode".
    # For now, I will treat them as "magnitudes". If they are singular values, energy is s^2.
    # If they are eigenvalues of covariance, energy is lambda.
    # This is a tricky design point. 
    # DECISION: I will assume the input to these functions is strictly the "Eigenvalues of the Correlation/Covariance Matrix" or equivalent "Power".
    # So if we had singular values s, we should convert to s^2 before calling this if we want "Explained Variance".
    # BUT, to keep it simple, I will modify `adapters.py` later to always return "Power/Variance" spectrum?
    # No, SVD returns singular values. 
    # Let's add a 'squared' argument or assume the user handles it? 
    # No, ease of use.
    # I'll implement a helper that treats them as singular values by default (squaring them) if they seem to be s-values? 
    # Or I'll just document: "Expects eigenvalues (variance)".

    # Actually, for PR and Entropy, we often operate on eigenvalues of covariance matrix.
    # So, I should probably enforce that `components are energies`.

    # Let's treat the input `spectrum` as strictly "Variance/Energy" distribution.
    # I will update `adapters.py` to optionally return squared values, or I handle it here.
    # Let's assume they are VARIANCES (Eigenvalues).

    total_var = np.sum(spectrum)
    if total_var == 0:
        return 0.0

    cumsum = np.cumsum(spectrum)
    # Find index where cumsum >= threshold * total_var
    idx = np.searchsorted(cumsum, threshold * total_var)
    return float(idx + 1)

renyi_effective_dimension(spectrum, alpha=2.0)

Computes Rényi Effective Dimension (Generalized). For alpha=1 -> Shannon. For alpha=2 -> Connected to Participation Ratio? R_2 = 1/(1-2) * log(sum p^2) = -log(sum p^2) Exp(R_2) = 1 / sum p^2. PR = (sum lambda)^2 / sum lambda^2 = 1 / sum (lambda/sum lambda)^2 = 1 / sum p^2. So Exp(Renyi_2) is exactly Participation Ratio!

Source code in src/effdim/metrics.py
def renyi_effective_dimension(spectrum: np.ndarray, alpha: float = 2.0) -> float:
    """
    Computes Rényi Effective Dimension (Generalized).
    For alpha=1 -> Shannon.
    For alpha=2 -> Connected to Participation Ratio?
      R_2 = 1/(1-2) * log(sum p^2) = -log(sum p^2)
      Exp(R_2) = 1 / sum p^2.
      PR = (sum lambda)^2 / sum lambda^2 = 1 / sum (lambda/sum lambda)^2 = 1 / sum p^2.
      So Exp(Renyi_2) is exactly Participation Ratio!
    """
    if alpha == 1:
        return shannon_effective_dimension(spectrum)

    p = _normalize_spectrum(spectrum)
    p_alpha = np.sum(p**alpha)
    if p_alpha == 0:
        return 0.0

    entropy = (1 / (1 - alpha)) * np.log(p_alpha)
    return np.exp(entropy)

shannon_effective_dimension(spectrum)

Computes Shannon Effective Dimension: exp(Entropy). H = - sum p_i log p_i where p_i = lambda_i / sum(lambda)

Source code in src/effdim/metrics.py
def shannon_effective_dimension(spectrum: np.ndarray) -> float:
    """
    Computes Shannon Effective Dimension: exp(Entropy).
    H = - sum p_i log p_i
    where p_i = lambda_i / sum(lambda)
    """
    p = _normalize_spectrum(spectrum)
    # Filter zeros for log
    p = p[p > 0]
    entropy = -np.sum(p * np.log(p))
    return np.exp(entropy)

Geometry (Spatial)

knn_intrinsic_dimension(data, k=5)

Computes Intrinsic Dimension using Levina-Bickel MLE.

Parameters:

Name Type Description Default
data ndarray

(N, D) array of points.

required
k int

Number of neighbors.

5

Returns:

Name Type Description
float float

Estimated dimension.

Source code in src/effdim/geometry.py
def knn_intrinsic_dimension(data: np.ndarray, k: int = 5) -> float:
    """
    Computes Intrinsic Dimension using Levina-Bickel MLE.

    Args:
        data: (N, D) array of points.
        k: Number of neighbors.

    Returns:
        float: Estimated dimension.
    """
    data = np.asarray(data)
    if data.ndim != 2:
        raise ValueError("Data must be 2D array (N, D).")

    N = data.shape[0]
    if N < k + 1:
        raise ValueError(f"Not enough samples ({N}) for k={k}.")

    # Query k neighbors. k+1 because the point itself is included as distance 0.
    tree = cKDTree(data)
    dists, _ = tree.query(data, k=k+1)

    # dists has shape (N, k+1). Column 0 is distance to self (0).
    # We want neighbors 1 to k.
    # The Levina-Bickel estimator uses distances up to k.
    # Eq: inv( 1/(k-1) * sum_{j=1}^{k-1} log(Tk / Tj) ) ... wait average over N points first?
    # MacKay/Levina-Bickel:
    # For each point i: m_i = [ 1/(k-1) * sum_{j=1}^{k-1} log( T_k(x_i) / T_j(x_i) ) ]^-1
    # Global estimator is the average of m_i? Or average the inverse?
    # Levina-Bickel (2005) "Maximum Likelihood Estimation...":
    # \hat{m}_k = \left[ \frac{1}{N(k-1)} \sum_{i=1}^N \sum_{j=1}^{k-1} \ln \frac{T_k(x_i)}{T_j(x_i)} \right]^{-1}
    # Yes, one global inverse.

    # Drop self
    neighbors_dists = dists[:, 1:] # (N, k) - these are 1st to kth neighbors

    # T_k is the distance to the k-th neighbor (last column)
    T_k = neighbors_dists[:, -1] # (N,)

    # T_j are distances 1 to k-1. (All columns excluding last)
    T_j = neighbors_dists[:, :-1] # (N, k-1)

    # Avoid log(0) - unlikely if points distinct, but clear duplicates
    # If T_k or T_j is 0, we have duplicates.
    # Add epsilon? Or filter?
    # Simple fix: non-zero epsilon
    epsilon = 1e-10
    T_k = np.maximum(T_k, epsilon)
    T_j = np.maximum(T_j, epsilon)

    # Log ratios: log(T_k / T_j) = log(T_k) - log(T_j)
    # Broadcast T_k: (N, 1) - (N, k-1)
    log_sum = np.sum(np.log(T_k[:, None]) - np.log(T_j)) # Sum over all i, j

    # Denominator: N * (k-1) is outside the sum if we sum over everything
    # The formula is 1 / ( (1 / (N*(k-1))) * log_sum )
    # = N*(k-1) / log_sum

    # Actually, verify formula carefully.
    # sum_{i=1}^N sum_{j=1}^{k-1} ... is the total sum.
    # The term in bracket is Average of log ratios? No, explicit 1/N(k-1).
    # So MLE = 1 / ( Mean of Log Ratios ).

    estimator = (N * (k - 1)) / log_sum
    return float(estimator)

two_nn_intrinsic_dimension(data)

Computes ID using Two-NN method (Facco et al., 2017). Uses ratio of 2nd to 1st neighbor distances.

Parameters:

Name Type Description Default
data ndarray

(N, D) array.

required

Returns:

Name Type Description
float float

Estimated dimension.

Source code in src/effdim/geometry.py
def two_nn_intrinsic_dimension(data: np.ndarray) -> float:
    """
    Computes ID using Two-NN method (Facco et al., 2017).
    Uses ratio of 2nd to 1st neighbor distances.

    Args:
        data: (N, D) array.

    Returns:
        float: Estimated dimension.
    """
    data = np.asarray(data)
    N = data.shape[0]
    if N < 3:
        raise ValueError("Need at least 3 points for Two-NN.")

    tree = cKDTree(data)
    dists, _ = tree.query(data, k=3) # Self, 1st, 2nd

    # r1 is dists[:, 1], r2 is dists[:, 2]
    r1 = dists[:, 1]
    r2 = dists[:, 2]

    # Filter valid ratios (r1 > 0, r2 > 0)
    # If duplicates, r1=0.
    mask = r1 > 1e-10
    r1 = r1[mask]
    r2 = r2[mask]

    # mu = r2 / r1
    mu = r2 / r1

    # Empirical cdf F(mu).
    # The paper simplifies to a linear fit or directly the formula:
    # d = N / sum_i ln(mu_i).
    # Wait, the paper "Estimating the intrinsic dimension of datasets by a minimal neighborhood information"
    # Section "The Two-NN estimator".
    # "d_hat = N / \sum_{i=1}^N \ln(\mu_i)"
    # This is assuming the distribution is Pareto with alpha=d.
    # Let's use this simple estimator.

    if len(mu) == 0:
        return 0.0

    log_mu_sum = np.sum(np.log(mu))
    if log_mu_sum == 0:
        return 0.0

    d_hat = len(mu) / log_mu_sum
    return float(d_hat)

Adapters

get_singular_values(data)

Standardizes input data into Singular Values.

Parameters:

Name Type Description Default
data Union[ndarray, spmatrix]

Input data. - (N, D) array: Interpreted as raw data. Returns singular values. - (N, N) symmetric matrix: Interpreted as Covariance. Returns sqrt(abs(eigenvalues)).

required

Returns:

Type Description
ndarray

np.ndarray: 1D array of singular values, sorted descending.

Source code in src/effdim/adapters.py
def get_singular_values(data: Union[np.ndarray, sparse.spmatrix]) -> np.ndarray:
    """
    Standardizes input data into Singular Values.

    Args:
        data: Input data.
            - (N, D) array: Interpreted as raw data. Returns singular values.
            - (N, N) symmetric matrix: Interpreted as Covariance. Returns sqrt(abs(eigenvalues)).

    Returns:
        np.ndarray: 1D array of singular values, sorted descending.
    """
    data = np.asarray(data)

    if data.ndim != 2:
        raise ValueError("Input data must be 2-dimensional.")

    N, D = data.shape

    # Heuristic for Symmetric Matrix (Covariance/Kernel)
    if N == D and np.allclose(data, data.T):
        vals = linalg.eigvalsh(data)
        # Eigenvalues of Covariance are Variance = s^2.
        # So s = sqrt(vals).
        # We take abs just in case of numerical noise, though cov should be pos def.
        return np.sqrt(np.abs(vals))[::-1]

    # (N, D) Data Matrix -> SVD
    # For large matrices, this is slow. v0.2 will add randomized SVD.
    _, s, _ = linalg.svd(data, full_matrices=False)
    return s