Source code for glyphx.regplot

"""
GlyphX regplot -- regression plot with multiple fit types.

Extends the basic lmplot stub to support polynomial, logistic, and LOWESS
regression with confidence intervals, closing the last major gap vs Seaborn's
regression plotting.

    from glyphx.regplot import regplot

    # Ordinary least squares (default)
    fig = regplot(df, x="bill_length", y="body_mass")

    # Polynomial degree-2
    fig = regplot(df, x="age", y="income", order=2)

    # LOWESS (locally weighted, no parametric assumption)
    fig = regplot(df, x="gdp", y="life_exp", lowess=True)

    # Logistic (binary Y)
    fig = regplot(df, x="score", y="pass_fail", logistic=True)
"""
from __future__ import annotations

import math
import numpy as np

from .figure  import Figure
from .series  import ScatterSeries, LineSeries
from .fill_between import FillBetweenSeries


def _lowess(x: np.ndarray, y: np.ndarray, frac: float = 0.3) -> tuple:
    """
    LOWESS (locally weighted scatterplot smoothing) -- pure NumPy.

    Args:
        x:    X values (sorted).
        y:    Y values.
        frac: Smoothing fraction (0-1).  Larger = smoother.

    Returns:
        ``(x_sorted, y_smooth)``
    """
    n   = len(x)
    r   = max(1, int(frac * n))
    order = np.argsort(x)
    xs, ys = x[order], y[order]
    yhat  = np.zeros(n)

    for i in range(n):
        dists    = np.abs(xs - xs[i])
        sorted_d = np.sort(dists)
        h        = sorted_d[min(r, n - 1)]
        if h == 0:
            h = 1e-10
        u       = np.clip(dists / h, 0, 1)
        w       = (1 - u ** 3) ** 3
        X       = np.vstack([np.ones(n), xs]).T
        WX      = w[:, None] * X
        WY      = w * ys
        try:
            beta  = np.linalg.lstsq(WX, WY, rcond=None)[0]
            yhat[i] = beta[0] + beta[1] * xs[i]
        except np.linalg.LinAlgError:
            yhat[i] = ys[i]

    return xs, yhat


def _logistic_fit(x: np.ndarray, y: np.ndarray, n_iter: int = 200
                  ) -> tuple[float, float]:
    """
    Fit logistic regression via gradient descent -- pure NumPy.

    Returns ``(intercept, slope)`` for σ(intercept + slope * x).
    """
    # Normalise x for numerical stability
    x_mu, x_std = x.mean(), x.std() + 1e-10
    xn = (x - x_mu) / x_std

    b0, b1 = 0.0, 0.0
    lr     = 0.1

    for _ in range(n_iter):
        p   = 1 / (1 + np.exp(-(b0 + b1 * xn)))
        err = p - y
        b0 -= lr * err.mean()
        b1 -= lr * (err * xn).mean()

    # Un-normalise
    b1_raw = b1 / x_std
    b0_raw = b0 - b1_raw * x_mu
    return b0_raw, b1_raw


def _bootstrap_ci(
    x: np.ndarray,
    y: np.ndarray,
    x_eval: np.ndarray,
    order:    int   = 1,
    n_boot:   int   = 100,
    ci:       float = 95,
) -> tuple[np.ndarray, np.ndarray]:
    """Bootstrap confidence interval for a polynomial regression line."""
    rng    = np.random.default_rng(42)
    n      = len(x)
    preds  = np.zeros((n_boot, len(x_eval)))

    for b in range(n_boot):
        idx    = rng.integers(0, n, n)
        coeffs = np.polyfit(x[idx], y[idx], order)
        preds[b] = np.polyval(coeffs, x_eval)

    lo  = np.percentile(preds, (100 - ci) / 2,      axis=0)
    hi  = np.percentile(preds, 100 - (100 - ci) / 2, axis=0)
    return lo, hi



[docs]
def regplot(
    data,
    x:         str | None    = None,
    y:         str | None    = None,
    x_vals:    list | None   = None,
    y_vals:    list | None   = None,
    order:     int           = 1,
    lowess:    bool          = False,
    logistic:  bool          = False,
    ci:        int           = 95,
    n_boot:    int           = 100,
    scatter_kw: dict | None  = None,
    line_kw:    dict | None  = None,
    color:     str           = "#2563eb",
    alpha:     float         = 0.20,
    title:     str           = "",
    theme:     str           = "default",
    auto_display: bool       = False,
) -> Figure:
    """
    Regression plot: scatter + fitted curve + optional CI band.

    Supports OLS, polynomial, logistic, and LOWESS regression, all in
    pure NumPy.  Seaborn's ``regplot`` requires statsmodels for some modes;
    GlyphX has zero extra dependencies.

    Args:
        data:       DataFrame (use with ``x=`` and ``y=`` column names) or
                    ``None`` (use ``x_vals``/``y_vals`` directly).
        x, y:       Column names when ``data`` is a DataFrame.
        x_vals, y_vals: Raw numeric arrays when ``data`` is None.
        order:      Polynomial degree (1 = linear, 2 = quadratic, etc.)
        lowess:     Use LOWESS instead of polynomial.
        logistic:   Fit logistic curve (binary Y assumed).
        ci:         Confidence interval level (0-100).  0 disables CI band.
        n_boot:     Bootstrap samples for the CI band.
        scatter_kw: Extra kwargs for :class:`~glyphx.series.ScatterSeries`.
        line_kw:    Extra kwargs for :class:`~glyphx.series.LineSeries`.
        color:      Shared colour for scatter and fit line.
        alpha:      CI band fill opacity.
        title:      Chart title.
        theme:      GlyphX theme name.
        auto_display: Open immediately after creation.

    Returns:
        :class:`~glyphx.Figure` ready to ``.show()`` or ``.save()``.

    Examples::

        from glyphx.regplot import regplot

        # OLS
        regplot(df, x="height", y="weight")

        # Quadratic
        regplot(df, x="age", y="income", order=2, color="#dc2626")

        # LOWESS
        regplot(df, x="gdp", y="life_exp", lowess=True)

        # Logistic (binary outcome)
        regplot(df, x="dose", y="response", logistic=True)

        # No data frame
        import numpy as np
        x = np.random.randn(200)
        y = 2*x + np.random.randn(200)
        regplot(None, x_vals=x, y_vals=y, title="Correlation")
    """
    import pandas as _pd

    scatter_kw = scatter_kw or {}
    line_kw    = line_kw    or {}

    # Extract arrays
    if data is not None and x and y:
        arr_x = np.asarray(data[x], dtype=float)
        arr_y = np.asarray(data[y], dtype=float)
        xlabel, ylabel = x, y
    elif x_vals is not None and y_vals is not None:
        arr_x = np.asarray(x_vals, dtype=float)
        arr_y = np.asarray(y_vals, dtype=float)
        xlabel = ylabel = ""
    else:
        raise ValueError("Provide either (data, x=, y=) or (x_vals=, y_vals=).")

    # Remove NaN pairs
    mask  = np.isfinite(arr_x) & np.isfinite(arr_y)
    arr_x = arr_x[mask]
    arr_y = arr_y[mask]

    fig = Figure(width=700, height=480, auto_display=auto_display, theme=theme)
    if title:
        fig.set_title(title)
    fig.set_xlabel(xlabel).set_ylabel(ylabel)

    # -- Scatter ------------------------------------------------------------
    fig.add(ScatterSeries(
        arr_x.tolist(), arr_y.tolist(),
        color=color, size=4,
        label="Observations",
        **scatter_kw,
    ))

    # -- Regression line ----------------------------------------------------
    x_eval = np.linspace(arr_x.min(), arr_x.max(), 200)

    if lowess:
        xs, ys = _lowess(arr_x, arr_y, frac=0.3)
        fig.add(LineSeries(
            xs.tolist(), ys.tolist(),
            color=color, width=2, label="LOWESS",
            **line_kw,
        ))

    elif logistic:
        b0, b1 = _logistic_fit(arr_x, arr_y)
        y_eval = 1 / (1 + np.exp(-(b0 + b1 * x_eval)))
        fig.add(LineSeries(
            x_eval.tolist(), y_eval.tolist(),
            color=color, width=2, label="Logistic fit",
            **line_kw,
        ))

    else:
        # Polynomial OLS
        coeffs = np.polyfit(arr_x, arr_y, order)
        y_eval = np.polyval(coeffs, x_eval)

        # Fit annotation
        if order == 1:
            slope, intercept = coeffs
            corr = float(np.corrcoef(arr_x, arr_y)[0, 1])
            lbl  = f"y = {slope:.2g}x + {intercept:.2g}  (r={corr:.2f})"
        else:
            lbl = f"Degree-{order} polynomial"

        fig.add(LineSeries(
            x_eval.tolist(), y_eval.tolist(),
            color=color, width=2, label=lbl,
            **line_kw,
        ))

        # CI band
        if ci > 0 and len(arr_x) > 5:
            lo, hi = _bootstrap_ci(arr_x, arr_y, x_eval,
                                   order=order, n_boot=n_boot, ci=ci)
            fig.add(FillBetweenSeries(
                list(range(len(x_eval))), lo.tolist(), hi.tolist(),
                color=color, alpha=alpha,
                label=f"{ci}% CI",
            ))

    return fig