Source code for glyphx.suggest

"""
GlyphX glyphx.suggest(df) -- AI-powered chart recommendation.

Inspects a DataFrame's column types, cardinality, and distribution
shape, then returns ranked chart suggestions with mini SVG previews.
No external dependencies -- the entire analysis runs in pure Python/NumPy.

    from glyphx import suggest
    import pandas as pd

    df = pd.read_csv("sales.csv")
    recs = suggest(df)          # list of Recommendation objects
    for rec in recs[:3]:
        print(rec.kind, rec.reason)
        rec.preview.show()      # render the mini preview
"""
from __future__ import annotations

import math
from dataclasses import dataclass, field
from typing import Any

import numpy as np
import pandas as pd


# ---------------------------------------------------------------------------
# Recommendation dataclass
# ---------------------------------------------------------------------------

[docs] @dataclass class Recommendation: """ A single chart recommendation. Attributes: kind: GlyphX chart kind string (e.g. ``"line"``, ``"bar"``). score: Confidence score 0-100. reason: Human-readable explanation of why this chart suits the data. x_col: Suggested X-axis column (or None). y_col: Suggested Y-axis column (or None for univariate charts). hue_col: Suggested hue/group column (or None). extra: Additional kwargs to pass to the chart constructor. preview: A :class:`~glyphx.Figure` rendered at 340x220 with representative sample data. Rendered lazily on first access. """ kind: str score: float reason: str x_col: str | None = None y_col: str | None = None hue_col: str | None = None extra: dict[str, Any] = field(default_factory=dict) _df: Any = field(default=None, repr=False) _fig: Any = field(default=None, repr=False) @property def preview(self): """Render and cache a 340x220 mini preview figure.""" if self._fig is None: self._fig = _render_preview(self) return self._fig def __repr__(self) -> str: return (f"<Recommendation kind={self.kind!r} score={self.score:.0f} " f"x={self.x_col!r} y={self.y_col!r}>")
# --------------------------------------------------------------------------- # Column profiling helpers # --------------------------------------------------------------------------- def _is_datetime_col(col: pd.Series) -> bool: if pd.api.types.is_datetime64_any_dtype(col): return True if col.dtype == object: sample = col.dropna().head(20) hit = 0 for v in sample: try: pd.to_datetime(str(v)) hit += 1 except Exception: pass return hit > len(sample) * 0.8 return False def _cardinality(col: pd.Series) -> int: return col.nunique() def _is_numeric(col: pd.Series) -> bool: return pd.api.types.is_numeric_dtype(col) def _is_categorical(col: pd.Series, max_card: int = 20) -> bool: return not _is_numeric(col) and _cardinality(col) <= max_card def _is_high_card_str(col: pd.Series) -> bool: return col.dtype == object and _cardinality(col) > 50 def _distribution_shape(col: pd.Series) -> str: """Rough shape classifier: 'normal', 'skewed', 'bimodal', 'uniform'.""" vals = col.dropna().values.astype(float) if len(vals) < 10: return "unknown" q1, med, q3 = np.percentile(vals, [25, 50, 75]) mean = vals.mean() std = vals.std() if std == 0: return "constant" skew = (mean - med) / std if abs(skew) < 0.2: return "normal" if abs(skew) < 0.8: return "skewed" return "skewed" def _has_trend(col: pd.Series) -> bool: """True if a numeric column has a monotone trend > 60% of the time.""" vals = col.dropna().values.astype(float) if len(vals) < 5: return False diffs = np.diff(vals) up = (diffs > 0).sum() down = (diffs < 0).sum() n = len(diffs) return max(up, down) / n > 0.6 # --------------------------------------------------------------------------- # Main recommendation engine # ---------------------------------------------------------------------------
[docs] def suggest( df: pd.DataFrame, max_rows: int = 500, top_n: int = 5, ) -> list[Recommendation]: """ Analyse a DataFrame and return ranked chart recommendations. The engine scores candidate chart types against the column profiles and returns the top ``top_n`` recommendations, each with a mini preview figure generated from a sample of the data. Args: df: Input DataFrame. max_rows: Cap for the internal sample used for analysis (default 500). top_n: Maximum number of recommendations to return (default 5). Returns: List of :class:`Recommendation` objects sorted by descending score. Example:: from glyphx import suggest recs = suggest(df) for rec in recs: print(f"{rec.kind:15s} score={rec.score:.0f} {rec.reason}") rec.preview.show() """ if df.empty: return [] # Sample for performance sample = df.sample(min(max_rows, len(df)), random_state=42) if len(df) > max_rows else df.copy() # Column profiles num_cols = [c for c in sample.columns if _is_numeric(sample[c])] cat_cols = [c for c in sample.columns if _is_categorical(sample[c])] dt_cols = [c for c in sample.columns if _is_datetime_col(sample[c])] str_cols = [c for c in sample.columns if sample[c].dtype == object] n_rows = len(sample) n_cols = len(sample.columns) recs: list[Recommendation] = [] # -- Line chart ------------------------------------------------- if dt_cols and num_cols: x = dt_cols[0]; y = num_cols[0] score = 85 hue = cat_cols[0] if cat_cols and _cardinality(sample[cat_cols[0]]) <= 6 else None if _has_trend(sample[y]): score = 92 recs.append(Recommendation( kind="line", score=score, x_col=x, y_col=y, hue_col=hue, reason=f"'{x}' is a datetime axis and '{y}' shows a trend over time.", _df=sample, )) elif num_cols and n_rows >= 10 and _has_trend(sample[num_cols[0]]): y = num_cols[0] recs.append(Recommendation( kind="line", score=70, x_col=None, y_col=y, reason=f"'{y}' has a monotone trend -- a line chart shows it clearly.", _df=sample, )) # -- Bar chart -------------------------------------------------- if cat_cols and num_cols: x = cat_cols[0]; y = num_cols[0] card = _cardinality(sample[x]) score = max(30, 90 - card * 2) hue = cat_cols[1] if len(cat_cols) > 1 and _cardinality(sample[cat_cols[1]]) <= 6 else None recs.append(Recommendation( kind="bar", score=score, x_col=x, y_col=y, hue_col=hue, reason=(f"'{x}' has {card} categories and '{y}' is numeric -- " "a bar chart compares groups."), _df=sample, )) # -- Scatter chart ---------------------------------------------- if len(num_cols) >= 2: x = num_cols[0]; y = num_cols[1] corr = abs(sample[[x, y]].dropna().corr().iloc[0, 1]) score = 60 + int(corr * 30) c = num_cols[2] if len(num_cols) > 2 else None hue = cat_cols[0] if cat_cols and _cardinality(sample[cat_cols[0]]) <= 8 else None recs.append(Recommendation( kind="scatter", score=score, x_col=x, y_col=y, hue_col=hue, reason=(f"'{x}' and '{y}' are both numeric (r={corr:.2f}) -- " "scatter reveals their relationship."), extra={"c": c} if c else {}, _df=sample, )) # -- Histogram -------------------------------------------------- if num_cols: col = num_cols[0] shape = _distribution_shape(sample[col]) score = 70 if shape in ("normal", "skewed") else 55 recs.append(Recommendation( kind="hist", score=score, x_col=col, y_col=None, reason=(f"'{col}' is continuous ({shape} distribution) -- " "a histogram shows its shape."), _df=sample, )) # -- Box plot --------------------------------------------------- if cat_cols and num_cols: x = cat_cols[0]; y = num_cols[0] card = _cardinality(sample[x]) score = 65 if card <= 10 else 45 recs.append(Recommendation( kind="box", score=score, x_col=x, y_col=y, reason=(f"Comparing '{y}' distribution across {card} groups of '{x}'."), _df=sample, )) # -- Heatmap ---------------------------------------------------- if len(num_cols) >= 3 and n_rows <= 200: score = 60 recs.append(Recommendation( kind="heatmap", score=score, x_col=None, y_col=None, reason=(f"Multiple numeric columns ({len(num_cols)}) with ≤200 rows -- " "a correlation heatmap reveals relationships."), _df=sample, )) # -- Pie / donut ------------------------------------------------ if cat_cols and num_cols: x = cat_cols[0]; y = num_cols[0] card = _cardinality(sample[x]) if 2 <= card <= 7: score = 60 recs.append(Recommendation( kind="donut", score=score, x_col=x, y_col=y, reason=(f"'{x}' has {card} categories -- a donut shows part-to-whole."), _df=sample, )) # -- Bubble ----------------------------------------------------- if len(num_cols) >= 3: x, y, s = num_cols[0], num_cols[1], num_cols[2] recs.append(Recommendation( kind="bubble", score=58, x_col=x, y_col=y, reason=(f"Three numeric dimensions -- bubble encodes '{s}' as size."), extra={"size": s}, _df=sample, )) # -- ECDF ------------------------------------------------------- if num_cols and n_rows >= 30: recs.append(Recommendation( kind="ecdf", score=52, x_col=num_cols[0], y_col=None, reason=(f"ECDF shows the full cumulative distribution of '{num_cols[0]}' " "with no bin-width choice needed."), _df=sample, )) # -- Parallel coordinates ---------------------------------------- if len(num_cols) >= 4: score = 65 if len(num_cols) <= 8 else 50 hue = cat_cols[0] if cat_cols else None recs.append(Recommendation( kind="parallel", score=score, reason=(f"{len(num_cols)} numeric columns -- parallel coordinates " "reveals multi-dimensional patterns."), extra={"axes": num_cols[:8]}, hue_col=hue, _df=sample, )) # Sort and truncate recs.sort(key=lambda r: r.score, reverse=True) return recs[:top_n]
# --------------------------------------------------------------------------- # Preview renderer # --------------------------------------------------------------------------- def _render_preview(rec: Recommendation): """Build a 340x220 mini Figure from the recommendation.""" from .figure import Figure from .series import (LineSeries, BarSeries, ScatterSeries, HistogramSeries, BoxPlotSeries, HeatmapSeries, PieSeries, DonutSeries) df = rec._df kind = rec.kind x_col = rec.x_col y_col = rec.y_col hue = rec.hue_col fig = Figure(width=340, height=220, auto_display=False) title = kind.upper() if x_col: title += f" -- {x_col}" if y_col: title += f" x {y_col}" fig.set_title(title) try: SAMPLE = 80 # keep preview fast if kind == "line": sample = df[[x_col, y_col]].dropna().head(SAMPLE) x_vals = list(range(len(sample))) if _is_datetime_col(df[x_col]) else sample[x_col].tolist() if hue: colors = ["#2563eb", "#dc2626", "#16a34a", "#d97706", "#7c3aed"] for i, (g, gdf) in enumerate(df.groupby(hue)): s = gdf[[x_col, y_col]].dropna().head(SAMPLE) fig.add(LineSeries(list(range(len(s))), s[y_col].tolist(), color=colors[i % len(colors)], label=str(g), width=1.5)) else: fig.add(LineSeries(x_vals, sample[y_col].tolist(), color="#2563eb", width=1.5)) elif kind == "bar": agg = df.groupby(x_col)[y_col].mean().reset_index().head(12) fig.add(BarSeries(agg[x_col].tolist(), agg[y_col].tolist(), color="#2563eb", bar_width=0.7)) elif kind == "scatter": s = df[[x_col, y_col]].dropna().head(SAMPLE) c_col = rec.extra.get("c") c_vals = df[c_col].head(SAMPLE).tolist() if c_col and c_col in df else None fig.add(ScatterSeries(s[x_col].tolist(), s[y_col].tolist(), c=c_vals, cmap="viridis", size=4)) elif kind == "hist": vals = df[x_col].dropna().tolist() fig.add(HistogramSeries(vals, bins=20, color="#2563eb")) elif kind == "box": groups = df[x_col].unique()[:6] datasets = [df[df[x_col] == g][y_col].dropna().tolist() for g in groups] fig.add(BoxPlotSeries(datasets, categories=[str(g) for g in groups], color="#7c3aed", box_width=18)) elif kind == "heatmap": nums = [c for c in df.columns if _is_numeric(df[c])][:8] corr = df[nums].corr().values.tolist() fig.add(HeatmapSeries(corr, row_labels=nums, col_labels=nums, show_values=True, cmap=["#1e40af","#93c5fd","#f0f0f0","#fca5a5","#b91c1c"])) elif kind in ("pie", "donut"): agg = df.groupby(x_col)[y_col].sum().reset_index().head(7) cls = DonutSeries if kind == "donut" else PieSeries from .colormaps import colormap_colors fig.add(cls(agg[y_col].tolist(), labels=agg[x_col].tolist(), colors=colormap_colors("viridis", len(agg)))) elif kind == "bubble": s_col = rec.extra.get("size", y_col) s = df[[x_col, y_col, s_col]].dropna().head(SAMPLE) if s_col else df[[x_col, y_col]].dropna().head(SAMPLE) from .bubble import BubbleSeries fig.add(BubbleSeries(s[x_col].tolist(), s[y_col].tolist(), size=s[s_col].tolist() if s_col in s else 10, color="#2563eb", alpha=0.65, min_radius=4, max_radius=28)) elif kind == "ecdf": from .ecdf import ECDFSeries fig.add(ECDFSeries(df[x_col].dropna().head(200).tolist(), color="#2563eb")) elif kind == "parallel": from .parallel_coords import ParallelCoordinatesSeries axes = rec.extra.get("axes", [])[:6] s = df[axes].dropna().head(SAMPLE) fig.add(ParallelCoordinatesSeries(s.values.tolist(), axes=axes, alpha=0.35, cmap="viridis")) except Exception: pass # Preview failure should never crash the caller return fig