"""
GlyphX glyphx.suggest(df) -- AI-powered chart recommendation.
Inspects a DataFrame's column types, cardinality, and distribution
shape, then returns ranked chart suggestions with mini SVG previews.
No external dependencies -- the entire analysis runs in pure Python/NumPy.
from glyphx import suggest
import pandas as pd
df = pd.read_csv("sales.csv")
recs = suggest(df) # list of Recommendation objects
for rec in recs[:3]:
print(rec.kind, rec.reason)
rec.preview.show() # render the mini preview
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from typing import Any
import numpy as np
import pandas as pd
# ---------------------------------------------------------------------------
# Recommendation dataclass
# ---------------------------------------------------------------------------
[docs]
@dataclass
class Recommendation:
"""
A single chart recommendation.
Attributes:
kind: GlyphX chart kind string (e.g. ``"line"``, ``"bar"``).
score: Confidence score 0-100.
reason: Human-readable explanation of why this chart suits the data.
x_col: Suggested X-axis column (or None).
y_col: Suggested Y-axis column (or None for univariate charts).
hue_col: Suggested hue/group column (or None).
extra: Additional kwargs to pass to the chart constructor.
preview: A :class:`~glyphx.Figure` rendered at 340x220 with
representative sample data. Rendered lazily on first access.
"""
kind: str
score: float
reason: str
x_col: str | None = None
y_col: str | None = None
hue_col: str | None = None
extra: dict[str, Any] = field(default_factory=dict)
_df: Any = field(default=None, repr=False)
_fig: Any = field(default=None, repr=False)
@property
def preview(self):
"""Render and cache a 340x220 mini preview figure."""
if self._fig is None:
self._fig = _render_preview(self)
return self._fig
def __repr__(self) -> str:
return (f"<Recommendation kind={self.kind!r} score={self.score:.0f} "
f"x={self.x_col!r} y={self.y_col!r}>")
# ---------------------------------------------------------------------------
# Column profiling helpers
# ---------------------------------------------------------------------------
def _is_datetime_col(col: pd.Series) -> bool:
if pd.api.types.is_datetime64_any_dtype(col):
return True
if col.dtype == object:
sample = col.dropna().head(20)
hit = 0
for v in sample:
try:
pd.to_datetime(str(v))
hit += 1
except Exception:
pass
return hit > len(sample) * 0.8
return False
def _cardinality(col: pd.Series) -> int:
return col.nunique()
def _is_numeric(col: pd.Series) -> bool:
return pd.api.types.is_numeric_dtype(col)
def _is_categorical(col: pd.Series, max_card: int = 20) -> bool:
return not _is_numeric(col) and _cardinality(col) <= max_card
def _is_high_card_str(col: pd.Series) -> bool:
return col.dtype == object and _cardinality(col) > 50
def _distribution_shape(col: pd.Series) -> str:
"""Rough shape classifier: 'normal', 'skewed', 'bimodal', 'uniform'."""
vals = col.dropna().values.astype(float)
if len(vals) < 10:
return "unknown"
q1, med, q3 = np.percentile(vals, [25, 50, 75])
mean = vals.mean()
std = vals.std()
if std == 0:
return "constant"
skew = (mean - med) / std
if abs(skew) < 0.2:
return "normal"
if abs(skew) < 0.8:
return "skewed"
return "skewed"
def _has_trend(col: pd.Series) -> bool:
"""True if a numeric column has a monotone trend > 60% of the time."""
vals = col.dropna().values.astype(float)
if len(vals) < 5:
return False
diffs = np.diff(vals)
up = (diffs > 0).sum()
down = (diffs < 0).sum()
n = len(diffs)
return max(up, down) / n > 0.6
# ---------------------------------------------------------------------------
# Main recommendation engine
# ---------------------------------------------------------------------------
[docs]
def suggest(
df: pd.DataFrame,
max_rows: int = 500,
top_n: int = 5,
) -> list[Recommendation]:
"""
Analyse a DataFrame and return ranked chart recommendations.
The engine scores candidate chart types against the column profiles
and returns the top ``top_n`` recommendations, each with a mini
preview figure generated from a sample of the data.
Args:
df: Input DataFrame.
max_rows: Cap for the internal sample used for analysis (default 500).
top_n: Maximum number of recommendations to return (default 5).
Returns:
List of :class:`Recommendation` objects sorted by descending score.
Example::
from glyphx import suggest
recs = suggest(df)
for rec in recs:
print(f"{rec.kind:15s} score={rec.score:.0f} {rec.reason}")
rec.preview.show()
"""
if df.empty:
return []
# Sample for performance
sample = df.sample(min(max_rows, len(df)), random_state=42) if len(df) > max_rows else df.copy()
# Column profiles
num_cols = [c for c in sample.columns if _is_numeric(sample[c])]
cat_cols = [c for c in sample.columns if _is_categorical(sample[c])]
dt_cols = [c for c in sample.columns if _is_datetime_col(sample[c])]
str_cols = [c for c in sample.columns if sample[c].dtype == object]
n_rows = len(sample)
n_cols = len(sample.columns)
recs: list[Recommendation] = []
# -- Line chart -------------------------------------------------
if dt_cols and num_cols:
x = dt_cols[0]; y = num_cols[0]
score = 85
hue = cat_cols[0] if cat_cols and _cardinality(sample[cat_cols[0]]) <= 6 else None
if _has_trend(sample[y]):
score = 92
recs.append(Recommendation(
kind="line", score=score, x_col=x, y_col=y, hue_col=hue,
reason=f"'{x}' is a datetime axis and '{y}' shows a trend over time.",
_df=sample,
))
elif num_cols and n_rows >= 10 and _has_trend(sample[num_cols[0]]):
y = num_cols[0]
recs.append(Recommendation(
kind="line", score=70, x_col=None, y_col=y,
reason=f"'{y}' has a monotone trend -- a line chart shows it clearly.",
_df=sample,
))
# -- Bar chart --------------------------------------------------
if cat_cols and num_cols:
x = cat_cols[0]; y = num_cols[0]
card = _cardinality(sample[x])
score = max(30, 90 - card * 2)
hue = cat_cols[1] if len(cat_cols) > 1 and _cardinality(sample[cat_cols[1]]) <= 6 else None
recs.append(Recommendation(
kind="bar", score=score, x_col=x, y_col=y, hue_col=hue,
reason=(f"'{x}' has {card} categories and '{y}' is numeric -- "
"a bar chart compares groups."),
_df=sample,
))
# -- Scatter chart ----------------------------------------------
if len(num_cols) >= 2:
x = num_cols[0]; y = num_cols[1]
corr = abs(sample[[x, y]].dropna().corr().iloc[0, 1])
score = 60 + int(corr * 30)
c = num_cols[2] if len(num_cols) > 2 else None
hue = cat_cols[0] if cat_cols and _cardinality(sample[cat_cols[0]]) <= 8 else None
recs.append(Recommendation(
kind="scatter", score=score, x_col=x, y_col=y,
hue_col=hue,
reason=(f"'{x}' and '{y}' are both numeric (r={corr:.2f}) -- "
"scatter reveals their relationship."),
extra={"c": c} if c else {},
_df=sample,
))
# -- Histogram --------------------------------------------------
if num_cols:
col = num_cols[0]
shape = _distribution_shape(sample[col])
score = 70 if shape in ("normal", "skewed") else 55
recs.append(Recommendation(
kind="hist", score=score, x_col=col, y_col=None,
reason=(f"'{col}' is continuous ({shape} distribution) -- "
"a histogram shows its shape."),
_df=sample,
))
# -- Box plot ---------------------------------------------------
if cat_cols and num_cols:
x = cat_cols[0]; y = num_cols[0]
card = _cardinality(sample[x])
score = 65 if card <= 10 else 45
recs.append(Recommendation(
kind="box", score=score, x_col=x, y_col=y,
reason=(f"Comparing '{y}' distribution across {card} groups of '{x}'."),
_df=sample,
))
# -- Heatmap ----------------------------------------------------
if len(num_cols) >= 3 and n_rows <= 200:
score = 60
recs.append(Recommendation(
kind="heatmap", score=score, x_col=None, y_col=None,
reason=(f"Multiple numeric columns ({len(num_cols)}) with ≤200 rows -- "
"a correlation heatmap reveals relationships."),
_df=sample,
))
# -- Pie / donut ------------------------------------------------
if cat_cols and num_cols:
x = cat_cols[0]; y = num_cols[0]
card = _cardinality(sample[x])
if 2 <= card <= 7:
score = 60
recs.append(Recommendation(
kind="donut", score=score, x_col=x, y_col=y,
reason=(f"'{x}' has {card} categories -- a donut shows part-to-whole."),
_df=sample,
))
# -- Bubble -----------------------------------------------------
if len(num_cols) >= 3:
x, y, s = num_cols[0], num_cols[1], num_cols[2]
recs.append(Recommendation(
kind="bubble", score=58, x_col=x, y_col=y,
reason=(f"Three numeric dimensions -- bubble encodes '{s}' as size."),
extra={"size": s},
_df=sample,
))
# -- ECDF -------------------------------------------------------
if num_cols and n_rows >= 30:
recs.append(Recommendation(
kind="ecdf", score=52, x_col=num_cols[0], y_col=None,
reason=(f"ECDF shows the full cumulative distribution of '{num_cols[0]}' "
"with no bin-width choice needed."),
_df=sample,
))
# -- Parallel coordinates ----------------------------------------
if len(num_cols) >= 4:
score = 65 if len(num_cols) <= 8 else 50
hue = cat_cols[0] if cat_cols else None
recs.append(Recommendation(
kind="parallel", score=score,
reason=(f"{len(num_cols)} numeric columns -- parallel coordinates "
"reveals multi-dimensional patterns."),
extra={"axes": num_cols[:8]},
hue_col=hue,
_df=sample,
))
# Sort and truncate
recs.sort(key=lambda r: r.score, reverse=True)
return recs[:top_n]
# ---------------------------------------------------------------------------
# Preview renderer
# ---------------------------------------------------------------------------
def _render_preview(rec: Recommendation):
"""Build a 340x220 mini Figure from the recommendation."""
from .figure import Figure
from .series import (LineSeries, BarSeries, ScatterSeries,
HistogramSeries, BoxPlotSeries, HeatmapSeries,
PieSeries, DonutSeries)
df = rec._df
kind = rec.kind
x_col = rec.x_col
y_col = rec.y_col
hue = rec.hue_col
fig = Figure(width=340, height=220, auto_display=False)
title = kind.upper()
if x_col:
title += f" -- {x_col}"
if y_col:
title += f" x {y_col}"
fig.set_title(title)
try:
SAMPLE = 80 # keep preview fast
if kind == "line":
sample = df[[x_col, y_col]].dropna().head(SAMPLE)
x_vals = list(range(len(sample))) if _is_datetime_col(df[x_col]) else sample[x_col].tolist()
if hue:
colors = ["#2563eb", "#dc2626", "#16a34a", "#d97706", "#7c3aed"]
for i, (g, gdf) in enumerate(df.groupby(hue)):
s = gdf[[x_col, y_col]].dropna().head(SAMPLE)
fig.add(LineSeries(list(range(len(s))), s[y_col].tolist(),
color=colors[i % len(colors)], label=str(g), width=1.5))
else:
fig.add(LineSeries(x_vals, sample[y_col].tolist(), color="#2563eb", width=1.5))
elif kind == "bar":
agg = df.groupby(x_col)[y_col].mean().reset_index().head(12)
fig.add(BarSeries(agg[x_col].tolist(), agg[y_col].tolist(), color="#2563eb", bar_width=0.7))
elif kind == "scatter":
s = df[[x_col, y_col]].dropna().head(SAMPLE)
c_col = rec.extra.get("c")
c_vals = df[c_col].head(SAMPLE).tolist() if c_col and c_col in df else None
fig.add(ScatterSeries(s[x_col].tolist(), s[y_col].tolist(),
c=c_vals, cmap="viridis", size=4))
elif kind == "hist":
vals = df[x_col].dropna().tolist()
fig.add(HistogramSeries(vals, bins=20, color="#2563eb"))
elif kind == "box":
groups = df[x_col].unique()[:6]
datasets = [df[df[x_col] == g][y_col].dropna().tolist() for g in groups]
fig.add(BoxPlotSeries(datasets, categories=[str(g) for g in groups],
color="#7c3aed", box_width=18))
elif kind == "heatmap":
nums = [c for c in df.columns if _is_numeric(df[c])][:8]
corr = df[nums].corr().values.tolist()
fig.add(HeatmapSeries(corr, row_labels=nums, col_labels=nums,
show_values=True,
cmap=["#1e40af","#93c5fd","#f0f0f0","#fca5a5","#b91c1c"]))
elif kind in ("pie", "donut"):
agg = df.groupby(x_col)[y_col].sum().reset_index().head(7)
cls = DonutSeries if kind == "donut" else PieSeries
from .colormaps import colormap_colors
fig.add(cls(agg[y_col].tolist(),
labels=agg[x_col].tolist(),
colors=colormap_colors("viridis", len(agg))))
elif kind == "bubble":
s_col = rec.extra.get("size", y_col)
s = df[[x_col, y_col, s_col]].dropna().head(SAMPLE) if s_col else df[[x_col, y_col]].dropna().head(SAMPLE)
from .bubble import BubbleSeries
fig.add(BubbleSeries(s[x_col].tolist(), s[y_col].tolist(),
size=s[s_col].tolist() if s_col in s else 10,
color="#2563eb", alpha=0.65, min_radius=4, max_radius=28))
elif kind == "ecdf":
from .ecdf import ECDFSeries
fig.add(ECDFSeries(df[x_col].dropna().head(200).tolist(), color="#2563eb"))
elif kind == "parallel":
from .parallel_coords import ParallelCoordinatesSeries
axes = rec.extra.get("axes", [])[:6]
s = df[axes].dropna().head(SAMPLE)
fig.add(ParallelCoordinatesSeries(s.values.tolist(), axes=axes,
alpha=0.35, cmap="viridis"))
except Exception:
pass # Preview failure should never crash the caller
return fig