Dataset Profiler

Profile a dataset visually using manifest aggregates and matplotlib—without reading raw tar data

Manifests record per-shard statistics and per-sample metadata at write time. That means you can profile an entire dataset — class distributions, numeric ranges, tag frequencies — without ever cracking open a tar file.

This example generates a synthetic NLP sentiment corpus, writes it with manifests, then builds a visual profile using only the sidecar files.

1 — Define a sentiment sample type

import numpy as np
from numpy.typing import NDArray
from typing import Annotated
import atdata
from atdata import ManifestField


@atdata.packable
class SentimentSample:
    """A text-embedding sample with rich metadata for profiling."""

    embedding: NDArray
    sentiment: Annotated[str, ManifestField("categorical")]
    confidence: Annotated[float, ManifestField("numeric")]
    language: Annotated[str, ManifestField("categorical")]
    tags: Annotated[list[str], ManifestField("set")]

The embedding field (NDArray) is automatically excluded from the manifest. The other four fields each get an appropriate aggregate collector.

2 — Generate 1,000 synthetic samples

We use weighted distributions to make the profile interesting: sentiment is skewed positive, confidence follows a beta distribution, and language has realistic frequency skew.

rng = np.random.default_rng(2024)

sentiments = rng.choice(
    ["positive", "negative", "neutral"],
    size=1_000,
    p=[0.50, 0.20, 0.30],
)

confidences = rng.beta(5, 2, size=1_000)  # right-skewed toward 1.0

languages = rng.choice(
    ["en", "es", "fr", "de", "ja"],
    size=1_000,
    p=[0.55, 0.15, 0.12, 0.10, 0.08],
)

tag_pool = ["formal", "informal", "slang", "technical", "conversational",
            "academic", "casual", "literary"]

samples = [
    SentimentSample(
        embedding=rng.standard_normal(64).astype(np.float32),
        sentiment=str(sentiments[i]),
        confidence=round(float(confidences[i]), 4),
        language=str(languages[i]),
        tags=list(rng.choice(tag_pool, size=rng.integers(1, 4), replace=False)),
    )
    for i in range(1_000)
]

print(f"Generated {len(samples)} samples")
print(f"Embedding shape: {samples[0].embedding.shape}")
print(f"Example: sentiment={samples[0].sentiment!r}, confidence={samples[0].confidence}")
Generated 1000 samples
Embedding shape: (64,)
Example: sentiment='negative', confidence=0.7578

3 — Write shards with manifests

import tempfile
from pathlib import Path

tmpdir = Path(tempfile.mkdtemp(prefix="atdata_profiler_"))

ds = atdata.write_samples(
    samples,
    tmpdir / "sentiment.tar",
    maxcount=250,
    manifest=True,
)

shards = ds.list_shards()
manifests = sorted(tmpdir.glob("*.manifest.*"))
print(f"Wrote {len(shards)} shards with {len(manifests)} manifest files")
for m in manifests:
    print(f"  {m.name}")
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_profiler_qx6hkqfm/sentiment-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_profiler_qx6hkqfm/sentiment-000001.tar 250 0.0 GB 250
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_profiler_qx6hkqfm/sentiment-000002.tar 250 0.0 GB 500
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_profiler_qx6hkqfm/sentiment-000003.tar 250 0.0 GB 750
Wrote 4 shards with 8 manifest files
  sentiment-000000.manifest.json
  sentiment-000000.manifest.parquet
  sentiment-000001.manifest.json
  sentiment-000001.manifest.parquet
  sentiment-000002.manifest.json
  sentiment-000002.manifest.parquet
  sentiment-000003.manifest.json
  sentiment-000003.manifest.parquet

4 — Load manifests and preview

Note

Everything from here on uses only the manifest sidecar files — the .manifest.json headers and .manifest.parquet per-sample tables. The raw tar data is never read.

import pandas as pd
from atdata import QueryExecutor

executor = QueryExecutor.from_directory(tmpdir)
combined = pd.concat(
    [m.samples for m in executor._manifests if not m.samples.empty],
    ignore_index=True,
)

print(f"Total samples in manifests: {len(combined)}")
print(f"Columns: {list(combined.columns)}")
print()
print(combined[["sentiment", "confidence", "language"]].head(8).to_string(index=False))
Total samples in manifests: 750
Columns: ['__key__', '__offset__', '__size__', 'sentiment', 'confidence', 'language', 'tags']

sentiment  confidence language
 negative      0.7578       en
 positive      0.8524       en
 positive      0.7038       en
  neutral      0.9045       de
  neutral      0.8305       es
 positive      0.5754       en
 positive      0.8700       en
 positive      0.7343       de

5 — Class distribution (bar chart)

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

sentiment_counts = combined["sentiment"].value_counts()
colors = {"positive": "#4CAF50", "neutral": "#9E9E9E", "negative": "#F44336"}

fig, ax = plt.subplots(figsize=(6, 3.5))
bars = ax.bar(
    sentiment_counts.index,
    sentiment_counts.values,
    color=[colors.get(s, "#2196F3") for s in sentiment_counts.index],
    edgecolor="white",
    linewidth=0.8,
)
for bar, count in zip(bars, sentiment_counts.values):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 8,
            str(count), ha="center", va="bottom", fontweight="bold", fontsize=11)

ax.set_ylabel("Count")
ax.set_title("Sentiment Distribution", fontweight="bold")
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()
/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/ipykernel_23293/3396340794.py:24: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()

6 — Confidence score distribution (histogram)

The numeric aggregate in the manifest JSON header gives us min, max, and mean without touching parquet. Here we overlay those on the full histogram.

import json

manifest_json = sorted(tmpdir.glob("*.manifest.json"))[0]
with open(manifest_json) as f:
    header = json.load(f)

conf_agg = header["aggregates"]["confidence"]
print(f"Aggregate from shard 0: {conf_agg}")
Aggregate from shard 0: {'type': 'numeric', 'min': 0.201, 'max': 0.9863, 'mean': 0.7155908000000004, 'count': 250}
fig, ax = plt.subplots(figsize=(6, 3.5))
ax.hist(combined["confidence"], bins=30, color="#42A5F5", edgecolor="white",
        linewidth=0.5, alpha=0.85)

overall_mean = combined["confidence"].mean()
overall_min = combined["confidence"].min()
overall_max = combined["confidence"].max()

ax.axvline(overall_mean, color="#E65100", linewidth=2, linestyle="--",
           label=f"mean = {overall_mean:.3f}")
ax.axvline(overall_min, color="#78909C", linewidth=1, linestyle=":",
           label=f"min = {overall_min:.3f}")
ax.axvline(overall_max, color="#78909C", linewidth=1, linestyle=":",
           label=f"max = {overall_max:.3f}")

ax.set_xlabel("Confidence")
ax.set_ylabel("Count")
ax.set_title("Confidence Score Distribution", fontweight="bold")
ax.legend(fontsize=9)
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()
/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/ipykernel_23293/4059321866.py:22: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()

7 — Language breakdown (horizontal bar)

lang_counts = combined["language"].value_counts().sort_values()
total = lang_counts.sum()

fig, ax = plt.subplots(figsize=(6, 3))
bars = ax.barh(lang_counts.index, lang_counts.values, color="#7E57C2",
               edgecolor="white", linewidth=0.5)

for bar, count in zip(bars, lang_counts.values):
    pct = count / total * 100
    ax.text(bar.get_width() + 5, bar.get_y() + bar.get_height() / 2,
            f"{count}  ({pct:.0f}%)", va="center", fontsize=9)

ax.set_xlabel("Count")
ax.set_title("Language Breakdown", fontweight="bold")
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()
/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/ipykernel_23293/3086307890.py:17: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()

8 — Tag frequency (lollipop chart)

Tags are stored as lists per sample. Exploding and counting gives us a frequency profile.

tag_series = combined["tags"].explode()
tag_counts = tag_series.value_counts().sort_values()

fig, ax = plt.subplots(figsize=(6, 3.5))
ax.hlines(y=tag_counts.index, xmin=0, xmax=tag_counts.values,
          color="#26A69A", linewidth=2)
ax.plot(tag_counts.values, tag_counts.index, "o", color="#00796B", markersize=7)

ax.set_xlabel("Occurrences")
ax.set_title("Tag Frequency", fontweight="bold")
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()
/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/ipykernel_23293/3143888597.py:13: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()

9 — Cross-tabulation: sentiment x language

A styled heatmap reveals how sentiment distributes across languages.

ct = pd.crosstab(combined["sentiment"], combined["language"])
print(ct.to_string())
language   de   en  es  fr  ja
sentiment                     
negative   19   87  22  16  12
neutral    16  111  42  32  22
positive   38  208  66  35  24
fig, ax = plt.subplots(figsize=(6, 3.5))
im = ax.imshow(ct.values, cmap="YlOrRd", aspect="auto")

ax.set_xticks(range(len(ct.columns)))
ax.set_xticklabels(ct.columns)
ax.set_yticks(range(len(ct.index)))
ax.set_yticklabels(ct.index)

for i in range(len(ct.index)):
    for j in range(len(ct.columns)):
        ax.text(j, i, str(ct.values[i, j]), ha="center", va="center",
                fontweight="bold", fontsize=10,
                color="white" if ct.values[i, j] > ct.values.max() * 0.6 else "black")

ax.set_title("Sentiment x Language", fontweight="bold")
fig.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout()
plt.show()
/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/ipykernel_23293/2093014448.py:18: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()

10 — Query-driven profiling

Let’s profile just the low-confidence samples to see if certain sentiments or languages are harder to classify.

low_conf = executor.query(where=lambda df: df["confidence"] < 0.5)
print(f"Low-confidence samples (<0.5): {len(low_conf)}")

low_keys = {loc.key for loc in low_conf}
low_df = combined[combined["__key__"].isin(low_keys)]

print("\nSentiment breakdown of low-confidence samples:")
print(low_df["sentiment"].value_counts().to_string())

print("\nLanguage breakdown of low-confidence samples:")
print(low_df["language"].value_counts().to_string())
Low-confidence samples (<0.5): 94

Sentiment breakdown of low-confidence samples:
sentiment
positive    50
neutral     23
negative    21

Language breakdown of low-confidence samples:
language
en    39
es    17
de    16
fr    11
ja    11

11 — Clean up

import shutil

shutil.rmtree(tmpdir, ignore_errors=True)

Key takeaways

Concept API
Annotate manifest fields Annotated[T, ManifestField("categorical")]
Write with manifests write_samples(samples, path, manifest=True)
Load sidecar manifests QueryExecutor.from_directory(path)
Combine per-sample metadata pd.concat([m.samples for m in executor._manifests])
Profile without tar access Aggregates + parquet give full statistical profile
Query-driven subsets executor.query(where=...) returns SampleLocation list