Manifest-Powered Queries

Build per-shard manifests and query samples by metadata without scanning raw data

Large datasets become unwieldy when you need to find specific samples. atdata’s manifest system records per-shard metadata at write time, then lets you query across shards using pandas predicates—without ever opening the tar files themselves.

This example builds a labeled image dataset with manifests, writes them to JSON + Parquet sidecar files, and queries for specific samples.

1 — Define a sample type with manifest-aware fields

Primitive fields (str, int, float, bool, list) are automatically included in manifests with inferred aggregate types. You can also use Annotated[..., ManifestField(...)] for explicit control.

import numpy as np
from numpy.typing import NDArray
from typing import Annotated
import atdata
from atdata import ManifestField


@atdata.packable
class LabeledImage:
    """Image sample with queryable metadata."""

    pixels: NDArray
    label: Annotated[str, ManifestField("categorical")]
    confidence: Annotated[float, ManifestField("numeric")]
    tags: Annotated[list[str], ManifestField("set")]

The ManifestField annotations tell atdata which aggregate statistics to collect:

  • categorical — tracks the set of distinct values
  • numeric — tracks min, max, mean, count
  • set — tracks the union of all values across list fields

2 — See which fields are manifest-included

from atdata.manifest import resolve_manifest_fields

fields = resolve_manifest_fields(LabeledImage)
for name, mf in fields.items():
    print(f"  {name:15s} -> {mf.aggregate}")
  label           -> categorical
  confidence      -> numeric
  tags            -> set

3 — Generate data and write shards with manifests

Pass manifest=True to write_samples and atdata builds the sidecar manifest files automatically—no manual ManifestBuilder plumbing needed.

import tempfile
from pathlib import Path

tmpdir = Path(tempfile.mkdtemp(prefix="atdata_manifest_"))
rng = np.random.default_rng(7)

categories = ["cat", "dog", "bird", "fish", "horse"]
all_tags = ["outdoor", "indoor", "closeup", "wide", "blurry", "sharp"]
num_samples = 300

samples = [
    LabeledImage(
        pixels=rng.integers(0, 255, (32, 32, 3), dtype=np.uint8),
        label=categories[rng.integers(0, len(categories))],
        confidence=round(float(rng.uniform(0.5, 1.0)), 3),
        tags=list(rng.choice(all_tags, size=rng.integers(1, 4), replace=False)),
    )
    for _ in range(num_samples)
]

ds = atdata.write_samples(
    samples,
    tmpdir / "images.tar",
    maxcount=100,       # 3 shards of 100 samples each
    manifest=True,      # auto-build JSON + Parquet sidecar manifests
)

print(f"Wrote {num_samples} samples across {len(ds.list_shards())} shards")
print(f"Manifest files: {sorted(p.name for p in tmpdir.glob('*.manifest.*'))}")
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_manifest_001lx4pe/images-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_manifest_001lx4pe/images-000001.tar 100 0.0 GB 100
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_manifest_001lx4pe/images-000002.tar 100 0.0 GB 200
Wrote 300 samples across 3 shards
Manifest files: ['images-000000.manifest.json', 'images-000000.manifest.parquet', 'images-000001.manifest.json', 'images-000001.manifest.parquet', 'images-000002.manifest.json', 'images-000002.manifest.parquet']

4 — Inspect a manifest

Each shard gets a JSON header (aggregates + metadata) and a Parquet file (per-sample columns).

import json

manifest_json = sorted(tmpdir.glob("*.manifest.json"))[0]
with open(manifest_json) as f:
    header = json.load(f)

print("Shard:", header["shard_id"])
print("Samples:", header["num_samples"])
print()
print("Aggregates:")
for field_name, agg in header["aggregates"].items():
    print(f"  {field_name}: {agg}")
Shard: images
Samples: 100

Aggregates:
  label: {'type': 'categorical', 'cardinality': 5, 'value_counts': {'horse': 22, 'bird': 26, 'fish': 25, 'dog': 12, 'cat': 15}}
  confidence: {'type': 'numeric', 'min': 0.509, 'max': 0.996, 'mean': 0.7610199999999999, 'count': 100}
  tags: {'type': 'set', 'all_values': ['blurry', 'closeup', 'indoor', 'outdoor', 'sharp', 'wide']}

5 — Query across shards

QueryExecutor loads all manifests from a directory and runs a pandas predicate over the per-sample Parquet data.

from atdata import QueryExecutor

executor = QueryExecutor.from_directory(tmpdir)

# Find high-confidence dog samples
results = executor.query(
    where=lambda df: (df["confidence"] > 0.9) & (df["label"] == "dog")
)

print(f"Found {len(results)} high-confidence dog samples")
for loc in results[:5]:
    print(f"  shard={loc.shard}  key={loc.key}")
Found 6 high-confidence dog samples
  shard=images  key=69518926-0179-11f1-8000-000000000000
  shard=images-000000  key=6951fb04-0179-11f1-8000-000000000000
  shard=images-000000  key=69520b4e-0179-11f1-8000-000000000000
  shard=images-000000  key=69520ffe-0179-11f1-8000-000000000000
  shard=images-000000  key=69524e6a-0179-11f1-8000-000000000000

6 — Compound queries

The predicate receives a full DataFrame, so you can compose arbitrarily complex filters.

# Samples tagged "outdoor" with confidence below 0.7
results = executor.query(
    where=lambda df: (df["confidence"] < 0.7)
    & (df["tags"].apply(lambda t: "outdoor" in t if isinstance(t, list) else False))
)

print(f"Found {len(results)} low-confidence outdoor samples")
Found 22 low-confidence outdoor samples
# Count samples per label across all shards
import pandas as pd

all_dfs = [m.samples for m in executor._manifests if not m.samples.empty]
combined = pd.concat(all_dfs, ignore_index=True)
print(combined["label"].value_counts().to_string())
label
fish     50
bird     45
horse    38
dog      36
cat      31

7 — Clean up

import shutil

shutil.rmtree(tmpdir, ignore_errors=True)

Key takeaways

Concept API
Mark fields for manifests Annotated[T, ManifestField("categorical")]
Resolve manifest fields resolve_manifest_fields(SampleType)
Write with manifests write_samples(samples, path, manifest=True)
Query across shards QueryExecutor.from_directory(path).query(where=...)
Locate matching samples Returns list[SampleLocation] with shard + key