import numpy as np
from numpy.typing import NDArray
from typing import Annotated
import atdata
from atdata import ManifestField
@atdata.packable
class LabeledImage:
"""Image sample with queryable metadata."""
pixels: NDArray
label: Annotated[str, ManifestField("categorical")]
confidence: Annotated[float, ManifestField("numeric")]
tags: Annotated[list[str], ManifestField("set")]Manifest-Powered Queries
Large datasets become unwieldy when you need to find specific samples. atdata’s manifest system records per-shard metadata at write time, then lets you query across shards using pandas predicates—without ever opening the tar files themselves.
This example builds a labeled image dataset with manifests, writes them to JSON + Parquet sidecar files, and queries for specific samples.
1 — Define a sample type with manifest-aware fields
Primitive fields (str, int, float, bool, list) are automatically included in manifests with inferred aggregate types. You can also use Annotated[..., ManifestField(...)] for explicit control.
The ManifestField annotations tell atdata which aggregate statistics to collect:
- categorical — tracks the set of distinct values
- numeric — tracks min, max, mean, count
- set — tracks the union of all values across list fields
2 — See which fields are manifest-included
from atdata.manifest import resolve_manifest_fields
fields = resolve_manifest_fields(LabeledImage)
for name, mf in fields.items():
print(f" {name:15s} -> {mf.aggregate}") label -> categorical
confidence -> numeric
tags -> set
3 — Generate data and write shards with manifests
Pass manifest=True to write_samples and atdata builds the sidecar manifest files automatically—no manual ManifestBuilder plumbing needed.
import tempfile
from pathlib import Path
tmpdir = Path(tempfile.mkdtemp(prefix="atdata_manifest_"))
rng = np.random.default_rng(7)
categories = ["cat", "dog", "bird", "fish", "horse"]
all_tags = ["outdoor", "indoor", "closeup", "wide", "blurry", "sharp"]
num_samples = 300
samples = [
LabeledImage(
pixels=rng.integers(0, 255, (32, 32, 3), dtype=np.uint8),
label=categories[rng.integers(0, len(categories))],
confidence=round(float(rng.uniform(0.5, 1.0)), 3),
tags=list(rng.choice(all_tags, size=rng.integers(1, 4), replace=False)),
)
for _ in range(num_samples)
]
ds = atdata.write_samples(
samples,
tmpdir / "images.tar",
maxcount=100, # 3 shards of 100 samples each
manifest=True, # auto-build JSON + Parquet sidecar manifests
)
print(f"Wrote {num_samples} samples across {len(ds.list_shards())} shards")
print(f"Manifest files: {sorted(p.name for p in tmpdir.glob('*.manifest.*'))}")# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_manifest_001lx4pe/images-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_manifest_001lx4pe/images-000001.tar 100 0.0 GB 100
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_manifest_001lx4pe/images-000002.tar 100 0.0 GB 200
Wrote 300 samples across 3 shards
Manifest files: ['images-000000.manifest.json', 'images-000000.manifest.parquet', 'images-000001.manifest.json', 'images-000001.manifest.parquet', 'images-000002.manifest.json', 'images-000002.manifest.parquet']
4 — Inspect a manifest
Each shard gets a JSON header (aggregates + metadata) and a Parquet file (per-sample columns).
import json
manifest_json = sorted(tmpdir.glob("*.manifest.json"))[0]
with open(manifest_json) as f:
header = json.load(f)
print("Shard:", header["shard_id"])
print("Samples:", header["num_samples"])
print()
print("Aggregates:")
for field_name, agg in header["aggregates"].items():
print(f" {field_name}: {agg}")Shard: images
Samples: 100
Aggregates:
label: {'type': 'categorical', 'cardinality': 5, 'value_counts': {'horse': 22, 'bird': 26, 'fish': 25, 'dog': 12, 'cat': 15}}
confidence: {'type': 'numeric', 'min': 0.509, 'max': 0.996, 'mean': 0.7610199999999999, 'count': 100}
tags: {'type': 'set', 'all_values': ['blurry', 'closeup', 'indoor', 'outdoor', 'sharp', 'wide']}
5 — Query across shards
QueryExecutor loads all manifests from a directory and runs a pandas predicate over the per-sample Parquet data.
from atdata import QueryExecutor
executor = QueryExecutor.from_directory(tmpdir)
# Find high-confidence dog samples
results = executor.query(
where=lambda df: (df["confidence"] > 0.9) & (df["label"] == "dog")
)
print(f"Found {len(results)} high-confidence dog samples")
for loc in results[:5]:
print(f" shard={loc.shard} key={loc.key}")Found 6 high-confidence dog samples
shard=images key=69518926-0179-11f1-8000-000000000000
shard=images-000000 key=6951fb04-0179-11f1-8000-000000000000
shard=images-000000 key=69520b4e-0179-11f1-8000-000000000000
shard=images-000000 key=69520ffe-0179-11f1-8000-000000000000
shard=images-000000 key=69524e6a-0179-11f1-8000-000000000000
6 — Compound queries
The predicate receives a full DataFrame, so you can compose arbitrarily complex filters.
# Samples tagged "outdoor" with confidence below 0.7
results = executor.query(
where=lambda df: (df["confidence"] < 0.7)
& (df["tags"].apply(lambda t: "outdoor" in t if isinstance(t, list) else False))
)
print(f"Found {len(results)} low-confidence outdoor samples")Found 22 low-confidence outdoor samples
# Count samples per label across all shards
import pandas as pd
all_dfs = [m.samples for m in executor._manifests if not m.samples.empty]
combined = pd.concat(all_dfs, ignore_index=True)
print(combined["label"].value_counts().to_string())label
fish 50
bird 45
horse 38
dog 36
cat 31
7 — Clean up
import shutil
shutil.rmtree(tmpdir, ignore_errors=True)Key takeaways
| Concept | API |
|---|---|
| Mark fields for manifests | Annotated[T, ManifestField("categorical")] |
| Resolve manifest fields | resolve_manifest_fields(SampleType) |
| Write with manifests | write_samples(samples, path, manifest=True) |
| Query across shards | QueryExecutor.from_directory(path).query(where=...) |
| Locate matching samples | Returns list[SampleLocation] with shard + key |