import numpy as np
from numpy.typing import NDArray
from typing import Annotated
import atdata
from atdata import ManifestField
@atdata.packable
class Observation:
"""A wildlife observation with queryable metadata."""
embedding: NDArray
species: Annotated[str, ManifestField("categorical")]
weight_kg: Annotated[float, ManifestField("numeric")]
is_adult: Annotated[bool, ManifestField("categorical")]
habitats: Annotated[list[str], ManifestField("set")]Query Cookbook
This cookbook collects practical query patterns for manifest-powered datasets. Each recipe shows both the lambda syntax (pandas-native) and the typed proxy DSL (F expressions) side by side, so you can pick whichever reads best for your use case.
1 — Build the example dataset
We model a wildlife observation log with species, weight, maturity, and habitat tags. Weight distributions vary by species to make queries interesting.
import tempfile
from pathlib import Path
tmpdir = Path(tempfile.mkdtemp(prefix="atdata_cookbook_"))
rng = np.random.default_rng(2024)
species_weights = {
"elephant": (3500, 800),
"bear": (250, 80),
"wolf": (40, 12),
"eagle": (5, 1.5),
"salmon": (8, 3),
"mouse": (0.03, 0.01),
}
habitat_pool = ["forest", "desert", "ocean", "mountain", "tundra", "grassland"]
samples = []
for _ in range(800):
sp = rng.choice(list(species_weights.keys()))
mean, std = species_weights[sp]
samples.append(Observation(
embedding=rng.standard_normal(32).astype(np.float32),
species=sp,
weight_kg=round(max(0.01, float(rng.normal(mean, std))), 2),
is_adult=bool(rng.random() > 0.3),
habitats=list(rng.choice(habitat_pool, size=rng.integers(1, 4), replace=False)),
))
ds = atdata.write_samples(samples, tmpdir / "wildlife.tar", maxcount=200, manifest=True)
print(f"Wrote {len(samples)} observations across {len(ds.list_shards())} shards")# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_cookbook_yfgbs7x7/wildlife-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_cookbook_yfgbs7x7/wildlife-000001.tar 200 0.0 GB 200
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_cookbook_yfgbs7x7/wildlife-000002.tar 200 0.0 GB 400
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_cookbook_yfgbs7x7/wildlife-000003.tar 200 0.0 GB 600
Wrote 800 observations across 4 shards
2 — Load manifests and preview
import pandas as pd
from atdata import QueryExecutor
from atdata.manifest import F
executor = QueryExecutor.from_directory(tmpdir)
combined = pd.concat(
[m.samples for m in executor._manifests if not m.samples.empty],
ignore_index=True,
)
print(combined[["species", "weight_kg", "is_adult"]].describe().to_string())
print()
print(f"Species: {sorted(combined['species'].unique())}") weight_kg
count 600.000000
mean 616.406983
std 1322.182118
min 0.010000
25% 4.195000
50% 11.790000
75% 257.692500
max 5921.580000
Species: ['bear', 'eagle', 'elephant', 'mouse', 'salmon', 'wolf']
3 — Recipe: threshold filter
Find heavy animals above a weight threshold.
results_lambda = executor.query(
where=lambda df: df["weight_kg"] > 500
)
print(f"Matches (lambda): {len(results_lambda)}")Matches (lambda): 95
results_dsl = executor.query(where=F.weight_kg > 500)
print(f"Matches (DSL): {len(results_dsl)}")Matches (DSL): 95
heavy_keys = {loc.key for loc in results_dsl}
heavy_df = combined[combined["__key__"].isin(heavy_keys)]
print("\nSpecies breakdown of heavy animals (>500 kg):")
print(heavy_df["species"].value_counts().to_string())
Species breakdown of heavy animals (>500 kg):
species
elephant 95
4 — Recipe: categorical selection
bears = executor.query(where=F.species == "bear")
print(f"Bears: {len(bears)}")Bears: 112
large_animals = executor.query(where=F.species.isin(["elephant", "bear"]))
print(f"Elephants + bears: {len(large_animals)}")Elephants + bears: 207
not_small = executor.query(where=~F.species.isin(["mouse", "salmon"]))
print(f"Excluding mouse & salmon: {len(not_small)}")Excluding mouse & salmon: 398
5 — Recipe: range filter
The .between() method creates a closed-interval filter.
mid_range = executor.query(
where=lambda df: df["weight_kg"].between(10, 100)
)
print(f"Weight 10-100 kg (lambda): {len(mid_range)}")Weight 10-100 kg (lambda): 112
mid_range_dsl = executor.query(where=F.weight_kg.between(10, 100))
print(f"Weight 10-100 kg (DSL): {len(mid_range_dsl)}")Weight 10-100 kg (DSL): 112
7 — Recipe: compound queries
Combine predicates with & (AND), | (OR), and ~ (NOT).
adult_bears = executor.query(
where=(F.species == "bear") & (F.is_adult == True)
)
print(f"Adult bears: {len(adult_bears)}")Adult bears: 73
extreme = executor.query(
where=(F.species == "elephant") | (F.weight_kg < 0.1)
)
print(f"Elephants or very light (<0.1 kg): {len(extreme)}")Elephants or very light (<0.1 kg): 202
juveniles = executor.query(where=~(F.is_adult == True))
print(f"Juveniles: {len(juveniles)}")Juveniles: 181
8 — Scatter plot: query results highlighted
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
query_pred = (F.species.isin(["elephant", "bear"])) & (F.is_adult == True)
match_locs = executor.query(where=query_pred)
match_keys = {loc.key for loc in match_locs}
species_list = sorted(combined["species"].unique())
cmap = plt.cm.Set2
species_colors = {sp: cmap(i / len(species_list)) for i, sp in enumerate(species_list)}
fig, ax = plt.subplots(figsize=(8, 4))
for sp in species_list:
mask = combined["species"] == sp
subset = combined[mask]
ax.scatter(
subset.index, subset["weight_kg"],
c=[species_colors[sp]], label=sp,
s=15, alpha=0.6, linewidths=0,
)
# Highlight matches
match_mask = combined["__key__"].isin(match_keys)
ax.scatter(
combined[match_mask].index,
combined[match_mask]["weight_kg"],
facecolors="none", edgecolors="#D32F2F", s=50, linewidths=1.5,
label="query match", zorder=5,
)
ax.set_xlabel("Sample index")
ax.set_ylabel("Weight (kg)")
ax.set_yscale("log")
ax.set_title("Adult elephants & bears highlighted", fontweight="bold")
ax.legend(fontsize=8, ncol=4, loc="upper center", bbox_to_anchor=(0.5, -0.15))
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/ipykernel_23348/4089515773.py:40: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
plt.show()
9 — Query hit rate per shard
This chart shows how matches distribute across shards. When aggregate pruning is active, shards with zero hits can be skipped entirely.
from collections import Counter
shard_hits = Counter(loc.shard for loc in match_locs)
shards_all = sorted({m.shard_id for m in executor._manifests})
hit_counts = [shard_hits.get(s, 0) for s in shards_all]
shard_labels = [Path(s).stem for s in shards_all]
fig, ax = plt.subplots(figsize=(6, 3))
colors = ["#43A047" if c > 0 else "#BDBDBD" for c in hit_counts]
ax.bar(shard_labels, hit_counts, color=colors, edgecolor="white")
for i, count in enumerate(hit_counts):
if count > 0:
ax.text(i, count + 0.5, str(count), ha="center", fontweight="bold", fontsize=10)
ax.set_ylabel("Matching samples")
ax.set_title("Query Hits per Shard", fontweight="bold")
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/ipykernel_23348/1217293230.py:21: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
plt.show()
In production, QueryExecutor checks each shard’s aggregate statistics before loading its parquet file. If the aggregates prove no samples can match (e.g., a shard’s max weight is below your threshold), the parquet is never read—saving I/O on large datasets.
10 — Building a filtered subset
Query results are SampleLocation objects with shard path, key, and byte offset. Group them by shard for downstream processing.
from itertools import groupby
from operator import attrgetter
sorted_locs = sorted(match_locs, key=attrgetter("shard"))
for shard, group in groupby(sorted_locs, key=attrgetter("shard")):
keys = [loc.key for loc in group]
print(f" {Path(shard).stem}: {len(keys)} matches (first 3: {keys[:3]})") wildlife: 49 matches (first 3: ['fa3119bc-0173-11f1-8000-000000000000', 'fa312560-0173-11f1-8000-000000000000', 'fa312af6-0173-11f1-8000-000000000000'])
wildlife-000000: 39 matches (first 3: ['fa328b4e-0173-11f1-8000-000000000000', 'fa328ca2-0173-11f1-8000-000000000000', 'fa328df6-0173-11f1-8000-000000000000'])
wildlife-000001: 52 matches (first 3: ['fa33c22a-0173-11f1-8000-000000000000', 'fa33cfc2-0173-11f1-8000-000000000000', 'fa33d814-0173-11f1-8000-000000000000'])
11 — Clean up
import shutil
shutil.rmtree(tmpdir, ignore_errors=True)Key takeaways
| Pattern | Lambda | Typed DSL |
|---|---|---|
| Threshold | df["weight"] > 50 |
F.weight > 50 |
| Equality | df["species"] == "bear" |
F.species == "bear" |
| Set membership | df["species"].isin([...]) |
F.species.isin([...]) |
| Range | df["x"].between(a, b) |
F.x.between(a, b) |
| AND | (cond1) & (cond2) |
(cond1) & (cond2) |
| OR | (cond1) \| (cond2) |
(cond1) \| (cond2) |
| NOT | ~(condition) |
~(condition) |
| Tag contains | df["tags"].apply(...) |
(use lambda) |