import numpy as np
from numpy.typing import NDArray
import atdata
@atdata.packable
class TextSample:
"""Simple text document."""
text: str
language: str
word_count: int
@atdata.packable
class EmbeddingSample:
"""Dense embedding with label."""
vector: NDArray
label: str
source: strIndex-Managed Datasets
Writing raw tar files is fine for quick experiments, but as your dataset collection grows you need discovery, versioning, and metadata. The Index class provides this layer—backed by SQLite by default—and pairs with LocalDiskStore for persistent file storage.
This example creates an index, writes two datasets into it, lists and retrieves them, and shows how load_dataset resolves from the index.
1 — Define sample types
2 — Create an Index with local disk storage
By default Index() uses an in-memory SQLite database. We pass a custom path to persist across the example, and a LocalDiskStore rooted in a temp directory.
import tempfile
from pathlib import Path
tmpdir = Path(tempfile.mkdtemp(prefix="atdata_index_"))
index = atdata.Index(
path=tmpdir / "index.db",
data_store=atdata.LocalDiskStore(root=tmpdir / "data"),
)
print(f"Index DB : {tmpdir / 'index.db'}")
print(f"Data root : {tmpdir / 'data'}")Index DB : /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/index.db
Data root : /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/data
3 — Write datasets through the index
index.write() serializes samples to sharded tars via the data store and creates a tracked entry with a content-addressed CID.
rng = np.random.default_rng(99)
# --- Dataset 1: text documents ---
text_samples = [
TextSample(
text=f"Document number {i} about topic {i % 5}.",
language="en" if i % 3 != 0 else "es",
word_count=len(f"Document number {i} about topic {i % 5}.".split()),
)
for i in range(500)
]
text_entry = index.write(
text_samples,
name="docs-v1",
description="500 synthetic text documents",
tags=["text", "multilingual"],
maxcount=250,
)
print(f"Text dataset CID : {text_entry.cid[:16]}...")
print(f"Text dataset shards: {len(text_entry.data_urls)}")# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp0rm6ej8c/data-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp0rm6ej8c/data-000001.tar 250 0.0 GB 250
# writing /private/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/data/docs-v1/data--078fef77--000000.tar 0 0.0 GB 0
Text dataset CID : bafyreiexgbzh45k...
Text dataset shards: 1
# --- Dataset 2: embeddings ---
embedding_samples = [
EmbeddingSample(
vector=rng.standard_normal(128).astype(np.float32),
label=f"class_{i % 10}",
source="synthetic",
)
for i in range(300)
]
emb_entry = index.write(
embedding_samples,
name="embeddings-v1",
description="300 synthetic 128-d embeddings",
tags=["embeddings", "synthetic"],
maxcount=150,
)
print(f"Embedding CID : {emb_entry.cid[:16]}...")
print(f"Embedding shards : {len(emb_entry.data_urls)}")# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp9mw37qd0/data-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp9mw37qd0/data-000001.tar 150 0.0 GB 150
# writing /private/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/data/embeddings-v1/data--3bd5dbe5--000000.tar 0 0.0 GB 0
Embedding CID : bafyreid45dgeyhf...
Embedding shards : 1
4 — List and discover datasets
print("Datasets in index:")
for entry in index.list_datasets():
print(f" {entry.name:20s} shards={len(entry.data_urls)} cid={entry.cid[:12]}...")Datasets in index:
test-ly shards=1 cid=bafyreiawvxv...
proto-ly shards=1 cid=bafyreidtr6v...
proto-ly shards=1 cid=bafyreihelhd...
proto-ly shards=1 cid=bafyreihly5j...
docs-v1 shards=1 cid=bafyreicek7z...
embeddings-v1 shards=1 cid=bafyreidc3by...
docs-v1 shards=1 cid=bafyreia4iqz...
embeddings-v1 shards=1 cid=bafyreihsz4n...
docs-v1 shards=1 cid=bafyreiejgub...
embeddings-v1 shards=1 cid=bafyreifwtav...
docs-v1 shards=1 cid=bafyreiaor62...
embeddings-v1 shards=1 cid=bafyreidjt2i...
docs-v1 shards=1 cid=bafyreidphej...
embeddings-v1 shards=1 cid=bafyreid5c7s...
docs-v1 shards=1 cid=bafyreia6rt3...
embeddings-v1 shards=1 cid=bafyreifwj4u...
docs-v1 shards=1 cid=bafyreig5yro...
embeddings-v1 shards=1 cid=bafyreiey4ua...
docs-v1 shards=1 cid=bafyreihvtxe...
embeddings-v1 shards=1 cid=bafyreic2r24...
docs-v1 shards=1 cid=bafyreiexgbz...
embeddings-v1 shards=1 cid=bafyreid45dg...
5 — Load a dataset from the index
load_dataset can resolve dataset names through the default index. Because the index stores the schema for each dataset, you don’t need to pass the sample type explicitly—it’s reconstructed automatically.
atdata.set_default_index(index)
# No sample_type argument needed -- schema is resolved from the index
ds = atdata.load_dataset("@local/docs-v1", split="train")
for batch in ds.ordered(batch_size=10):
print(f"Texts : {batch.text[:2]}...")
print(f"Languages : {batch.language[:5]}...")
print(f"Counts : {batch.word_count[:5]}...")
breakTexts : ['Document number 0 about topic 0.', 'Document number 1 about topic 1.']...
Languages : ['es', 'en', 'en', 'es', 'en']...
Counts : [6, 6, 6, 6, 6]...
# Same for embeddings -- type is auto-resolved
emb_ds = atdata.load_dataset("@local/embeddings-v1", split="train")
for batch in emb_ds.ordered(batch_size=8):
print(f"Vector shape : {batch.vector.shape}")
print(f"Labels : {batch.label}")
breakVector shape : (8, 128)
Labels : ['class_0', 'class_1', 'class_2', 'class_3', 'class_4', 'class_5', 'class_6', 'class_7']
6 — Schema tracking
The index records the schema for each dataset, enabling type reconstruction at load time. Schemas are persisted automatically by index.write().
for schema in index.schemas:
print(f" {schema.name:20s} fields={[f.name for f in schema.fields]}") TextSample fields=['text', 'language', 'word_count']
EmbeddingSample fields=['vector', 'label', 'source']
7 — Inspect the repository
The index stores all backends in a uniform repos dict. The "local" repository is always present.
for name, repo in index.repos.items():
has_store = "yes" if repo.data_store else "no"
print(f" {name:10s} provider={type(repo.provider).__name__} data_store={has_store}") local provider=SqliteProvider data_store=yes
8 — Clean up
import shutil
atdata.set_default_index(None)
shutil.rmtree(tmpdir, ignore_errors=True)Key takeaways
| Concept | API |
|---|---|
| Create an index | atdata.Index(path=..., data_store=...) |
| Write with tracking | index.write(samples, name=..., tags=...) |
| List datasets | index.list_datasets() |
| Schema discovery | index.list_schemas() |
| Inspect repositories | index.repos (dict including "local") |
| Set global index | atdata.set_default_index(index) |
| Load by name (auto-typed) | atdata.load_dataset("@local/name", split=...) |