Index-Managed Datasets

Use Index with LocalDiskStore for managed dataset storage and discovery

Writing raw tar files is fine for quick experiments, but as your dataset collection grows you need discovery, versioning, and metadata. The Index class provides this layer—backed by SQLite by default—and pairs with LocalDiskStore for persistent file storage.

This example creates an index, writes two datasets into it, lists and retrieves them, and shows how load_dataset resolves from the index.

1 — Define sample types

import numpy as np
from numpy.typing import NDArray
import atdata


@atdata.packable
class TextSample:
    """Simple text document."""

    text: str
    language: str
    word_count: int


@atdata.packable
class EmbeddingSample:
    """Dense embedding with label."""

    vector: NDArray
    label: str
    source: str

2 — Create an Index with local disk storage

By default Index() uses an in-memory SQLite database. We pass a custom path to persist across the example, and a LocalDiskStore rooted in a temp directory.

import tempfile
from pathlib import Path

tmpdir = Path(tempfile.mkdtemp(prefix="atdata_index_"))

index = atdata.Index(
    path=tmpdir / "index.db",
    data_store=atdata.LocalDiskStore(root=tmpdir / "data"),
)

print(f"Index DB   : {tmpdir / 'index.db'}")
print(f"Data root  : {tmpdir / 'data'}")
Index DB   : /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/index.db
Data root  : /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/data

3 — Write datasets through the index

index.write() serializes samples to sharded tars via the data store and creates a tracked entry with a content-addressed CID.

rng = np.random.default_rng(99)

# --- Dataset 1: text documents ---
text_samples = [
    TextSample(
        text=f"Document number {i} about topic {i % 5}.",
        language="en" if i % 3 != 0 else "es",
        word_count=len(f"Document number {i} about topic {i % 5}.".split()),
    )
    for i in range(500)
]

text_entry = index.write(
    text_samples,
    name="docs-v1",
    description="500 synthetic text documents",
    tags=["text", "multilingual"],
    maxcount=250,
)

print(f"Text dataset CID  : {text_entry.cid[:16]}...")
print(f"Text dataset shards: {len(text_entry.data_urls)}")
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp0rm6ej8c/data-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp0rm6ej8c/data-000001.tar 250 0.0 GB 250
# writing /private/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/data/docs-v1/data--078fef77--000000.tar 0 0.0 GB 0
Text dataset CID  : bafyreiexgbzh45k...
Text dataset shards: 1
# --- Dataset 2: embeddings ---
embedding_samples = [
    EmbeddingSample(
        vector=rng.standard_normal(128).astype(np.float32),
        label=f"class_{i % 10}",
        source="synthetic",
    )
    for i in range(300)
]

emb_entry = index.write(
    embedding_samples,
    name="embeddings-v1",
    description="300 synthetic 128-d embeddings",
    tags=["embeddings", "synthetic"],
    maxcount=150,
)

print(f"Embedding CID     : {emb_entry.cid[:16]}...")
print(f"Embedding shards  : {len(emb_entry.data_urls)}")
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp9mw37qd0/data-000000.tar 0 0.0 GB 0
# writing /var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/tmp9mw37qd0/data-000001.tar 150 0.0 GB 150
# writing /private/var/folders/hx/9l078dds5z945qcv8j1hsnr00000gn/T/atdata_index_fvd0vvxj/data/embeddings-v1/data--3bd5dbe5--000000.tar 0 0.0 GB 0
Embedding CID     : bafyreid45dgeyhf...
Embedding shards  : 1

4 — List and discover datasets

print("Datasets in index:")
for entry in index.list_datasets():
    print(f"  {entry.name:20s}  shards={len(entry.data_urls)}  cid={entry.cid[:12]}...")
Datasets in index:
  test-ly               shards=1  cid=bafyreiawvxv...
  proto-ly              shards=1  cid=bafyreidtr6v...
  proto-ly              shards=1  cid=bafyreihelhd...
  proto-ly              shards=1  cid=bafyreihly5j...
  docs-v1               shards=1  cid=bafyreicek7z...
  embeddings-v1         shards=1  cid=bafyreidc3by...
  docs-v1               shards=1  cid=bafyreia4iqz...
  embeddings-v1         shards=1  cid=bafyreihsz4n...
  docs-v1               shards=1  cid=bafyreiejgub...
  embeddings-v1         shards=1  cid=bafyreifwtav...
  docs-v1               shards=1  cid=bafyreiaor62...
  embeddings-v1         shards=1  cid=bafyreidjt2i...
  docs-v1               shards=1  cid=bafyreidphej...
  embeddings-v1         shards=1  cid=bafyreid5c7s...
  docs-v1               shards=1  cid=bafyreia6rt3...
  embeddings-v1         shards=1  cid=bafyreifwj4u...
  docs-v1               shards=1  cid=bafyreig5yro...
  embeddings-v1         shards=1  cid=bafyreiey4ua...
  docs-v1               shards=1  cid=bafyreihvtxe...
  embeddings-v1         shards=1  cid=bafyreic2r24...
  docs-v1               shards=1  cid=bafyreiexgbz...
  embeddings-v1         shards=1  cid=bafyreid45dg...

5 — Load a dataset from the index

load_dataset can resolve dataset names through the default index. Because the index stores the schema for each dataset, you don’t need to pass the sample type explicitly—it’s reconstructed automatically.

atdata.set_default_index(index)

# No sample_type argument needed -- schema is resolved from the index
ds = atdata.load_dataset("@local/docs-v1", split="train")
for batch in ds.ordered(batch_size=10):
    print(f"Texts     : {batch.text[:2]}...")
    print(f"Languages : {batch.language[:5]}...")
    print(f"Counts    : {batch.word_count[:5]}...")
    break
Texts     : ['Document number 0 about topic 0.', 'Document number 1 about topic 1.']...
Languages : ['es', 'en', 'en', 'es', 'en']...
Counts    : [6, 6, 6, 6, 6]...
# Same for embeddings -- type is auto-resolved
emb_ds = atdata.load_dataset("@local/embeddings-v1", split="train")
for batch in emb_ds.ordered(batch_size=8):
    print(f"Vector shape : {batch.vector.shape}")
    print(f"Labels       : {batch.label}")
    break
Vector shape : (8, 128)
Labels       : ['class_0', 'class_1', 'class_2', 'class_3', 'class_4', 'class_5', 'class_6', 'class_7']

6 — Schema tracking

The index records the schema for each dataset, enabling type reconstruction at load time. Schemas are persisted automatically by index.write().

for schema in index.schemas:
    print(f"  {schema.name:20s}  fields={[f.name for f in schema.fields]}")
  TextSample            fields=['text', 'language', 'word_count']
  EmbeddingSample       fields=['vector', 'label', 'source']

7 — Inspect the repository

The index stores all backends in a uniform repos dict. The "local" repository is always present.

for name, repo in index.repos.items():
    has_store = "yes" if repo.data_store else "no"
    print(f"  {name:10s}  provider={type(repo.provider).__name__}  data_store={has_store}")
  local       provider=SqliteProvider  data_store=yes

8 — Clean up

import shutil

atdata.set_default_index(None)
shutil.rmtree(tmpdir, ignore_errors=True)

Key takeaways

Concept API
Create an index atdata.Index(path=..., data_store=...)
Write with tracking index.write(samples, name=..., tags=...)
List datasets index.list_datasets()
Schema discovery index.list_schemas()
Inspect repositories index.repos (dict including "local")
Set global index atdata.set_default_index(index)
Load by name (auto-typed) atdata.load_dataset("@local/name", split=...)