sdmxabs

Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.

 1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API."""
 2
 3from importlib.metadata import PackageNotFoundError, version
 4
 5from .download_cache import (
 6    CacheError,
 7    GetFileKwargs,
 8    HttpError,
 9    ModalityType,
10)
11from .fetch import fetch
12from .fetch_multi import fetch_multi
13from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item
14from .flow_metadata import code_lists, data_dimensions, data_flows
15
16# --- version and author
17try:
18    __version__ = version(__name__)
19except PackageNotFoundError:
20    __version__ = "0.0.0"  # Fallback for development mode
21__author__ = "Bryan Palmer"
22
23# --- establish the package contents
24__all__ = [
25    "CacheError",
26    "GetFileKwargs",
27    "HttpError",
28    "MatchCriteria",
29    "MatchItem",
30    "MatchType",
31    "ModalityType",
32    "__author__",
33    "__version__",
34    "code_lists",
35    "data_dimensions",
36    "data_flows",
37    "fetch",
38    "fetch_multi",
39    "fetch_selection",
40    "make_wanted",
41    "match_item",
42]
class CacheError(builtins.Exception):
38class CacheError(Exception):
39    """A problem retrieving data from the cache."""

A problem retrieving data from the cache.

class GetFileKwargs(typing.TypedDict):
45class GetFileKwargs(TypedDict):
46    """TypedDict for acqure_url function arguments."""
47
48    verbose: NotRequired[bool]
49    """If True, print information about the data retrieval process."""
50    modality: NotRequired[ModalityType]
51    """Kind of retrieval: "prefer_cache", "prefer_url"."""

TypedDict for acqure_url function arguments.

verbose: NotRequired[bool]

If True, print information about the data retrieval process.

modality: NotRequired[Literal['prefer-cache', 'prefer-url']]

Kind of retrieval: "prefer_cache", "prefer_url".

class HttpError(builtins.Exception):
34class HttpError(Exception):
35    """A problem retrieving data using HTTP."""

A problem retrieving data using HTTP.

MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
MatchItem = tuple[str, str, MatchType]
class MatchType(enum.Enum):
17class MatchType(Enum):
18    """Enumeration for match types."""
19
20    EXACT = 1
21    PARTIAL = 2
22    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
ModalityType = typing.Literal['prefer-cache', 'prefer-url']
__author__ = 'Bryan Palmer'
__version__ = '0.1.0'
@cache
def code_lists( cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
 90@cache
 91def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
 92    """Get the code list metadata from the ABS SDMX API.
 93
 94    Args:
 95        cl_id (str): The ID of the code list to retrieve.
 96        **kwargs: Additional keyword arguments passed to acquire_url().
 97
 98    Returns:
 99        FlowMetaDict: A dictionary containing the codes and
100            their associated key=value pairs. A "name" key should always
101            be present. A "parent" key may also be present.
102
103    Raises:
104        HttpError: If there is an issue with the HTTP request.
105        CacheError: If there is an issue with the cache.
106        ValueError: If no XML root is found in the response.
107
108    Note:
109        You will get a CacheError if the codelist is not found on the ABS SDMX API.
110        (This package tries the website first, then the cache.)
111
112    """
113    tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs)
114
115    codes: FlowMetaDict = {}
116    for code in tree.findall(".//str:Code", NAME_SPACES):
117        code_id = code.get("id", None)
118        if code_id is None:
119            continue
120        elements: dict[str, str] = {}
121        name = code.find("com:Name", NAME_SPACES)
122        elements["name"] = name.text if name is not None and name.text else "(missing)"
123        parent = code.find("str:Parent", NAME_SPACES)
124        parent_id = ""
125        if parent is not None:
126            ref = parent.find("Ref", NAME_SPACES)
127            if ref is not None:
128                parent_id = str(ref.get("id", ""))
129        if parent_id:  # Only add if not empty
130            elements["parent"] = parent_id
131
132        codes[code_id] = elements
133
134    return codes

Get the code list metadata from the ABS SDMX API.

Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)

@cache
def data_dimensions( flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
55@cache
56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
57    """Get the data dimensions metadata from the ABS SDMX API.
58
59    Args:
60        flow_id (str): The ID of the dataflow to retrieve dimensions for.
61        **kwargs: Additional keyword arguments passed to acquire_url().
62
63    Returns:
64        dict[str, dict[str, str]]: A dictionary containing the dimensions and
65            their metadata in key=value pairs.
66
67    Raises:
68        HttpError: If there is an issue with the HTTP request.
69        CacheError: If there is an issue with the cache.
70        ValueError: If no XML root is found in the response.
71
72    """
73    tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs)
74
75    dimensions = {}
76    for dim in tree.findall(".//str:Dimension", NAME_SPACES):
77        dim_id = dim.get("id")
78        dim_pos = dim.get("position")
79        if dim_id is None or dim_pos is None:
80            continue
81        contents = {"position": dim_pos}
82        if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and (
83            enumer := lr.find("str:Enumeration/Ref", NAME_SPACES)
84        ) is not None:
85            contents = contents | enumer.attrib
86        dimensions[dim_id] = contents
87    return dimensions

Get the data dimensions metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

@cache
def data_flows( flow_id: str = 'all', **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
22@cache
23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
24    """Get the toplevel metadata from the ABS SDMX API.
25
26    Args:
27        flow_id (str): The ID of the dataflow to retrieve. Defaults to "all".
28        **kwargs: Additional keyword arguments passed to acquire_url().
29
30    Returns:
31        dict[str, dict[str, str]]: A dictionary containing the dataflow IDs
32            and their metadatain key=value pairs.
33
34    Raises:
35        HttpError: If there is an issue with the HTTP request.
36        CacheError: If there is an issue with the cache.
37        ValueError: If no XML root is found in the response.
38
39    """
40    tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs)
41
42    d_flows: FlowMetaDict = {}
43    for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES):
44        attributes: dict[str, str] = dataflow.attrib.copy()
45        if "id" not in attributes:
46            continue
47        df_id = attributes.pop("id")
48        name_elem = dataflow.find("com:Name", NAME_SPACES)
49        df_name = name_elem.text if name_elem is not None else "(missing name)"
50        attributes["name"] = str(df_name)  # str(...) because pylance complains about it being None
51        d_flows[df_id] = attributes
52    return d_flows

Get the toplevel metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

def fetch( flow_id: str, dims: dict[str, str] | None = None, parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
196def fetch(
197    flow_id: str,
198    dims: dict[str, str] | None = None,
199    parameters: dict[str, str] | None = None,
200    *,
201    validate: bool = False,
202    **kwargs: Unpack[GetFileKwargs],
203) -> tuple[pd.DataFrame, pd.DataFrame]:
204    """Fetch data from the ABS SDMX API.
205
206    Args:
207        flow_id (str): The ID of the data flow from which to retrieve data items.
208        dims (dict[str, str], optional): A dictionary of dimensions to select the
209            data items. If None, the ABS fetch request will be for all data items,
210            which can be slow.
211        parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply
212            to the data request. Supported parameters include:
213            - 'startPeriod': Start period for data filtering (e.g., '2020-Q1')
214            - 'endPeriod': End period for data filtering (e.g., '2023-Q4')
215            - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata')
216            If None, no parameters are applied.
217        validate (bool): If True, print validation diagnostics for the proposed
218            dimensions against the metadata requirements. Defaults to False.
219        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
220
221    Returns: a tuple of two DataFrames:
222        - The first DataFrame contains the fetched data.
223        - The second DataFrame contains the metadata.
224
225    Raises:
226        HttpError: If there is an issue with the HTTP request.
227        CacheError: If there is an issue with the cache.
228        ValueError: If no XML root is found in the response.
229        ValueError: If invalid parameter values are provided.
230
231    Notes:
232        If the `dims` argument is not valid you should get a CacheError or HttpError.
233        If the `flow_id` is not valid, you should get a ValueError.
234
235    """
236    # --- debugging output
237    verbose = kwargs.get("verbose", False)
238    if verbose:
239        print(f"fetch(): {flow_id=} {dims=} {parameters=} {validate=} {kwargs=}")
240
241    # --- validate parameters
242    valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"}
243    if parameters:
244        detail_value = parameters.get("detail")
245        if detail_value and detail_value not in valid_detail_values:
246            raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}")
247
248    # --- prepare to get the XML root from the ABS SDMX API
249    # prefer fresh data every time
250    kwargs["modality"] = kwargs.get("modality", "prefer-url")
251    key = build_key(
252        flow_id,
253        dims,
254        validate=validate,
255    )
256
257    # --- build URL with optional parameters
258    url = f"{URL_STEM}/data/{flow_id}/{key}"
259    if parameters:
260        url_params = []
261        if "startPeriod" in parameters:
262            url_params.append(f"startPeriod={parameters['startPeriod']}")
263        if "endPeriod" in parameters:
264            url_params.append(f"endPeriod={parameters['endPeriod']}")
265        if "detail" in parameters:
266            url_params.append(f"detail={parameters['detail']}")
267        if url_params:
268            url += "?" + "&".join(url_params)
269
270    xml_root = acquire_xml(url, **kwargs)
271    return _extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply to the data request. Supported parameters include: - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') - 'endPeriod': End period for data filtering (e.g., '2023-Q4') - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') If None, no parameters are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.

Notes: If the dims argument is not valid you should get a CacheError or HttpError. If the flow_id is not valid, you should get a ValueError.

def fetch_multi( wanted: pandas.core.frame.DataFrame, parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
114def fetch_multi(
115    wanted: pd.DataFrame,
116    parameters: dict[str, str] | None = None,
117    *,
118    validate: bool = False,
119    **kwargs: Unpack[GetFileKwargs],
120) -> tuple[pd.DataFrame, pd.DataFrame]:
121    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
122
123    Args:
124        wanted: A DataFrame with rows for each desired data set (of one or more series).
125                Each row should contain the necessary identifiers to fetch the dataset.
126                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
127                The 'flow_id' column is mandatory, and the rest are optional.
128                Note: the DataFrame index is not used in the fetching process.
129        parameters: A dictionary of additional parameters to pass to the fetch function.
130        validate: If True, the function will validate dimensions and values against
131                  the ABS SDMX API codelists. Defaults to False.
132        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
133
134    Returns:
135        A tuple containing two DataFrames:
136        - The first DataFrame contains the fetched data.
137        - The second DataFrame contains metadata about the fetched datasets.
138
139    Raises:
140        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
141
142    Note:
143        CacheError and HttpError are raised by the fetch function.
144        These will be caught and reported to standard output.
145
146    Note:
147        The function validates that all datasets have compatible index types.
148        A ValueError will be raised if incompatible index types are detected
149        (e.g., mixing quarterly and monthly data).
150
151    """
152    # --- debugging output
153    verbose = kwargs.get("verbose", False)
154    if verbose:
155        print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}")
156
157    # --- quick sanity checks
158    if wanted.empty:
159        print("wanted DataFrame is empty, returning empty DataFrames.")
160        return pd.DataFrame(), pd.DataFrame()
161    if "flow_id" not in wanted.columns:
162        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
163
164    # --- do the work
165    return _extract(wanted, parameters, validate=validate, **kwargs)

Fetch multiple SDMX datasets based on a DataFrame of desired datasets.

Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.

Raises: ValueError: If the 'flow_id' column is missing from the wanted DataFrame.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).

def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
203def fetch_selection(
204    flow_id: str,
205    criteria: MatchCriteria,
206    parameters: dict[str, str] | None = None,
207    *,
208    validate: bool = False,
209    **kwargs: Unpack[GetFileKwargs],
210) -> tuple[pd.DataFrame, pd.DataFrame]:
211    """Fetch data based on a selection criteria for items.
212
213    Args:
214        flow_id (str): The ID of the data flow to fetch.
215        criteria (MatchCriteria): A sequence of match criteria to filter the data.
216        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
217        validate (bool, optional): If True, validate the selection against the flow's
218            required dimensions. Defaults to False.
219        **kwargs: Additional keyword arguments for the fetch_multi function.
220
221    Returns:
222        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
223
224    """
225    verbose = kwargs.get("verbose", False)
226    if verbose:
227        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
228    selection = make_wanted(flow_id, criteria)
229    return fetch_multi(selection, parameters, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
169def make_wanted(
170    flow_id: str,
171    criteria: MatchCriteria,
172) -> pd.DataFrame:
173    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
174
175    Args:
176        flow_id (str): The ID of the data flow to select items from.
177        criteria (MatchCriteria): A sequence of tuples containing the pattern,
178            dimension name, and match-type (exact, partial, or regex).
179
180    Returns:
181        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
182            into the call of the function fetch_multi().
183
184    Raises:
185        ValueError: If the flow_id is not valid or if no items match the criteria.
186
187    Notes:
188    -   Should build a one line DataFrame. This Frame may select multiple data series,
189        when passed to fetch_multi. It also can be concatenated with other DataFrames
190        to build a larger selection.
191    -   If two match elements refer to the same dimension, only the `intersection` of the
192        matches will be returned.
193
194    """
195    dimensions = _validate_flow_and_dimensions(flow_id)
196    result_dict = _process_match_criteria(criteria, flow_id, dimensions)
197
198    # Add flow_id and return as DataFrame
199    result_dict["flow_id"] = flow_id
200    return pd.DataFrame([result_dict]).astype(str)

Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
150def match_item(
151    pattern: str,
152    dimension: str,
153    match_type: MatchType = MatchType.PARTIAL,
154) -> MatchItem:
155    """Create a new MatchItem for use in select_items() and fetch_selection().
156
157    Args:
158        pattern (str): The pattern to match.
159        dimension (str): The dimension to match against.
160        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
161
162    Returns:
163        MatchElement: A tuple representing the match element.
164
165    """
166    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.