sdmxabs.fetch_multi
Fetch multiple datasets from the SDMX API.
1"""Fetch multiple datasets from the SDMX API.""" 2 3from io import StringIO 4from typing import Unpack 5 6import pandas as pd 7 8from sdmxabs.download_cache import CacheError, GetFileKwargs, HttpError 9from sdmxabs.fetch import fetch 10 11# --- private function 12IndexInformation = tuple[type, str | None] # (Index type, frequency if PeriodIndex) 13 14 15def _validate_index_compatibility( 16 data: pd.DataFrame, reference_index_info: IndexInformation | None 17) -> IndexInformation: 18 """Validate that the index of the current DataFrame is compatible with the reference index.""" 19 # establish the index information for the current DataFrame 20 if isinstance(data.index, pd.PeriodIndex): 21 current_index_info: IndexInformation = (type(data.index), data.index.freqstr) 22 else: 23 current_index_info = (type(data.index), None) 24 25 # if this is the first DataFrame, set the reference index info 26 if reference_index_info is None: 27 reference_index_info = current_index_info 28 29 # if this is not the first DataFrame, check for index compatibility 30 elif current_index_info != reference_index_info: 31 raise ValueError( 32 f"Index mismatch: cannot mix {reference_index_info} " 33 f"with {current_index_info}. " 34 f"All datasets must have the same index type (e.g., all quarterly or all monthly data)." 35 ) 36 37 return reference_index_info 38 39 40def _extract( 41 wanted: pd.DataFrame, 42 parameters: dict[str, str] | None, 43 *, 44 validate: bool = False, 45 **kwargs: Unpack[GetFileKwargs], 46) -> tuple[pd.DataFrame, pd.DataFrame]: # data / metadata 47 """Extract the data and metadata for each row in the dimensions DataFrame. 48 49 Args: 50 wanted (pd.DataFrame): DataFrame containing the dimensions to fetch. 51 DataFrame cells with NAN values will be ignored. 52 The DataFrame must have a populated 'flow_id' column. 53 validate (bool): If True, the function will validate the dimensions and values 54 against the ABS SDMX API codelists. Defaults to False. 55 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 56 57 Returns: 58 tuple[pd.DataFrame, pd.DataFrame]: A DataFrame with the fetched data and 59 a DataFrame with the metadata. 60 61 Raises: 62 ValueError: if any input data is not as expected, or if incompatible 63 index types are detected (e.g., mixing quarterly and monthly data). 64 65 Note: CacheError and HttpError are raised by the fetch function. 66 These will be caught and reported to standard output. 67 68 """ 69 # --- initial setup - empty return results 70 return_meta = {} 71 return_data = {} 72 counter = 0 73 reference_index_info: IndexInformation | None = None 74 75 # --- loop over the rows of the wanted DataFrame 76 for _index, row in wanted.iterrows(): 77 # --- get the arguments for the fetch (ignoring NaN values) 78 row_dict: dict[str, str] = row.dropna().to_dict() 79 flow_id = row_dict.pop("flow_id", "") 80 if not flow_id: 81 # --- if there is no flow_id, we will skip this row 82 print(f"Skipping row with no flow_id: {row_dict}") 83 continue 84 85 # --- fetch the data and meta data for each row of the selection table 86 try: 87 data, meta = fetch(flow_id, dims=row_dict, parameters=parameters, validate=validate, **kwargs) 88 except (CacheError, HttpError, ValueError) as e: 89 # --- if there is an error, we will skip this row 90 print(f"Error fetching {flow_id} with dimensions {row_dict}: {e}") 91 continue 92 if data.empty or meta.empty: 93 # --- this should not happen, but if it does, we will skip this row 94 print(f"No data for {flow_id} with dimensions {row_dict}") 95 continue 96 97 # --- validate index compatibility - including frequency compatibility for PeriodIndex 98 reference_index_info = _validate_index_compatibility(data, reference_index_info) 99 100 # --- manage duplicates 101 for col in data.columns: 102 counter += 1 103 save_name = col 104 if save_name in return_data: 105 save_name += f"_{counter:03d}" 106 return_data[save_name] = data[col] 107 return_meta[save_name] = meta.loc[col] 108 109 return pd.DataFrame(return_data), pd.DataFrame(return_meta).T 110 111 112# --- public function 113def fetch_multi( 114 wanted: pd.DataFrame, 115 parameters: dict[str, str] | None = None, 116 *, 117 validate: bool = False, 118 **kwargs: Unpack[GetFileKwargs], 119) -> tuple[pd.DataFrame, pd.DataFrame]: 120 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 121 122 Args: 123 wanted: A DataFrame with rows for each desired data set (of one or more series). 124 Each row should contain the necessary identifiers to fetch the dataset. 125 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 126 The 'flow_id' column is mandatory, and the rest are optional. 127 Note: the DataFrame index is not used in the fetching process. 128 parameters: A dictionary of additional parameters to pass to the fetch function. 129 validate: If True, the function will validate dimensions and values against 130 the ABS SDMX API codelists. Defaults to False. 131 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 132 133 Returns: 134 A tuple containing two DataFrames: 135 - The first DataFrame contains the fetched data. 136 - The second DataFrame contains metadata about the fetched datasets. 137 138 Raises: 139 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 140 141 Note: 142 CacheError and HttpError are raised by the fetch function. 143 These will be caught and reported to standard output. 144 145 Note: 146 The function validates that all datasets have compatible index types. 147 A ValueError will be raised if incompatible index types are detected 148 (e.g., mixing quarterly and monthly data). 149 150 """ 151 # --- debugging output 152 verbose = kwargs.get("verbose", False) 153 if verbose: 154 print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}") 155 156 # --- quick sanity checks 157 if wanted.empty: 158 print("wanted DataFrame is empty, returning empty DataFrames.") 159 return pd.DataFrame(), pd.DataFrame() 160 if "flow_id" not in wanted.columns: 161 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 162 163 # --- do the work 164 return _extract(wanted, parameters, validate=validate, **kwargs) 165 166 167if __name__ == "__main__": 168 169 def module_test() -> None: 170 """Run a simple test of the module.""" 171 wanted_text = """ 172 flow_id, MEASURE, INDEX, TSEST, REGION, DATA_ITEM, SECTOR, FREQ 173 CPI, 3, 10001, 10, 50, -, -, Q 174 CPI, 3, 999902, 20, 50, -, -, Q 175 CPI, 3, 999903, 20, 50, -, -, Q 176 ANA_EXP, DCH, -, 20, AUS, FCE, PHS, Q 177 ANA_EXP, PCT_DCH, -, 20, AUS, FCE, PHS, Q 178 """ 179 wanted = pd.read_csv(StringIO(wanted_text), dtype=str, skipinitialspace=True) 180 parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"} 181 fetched_data, _fetched_meta = fetch_multi( 182 wanted, 183 parameters=parameters, 184 validate=False, 185 modality="prefer-url", 186 ) 187 expected = (4, 5) 188 if fetched_data.shape == expected: 189 print(f"Test passed: {fetched_data.shape=}.") 190 else: 191 print(f"Test FAILED: data shape {fetched_data.shape} is unexpected {expected=}.") 192 193 module_test()
114def fetch_multi( 115 wanted: pd.DataFrame, 116 parameters: dict[str, str] | None = None, 117 *, 118 validate: bool = False, 119 **kwargs: Unpack[GetFileKwargs], 120) -> tuple[pd.DataFrame, pd.DataFrame]: 121 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 122 123 Args: 124 wanted: A DataFrame with rows for each desired data set (of one or more series). 125 Each row should contain the necessary identifiers to fetch the dataset. 126 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 127 The 'flow_id' column is mandatory, and the rest are optional. 128 Note: the DataFrame index is not used in the fetching process. 129 parameters: A dictionary of additional parameters to pass to the fetch function. 130 validate: If True, the function will validate dimensions and values against 131 the ABS SDMX API codelists. Defaults to False. 132 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 133 134 Returns: 135 A tuple containing two DataFrames: 136 - The first DataFrame contains the fetched data. 137 - The second DataFrame contains metadata about the fetched datasets. 138 139 Raises: 140 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 141 142 Note: 143 CacheError and HttpError are raised by the fetch function. 144 These will be caught and reported to standard output. 145 146 Note: 147 The function validates that all datasets have compatible index types. 148 A ValueError will be raised if incompatible index types are detected 149 (e.g., mixing quarterly and monthly data). 150 151 """ 152 # --- debugging output 153 verbose = kwargs.get("verbose", False) 154 if verbose: 155 print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}") 156 157 # --- quick sanity checks 158 if wanted.empty: 159 print("wanted DataFrame is empty, returning empty DataFrames.") 160 return pd.DataFrame(), pd.DataFrame() 161 if "flow_id" not in wanted.columns: 162 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 163 164 # --- do the work 165 return _extract(wanted, parameters, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).