Interactive online version: Binder badge

Aggregations

Import the LArray library:

[1]:
from larray import *

Load the population array and related axes from the demography_eurostat dataset:

[2]:
# load the 'demography_eurostat' dataset
demography_eurostat = load_example_data('demography_eurostat')

# extract the 'country', 'gender' and 'time' axes
country = demography_eurostat.country
gender = demography_eurostat.gender
time = demography_eurostat.time

# extract the 'population_5_countries' array as 'population'
population = demography_eurostat.population_5_countries

# show the 'population' array
population
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 2
      1 # load the 'demography_eurostat' dataset
----> 2 demography_eurostat = load_example_data('demography_eurostat')
      4 # extract the 'country', 'gender' and 'time' axes
      5 country = demography_eurostat.country

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/example.py:97, in load_example_data(name)
     95     available_datasets = list(AVAILABLE_EXAMPLE_DATA.keys())
     96     raise ValueError(f"example_data must be chosen from list {available_datasets}")
---> 97 return la.Session(AVAILABLE_EXAMPLE_DATA[name])

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:98, in Session.__init__(self, meta, *args, **kwargs)
     94     elements = {a.name: a for a in args}
     96 if isinstance(elements, (str, Path)):
     97     # assume elements is a filename
---> 98     self.load(elements)
     99     self.update(**kwargs)
    100 else:
    101     # iterable of tuple or dict-like

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:438, in Session.load(self, fname, names, engine, display, **kwargs)
    436 else:
    437     handler = handler_cls(fname)
--> 438 metadata, objects = handler.read(names, display=display, **kwargs)
    439 self._update_from_iterable(objects.items())
    440 self.meta = metadata

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/common.py:139, in FileHandler.read(self, keys, display, ignore_exceptions, *args, **kwargs)
    114 def read(self, keys, *args, display=False, ignore_exceptions=False, **kwargs) -> Tuple[Metadata, dict]:
    115     r"""
    116     Read file content (HDF, Excel, CSV, ...) and returns a dictionary containing loaded objects.
    117
   (...)
    137         Dictionary containing the loaded objects.
    138     """
--> 139     self._open_for_read()
    140     metadata = self._read_metadata()
    141     item_types = self.item_types()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/hdf.py:138, in PandasHDFHandler._open_for_read(self)
    137 def _open_for_read(self):
--> 138     self.handle = HDFStore(self.fname, mode='r')

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/io/pytables.py:566, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    563 if "format" in kwargs:
    564     raise ValueError("format is not a defined argument for HDFStore")
--> 566 tables = import_optional_dependency("tables")
    568 if complib is not None and complib not in tables.filters.all_complibs:
    569     raise ValueError(
    570         f"complib only supports {tables.filters.all_complibs} compression."
    571     )

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
    130 msg = (
    131     f"Missing optional dependency '{install_name}'. {extra} "
    132     f"Use pip or conda to install {install_name}."
    133 )
    134 try:
--> 135     module = importlib.import_module(name)
    136 except ImportError:
    137     if errors == "raise":

File ~/.asdf/installs/python/3.11.9/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    124             break
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1147, in _find_and_load_unlocked(name, import_)

File <frozen importlib._bootstrap>:690, in _load_unlocked(spec)

File <frozen importlib._bootstrap_external>:940, in exec_module(self, module)

File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/__init__.py:44
     40     raise RuntimeError("Blosc2 library not found. "
     41                        f"I looked for \"{', '.join(blosc2_search_paths)}\"")
     43 # Necessary imports to get versions stored on the cython extension
---> 44 from .utilsextension import get_hdf5_version as _get_hdf5_version
     46 from ._version import __version__
     48 hdf5_version = _get_hdf5_version()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/utilsextension.pyx:1, in init tables.utilsextension()

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

The LArray library provides many aggregation functions. The list is given in the Aggregation Functions subsection of the API Reference page.

Aggregation operations can be performed on axes or groups. Axes and groups can be mixed.

The main rules are:

  • Axes are separated by commas ,

  • Groups belonging to the same axis are grouped inside parentheses ()

Calculate the sum along an axis:

[3]:
population.sum(gender)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 population.sum(gender)

NameError: name 'population' is not defined

or several axes (axes are separated by commas ,):

[4]:
population.sum(country, gender)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 population.sum(country, gender)

NameError: name 'population' is not defined

Calculate the sum along all axes except one by appending _by to the aggregation function:

[5]:
population.sum_by(time)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 population.sum_by(time)

NameError: name 'population' is not defined

Calculate the sum along groups (the groups belonging to the same axis must grouped inside parentheses ()):

[6]:
benelux = population.country['Belgium', 'Netherlands', 'Luxembourg'] >> 'benelux'
fr_de = population.country['France', 'Germany'] >> 'FR+DE'

population.sum((benelux, fr_de))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 benelux = population.country['Belgium', 'Netherlands', 'Luxembourg'] >> 'benelux'
      2 fr_de = population.country['France', 'Germany'] >> 'FR+DE'
      4 population.sum((benelux, fr_de))

NameError: name 'population' is not defined

Mixing axes and groups in aggregations:

[7]:
population.sum(gender, (benelux, fr_de))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 population.sum(gender, (benelux, fr_de))

NameError: name 'population' is not defined

Warning: Mixing slices and individual labels inside the [ ] will generate several groups (a tuple of groups) instead of a single group.If you want to create a single group using both slices and individual labels, you need to use the .union() method (see below).

[8]:
# mixing slices and individual labels leads to the creation of several groups (a tuple of groups)
except_2016 = time[:2015, 2017]
except_2016
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 2
      1 # mixing slices and individual labels leads to the creation of several groups (a tuple of groups)
----> 2 except_2016 = time[:2015, 2017]
      3 except_2016

NameError: name 'time' is not defined
[9]:
# leading to potentially unexpected results
population.sum(except_2016)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 2
      1 # leading to potentially unexpected results
----> 2 population.sum(except_2016)

NameError: name 'population' is not defined
[10]:
# the union() method allows to mix slices and individual labels to create a single group
except_2016 = time[:2015].union(time[2017])
except_2016
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 2
      1 # the union() method allows to mix slices and individual labels to create a single group
----> 2 except_2016 = time[:2015].union(time[2017])
      3 except_2016

NameError: name 'time' is not defined
[11]:
population.sum(except_2016)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 population.sum(except_2016)

NameError: name 'population' is not defined