Interactive online version: Binder badge

Combining arrays

Import the LArray library:

[1]:
from larray import *
[2]:
# load the 'demography_eurostat' dataset
demography_eurostat = load_example_data('demography_eurostat')

# load 'gender' and 'time' axes
gender = demography_eurostat.gender
time = demography_eurostat.time
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 2
      1 # load the 'demography_eurostat' dataset
----> 2 demography_eurostat = load_example_data('demography_eurostat')
      4 # load 'gender' and 'time' axes
      5 gender = demography_eurostat.gender

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/example.py:97, in load_example_data(name)
     95     available_datasets = list(AVAILABLE_EXAMPLE_DATA.keys())
     96     raise ValueError(f"example_data must be chosen from list {available_datasets}")
---> 97 return la.Session(AVAILABLE_EXAMPLE_DATA[name])

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:98, in Session.__init__(self, meta, *args, **kwargs)
     94     elements = {a.name: a for a in args}
     96 if isinstance(elements, (str, Path)):
     97     # assume elements is a filename
---> 98     self.load(elements)
     99     self.update(**kwargs)
    100 else:
    101     # iterable of tuple or dict-like

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:438, in Session.load(self, fname, names, engine, display, **kwargs)
    436 else:
    437     handler = handler_cls(fname)
--> 438 metadata, objects = handler.read(names, display=display, **kwargs)
    439 self._update_from_iterable(objects.items())
    440 self.meta = metadata

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/common.py:139, in FileHandler.read(self, keys, display, ignore_exceptions, *args, **kwargs)
    114 def read(self, keys, *args, display=False, ignore_exceptions=False, **kwargs) -> Tuple[Metadata, dict]:
    115     r"""
    116     Read file content (HDF, Excel, CSV, ...) and returns a dictionary containing loaded objects.
    117
   (...)
    137         Dictionary containing the loaded objects.
    138     """
--> 139     self._open_for_read()
    140     metadata = self._read_metadata()
    141     item_types = self.item_types()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/hdf.py:138, in PandasHDFHandler._open_for_read(self)
    137 def _open_for_read(self):
--> 138     self.handle = HDFStore(self.fname, mode='r')

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/io/pytables.py:566, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    563 if "format" in kwargs:
    564     raise ValueError("format is not a defined argument for HDFStore")
--> 566 tables = import_optional_dependency("tables")
    568 if complib is not None and complib not in tables.filters.all_complibs:
    569     raise ValueError(
    570         f"complib only supports {tables.filters.all_complibs} compression."
    571     )

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
    130 msg = (
    131     f"Missing optional dependency '{install_name}'. {extra} "
    132     f"Use pip or conda to install {install_name}."
    133 )
    134 try:
--> 135     module = importlib.import_module(name)
    136 except ImportError:
    137     if errors == "raise":

File ~/.asdf/installs/python/3.11.9/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    124             break
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1147, in _find_and_load_unlocked(name, import_)

File <frozen importlib._bootstrap>:690, in _load_unlocked(spec)

File <frozen importlib._bootstrap_external>:940, in exec_module(self, module)

File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/__init__.py:44
     40     raise RuntimeError("Blosc2 library not found. "
     41                        f"I looked for \"{', '.join(blosc2_search_paths)}\"")
     43 # Necessary imports to get versions stored on the cython extension
---> 44 from .utilsextension import get_hdf5_version as _get_hdf5_version
     46 from ._version import __version__
     48 hdf5_version = _get_hdf5_version()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/utilsextension.pyx:1, in init tables.utilsextension()

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
[3]:
# load the 'population' array from the 'demography_eurostat' dataset
population = demography_eurostat.population

# show 'population' array
population
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 2
      1 # load the 'population' array from the 'demography_eurostat' dataset
----> 2 population = demography_eurostat.population
      4 # show 'population' array 
      5 population

NameError: name 'demography_eurostat' is not defined
[4]:
# load the 'population_benelux' array from the 'demography_eurostat' dataset
population_benelux = demography_eurostat.population_benelux

# show 'population_benelux' array
population_benelux
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 2
      1 # load the 'population_benelux' array from the 'demography_eurostat' dataset
----> 2 population_benelux = demography_eurostat.population_benelux
      4 # show 'population_benelux' array 
      5 population_benelux

NameError: name 'demography_eurostat' is not defined

The LArray library offers several methods and functions to combine arrays:

  • insert: inserts an array in another array along an axis

  • append: adds an array at the end of an axis.

  • prepend: adds an array at the beginning of an axis.

  • extend: extends an array along an axis.

  • stack: combines several arrays along a new axis.

Insert

[5]:
other_countries = zeros((Axis('country=Luxembourg,Netherlands'), gender, time), dtype=int)

# insert new countries before 'France'
population_new_countries = population.insert(other_countries, before='France')
population_new_countries
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 other_countries = zeros((Axis('country=Luxembourg,Netherlands'), gender, time), dtype=int)
      3 # insert new countries before 'France'
      4 population_new_countries = population.insert(other_countries, before='France')

NameError: name 'gender' is not defined
[6]:
# insert new countries after 'France'
population_new_countries = population.insert(other_countries, after='France')
population_new_countries
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 2
      1 # insert new countries after 'France'
----> 2 population_new_countries = population.insert(other_countries, after='France')
      3 population_new_countries

NameError: name 'population' is not defined

See insert for more details and examples.

Append

Append one element to an axis of an array:

[7]:
# append data for 'Luxembourg'
population_new = population.append('country', population_benelux['Luxembourg'], 'Luxembourg')
population_new
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 2
      1 # append data for 'Luxembourg'
----> 2 population_new = population.append('country', population_benelux['Luxembourg'], 'Luxembourg')
      3 population_new

NameError: name 'population' is not defined

The value being appended can have missing (or even extra) axes as long as common axes are compatible:

[8]:
population_lux = stack({'Male': -1, 'Female': 1}, gender)
population_lux
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 population_lux = stack({'Male': -1, 'Female': 1}, gender)
      2 population_lux

NameError: name 'gender' is not defined
[9]:
population_new = population.append('country', population_lux, 'Luxembourg')
population_new
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 population_new = population.append('country', population_lux, 'Luxembourg')
      2 population_new

NameError: name 'population' is not defined

The value being appended can also have the axis along which we are appending:

[10]:
population_nelux = population_benelux[['Netherlands', 'Luxembourg']]
population_nelux
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 population_nelux = population_benelux[['Netherlands', 'Luxembourg']]
      2 population_nelux

NameError: name 'population_benelux' is not defined
[11]:
population_extended = population.append('country', population_nelux)
population_extended
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 population_extended = population.append('country', population_nelux)
      2 population_extended

NameError: name 'population' is not defined

See append for more details and examples.

Prepend

Prepend one element to an axis of an array:

[12]:
# append data for 'Luxembourg'
population_new = population.prepend('country', population_benelux['Luxembourg'], 'Luxembourg')
population_new
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 2
      1 # append data for 'Luxembourg'
----> 2 population_new = population.prepend('country', population_benelux['Luxembourg'], 'Luxembourg')
      3 population_new

NameError: name 'population' is not defined

See prepend for more details and examples.

Stack

Stack several arrays together to create an entirely new dimension

[13]:
# imagine you have loaded data for each country in different arrays
# (e.g. loaded from different Excel sheets)
population_be = population['Belgium']
population_fr = population['France']
population_de = population['Germany']

print(population_be)
print(population_fr)
print(population_de)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 3
      1 # imagine you have loaded data for each country in different arrays 
      2 # (e.g. loaded from different Excel sheets)
----> 3 population_be = population['Belgium']
      4 population_fr = population['France']
      5 population_de = population['Germany']

NameError: name 'population' is not defined
[14]:
# create a new array with an extra axis 'country' by stacking the three arrays population_be/fr/de
population_stacked = stack({'Belgium': population_be, 'France': population_fr, 'Germany': population_de}, 'country')
population_stacked
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 # create a new array with an extra axis 'country' by stacking the three arrays population_be/fr/de
----> 2 population_stacked = stack({'Belgium': population_be, 'France': population_fr, 'Germany': population_de}, 'country')
      3 population_stacked

NameError: name 'population_be' is not defined

See stack for more details and examples.