Interactive online version: Binder badge

Import the LArray library:

[1]:
from larray import *
[2]:
# load 'demography_eurostat' dataset
demography_eurostat = load_example_data('demography_eurostat')

# extract the 'population' array from the dataset
population = demography_eurostat.population
population
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 2
      1 # load 'demography_eurostat' dataset
----> 2 demography_eurostat = load_example_data('demography_eurostat')
      4 # extract the 'population' array from the dataset 
      5 population = demography_eurostat.population

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/example.py:97, in load_example_data(name)
     95     available_datasets = list(AVAILABLE_EXAMPLE_DATA.keys())
     96     raise ValueError(f"example_data must be chosen from list {available_datasets}")
---> 97 return la.Session(AVAILABLE_EXAMPLE_DATA[name])

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:98, in Session.__init__(self, meta, *args, **kwargs)
     94     elements = {a.name: a for a in args}
     96 if isinstance(elements, (str, Path)):
     97     # assume elements is a filename
---> 98     self.load(elements)
     99     self.update(**kwargs)
    100 else:
    101     # iterable of tuple or dict-like

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:438, in Session.load(self, fname, names, engine, display, **kwargs)
    436 else:
    437     handler = handler_cls(fname)
--> 438 metadata, objects = handler.read(names, display=display, **kwargs)
    439 self._update_from_iterable(objects.items())
    440 self.meta = metadata

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/common.py:139, in FileHandler.read(self, keys, display, ignore_exceptions, *args, **kwargs)
    114 def read(self, keys, *args, display=False, ignore_exceptions=False, **kwargs) -> Tuple[Metadata, dict]:
    115     r"""
    116     Read file content (HDF, Excel, CSV, ...) and returns a dictionary containing loaded objects.
    117
   (...)
    137         Dictionary containing the loaded objects.
    138     """
--> 139     self._open_for_read()
    140     metadata = self._read_metadata()
    141     item_types = self.item_types()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/hdf.py:138, in PandasHDFHandler._open_for_read(self)
    137 def _open_for_read(self):
--> 138     self.handle = HDFStore(self.fname, mode='r')

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/io/pytables.py:566, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    563 if "format" in kwargs:
    564     raise ValueError("format is not a defined argument for HDFStore")
--> 566 tables = import_optional_dependency("tables")
    568 if complib is not None and complib not in tables.filters.all_complibs:
    569     raise ValueError(
    570         f"complib only supports {tables.filters.all_complibs} compression."
    571     )

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
    130 msg = (
    131     f"Missing optional dependency '{install_name}'. {extra} "
    132     f"Use pip or conda to install {install_name}."
    133 )
    134 try:
--> 135     module = importlib.import_module(name)
    136 except ImportError:
    137     if errors == "raise":

File ~/.asdf/installs/python/3.11.9/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    124             break
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1147, in _find_and_load_unlocked(name, import_)

File <frozen importlib._bootstrap>:690, in _load_unlocked(spec)

File <frozen importlib._bootstrap_external>:940, in exec_module(self, module)

File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/__init__.py:44
     40     raise RuntimeError("Blosc2 library not found. "
     41                        f"I looked for \"{', '.join(blosc2_search_paths)}\"")
     43 # Necessary imports to get versions stored on the cython extension
---> 44 from .utilsextension import get_hdf5_version as _get_hdf5_version
     46 from ._version import __version__
     48 hdf5_version = _get_hdf5_version()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/utilsextension.pyx:1, in init tables.utilsextension()

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

Inspecting Array objects

Get array summary : metadata + dimensions + description of axes + dtype + size in memory

[3]:
# Array summary: metadata + dimensions + description of axes
population.info
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 2
      1 # Array summary: metadata + dimensions + description of axes
----> 2 population.info

NameError: name 'population' is not defined

Get axes

[4]:
population.axes
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 population.axes

NameError: name 'population' is not defined

Get axis names

[5]:
population.axes.names
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 population.axes.names

NameError: name 'population' is not defined

Get number of dimensions

[6]:
population.ndim
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 population.ndim

NameError: name 'population' is not defined

Get length of each dimension

[7]:
population.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 population.shape

NameError: name 'population' is not defined

Get total number of elements of the array

[8]:
population.size
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 population.size

NameError: name 'population' is not defined

Get type of internal data (int, float, …)

[9]:
population.dtype
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 population.dtype

NameError: name 'population' is not defined

Get size in memory

[10]:
population.memory_used
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 population.memory_used

NameError: name 'population' is not defined

Some Useful Functions

with total

Add totals to one or several axes:

[11]:
population.with_total('gender', label='Total')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 population.with_total('gender', label='Total')

NameError: name 'population' is not defined

See with_total for more details and examples.

where

The where function can be used to apply some computation depending on a condition:

[12]:
# where(condition, value if true, value if false)
where(population < population.mean('time'), -population, population)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 2
      1 # where(condition, value if true, value if false)
----> 2 where(population < population.mean('time'), -population, population)

NameError: name 'population' is not defined

See where for more details and examples.

clip

Set all data between a certain range:

[13]:
# values below 10 millions are set to 10 millions
population.clip(minval=10**7)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 2
      1 # values below 10 millions are set to 10 millions
----> 2 population.clip(minval=10**7)

NameError: name 'population' is not defined
[14]:
# values above 40 millions are set to 40 millions
population.clip(maxval=4*10**7)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 # values above 40 millions are set to 40 millions
----> 2 population.clip(maxval=4*10**7)

NameError: name 'population' is not defined
[15]:
# values below 10 millions are set to 10 millions and
# values above 40 millions are set to 40 millions
population.clip(10**7, 4*10**7)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 3
      1 # values below 10 millions are set to 10 millions and 
      2 # values above 40 millions are set to 40 millions
----> 3 population.clip(10**7, 4*10**7)

NameError: name 'population' is not defined
[16]:
# Using vectors to define the lower and upper bounds
lower_bound = sequence(population.time, initial=5_500_000, inc=50_000)
upper_bound = sequence(population.time, 41_000_000, inc=100_000)

print(lower_bound, '\n')
print(upper_bound, '\n')

population.clip(lower_bound, upper_bound)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 2
      1 # Using vectors to define the lower and upper bounds
----> 2 lower_bound = sequence(population.time, initial=5_500_000, inc=50_000)
      3 upper_bound = sequence(population.time, 41_000_000, inc=100_000)
      5 print(lower_bound, '\n')

NameError: name 'population' is not defined

See clip for more details and examples.

divnot0

Replace division by 0 by 0:

[17]:
divisor = ones(population.axes, dtype=int)
divisor['Male'] = 0
divisor
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 1
----> 1 divisor = ones(population.axes, dtype=int)
      2 divisor['Male'] = 0
      3 divisor

NameError: name 'population' is not defined
[18]:
population / divisor
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 1
----> 1 population / divisor

NameError: name 'population' is not defined
[19]:
# we use astype(int) since the divnot0 method
# returns a float array in this case while
# we want an integer array
population.divnot0(divisor).astype(int)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 4
      1 # we use astype(int) since the divnot0 method 
      2 # returns a float array in this case while 
      3 # we want an integer array
----> 4 population.divnot0(divisor).astype(int)

NameError: name 'population' is not defined

See divnot0 for more details and examples.

ratio

The ratio (rationot0) method returns an array with all values divided by the sum of values along given axes:

[20]:
population.ratio('gender')

# which is equivalent to
population / population.sum('gender')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 population.ratio('gender')
      3 # which is equivalent to
      4 population / population.sum('gender')

NameError: name 'population' is not defined

See ratio and rationot0 for more details and examples.

percents

[21]:
# or, if you want the previous ratios in percents
population.percent('gender')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 2
      1 # or, if you want the previous ratios in percents
----> 2 population.percent('gender')

NameError: name 'population' is not defined

See percent for more details and examples.

diff

The diff method calculates the n-th order discrete difference along a given axis.

The first order difference is given by out[n+1] = in[n+1] - in[n] along the given axis.

[22]:
# calculates 'diff[year+1] = population[year+1] - population[year]'
population.diff('time')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 2
      1 # calculates 'diff[year+1] = population[year+1] - population[year]'
----> 2 population.diff('time')

NameError: name 'population' is not defined
[23]:
# calculates 'diff[year+2] = population[year+2] - population[year]'
population.diff('time', d=2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 2
      1 # calculates 'diff[year+2] = population[year+2] - population[year]'
----> 2 population.diff('time', d=2)

NameError: name 'population' is not defined
[24]:
# calculates 'diff[year] = population[year+1] - population[year]'
population.diff('time', label='lower')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 2
      1 # calculates 'diff[year] = population[year+1] - population[year]'
----> 2 population.diff('time', label='lower')

NameError: name 'population' is not defined

See diff for more details and examples.

growth_rate

The growth_rate method calculates the growth along a given axis.

It is roughly equivalent to a.diff(axis, d, label) / a[axis.i[:-d]]:

[25]:
population.growth_rate('time')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 population.growth_rate('time')

NameError: name 'population' is not defined

See growth_rate for more details and examples.

shift

The shift method drops first label of an axis and shifts all subsequent labels

[26]:
population.shift('time')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 1
----> 1 population.shift('time')

NameError: name 'population' is not defined
[27]:
# when shift is applied on an (increasing) time axis,
# it effectively brings "past" data into the future
population_shifted = population.shift('time')
stack({'population_shifted_2014': population_shifted[2014], 'population_2013': population[2013]}, 'array')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 3
      1 # when shift is applied on an (increasing) time axis,
      2 # it effectively brings "past" data into the future
----> 3 population_shifted = population.shift('time')
      4 stack({'population_shifted_2014': population_shifted[2014], 'population_2013': population[2013]}, 'array')

NameError: name 'population' is not defined

See shift for more details and examples.

Other interesting functions

There are a lot more interesting functions that you can find in the API reference in sections Aggregation Functions, Miscellaneous and Utility Functions.