Interactive online version: Binder badge

Arithmetic Operations

Import the LArray library:

[1]:
from larray import *

Load the population array from the demography_eurostat dataset:

[2]:
# load the 'demography_eurostat' dataset
demography_eurostat = load_example_data('demography_eurostat')

# extract the 'country', 'gender' and 'time' axes
country = demography_eurostat.country
gender = demography_eurostat.gender
time = demography_eurostat.time

# extract the 'population' array
population = demography_eurostat.population

# show the 'population' array
population
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 2
      1 # load the 'demography_eurostat' dataset
----> 2 demography_eurostat = load_example_data('demography_eurostat')
      4 # extract the 'country', 'gender' and 'time' axes
      5 country = demography_eurostat.country

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/example.py:97, in load_example_data(name)
     95     available_datasets = list(AVAILABLE_EXAMPLE_DATA.keys())
     96     raise ValueError(f"example_data must be chosen from list {available_datasets}")
---> 97 return la.Session(AVAILABLE_EXAMPLE_DATA[name])

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:98, in Session.__init__(self, meta, *args, **kwargs)
     94     elements = {a.name: a for a in args}
     96 if isinstance(elements, (str, Path)):
     97     # assume elements is a filename
---> 98     self.load(elements)
     99     self.update(**kwargs)
    100 else:
    101     # iterable of tuple or dict-like

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:438, in Session.load(self, fname, names, engine, display, **kwargs)
    436 else:
    437     handler = handler_cls(fname)
--> 438 metadata, objects = handler.read(names, display=display, **kwargs)
    439 self._update_from_iterable(objects.items())
    440 self.meta = metadata

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/common.py:139, in FileHandler.read(self, keys, display, ignore_exceptions, *args, **kwargs)
    114 def read(self, keys, *args, display=False, ignore_exceptions=False, **kwargs) -> Tuple[Metadata, dict]:
    115     r"""
    116     Read file content (HDF, Excel, CSV, ...) and returns a dictionary containing loaded objects.
    117
   (...)
    137         Dictionary containing the loaded objects.
    138     """
--> 139     self._open_for_read()
    140     metadata = self._read_metadata()
    141     item_types = self.item_types()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/hdf.py:138, in PandasHDFHandler._open_for_read(self)
    137 def _open_for_read(self):
--> 138     self.handle = HDFStore(self.fname, mode='r')

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/io/pytables.py:566, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    563 if "format" in kwargs:
    564     raise ValueError("format is not a defined argument for HDFStore")
--> 566 tables = import_optional_dependency("tables")
    568 if complib is not None and complib not in tables.filters.all_complibs:
    569     raise ValueError(
    570         f"complib only supports {tables.filters.all_complibs} compression."
    571     )

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
    130 msg = (
    131     f"Missing optional dependency '{install_name}'. {extra} "
    132     f"Use pip or conda to install {install_name}."
    133 )
    134 try:
--> 135     module = importlib.import_module(name)
    136 except ImportError:
    137     if errors == "raise":

File ~/.asdf/installs/python/3.11.9/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    124             break
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1147, in _find_and_load_unlocked(name, import_)

File <frozen importlib._bootstrap>:690, in _load_unlocked(spec)

File <frozen importlib._bootstrap_external>:940, in exec_module(self, module)

File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/__init__.py:44
     40     raise RuntimeError("Blosc2 library not found. "
     41                        f"I looked for \"{', '.join(blosc2_search_paths)}\"")
     43 # Necessary imports to get versions stored on the cython extension
---> 44 from .utilsextension import get_hdf5_version as _get_hdf5_version
     46 from ._version import __version__
     48 hdf5_version = _get_hdf5_version()

File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/utilsextension.pyx:1, in init tables.utilsextension()

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

Basics

One can do all usual arithmetic operations on an array, it will apply the operation to all elements individually

[3]:
# 'true' division
population_in_millions = population / 1_000_000
population_in_millions
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 2
      1 # 'true' division
----> 2 population_in_millions = population / 1_000_000
      3 population_in_millions

NameError: name 'population' is not defined
[4]:
# 'floor' division
population_in_millions = population // 1_000_000
population_in_millions
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 2
      1 # 'floor' division
----> 2 population_in_millions = population // 1_000_000
      3 population_in_millions

NameError: name 'population' is not defined

Warning: Python has two different division operators:

  • the ‘true’ division (/) always returns a float.

  • the ‘floor’ division (//) returns an integer result (discarding any fractional result).

[5]:
# % means modulo (aka remainder of division)
population % 1_000_000
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 2
      1 # % means modulo (aka remainder of division)
----> 2 population % 1_000_000

NameError: name 'population' is not defined
[6]:
# ** means raising to the power
print(ndtest(4))
ndtest(4) ** 3
a  a0  a1  a2  a3
    0   1   2   3
[6]:
a  a0  a1  a2  a3
    0   1   8  27

More interestingly, binary operators as above also works between two arrays.

Let us imagine a rate of population growth which is constant over time but different by gender and country:

[7]:
growth_rate = Array(data=[[1.011, 1.010], [1.013, 1.011], [1.010, 1.009]], axes=[country, gender])
growth_rate
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 growth_rate = Array(data=[[1.011, 1.010], [1.013, 1.011], [1.010, 1.009]], axes=[country, gender])
      2 growth_rate

NameError: name 'country' is not defined
[8]:
# we store the population of the year 2017 in a new variable
population_2017 = population[2017]
population_2017
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 2
      1 # we store the population of the year 2017 in a new variable
----> 2 population_2017 = population[2017]
      3 population_2017

NameError: name 'population' is not defined
[9]:
# perform an arithmetic operation between two arrays
population_2018 = population_2017 * growth_rate
population_2018
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 2
      1 # perform an arithmetic operation between two arrays
----> 2 population_2018 = population_2017 * growth_rate
      3 population_2018

NameError: name 'population_2017' is not defined

Note: Be careful when mixing different data types. You can use the method astype to change the data type of an array.

[10]:
# force the resulting matrix to be an integer matrix
population_2018 = (population_2017 * growth_rate).astype(int)
population_2018
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 2
      1 # force the resulting matrix to be an integer matrix
----> 2 population_2018 = (population_2017 * growth_rate).astype(int)
      3 population_2018

NameError: name 'population_2017' is not defined

Axis order does not matter much (except for output)

You can do operations between arrays having different axes order. The axis order of the result is the same as the left array

[11]:
# let's change the order of axes of the 'constant_growth_rate' array
transposed_growth_rate = growth_rate.transpose()

# look at the order of the new 'transposed_growth_rate' array:
# 'gender' is the first axis while 'country' is the second
transposed_growth_rate
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 2
      1 # let's change the order of axes of the 'constant_growth_rate' array
----> 2 transposed_growth_rate = growth_rate.transpose()
      4 # look at the order of the new 'transposed_growth_rate' array:
      5 # 'gender' is the first axis while 'country' is the second
      6 transposed_growth_rate

NameError: name 'growth_rate' is not defined
[12]:
# look at the order of the 'population_2017' array:
# 'country' is the first axis while 'gender' is the second
population_2017
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 3
      1 # look at the order of the 'population_2017' array:
      2 # 'country' is the first axis while 'gender' is the second
----> 3 population_2017

NameError: name 'population_2017' is not defined
[13]:
# LArray doesn't care of axes order when performing
# arithmetic operations between arrays
population_2018 = population_2017 * transposed_growth_rate
population_2018
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 3
      1 # LArray doesn't care of axes order when performing 
      2 # arithmetic operations between arrays
----> 3 population_2018 = population_2017 * transposed_growth_rate
      4 population_2018

NameError: name 'population_2017' is not defined

Axes must be compatible

Arithmetic operations between two arrays only works when they have compatible axes (i.e. same list of labels in the same order).

[14]:
# show 'population_2017'
population_2017
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 # show 'population_2017'
----> 2 population_2017

NameError: name 'population_2017' is not defined

Order of labels matters

[15]:
# let us imagine that the labels of the 'country' axis
# of the 'constant_growth_rate' array are in a different order
# than in the 'population_2017' array
reordered_growth_rate = growth_rate.reindex('country', ['Germany', 'Belgium', 'France'])
reordered_growth_rate
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 4
      1 # let us imagine that the labels of the 'country' axis 
      2 # of the 'constant_growth_rate' array are in a different order
      3 # than in the 'population_2017' array
----> 4 reordered_growth_rate = growth_rate.reindex('country', ['Germany', 'Belgium', 'France'])
      5 reordered_growth_rate

NameError: name 'growth_rate' is not defined
[16]:
# when doing arithmetic operations,
# the order of labels counts
try:
    population_2018 = population_2017 * reordered_growth_rate
except Exception as e:
    print(type(e).__name__, e)
NameError name 'population_2017' is not defined

No extra or missing labels are permitted

[17]:
# let us imagine that the 'country' axis of
# the 'constant_growth_rate' array has an extra
# label 'Netherlands' compared to the same axis of
# the 'population_2017' array
growth_rate_netherlands = Array([1.012, 1.], population.gender)
growth_rate_extra_country = growth_rate.append('country', growth_rate_netherlands, label='Netherlands')
growth_rate_extra_country
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 5
      1 # let us imagine that the 'country' axis of 
      2 # the 'constant_growth_rate' array has an extra 
      3 # label 'Netherlands' compared to the same axis of 
      4 # the 'population_2017' array
----> 5 growth_rate_netherlands = Array([1.012, 1.], population.gender)
      6 growth_rate_extra_country = growth_rate.append('country', growth_rate_netherlands, label='Netherlands')
      7 growth_rate_extra_country

NameError: name 'population' is not defined
[18]:
# when doing arithmetic operations,
# no extra or missing labels are permitted
try:
    population_2018 = population_2017 * growth_rate_extra_country
except Exception as e:
    print(type(e).__name__, e)
NameError name 'population_2017' is not defined

Ignoring labels (risky)

Warning: Operations between two arrays only works when they have compatible axes (i.e. same labels) but this behavior can be override via the ignore_labels method. In that case only the position on the axis is used and not the labels.

Using this method is done at your own risk and SHOULD NEVER BEEN USED IN A MODEL. Use this method only for quick tests or rapid data exploration.

[19]:
# let us imagine that the labels of the 'country' axis
# of the 'constant_growth_rate' array are the
# country codes instead of the country full names
growth_rate_country_codes = growth_rate.set_labels('country', ['BE', 'FR', 'DE'])
growth_rate_country_codes
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 4
      1 # let us imagine that the labels of the 'country' axis 
      2 # of the 'constant_growth_rate' array are the 
      3 # country codes instead of the country full names
----> 4 growth_rate_country_codes = growth_rate.set_labels('country', ['BE', 'FR', 'DE'])
      5 growth_rate_country_codes

NameError: name 'growth_rate' is not defined
[20]:
# use the .ignore_labels() method on axis 'country'
# to avoid the incompatible axes error (risky)
population_2018 = population_2017 * growth_rate_country_codes.ignore_labels('country')
population_2018
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 3
      1 # use the .ignore_labels() method on axis 'country'
      2 # to avoid the incompatible axes error (risky)
----> 3 population_2018 = population_2017 * growth_rate_country_codes.ignore_labels('country')
      4 population_2018

NameError: name 'population_2017' is not defined

Extra Or Missing Axes (Broadcasting)

The condition that axes must be compatible only applies on common axes. Making arithmetic operations between two arrays having the same axes is intuitive. However, arithmetic operations between two arrays can be performed even if the second array has extra and/or missing axes compared to the first one. Such mechanism is called broadcasting. It allows to make a lot of arithmetic operations without using any loop. This is a great advantage since using loops in Python can be highly time consuming (especially nested loops) and should be avoided as much as possible.

To understand how broadcasting works, let us start with a simple example. We assume we have the population of both men and women cumulated for each country:

[21]:
population_by_country = population_2017['Male'] + population_2017['Female']
population_by_country
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 1
----> 1 population_by_country = population_2017['Male'] + population_2017['Female']
      2 population_by_country

NameError: name 'population_2017' is not defined

We also assume we have the proportion of each gender in the population and that proportion is supposed to be the same for all countries:

[22]:
gender_proportion = Array([0.49, 0.51], gender)
gender_proportion
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 gender_proportion = Array([0.49, 0.51], gender)
      2 gender_proportion

NameError: name 'gender' is not defined

Using the two 1D arrays above, we can naively compute the population by country and gender as follow:

[23]:
# define a new variable with both 'country' and 'gender' axes to store the result
population_by_country_and_gender = zeros([country, gender], dtype=int)

# loop over the 'country' and 'gender' axes
for c in country:
    for g in gender:
        population_by_country_and_gender[c, g] = population_by_country[c] * gender_proportion[g]

# display the result
population_by_country_and_gender
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 2
      1 # define a new variable with both 'country' and 'gender' axes to store the result
----> 2 population_by_country_and_gender = zeros([country, gender], dtype=int)
      4 # loop over the 'country' and 'gender' axes 
      5 for c in country:

NameError: name 'country' is not defined

Relying on the broadcasting mechanism, the calculation above becomes:

[24]:
# the outer product is done automatically.
# No need to use any loop -> saves a lot of computation time
population_by_country_and_gender = population_by_country * gender_proportion

# display the result
population_by_country_and_gender.astype(int)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 3
      1 # the outer product is done automatically.
      2 # No need to use any loop -> saves a lot of computation time
----> 3 population_by_country_and_gender = population_by_country * gender_proportion
      5 # display the result
      6 population_by_country_and_gender.astype(int)

NameError: name 'population_by_country' is not defined

In the calculation above, LArray automatically creates a resulting array with axes given by the union of the axes of the two arrays involved in the arithmetic operation.

Let us do the same calculation but we add a common time axis:

[25]:
population_by_country_and_year = population['Male'] + population['Female']
population_by_country_and_year
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 population_by_country_and_year = population['Male'] + population['Female']
      2 population_by_country_and_year

NameError: name 'population' is not defined
[26]:
gender_proportion_by_year = Array([[0.49, 0.485, 0.495, 0.492, 0.498],
                                   [0.51, 0.515, 0.505, 0.508, 0.502]], [gender, time])
gender_proportion_by_year
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 2
      1 gender_proportion_by_year = Array([[0.49, 0.485, 0.495, 0.492, 0.498],
----> 2                                    [0.51, 0.515, 0.505, 0.508, 0.502]], [gender, time])
      3 gender_proportion_by_year

NameError: name 'gender' is not defined

Without the broadcasting mechanism, the computation of the population by country, gender and year would have been:

[27]:
# define a new variable to store the result.
# Its axes is the union of the axes of the two arrays
# involved in the arithmetic operation
population_by_country_gender_year = zeros([country, gender, time], dtype=int)

# loop over axes which are not present in both arrays
# involved in the arithmetic operation
for c in country:
    for g in gender:
        # all subsets below have the same 'time' axis
        population_by_country_gender_year[c, g] = population_by_country_and_year[c] * gender_proportion_by_year[g]

population_by_country_gender_year
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 4
      1 # define a new variable to store the result.
      2 # Its axes is the union of the axes of the two arrays 
      3 # involved in the arithmetic operation
----> 4 population_by_country_gender_year = zeros([country, gender, time], dtype=int)
      6 # loop over axes which are not present in both arrays
      7 # involved in the arithmetic operation
      8 for c in country:

NameError: name 'country' is not defined

Once again, the above calculation can be simplified as:

[28]:
# No need to use any loop -> saves a lot of computation time
population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year

# display the result
population_by_country_gender_year.astype(int)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 2
      1 # No need to use any loop -> saves a lot of computation time
----> 2 population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year
      4 # display the result
      5 population_by_country_gender_year.astype(int)

NameError: name 'population_by_country_and_year' is not defined

Warning: Broadcasting is a powerful mechanism but can be confusing at first. It can lead to unexpected results. In particular, if axes which are supposed to be common are not, you will get a resulting array with extra axes you didn’t want.

For example, imagine that the name of the time axis is time for the first array but period for the second:

[29]:
gender_proportion_by_year = gender_proportion_by_year.rename('time', 'period')
gender_proportion_by_year
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 gender_proportion_by_year = gender_proportion_by_year.rename('time', 'period')
      2 gender_proportion_by_year

NameError: name 'gender_proportion_by_year' is not defined
[30]:
population_by_country_and_year
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[30], line 1
----> 1 population_by_country_and_year

NameError: name 'population_by_country_and_year' is not defined
[31]:
# the two arrays below have a "time" axis with two different names: 'time' and 'period'.
# LArray will treat the "time" axis of the two arrays as two different "time" axes
population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year

# as a consequence, the result of the multiplication of the two arrays is not what we expected
population_by_country_gender_year.astype(int)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 3
      1 # the two arrays below have a "time" axis with two different names: 'time' and 'period'.
      2 # LArray will treat the "time" axis of the two arrays as two different "time" axes
----> 3 population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year
      5 # as a consequence, the result of the multiplication of the two arrays is not what we expected
      6 population_by_country_gender_year.astype(int)

NameError: name 'population_by_country_and_year' is not defined

Boolean Operations

Python comparison operators are:

Operator

Meaning

==

equal

!=

not equal

>

greater than

>=

greater than or equal

<

less than

<=

less than or equal

Applying a comparison operator on an array returns a boolean array:

[32]:
# test which values are greater than 10 millions
population > 10e6
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 2
      1 # test which values are greater than 10 millions
----> 2 population > 10e6

NameError: name 'population' is not defined

Comparison operations can be combined using Python bitwise operators:

Operator

Meaning

&

and

|

or

~

not

[33]:
# test which values are greater than 10 millions and less than 40 millions
(population > 10e6) & (population < 40e6)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 2
      1 # test which values are greater than 10 millions and less than 40 millions
----> 2 (population > 10e6) & (population < 40e6)

NameError: name 'population' is not defined
[34]:
# test which values are less than 10 millions or greater than 40 millions
(population < 10e6) | (population > 40e6)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[34], line 2
      1 # test which values are less than 10 millions or greater than 40 millions
----> 2 (population < 10e6) | (population > 40e6)

NameError: name 'population' is not defined
[35]:
# test which values are not less than 10 millions
~(population < 10e6)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[35], line 2
      1 # test which values are not less than 10 millions
----> 2 ~(population < 10e6)

NameError: name 'population' is not defined

The returned boolean array can then be used in selections and assignments:

[36]:
population_copy = population.copy()

# set all values greater than 40 millions to 40 millions
population_copy[population_copy > 40e6] = 40e6
population_copy
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[36], line 1
----> 1 population_copy = population.copy()
      3 # set all values greater than 40 millions to 40 millions
      4 population_copy[population_copy > 40e6] = 40e6

NameError: name 'population' is not defined

Boolean operations can be made between arrays:

[37]:
# test where the two arrays have the same values
population == population_copy
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[37], line 2
      1 # test where the two arrays have the same values
----> 2 population == population_copy

NameError: name 'population' is not defined

To test if all values between are equals, use the equals method:

[38]:
population.equals(population_copy)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[38], line 1
----> 1 population.equals(population_copy)

NameError: name 'population' is not defined