Arithmetic Operations
Import the LArray library:
[1]:
from larray import *
Load the population
array from the demography_eurostat
dataset:
[2]:
# load the 'demography_eurostat' dataset
demography_eurostat = load_example_data('demography_eurostat')
# extract the 'country', 'gender' and 'time' axes
country = demography_eurostat.country
gender = demography_eurostat.gender
time = demography_eurostat.time
# extract the 'population' array
population = demography_eurostat.population
# show the 'population' array
population
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 2
1 # load the 'demography_eurostat' dataset
----> 2 demography_eurostat = load_example_data('demography_eurostat')
4 # extract the 'country', 'gender' and 'time' axes
5 country = demography_eurostat.country
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/example.py:97, in load_example_data(name)
95 available_datasets = list(AVAILABLE_EXAMPLE_DATA.keys())
96 raise ValueError(f"example_data must be chosen from list {available_datasets}")
---> 97 return la.Session(AVAILABLE_EXAMPLE_DATA[name])
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:98, in Session.__init__(self, meta, *args, **kwargs)
94 elements = {a.name: a for a in args}
96 if isinstance(elements, (str, Path)):
97 # assume elements is a filename
---> 98 self.load(elements)
99 self.update(**kwargs)
100 else:
101 # iterable of tuple or dict-like
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:438, in Session.load(self, fname, names, engine, display, **kwargs)
436 else:
437 handler = handler_cls(fname)
--> 438 metadata, objects = handler.read(names, display=display, **kwargs)
439 self._update_from_iterable(objects.items())
440 self.meta = metadata
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/common.py:139, in FileHandler.read(self, keys, display, ignore_exceptions, *args, **kwargs)
114 def read(self, keys, *args, display=False, ignore_exceptions=False, **kwargs) -> Tuple[Metadata, dict]:
115 r"""
116 Read file content (HDF, Excel, CSV, ...) and returns a dictionary containing loaded objects.
117
(...)
137 Dictionary containing the loaded objects.
138 """
--> 139 self._open_for_read()
140 metadata = self._read_metadata()
141 item_types = self.item_types()
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/hdf.py:138, in PandasHDFHandler._open_for_read(self)
137 def _open_for_read(self):
--> 138 self.handle = HDFStore(self.fname, mode='r')
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/io/pytables.py:566, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
563 if "format" in kwargs:
564 raise ValueError("format is not a defined argument for HDFStore")
--> 566 tables = import_optional_dependency("tables")
568 if complib is not None and complib not in tables.filters.all_complibs:
569 raise ValueError(
570 f"complib only supports {tables.filters.all_complibs} compression."
571 )
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
130 msg = (
131 f"Missing optional dependency '{install_name}'. {extra} "
132 f"Use pip or conda to install {install_name}."
133 )
134 try:
--> 135 module = importlib.import_module(name)
136 except ImportError:
137 if errors == "raise":
File ~/.asdf/installs/python/3.11.9/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
124 break
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1147, in _find_and_load_unlocked(name, import_)
File <frozen importlib._bootstrap>:690, in _load_unlocked(spec)
File <frozen importlib._bootstrap_external>:940, in exec_module(self, module)
File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/__init__.py:44
40 raise RuntimeError("Blosc2 library not found. "
41 f"I looked for \"{', '.join(blosc2_search_paths)}\"")
43 # Necessary imports to get versions stored on the cython extension
---> 44 from .utilsextension import get_hdf5_version as _get_hdf5_version
46 from ._version import __version__
48 hdf5_version = _get_hdf5_version()
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/utilsextension.pyx:1, in init tables.utilsextension()
ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
Basics
One can do all usual arithmetic operations on an array, it will apply the operation to all elements individually
[3]:
# 'true' division
population_in_millions = population / 1_000_000
population_in_millions
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 2
1 # 'true' division
----> 2 population_in_millions = population / 1_000_000
3 population_in_millions
NameError: name 'population' is not defined
[4]:
# 'floor' division
population_in_millions = population // 1_000_000
population_in_millions
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 2
1 # 'floor' division
----> 2 population_in_millions = population // 1_000_000
3 population_in_millions
NameError: name 'population' is not defined
Warning: Python has two different division operators:
the ‘true’ division (/) always returns a float.
the ‘floor’ division (//) returns an integer result (discarding any fractional result).
[5]:
# % means modulo (aka remainder of division)
population % 1_000_000
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 2
1 # % means modulo (aka remainder of division)
----> 2 population % 1_000_000
NameError: name 'population' is not defined
[6]:
# ** means raising to the power
print(ndtest(4))
ndtest(4) ** 3
a a0 a1 a2 a3
0 1 2 3
[6]:
a a0 a1 a2 a3
0 1 8 27
More interestingly, binary operators as above also works between two arrays.
Let us imagine a rate of population growth which is constant over time but different by gender and country:
[7]:
growth_rate = Array(data=[[1.011, 1.010], [1.013, 1.011], [1.010, 1.009]], axes=[country, gender])
growth_rate
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 1
----> 1 growth_rate = Array(data=[[1.011, 1.010], [1.013, 1.011], [1.010, 1.009]], axes=[country, gender])
2 growth_rate
NameError: name 'country' is not defined
[8]:
# we store the population of the year 2017 in a new variable
population_2017 = population[2017]
population_2017
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 2
1 # we store the population of the year 2017 in a new variable
----> 2 population_2017 = population[2017]
3 population_2017
NameError: name 'population' is not defined
[9]:
# perform an arithmetic operation between two arrays
population_2018 = population_2017 * growth_rate
population_2018
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 2
1 # perform an arithmetic operation between two arrays
----> 2 population_2018 = population_2017 * growth_rate
3 population_2018
NameError: name 'population_2017' is not defined
Note: Be careful when mixing different data types. You can use the method astype to change the data type of an array.
[10]:
# force the resulting matrix to be an integer matrix
population_2018 = (population_2017 * growth_rate).astype(int)
population_2018
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 2
1 # force the resulting matrix to be an integer matrix
----> 2 population_2018 = (population_2017 * growth_rate).astype(int)
3 population_2018
NameError: name 'population_2017' is not defined
Axis order does not matter much (except for output)
You can do operations between arrays having different axes order. The axis order of the result is the same as the left array
[11]:
# let's change the order of axes of the 'constant_growth_rate' array
transposed_growth_rate = growth_rate.transpose()
# look at the order of the new 'transposed_growth_rate' array:
# 'gender' is the first axis while 'country' is the second
transposed_growth_rate
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 2
1 # let's change the order of axes of the 'constant_growth_rate' array
----> 2 transposed_growth_rate = growth_rate.transpose()
4 # look at the order of the new 'transposed_growth_rate' array:
5 # 'gender' is the first axis while 'country' is the second
6 transposed_growth_rate
NameError: name 'growth_rate' is not defined
[12]:
# look at the order of the 'population_2017' array:
# 'country' is the first axis while 'gender' is the second
population_2017
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 3
1 # look at the order of the 'population_2017' array:
2 # 'country' is the first axis while 'gender' is the second
----> 3 population_2017
NameError: name 'population_2017' is not defined
[13]:
# LArray doesn't care of axes order when performing
# arithmetic operations between arrays
population_2018 = population_2017 * transposed_growth_rate
population_2018
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[13], line 3
1 # LArray doesn't care of axes order when performing
2 # arithmetic operations between arrays
----> 3 population_2018 = population_2017 * transposed_growth_rate
4 population_2018
NameError: name 'population_2017' is not defined
Axes must be compatible
Arithmetic operations between two arrays only works when they have compatible axes (i.e. same list of labels in the same order).
[14]:
# show 'population_2017'
population_2017
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[14], line 2
1 # show 'population_2017'
----> 2 population_2017
NameError: name 'population_2017' is not defined
Order of labels matters
[15]:
# let us imagine that the labels of the 'country' axis
# of the 'constant_growth_rate' array are in a different order
# than in the 'population_2017' array
reordered_growth_rate = growth_rate.reindex('country', ['Germany', 'Belgium', 'France'])
reordered_growth_rate
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 4
1 # let us imagine that the labels of the 'country' axis
2 # of the 'constant_growth_rate' array are in a different order
3 # than in the 'population_2017' array
----> 4 reordered_growth_rate = growth_rate.reindex('country', ['Germany', 'Belgium', 'France'])
5 reordered_growth_rate
NameError: name 'growth_rate' is not defined
[16]:
# when doing arithmetic operations,
# the order of labels counts
try:
population_2018 = population_2017 * reordered_growth_rate
except Exception as e:
print(type(e).__name__, e)
NameError name 'population_2017' is not defined
No extra or missing labels are permitted
[17]:
# let us imagine that the 'country' axis of
# the 'constant_growth_rate' array has an extra
# label 'Netherlands' compared to the same axis of
# the 'population_2017' array
growth_rate_netherlands = Array([1.012, 1.], population.gender)
growth_rate_extra_country = growth_rate.append('country', growth_rate_netherlands, label='Netherlands')
growth_rate_extra_country
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 5
1 # let us imagine that the 'country' axis of
2 # the 'constant_growth_rate' array has an extra
3 # label 'Netherlands' compared to the same axis of
4 # the 'population_2017' array
----> 5 growth_rate_netherlands = Array([1.012, 1.], population.gender)
6 growth_rate_extra_country = growth_rate.append('country', growth_rate_netherlands, label='Netherlands')
7 growth_rate_extra_country
NameError: name 'population' is not defined
[18]:
# when doing arithmetic operations,
# no extra or missing labels are permitted
try:
population_2018 = population_2017 * growth_rate_extra_country
except Exception as e:
print(type(e).__name__, e)
NameError name 'population_2017' is not defined
Ignoring labels (risky)
Warning: Operations between two arrays only works when they have compatible axes (i.e. same labels) but this behavior can be override via the ignore_labels method. In that case only the position on the axis is used and not the labels.
Using this method is done at your own risk and SHOULD NEVER BEEN USED IN A MODEL. Use this method only for quick tests or rapid data exploration.
[19]:
# let us imagine that the labels of the 'country' axis
# of the 'constant_growth_rate' array are the
# country codes instead of the country full names
growth_rate_country_codes = growth_rate.set_labels('country', ['BE', 'FR', 'DE'])
growth_rate_country_codes
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[19], line 4
1 # let us imagine that the labels of the 'country' axis
2 # of the 'constant_growth_rate' array are the
3 # country codes instead of the country full names
----> 4 growth_rate_country_codes = growth_rate.set_labels('country', ['BE', 'FR', 'DE'])
5 growth_rate_country_codes
NameError: name 'growth_rate' is not defined
[20]:
# use the .ignore_labels() method on axis 'country'
# to avoid the incompatible axes error (risky)
population_2018 = population_2017 * growth_rate_country_codes.ignore_labels('country')
population_2018
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[20], line 3
1 # use the .ignore_labels() method on axis 'country'
2 # to avoid the incompatible axes error (risky)
----> 3 population_2018 = population_2017 * growth_rate_country_codes.ignore_labels('country')
4 population_2018
NameError: name 'population_2017' is not defined
Extra Or Missing Axes (Broadcasting)
The condition that axes must be compatible only applies on common axes. Making arithmetic operations between two arrays having the same axes is intuitive. However, arithmetic operations between two arrays can be performed even if the second array has extra and/or missing axes compared to the first one. Such mechanism is called broadcasting
. It allows to make a lot of arithmetic operations without using any loop. This is a great advantage since using loops in Python can be highly time
consuming (especially nested loops) and should be avoided as much as possible.
To understand how broadcasting works, let us start with a simple example. We assume we have the population of both men and women cumulated for each country:
[21]:
population_by_country = population_2017['Male'] + population_2017['Female']
population_by_country
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 1
----> 1 population_by_country = population_2017['Male'] + population_2017['Female']
2 population_by_country
NameError: name 'population_2017' is not defined
We also assume we have the proportion of each gender in the population and that proportion is supposed to be the same for all countries:
[22]:
gender_proportion = Array([0.49, 0.51], gender)
gender_proportion
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 1
----> 1 gender_proportion = Array([0.49, 0.51], gender)
2 gender_proportion
NameError: name 'gender' is not defined
Using the two 1D arrays above, we can naively compute the population by country and gender as follow:
[23]:
# define a new variable with both 'country' and 'gender' axes to store the result
population_by_country_and_gender = zeros([country, gender], dtype=int)
# loop over the 'country' and 'gender' axes
for c in country:
for g in gender:
population_by_country_and_gender[c, g] = population_by_country[c] * gender_proportion[g]
# display the result
population_by_country_and_gender
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[23], line 2
1 # define a new variable with both 'country' and 'gender' axes to store the result
----> 2 population_by_country_and_gender = zeros([country, gender], dtype=int)
4 # loop over the 'country' and 'gender' axes
5 for c in country:
NameError: name 'country' is not defined
Relying on the broadcasting
mechanism, the calculation above becomes:
[24]:
# the outer product is done automatically.
# No need to use any loop -> saves a lot of computation time
population_by_country_and_gender = population_by_country * gender_proportion
# display the result
population_by_country_and_gender.astype(int)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[24], line 3
1 # the outer product is done automatically.
2 # No need to use any loop -> saves a lot of computation time
----> 3 population_by_country_and_gender = population_by_country * gender_proportion
5 # display the result
6 population_by_country_and_gender.astype(int)
NameError: name 'population_by_country' is not defined
In the calculation above, LArray
automatically creates a resulting array with axes given by the union of the axes of the two arrays involved in the arithmetic operation.
Let us do the same calculation but we add a common time
axis:
[25]:
population_by_country_and_year = population['Male'] + population['Female']
population_by_country_and_year
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[25], line 1
----> 1 population_by_country_and_year = population['Male'] + population['Female']
2 population_by_country_and_year
NameError: name 'population' is not defined
[26]:
gender_proportion_by_year = Array([[0.49, 0.485, 0.495, 0.492, 0.498],
[0.51, 0.515, 0.505, 0.508, 0.502]], [gender, time])
gender_proportion_by_year
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[26], line 2
1 gender_proportion_by_year = Array([[0.49, 0.485, 0.495, 0.492, 0.498],
----> 2 [0.51, 0.515, 0.505, 0.508, 0.502]], [gender, time])
3 gender_proportion_by_year
NameError: name 'gender' is not defined
Without the broadcasting
mechanism, the computation of the population by country, gender and year would have been:
[27]:
# define a new variable to store the result.
# Its axes is the union of the axes of the two arrays
# involved in the arithmetic operation
population_by_country_gender_year = zeros([country, gender, time], dtype=int)
# loop over axes which are not present in both arrays
# involved in the arithmetic operation
for c in country:
for g in gender:
# all subsets below have the same 'time' axis
population_by_country_gender_year[c, g] = population_by_country_and_year[c] * gender_proportion_by_year[g]
population_by_country_gender_year
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 4
1 # define a new variable to store the result.
2 # Its axes is the union of the axes of the two arrays
3 # involved in the arithmetic operation
----> 4 population_by_country_gender_year = zeros([country, gender, time], dtype=int)
6 # loop over axes which are not present in both arrays
7 # involved in the arithmetic operation
8 for c in country:
NameError: name 'country' is not defined
Once again, the above calculation can be simplified as:
[28]:
# No need to use any loop -> saves a lot of computation time
population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year
# display the result
population_by_country_gender_year.astype(int)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[28], line 2
1 # No need to use any loop -> saves a lot of computation time
----> 2 population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year
4 # display the result
5 population_by_country_gender_year.astype(int)
NameError: name 'population_by_country_and_year' is not defined
Warning: Broadcasting is a powerful mechanism but can be confusing at first. It can lead to unexpected results. In particular, if axes which are supposed to be common are not, you will get a resulting array with extra axes you didn’t want.
For example, imagine that the name of the time
axis is time
for the first array but period
for the second:
[29]:
gender_proportion_by_year = gender_proportion_by_year.rename('time', 'period')
gender_proportion_by_year
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[29], line 1
----> 1 gender_proportion_by_year = gender_proportion_by_year.rename('time', 'period')
2 gender_proportion_by_year
NameError: name 'gender_proportion_by_year' is not defined
[30]:
population_by_country_and_year
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[30], line 1
----> 1 population_by_country_and_year
NameError: name 'population_by_country_and_year' is not defined
[31]:
# the two arrays below have a "time" axis with two different names: 'time' and 'period'.
# LArray will treat the "time" axis of the two arrays as two different "time" axes
population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year
# as a consequence, the result of the multiplication of the two arrays is not what we expected
population_by_country_gender_year.astype(int)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[31], line 3
1 # the two arrays below have a "time" axis with two different names: 'time' and 'period'.
2 # LArray will treat the "time" axis of the two arrays as two different "time" axes
----> 3 population_by_country_gender_year = population_by_country_and_year * gender_proportion_by_year
5 # as a consequence, the result of the multiplication of the two arrays is not what we expected
6 population_by_country_gender_year.astype(int)
NameError: name 'population_by_country_and_year' is not defined
Boolean Operations
Python comparison operators are:
Operator |
Meaning |
---|---|
|
equal |
|
not equal |
|
greater than |
|
greater than or equal |
|
less than |
|
less than or equal |
Applying a comparison operator on an array returns a boolean array:
[32]:
# test which values are greater than 10 millions
population > 10e6
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[32], line 2
1 # test which values are greater than 10 millions
----> 2 population > 10e6
NameError: name 'population' is not defined
Comparison operations can be combined using Python bitwise operators:
Operator |
Meaning |
---|---|
& |
and |
| |
or |
~ |
not |
[33]:
# test which values are greater than 10 millions and less than 40 millions
(population > 10e6) & (population < 40e6)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[33], line 2
1 # test which values are greater than 10 millions and less than 40 millions
----> 2 (population > 10e6) & (population < 40e6)
NameError: name 'population' is not defined
[34]:
# test which values are less than 10 millions or greater than 40 millions
(population < 10e6) | (population > 40e6)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[34], line 2
1 # test which values are less than 10 millions or greater than 40 millions
----> 2 (population < 10e6) | (population > 40e6)
NameError: name 'population' is not defined
[35]:
# test which values are not less than 10 millions
~(population < 10e6)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[35], line 2
1 # test which values are not less than 10 millions
----> 2 ~(population < 10e6)
NameError: name 'population' is not defined
The returned boolean array can then be used in selections and assignments:
[36]:
population_copy = population.copy()
# set all values greater than 40 millions to 40 millions
population_copy[population_copy > 40e6] = 40e6
population_copy
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[36], line 1
----> 1 population_copy = population.copy()
3 # set all values greater than 40 millions to 40 millions
4 population_copy[population_copy > 40e6] = 40e6
NameError: name 'population' is not defined
Boolean operations can be made between arrays:
[37]:
# test where the two arrays have the same values
population == population_copy
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[37], line 2
1 # test where the two arrays have the same values
----> 2 population == population_copy
NameError: name 'population' is not defined
To test if all values between are equals, use the equals method:
[38]:
population.equals(population_copy)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[38], line 1
----> 1 population.equals(population_copy)
NameError: name 'population' is not defined