Transforming Arrays (Relabeling, Renaming, Reordering, Sorting, …)
Import the LArray library:
[1]:
from larray import *
Import the population
array from the demography_eurostat
dataset:
[2]:
demography_eurostat = load_example_data('demography_eurostat')
population = demography_eurostat.population
# display the 'population' array
population
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 1
----> 1 demography_eurostat = load_example_data('demography_eurostat')
2 population = demography_eurostat.population
4 # display the 'population' array
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/example.py:97, in load_example_data(name)
95 available_datasets = list(AVAILABLE_EXAMPLE_DATA.keys())
96 raise ValueError(f"example_data must be chosen from list {available_datasets}")
---> 97 return la.Session(AVAILABLE_EXAMPLE_DATA[name])
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:98, in Session.__init__(self, meta, *args, **kwargs)
94 elements = {a.name: a for a in args}
96 if isinstance(elements, (str, Path)):
97 # assume elements is a filename
---> 98 self.load(elements)
99 self.update(**kwargs)
100 else:
101 # iterable of tuple or dict-like
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:438, in Session.load(self, fname, names, engine, display, **kwargs)
436 else:
437 handler = handler_cls(fname)
--> 438 metadata, objects = handler.read(names, display=display, **kwargs)
439 self._update_from_iterable(objects.items())
440 self.meta = metadata
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/common.py:139, in FileHandler.read(self, keys, display, ignore_exceptions, *args, **kwargs)
114 def read(self, keys, *args, display=False, ignore_exceptions=False, **kwargs) -> Tuple[Metadata, dict]:
115 r"""
116 Read file content (HDF, Excel, CSV, ...) and returns a dictionary containing loaded objects.
117
(...)
137 Dictionary containing the loaded objects.
138 """
--> 139 self._open_for_read()
140 metadata = self._read_metadata()
141 item_types = self.item_types()
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/hdf.py:138, in PandasHDFHandler._open_for_read(self)
137 def _open_for_read(self):
--> 138 self.handle = HDFStore(self.fname, mode='r')
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/io/pytables.py:566, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
563 if "format" in kwargs:
564 raise ValueError("format is not a defined argument for HDFStore")
--> 566 tables = import_optional_dependency("tables")
568 if complib is not None and complib not in tables.filters.all_complibs:
569 raise ValueError(
570 f"complib only supports {tables.filters.all_complibs} compression."
571 )
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
130 msg = (
131 f"Missing optional dependency '{install_name}'. {extra} "
132 f"Use pip or conda to install {install_name}."
133 )
134 try:
--> 135 module = importlib.import_module(name)
136 except ImportError:
137 if errors == "raise":
File ~/.asdf/installs/python/3.11.9/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
124 break
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1147, in _find_and_load_unlocked(name, import_)
File <frozen importlib._bootstrap>:690, in _load_unlocked(spec)
File <frozen importlib._bootstrap_external>:940, in exec_module(self, module)
File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/__init__.py:44
40 raise RuntimeError("Blosc2 library not found. "
41 f"I looked for \"{', '.join(blosc2_search_paths)}\"")
43 # Necessary imports to get versions stored on the cython extension
---> 44 from .utilsextension import get_hdf5_version as _get_hdf5_version
46 from ._version import __version__
48 hdf5_version = _get_hdf5_version()
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/utilsextension.pyx:1, in init tables.utilsextension()
ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
Manipulating axes
The Array
class offers several methods to manipulate the axes and labels of an array:
set_labels: to replace all or some labels of one or several axes.
rename: to replace one or several axis names.
set_axes: to replace one or several axes.
transpose: to modify the order of axes.
drop: to remove one or several labels.
combine_axes: to combine axes.
split_axes: to split one or several axes by splitting their labels and names.
reindex: to reorder, add and remove labels of one or several axes.
insert: to insert a label at a given position.
Relabeling
Replace some labels of an axis:
[3]:
# replace only one label of the 'gender' axis by passing a dict
population_new_labels = population.set_labels('gender', {'Male': 'Men'})
population_new_labels
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 2
1 # replace only one label of the 'gender' axis by passing a dict
----> 2 population_new_labels = population.set_labels('gender', {'Male': 'Men'})
3 population_new_labels
NameError: name 'population' is not defined
[4]:
# set all labels of the 'country' axis to uppercase by passing the function str.upper()
population_new_labels = population.set_labels('country', str.upper)
population_new_labels
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 2
1 # set all labels of the 'country' axis to uppercase by passing the function str.upper()
----> 2 population_new_labels = population.set_labels('country', str.upper)
3 population_new_labels
NameError: name 'population' is not defined
See set_labels for more details and examples.
Renaming axes
Rename one axis:
[5]:
# 'rename' returns a copy of the array
population_new_names = population.rename('time', 'year')
population_new_names
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 2
1 # 'rename' returns a copy of the array
----> 2 population_new_names = population.rename('time', 'year')
3 population_new_names
NameError: name 'population' is not defined
Rename several axes at once:
[6]:
population_new_names = population.rename({'gender': 'sex', 'time': 'year'})
population_new_names
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 1
----> 1 population_new_names = population.rename({'gender': 'sex', 'time': 'year'})
2 population_new_names
NameError: name 'population' is not defined
See rename for more details and examples.
Replacing Axes
Replace one axis:
[7]:
new_gender = Axis('sex=Men,Women')
population_new_axis = population.set_axes('gender', new_gender)
population_new_axis
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 2
1 new_gender = Axis('sex=Men,Women')
----> 2 population_new_axis = population.set_axes('gender', new_gender)
3 population_new_axis
NameError: name 'population' is not defined
Replace several axes at once:
[8]:
new_country = Axis('country_codes=BE,FR,DE')
population_new_axes = population.set_axes({'country': new_country, 'gender': new_gender})
population_new_axes
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 2
1 new_country = Axis('country_codes=BE,FR,DE')
----> 2 population_new_axes = population.set_axes({'country': new_country, 'gender': new_gender})
3 population_new_axes
NameError: name 'population' is not defined
Reordering axes
Axes can be reordered using transpose
method. By default, transpose reverse axes, otherwise it permutes the axes according to the list given as argument. Axes not mentioned come after those which are mentioned(and keep their relative order). Finally, transpose returns a copy of the array.
[9]:
# starting order : country, gender, time
population
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 2
1 # starting order : country, gender, time
----> 2 population
NameError: name 'population' is not defined
[10]:
# no argument --> reverse all axes
population_transposed = population.transpose()
# .T is a shortcut for .transpose()
population_transposed = population.T
population_transposed
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 2
1 # no argument --> reverse all axes
----> 2 population_transposed = population.transpose()
4 # .T is a shortcut for .transpose()
5 population_transposed = population.T
NameError: name 'population' is not defined
[11]:
# reorder according to list
population_transposed = population.transpose('gender', 'country', 'time')
population_transposed
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 2
1 # reorder according to list
----> 2 population_transposed = population.transpose('gender', 'country', 'time')
3 population_transposed
NameError: name 'population' is not defined
[12]:
# move 'time' axis at first place
# not mentioned axes come after those which are mentioned (and keep their relative order)
population_transposed = population.transpose('time')
population_transposed
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 3
1 # move 'time' axis at first place
2 # not mentioned axes come after those which are mentioned (and keep their relative order)
----> 3 population_transposed = population.transpose('time')
4 population_transposed
NameError: name 'population' is not defined
[13]:
# move 'gender' axis at last place
# not mentioned axes come before those which are mentioned (and keep their relative order)
population_transposed = population.transpose(..., 'gender')
population_transposed
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[13], line 3
1 # move 'gender' axis at last place
2 # not mentioned axes come before those which are mentioned (and keep their relative order)
----> 3 population_transposed = population.transpose(..., 'gender')
4 population_transposed
NameError: name 'population' is not defined
See transpose for more details and examples.
Dropping Labels
[14]:
population_labels_dropped = population.drop([2014, 2016])
population_labels_dropped
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[14], line 1
----> 1 population_labels_dropped = population.drop([2014, 2016])
2 population_labels_dropped
NameError: name 'population' is not defined
See drop for more details and examples.
Combine And Split Axes
Combine two axes:
[15]:
population_combined_axes = population.combine_axes(('country', 'gender'))
population_combined_axes
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 1
----> 1 population_combined_axes = population.combine_axes(('country', 'gender'))
2 population_combined_axes
NameError: name 'population' is not defined
Split an axis:
[16]:
population_split_axes = population_combined_axes.split_axes('country_gender')
population_split_axes
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[16], line 1
----> 1 population_split_axes = population_combined_axes.split_axes('country_gender')
2 population_split_axes
NameError: name 'population_combined_axes' is not defined
See combine_axes and split_axes for more details and examples.
Reordering, adding and removing labels
The reindex
method allows to reorder, add and remove labels along one axis:
[17]:
# reverse years + remove 2013 + add 2018 + copy data for 2017 to 2018
population_new_time = population.reindex('time', '2018..2014', fill_value=population[2017])
population_new_time
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 2
1 # reverse years + remove 2013 + add 2018 + copy data for 2017 to 2018
----> 2 population_new_time = population.reindex('time', '2018..2014', fill_value=population[2017])
3 population_new_time
NameError: name 'population' is not defined
or several axes:
[18]:
population_new = population.reindex({'country': 'country=Luxembourg,Belgium,France,Germany',
'time': 'time=2018..2014'}, fill_value=0)
population_new
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 1
----> 1 population_new = population.reindex({'country': 'country=Luxembourg,Belgium,France,Germany',
2 'time': 'time=2018..2014'}, fill_value=0)
3 population_new
NameError: name 'population' is not defined
See reindex for more details and examples.
Another way to insert new labels is to use the insert
method:
[19]:
# insert a new country before 'France' with all values set to 0
population_new_country = population.insert(0, before='France', label='Luxembourg')
# or equivalently
population_new_country = population.insert(0, after='Belgium', label='Luxembourg')
population_new_country
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[19], line 2
1 # insert a new country before 'France' with all values set to 0
----> 2 population_new_country = population.insert(0, before='France', label='Luxembourg')
3 # or equivalently
4 population_new_country = population.insert(0, after='Belgium', label='Luxembourg')
NameError: name 'population' is not defined
See insert for more details and examples.
Sorting
sort_labels: sort the labels of an axis.
labelsofsorted: give labels which would sort an axis.
sort_values: sort axes according to values
[20]:
# get a copy of the 'population_benelux' array
population_benelux = demography_eurostat.population_benelux.copy()
population_benelux
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[20], line 2
1 # get a copy of the 'population_benelux' array
----> 2 population_benelux = demography_eurostat.population_benelux.copy()
3 population_benelux
NameError: name 'demography_eurostat' is not defined
Sort an axis (alphabetically if labels are strings)
[21]:
population_sorted = population_benelux.sort_labels('gender')
population_sorted
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 1
----> 1 population_sorted = population_benelux.sort_labels('gender')
2 population_sorted
NameError: name 'population_benelux' is not defined
Give labels which would sort the axis
[22]:
population_benelux.labelsofsorted('country')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 1
----> 1 population_benelux.labelsofsorted('country')
NameError: name 'population_benelux' is not defined
Sort according to values
[23]:
population_sorted = population_benelux.sort_values(('Male', 2017))
population_sorted
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[23], line 1
----> 1 population_sorted = population_benelux.sort_values(('Male', 2017))
2 population_sorted
NameError: name 'population_benelux' is not defined
Aligning Arrays
The align
method align two arrays on their axes with a specified join method. In other words, it ensure all common axes are compatible.
[24]:
# get a copy of the 'births' array
births = demography_eurostat.births.copy()
# align the two arrays with the 'inner' join method
population_aligned, births_aligned = population_benelux.align(births, join='inner')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[24], line 2
1 # get a copy of the 'births' array
----> 2 births = demography_eurostat.births.copy()
4 # align the two arrays with the 'inner' join method
5 population_aligned, births_aligned = population_benelux.align(births, join='inner')
NameError: name 'demography_eurostat' is not defined
[25]:
print('population_benelux before align:')
print(population_benelux)
print()
print('population_benelux after align:')
print(population_aligned)
population_benelux before align:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[25], line 2
1 print('population_benelux before align:')
----> 2 print(population_benelux)
3 print()
4 print('population_benelux after align:')
NameError: name 'population_benelux' is not defined
[26]:
print('births before align:')
print(births)
print()
print('births after align:')
print(births_aligned)
births before align:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[26], line 2
1 print('births before align:')
----> 2 print(births)
3 print()
4 print('births after align:')
NameError: name 'births' is not defined
Aligned arrays can then be used in arithmetic operations:
[27]:
population_aligned - births_aligned
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 1
----> 1 population_aligned - births_aligned
NameError: name 'population_aligned' is not defined
See align for more details and examples.