Import the LArray library:
[1]:
from larray import *
[2]:
# load 'demography_eurostat' dataset
demography_eurostat = load_example_data('demography_eurostat')
# extract the 'population' array from the dataset
population = demography_eurostat.population
population
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 2
1 # load 'demography_eurostat' dataset
----> 2 demography_eurostat = load_example_data('demography_eurostat')
4 # extract the 'population' array from the dataset
5 population = demography_eurostat.population
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/example.py:97, in load_example_data(name)
95 available_datasets = list(AVAILABLE_EXAMPLE_DATA.keys())
96 raise ValueError(f"example_data must be chosen from list {available_datasets}")
---> 97 return la.Session(AVAILABLE_EXAMPLE_DATA[name])
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:98, in Session.__init__(self, meta, *args, **kwargs)
94 elements = {a.name: a for a in args}
96 if isinstance(elements, (str, Path)):
97 # assume elements is a filename
---> 98 self.load(elements)
99 self.update(**kwargs)
100 else:
101 # iterable of tuple or dict-like
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/core/session.py:438, in Session.load(self, fname, names, engine, display, **kwargs)
436 else:
437 handler = handler_cls(fname)
--> 438 metadata, objects = handler.read(names, display=display, **kwargs)
439 self._update_from_iterable(objects.items())
440 self.meta = metadata
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/common.py:139, in FileHandler.read(self, keys, display, ignore_exceptions, *args, **kwargs)
114 def read(self, keys, *args, display=False, ignore_exceptions=False, **kwargs) -> Tuple[Metadata, dict]:
115 r"""
116 Read file content (HDF, Excel, CSV, ...) and returns a dictionary containing loaded objects.
117
(...)
137 Dictionary containing the loaded objects.
138 """
--> 139 self._open_for_read()
140 metadata = self._read_metadata()
141 item_types = self.item_types()
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/larray/inout/hdf.py:138, in PandasHDFHandler._open_for_read(self)
137 def _open_for_read(self):
--> 138 self.handle = HDFStore(self.fname, mode='r')
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/io/pytables.py:566, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
563 if "format" in kwargs:
564 raise ValueError("format is not a defined argument for HDFStore")
--> 566 tables = import_optional_dependency("tables")
568 if complib is not None and complib not in tables.filters.all_complibs:
569 raise ValueError(
570 f"complib only supports {tables.filters.all_complibs} compression."
571 )
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
130 msg = (
131 f"Missing optional dependency '{install_name}'. {extra} "
132 f"Use pip or conda to install {install_name}."
133 )
134 try:
--> 135 module = importlib.import_module(name)
136 except ImportError:
137 if errors == "raise":
File ~/.asdf/installs/python/3.11.9/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
124 break
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1147, in _find_and_load_unlocked(name, import_)
File <frozen importlib._bootstrap>:690, in _load_unlocked(spec)
File <frozen importlib._bootstrap_external>:940, in exec_module(self, module)
File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/__init__.py:44
40 raise RuntimeError("Blosc2 library not found. "
41 f"I looked for \"{', '.join(blosc2_search_paths)}\"")
43 # Necessary imports to get versions stored on the cython extension
---> 44 from .utilsextension import get_hdf5_version as _get_hdf5_version
46 from ._version import __version__
48 hdf5_version = _get_hdf5_version()
File ~/checkouts/readthedocs.org/user_builds/larray/envs/0.34.3/lib/python3.11/site-packages/tables/utilsextension.pyx:1, in init tables.utilsextension()
ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
Inspecting Array objects
Get array summary : metadata + dimensions + description of axes + dtype + size in memory
[3]:
# Array summary: metadata + dimensions + description of axes
population.info
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 2
1 # Array summary: metadata + dimensions + description of axes
----> 2 population.info
NameError: name 'population' is not defined
Get axes
[4]:
population.axes
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 1
----> 1 population.axes
NameError: name 'population' is not defined
Get axis names
[5]:
population.axes.names
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 1
----> 1 population.axes.names
NameError: name 'population' is not defined
Get number of dimensions
[6]:
population.ndim
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 1
----> 1 population.ndim
NameError: name 'population' is not defined
Get length of each dimension
[7]:
population.shape
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 1
----> 1 population.shape
NameError: name 'population' is not defined
Get total number of elements of the array
[8]:
population.size
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 1
----> 1 population.size
NameError: name 'population' is not defined
Get type of internal data (int, float, …)
[9]:
population.dtype
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 1
----> 1 population.dtype
NameError: name 'population' is not defined
Get size in memory
[10]:
population.memory_used
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 1
----> 1 population.memory_used
NameError: name 'population' is not defined
Some Useful Functions
with total
Add totals to one or several axes:
[11]:
population.with_total('gender', label='Total')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 1
----> 1 population.with_total('gender', label='Total')
NameError: name 'population' is not defined
See with_total for more details and examples.
where
The where
function can be used to apply some computation depending on a condition:
[12]:
# where(condition, value if true, value if false)
where(population < population.mean('time'), -population, population)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 2
1 # where(condition, value if true, value if false)
----> 2 where(population < population.mean('time'), -population, population)
NameError: name 'population' is not defined
See where for more details and examples.
clip
Set all data between a certain range:
[13]:
# values below 10 millions are set to 10 millions
population.clip(minval=10**7)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[13], line 2
1 # values below 10 millions are set to 10 millions
----> 2 population.clip(minval=10**7)
NameError: name 'population' is not defined
[14]:
# values above 40 millions are set to 40 millions
population.clip(maxval=4*10**7)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[14], line 2
1 # values above 40 millions are set to 40 millions
----> 2 population.clip(maxval=4*10**7)
NameError: name 'population' is not defined
[15]:
# values below 10 millions are set to 10 millions and
# values above 40 millions are set to 40 millions
population.clip(10**7, 4*10**7)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 3
1 # values below 10 millions are set to 10 millions and
2 # values above 40 millions are set to 40 millions
----> 3 population.clip(10**7, 4*10**7)
NameError: name 'population' is not defined
[16]:
# Using vectors to define the lower and upper bounds
lower_bound = sequence(population.time, initial=5_500_000, inc=50_000)
upper_bound = sequence(population.time, 41_000_000, inc=100_000)
print(lower_bound, '\n')
print(upper_bound, '\n')
population.clip(lower_bound, upper_bound)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[16], line 2
1 # Using vectors to define the lower and upper bounds
----> 2 lower_bound = sequence(population.time, initial=5_500_000, inc=50_000)
3 upper_bound = sequence(population.time, 41_000_000, inc=100_000)
5 print(lower_bound, '\n')
NameError: name 'population' is not defined
See clip for more details and examples.
divnot0
Replace division by 0 by 0:
[17]:
divisor = ones(population.axes, dtype=int)
divisor['Male'] = 0
divisor
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 1
----> 1 divisor = ones(population.axes, dtype=int)
2 divisor['Male'] = 0
3 divisor
NameError: name 'population' is not defined
[18]:
population / divisor
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 1
----> 1 population / divisor
NameError: name 'population' is not defined
[19]:
# we use astype(int) since the divnot0 method
# returns a float array in this case while
# we want an integer array
population.divnot0(divisor).astype(int)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[19], line 4
1 # we use astype(int) since the divnot0 method
2 # returns a float array in this case while
3 # we want an integer array
----> 4 population.divnot0(divisor).astype(int)
NameError: name 'population' is not defined
See divnot0 for more details and examples.
ratio
The ratio
(rationot0
) method returns an array with all values divided by the sum of values along given axes:
[20]:
population.ratio('gender')
# which is equivalent to
population / population.sum('gender')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[20], line 1
----> 1 population.ratio('gender')
3 # which is equivalent to
4 population / population.sum('gender')
NameError: name 'population' is not defined
percents
[21]:
# or, if you want the previous ratios in percents
population.percent('gender')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 2
1 # or, if you want the previous ratios in percents
----> 2 population.percent('gender')
NameError: name 'population' is not defined
See percent for more details and examples.
diff
The diff
method calculates the n-th order discrete difference along a given axis.
The first order difference is given by out[n+1] = in[n+1] - in[n]
along the given axis.
[22]:
# calculates 'diff[year+1] = population[year+1] - population[year]'
population.diff('time')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 2
1 # calculates 'diff[year+1] = population[year+1] - population[year]'
----> 2 population.diff('time')
NameError: name 'population' is not defined
[23]:
# calculates 'diff[year+2] = population[year+2] - population[year]'
population.diff('time', d=2)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[23], line 2
1 # calculates 'diff[year+2] = population[year+2] - population[year]'
----> 2 population.diff('time', d=2)
NameError: name 'population' is not defined
[24]:
# calculates 'diff[year] = population[year+1] - population[year]'
population.diff('time', label='lower')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[24], line 2
1 # calculates 'diff[year] = population[year+1] - population[year]'
----> 2 population.diff('time', label='lower')
NameError: name 'population' is not defined
See diff for more details and examples.
growth_rate
The growth_rate
method calculates the growth along a given axis.
It is roughly equivalent to a.diff(axis, d, label) / a[axis.i[:-d]]
:
[25]:
population.growth_rate('time')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[25], line 1
----> 1 population.growth_rate('time')
NameError: name 'population' is not defined
See growth_rate for more details and examples.
shift
The shift
method drops first label of an axis and shifts all subsequent labels
[26]:
population.shift('time')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[26], line 1
----> 1 population.shift('time')
NameError: name 'population' is not defined
[27]:
# when shift is applied on an (increasing) time axis,
# it effectively brings "past" data into the future
population_shifted = population.shift('time')
stack({'population_shifted_2014': population_shifted[2014], 'population_2013': population[2013]}, 'array')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 3
1 # when shift is applied on an (increasing) time axis,
2 # it effectively brings "past" data into the future
----> 3 population_shifted = population.shift('time')
4 stack({'population_shifted_2014': population_shifted[2014], 'population_2013': population[2013]}, 'array')
NameError: name 'population' is not defined
See shift for more details and examples.
Other interesting functions
There are a lot more interesting functions that you can find in the API reference in sections Aggregation Functions, Miscellaneous and Utility Functions.