from itertools import product
import numpy as np
import pandas as pd
from larray.core.array import Array
from larray.core.axis import Axis, AxisCollection
from larray.core.constants import nan
def decode(s, encoding='utf-8', errors='strict'):
if isinstance(s, bytes):
return s.decode(encoding, errors)
else:
assert s is None or isinstance(s, str), "unexpected " + str(type(s))
return s
def parse(s):
r"""
Used to parse the "folded" axis ticks (usually periods).
"""
# parameters can be strings or numbers
if isinstance(s, str):
s = s.strip()
low = s.lower()
if low == 'true':
return True
elif low == 'false':
return False
elif s.isdigit():
return int(s)
else:
try:
return float(s)
except ValueError:
return s
else:
return s
def simple_index_to_labels(idx: pd.Index, sort=True, keep_object=True) -> np.ndarray:
r"""
Return unique labels for a simple index as a numpy array
keep_object is an option which shouldn't exist (it should be always True)
It is a bug to use keep_object=False, but I introduced the option on purpose
to avoid breaking our users existing code in a bug-fix release (see #1193).
"""
if sort:
dtype = 'O' if keep_object and idx.dtype.kind == 'O' else None
# this will fail for mixed-type labels (as does np.sort(idx.to_numpy()))
labels = np.asarray(sorted(idx.to_list()), dtype=dtype)
else:
if keep_object:
# this is NOT the same as idx.to_numpy() (which we should always
# use) because it converts mixed str-numbers object indexes to a
# single str type.
labels = idx.to_numpy()
else:
labels = np.asarray(idx.to_list())
# this is a bug introduced on purpose to keep backward compatibility
# (see issue #1187)
if isinstance(idx, pd.DatetimeIndex):
labels = labels.astype(str)
return labels
def index_to_labels(idx, sort=True, keep_object=True): # -> list[np.ndarray]:
r"""
Return unique labels for each dimension as a list of numpy arrays
keep_object means that object dtype indexes will be returned as object
dtype arrays, even if they contain only strings or numbers (see #1193).
"""
if isinstance(idx, pd.MultiIndex):
if sort:
# idx.levels is a FrozenList of Index objects (which are already
# sorted)
return [simple_index_to_labels(idx_for_level, sort=False,
keep_object=keep_object)
for idx_for_level in idx.levels]
else:
# requires Pandas >= 0.23 (and it does NOT sort the values)
return [simple_index_to_labels(idx.unique(level_num), sort=False,
keep_object=keep_object)
for level_num in range(idx.nlevels)]
else:
assert isinstance(idx, pd.Index)
return [simple_index_to_labels(idx, sort=sort, keep_object=keep_object)]
def product_index(idx, sort=False):
"""
Converts a pandas (Multi)Index to an (Multi)Index with a cartesian
product of the labels present in each level
"""
labels = index_to_labels(idx, sort=sort)
if isinstance(idx, pd.MultiIndex):
return pd.MultiIndex.from_product(labels), labels
else:
assert isinstance(idx, pd.Index)
if sort:
return pd.Index(labels[0], name=idx.name), labels
else:
return idx, labels
def cartesian_product_df(df,
sort_rows=False,
sort_columns=False,
fill_value=nan,
**kwargs):
idx = df.index
columns = df.columns
prod_index, index_labels = product_index(idx, sort=sort_rows)
prod_columns, column_labels = product_index(columns, sort=sort_columns)
combined_labels = index_labels + column_labels
# the len() tests are meant to avoid the more expensive array_equal tests
if (len(prod_index) == len(idx) and
len(prod_columns) == len(columns) and
idx.equals(prod_index) and
columns.equals(prod_columns)):
return df, combined_labels
import numbers
if (isinstance(fill_value, numbers.Number) and not np.isnan(fill_value) and
any(dt.kind == 'O' and dt.type is str for dt in df.dtypes)):
df = df.copy()
for col in df.columns:
dt = df[col].dtype
if dt.kind == 'O' and dt.type is str:
# TODO: we should really output this warning, but the user
# needs a way to silence it, which requires we implement
# in all user-facing functions calling this function
# directly or indirectly (from_series, from_frame,
# read_excel, read_csv, ...) a way to specify
# both a usecols argument (when the data has mixed type
# but the user only needs an homogeneously typed subset)
# and a dtype=object argument when the user does need
# mixed types (and want to silence the warning).
# warnings.warn("fill_value is not valid for all "
# "columns because it is a (non-NaN) number but "
# f"the '{col}' column has string dtype. That "
# "column will converted to object dtype to avoid "
# "errors but this may cause performance issues.",
# FutureWarning, stacklevel=3)
df[col] = df[col].astype(object)
return df.reindex(index=prod_index, columns=prod_columns,
fill_value=fill_value, **kwargs), combined_labels
[docs]
def from_series(s,
sort_rows=False,
fill_value=nan,
meta=None,
copy=True,
**kwargs) -> Array:
r"""
Convert Pandas Series into Array.
Parameters
----------
s : Pandas Series
Input Pandas Series.
sort_rows : bool, optional
Whether to sort the rows alphabetically. Defaults to False.
fill_value : scalar, optional
Value used to fill cells corresponding to label combinations which are not present in the input Series.
Defaults to NaN.
meta : list of pairs or dict or Metadata, optional
Metadata (title, description, author, creation_date, ...) associated with the array.
Keys must be strings. Values must be of type string, int, float, date, time or datetime.
copy : bool, optional
Whether to copy the data from the Series. Defaults to True.
copy=False does not guarantee that no copy will be made, only that a
copy is only done when necessary. If the resulting array shares the same
data buffer than the original series, it will be read-only.
Returns
-------
Array
See Also
--------
Array.to_series
Examples
--------
>>> from larray import ndtest
>>> s = ndtest((2, 2, 2), dtype=float).to_series()
>>> s # doctest: +NORMALIZE_WHITESPACE
a b c
a0 b0 c0 0.0
c1 1.0
b1 c0 2.0
c1 3.0
a1 b0 c0 4.0
c1 5.0
b1 c0 6.0
c1 7.0
dtype: float64
>>> from_series(s)
a b\c c0 c1
a0 b0 0.0 1.0
a0 b1 2.0 3.0
a1 b0 4.0 5.0
a1 b1 6.0 7.0
"""
if isinstance(s.index, pd.MultiIndex):
# Using unstack sort argument (requires Pandas >= 2.1) would make this
# code simpler, but it makes it even slower than it already is.
# As of Pandas 2.3.3 on 12/2025, a series with a large MultiIndex is
# extremely slow to unstack, whether sort is used or not:
# >>> arr = ndtest((200, 200, 200))
# >>> s = arr.to_series() # 31.4 ms
# >>> s.unstack(level=-1, fill_value=np.nan) # 1.5s !!!
df = s.unstack(level=-1, fill_value=fill_value)
# pandas (un)stack and pivot(_table) methods return a Dataframe/Series with sorted index and columns
if not sort_rows:
labels = index_to_labels(s.index, sort=False)
if isinstance(df.index, pd.MultiIndex):
index = pd.MultiIndex.from_tuples(list(product(*labels[:-1])), names=s.index.names[:-1])
else:
index = labels[0]
columns = labels[-1]
# no need to use copy=False since Pandas implemented copy-on-write
# and will only copy if necessary
df = df.reindex(index=index, columns=columns, fill_value=fill_value)
# copy=False because unstack above already copied the data
res = from_frame(df,
sort_rows=sort_rows,
sort_columns=sort_rows,
fill_value=fill_value,
meta=meta,
copy=False,
**kwargs)
res.data.flags.writeable = True
return res
else:
name = decode(s.name, 'utf8') if s.name is not None else decode(s.index.name, 'utf8')
if sort_rows:
s = s.sort_index()
# sort_index copies the data, no need to copy it again
values = s.to_numpy(copy=False)
values.flags.writeable = True
else:
values = s.to_numpy(copy=copy)
labels = s.index.to_numpy(copy=copy)
return Array(values, Axis(labels, name), meta=meta)
[docs]
def from_frame(df,
sort_rows=False,
sort_columns=False,
parse_header=False,
unfold_last_axis_name=False,
fill_value=nan,
meta=None,
cartesian_prod=True,
copy=True,
**kwargs) -> Array:
r"""
Convert Pandas DataFrame into Array.
Parameters
----------
df : pandas.DataFrame
Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the
columns Index of the dataframe unless argument unfold_last_axis_name is set to True.
sort_rows : bool, optional
Whether to sort the rows alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
sort_columns : bool, optional
Whether to sort the columns alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
parse_header : bool, optional
Whether to parse columns labels. Pandas treats column labels as strings.
If True, column labels are converted into int, float or boolean when possible. Defaults to False.
unfold_last_axis_name : bool, optional
Whether to extract the names of the last two axes by splitting the name of the last index column of the
dataframe using ``\``. Defaults to False.
fill_value : scalar, optional
Value used to fill cells corresponding to label combinations which are not present in the input DataFrame.
Defaults to NaN.
meta : list of pairs or dict or Metadata, optional
Metadata (title, description, author, creation_date, ...) associated with the array.
Keys must be strings. Values must be of type string, int, float, date, time or datetime.
cartesian_prod : bool, optional
Whether to expand the dataframe to a cartesian product dataframe as needed by Array.
This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
well-formed. If False, arguments `sort_rows` and `sort_columns` must also be False.
Defaults to True.
copy : bool, optional
Whether to copy the data from the DataFrame. Defaults to True.
copy=False does not guarantee that no copy will be made, only that a
copy is only done when necessary. If the resulting array shares the same
data buffer than the original series, it will be read-only.
Returns
-------
Array
See Also
--------
Array.to_frame
Examples
--------
>>> from larray import ndtest
>>> df = ndtest((2, 2, 2)).to_frame()
>>> df # doctest: +NORMALIZE_WHITESPACE
c c0 c1
a b
a0 b0 0 1
b1 2 3
a1 b0 4 5
b1 6 7
>>> from_frame(df)
a b\c c0 c1
a0 b0 0 1
a0 b1 2 3
a1 b0 4 5
a1 b1 6 7
"""
axes_names = [decode(name, 'utf8') if isinstance(name, bytes) else name
for name in df.index.names]
# handle 2 or more dimensions with the last axis name given using \
if unfold_last_axis_name:
# Note that having several axes in columns (and using df.columns.names)
# in this case does not make sense
if isinstance(axes_names[-1], str) and '\\' in axes_names[-1]:
last_axes = [name.strip() for name in axes_names[-1].split('\\')]
axes_names = axes_names[:-1] + last_axes
else:
axes_names += [None]
else:
axes_names += df.columns.names
make_writable = False
if cartesian_prod:
orig_df = df
df, axes_labels = (
cartesian_product_df(df,
sort_rows=sort_rows,
sort_columns=sort_columns,
fill_value=fill_value,
**kwargs)
)
# we already copied the data so we can avoid copying it again when
# converting to numpy array
if copy and df is not orig_df:
copy = False
make_writable = True
else:
if sort_rows or sort_columns:
raise ValueError('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
'Please call the method sort_labels on the returned array to sort rows or columns')
# keep_object=False is an intentional bug to avoid breaking backwards
# compatibility (see issue #1193)
index_labels = index_to_labels(df.index, sort=False, keep_object=False)
column_labels = index_to_labels(df.columns, sort=False,
keep_object=False)
axes_labels = index_labels + column_labels
# Pandas treats column labels as column names (strings) so we need to convert them to values
if parse_header:
ncolaxes = df.columns.nlevels
for i in range(len(axes_labels) - ncolaxes, len(axes_labels)):
axes_labels[i] = [parse(cell) for cell in axes_labels[i]]
# TODO: use zip(..., strict=True) instead when we drop support for Python 3.9
assert len(axes_labels) == len(axes_names), \
(f"number of axes labels and axes names do not match: "
f"{len(axes_labels)} vs {len(axes_names)}:\n"
f"{axes_labels}\n\nvs\n\n{axes_names}")
axes = AxisCollection([Axis(labels, name) for labels, name in zip(axes_labels, axes_names)])
data = df.to_numpy(copy=copy).reshape(axes.shape)
if make_writable:
data.flags.writeable = True
return Array(data, axes, meta=meta)
def set_dataframe_index_by_position(df, index_col_indices):
"""
equivalent to Dataframe.set_index but with column indices, not column labels.
This is necessary to support creating an index from columns without a name or with duplicate names.
Return a new Dataframe (no shared data)
"""
if not isinstance(index_col_indices, list):
index_col_indices = [index_col_indices]
index_col_indices_set = set(index_col_indices)
index_col_values = [df.iloc[:, i] for i in index_col_indices]
non_index_col_indices = [i for i in range(len(df.columns)) if i not in index_col_indices_set]
# drop the index columns from the "normal" columns of the dataframe
df = df.iloc[:, non_index_col_indices]
# add them back as index columns
df.set_index(index_col_values, inplace=True)
return df
def df_asarray(df,
sort_rows=False,
sort_columns=False,
raw=False,
parse_header=True,
wide=True,
cartesian_prod=True,
**kwargs) -> Array:
r"""
Prepare Pandas DataFrame and then convert it into Array.
Warning
-------
This function actively tries to share data with the input dataframe
(equivalent to copy=False in from_frame) and will always return a writable
array (whether the backing array is shared with the input dataframe or not).
This is not a problem because we only use this function internally with
dataframes we created ourselves (by reading/converting files or other
data structures).
Parameters
----------
df : Pandas DataFrame
Input dataframe.
sort_rows : bool, optional
Whether to sort the rows alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
sort_columns : bool, optional
Whether to sort the columns alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
raw : bool, optional
Whether to consider the input dataframe as a raw dataframe, i.e. read without index at all.
If True, build the first N-1 axes of the output array from the first N-1 dataframe columns. Defaults to False.
parse_header : bool, optional
Whether to parse columns labels. Pandas treats column labels as strings.
If True, column labels are converted into int, float or boolean when possible. Defaults to True.
wide : bool, optional
Whether to assume the array is stored in "wide" format.
If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column.
Defaults to True.
cartesian_prod : bool, optional
Whether to expand the dataframe to a cartesian product dataframe as needed by Array.
This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
well-formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
Defaults to True.
Returns
-------
Array
"""
# we could inline df_asarray into the functions that use it, so that the original (non-cartesian) df is freed from
# memory at this point, but it would be much uglier and would not lower the peak memory usage which happens during
# cartesian_product_df.reindex
# raw = True: the dataframe was read without index at all (ie 2D dataframe),
# irrespective of the actual data dimensionality
if raw:
columns = df.columns.to_list()
if wide:
try:
# take the first column which contains '\'
pos_last = next(i for i, v in enumerate(columns) if isinstance(v, str) and '\\' in v)
except StopIteration:
# we assume first column will not contain data
pos_last = 0
# This is required to handle int column names (otherwise we can simply use column positions in set_index).
# This is NOT the same as df.columns[list(range(...))] !
df = set_dataframe_index_by_position(df, list(range(pos_last + 1)))
else:
df = set_dataframe_index_by_position(df, list(range(len(df.columns) - 1)))
series = df.iloc[:, -1]
series.name = df.index.name
# copy=False because set_dataframe_index_by_position copies data
res = from_series(series, sort_rows=sort_columns, copy=False, **kwargs)
res.data.flags.writeable = True
return res
# handle 1D arrays
if len(df) == 1:
# .item() returns the first element and checks length is 1
idx_val = df.index.item()
is_1d_array = (pd.isnull(idx_val) or
(isinstance(idx_val, str) and idx_val.strip() == ''))
else:
is_1d_array = False
if is_1d_array:
if parse_header:
df.columns = pd.Index([parse(cell) for cell in df.columns.to_list()], name=df.columns.name)
series = df.iloc[0]
series.name = df.index.name
if sort_rows:
raise ValueError('sort_rows=True is not valid for 1D arrays. Please use sort_columns instead.')
res = from_series(series, sort_rows=sort_columns, copy=False)
else:
def parse_axis_name(name):
if isinstance(name, bytes):
name = decode(name, 'utf8')
if not name:
name = None
return name
axes_names = [parse_axis_name(name) for name in df.index.names]
unfold_last_axis_name = isinstance(axes_names[-1], str) and '\\' in axes_names[-1]
res = from_frame(df,
sort_rows=sort_rows,
sort_columns=sort_columns,
parse_header=parse_header,
unfold_last_axis_name=unfold_last_axis_name,
cartesian_prod=cartesian_prod,
copy=False,
**kwargs)
res.data.flags.writeable = True
# ugly hack to avoid anonymous axes converted as axes with name 'Unnamed: x' by pandas
# we also take the opportunity to change axes with empty name to real anonymous axes (name is None) to
# make them roundtrip correctly, based on the assumption that in an in-memory LArray an anonymous axis is more
# likely and useful than an Axis with an empty name.
# TODO : find a more robust and elegant solution
res = res.rename({axis: None for axis in res.axes if (isinstance(axis.name, str)
and (axis.name == '' or 'Unnamed:' in axis.name))})
return res