from itertools import product
import numpy as np
import pandas as pd
from larray.core.array import Array
from larray.core.axis import Axis, AxisCollection
from larray.core.constants import nan
from larray.util.misc import unique_list
def decode(s, encoding='utf-8', errors='strict'):
if isinstance(s, bytes):
return s.decode(encoding, errors)
else:
assert s is None or isinstance(s, str), "unexpected " + str(type(s))
return s
def parse(s):
r"""
Used to parse the "folded" axis ticks (usually periods).
"""
# parameters can be strings or numbers
if isinstance(s, str):
s = s.strip()
low = s.lower()
if low == 'true':
return True
elif low == 'false':
return False
elif s.isdigit():
return int(s)
else:
try:
return float(s)
except ValueError:
return s
else:
return s
def index_to_labels(idx, sort=True):
r"""
Return unique labels for each dimension.
"""
if isinstance(idx, pd.MultiIndex):
if sort:
return list(idx.levels)
else:
return [unique_list(idx.get_level_values(label)) for label in range(idx.nlevels)]
else:
assert isinstance(idx, pd.Index)
labels = list(idx.values)
return [sorted(labels) if sort else labels]
def cartesian_product_df(df, sort_rows=False, sort_columns=False, fill_value=nan, **kwargs):
idx = df.index
labels = index_to_labels(idx, sort=sort_rows)
if isinstance(idx, pd.MultiIndex):
if sort_rows:
new_index = pd.MultiIndex.from_product(labels)
else:
new_index = pd.MultiIndex.from_tuples(list(product(*labels)))
else:
if sort_rows:
new_index = pd.Index(labels[0], name=idx.name)
else:
new_index = idx
columns = sorted(df.columns) if sort_columns else list(df.columns)
# the prodlen test is meant to avoid the more expensive array_equal test
prodlen = np.prod([len(axis_labels) for axis_labels in labels])
if prodlen == len(df) and columns == list(df.columns) and np.array_equal(idx.values, new_index.values):
return df, labels
return df.reindex(index=new_index, columns=columns, fill_value=fill_value, **kwargs), labels
[docs]def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs) -> Array:
r"""
Convert Pandas Series into Array.
Parameters
----------
s : Pandas Series
Input Pandas Series.
sort_rows : bool, optional
Whether to sort the rows alphabetically. Defaults to False.
fill_value : scalar, optional
Value used to fill cells corresponding to label combinations which are not present in the input Series.
Defaults to NaN.
meta : list of pairs or dict or Metadata, optional
Metadata (title, description, author, creation_date, ...) associated with the array.
Keys must be strings. Values must be of type string, int, float, date, time or datetime.
Returns
-------
Array
See Also
--------
Array.to_series
Examples
--------
>>> from larray import ndtest
>>> s = ndtest((2, 2, 2), dtype=float).to_series()
>>> s # doctest: +NORMALIZE_WHITESPACE
a b c
a0 b0 c0 0.0
c1 1.0
b1 c0 2.0
c1 3.0
a1 b0 c0 4.0
c1 5.0
b1 c0 6.0
c1 7.0
dtype: float64
>>> from_series(s)
a b\c c0 c1
a0 b0 0.0 1.0
a0 b1 2.0 3.0
a1 b0 4.0 5.0
a1 b1 6.0 7.0
"""
if isinstance(s.index, pd.MultiIndex):
# TODO: use argument sort=False when it will be available
# (see https://github.com/pandas-dev/pandas/issues/15105)
df = s.unstack(level=-1, fill_value=fill_value)
# pandas (un)stack and pivot(_table) methods return a Dataframe/Series with sorted index and columns
if not sort_rows:
labels = index_to_labels(s.index, sort=False)
if isinstance(df.index, pd.MultiIndex):
index = pd.MultiIndex.from_tuples(list(product(*labels[:-1])), names=s.index.names[:-1])
else:
index = labels[0]
columns = labels[-1]
df = df.reindex(index=index, columns=columns, fill_value=fill_value)
return from_frame(df, sort_rows=sort_rows, sort_columns=sort_rows, fill_value=fill_value, meta=meta, **kwargs)
else:
name = decode(s.name, 'utf8') if s.name is not None else decode(s.index.name, 'utf8')
if sort_rows:
s = s.sort_index()
return Array(s.values, Axis(s.index.values, name), meta=meta)
[docs]def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfold_last_axis_name=False,
fill_value=nan, meta=None, cartesian_prod=True, **kwargs) -> Array:
r"""
Convert Pandas DataFrame into Array.
Parameters
----------
df : pandas.DataFrame
Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the
columns Index of the dataframe unless argument unfold_last_axis_name is set to True.
sort_rows : bool, optional
Whether to sort the rows alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
sort_columns : bool, optional
Whether to sort the columns alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
parse_header : bool, optional
Whether to parse columns labels. Pandas treats column labels as strings.
If True, column labels are converted into int, float or boolean when possible. Defaults to False.
unfold_last_axis_name : bool, optional
Whether to extract the names of the last two axes by splitting the name of the last index column of the
dataframe using ``\``. Defaults to False.
fill_value : scalar, optional
Value used to fill cells corresponding to label combinations which are not present in the input DataFrame.
Defaults to NaN.
meta : list of pairs or dict or Metadata, optional
Metadata (title, description, author, creation_date, ...) associated with the array.
Keys must be strings. Values must be of type string, int, float, date, time or datetime.
cartesian_prod : bool, optional
Whether to expand the dataframe to a cartesian product dataframe as needed by Array.
This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
Defaults to True.
Returns
-------
Array
See Also
--------
Array.to_frame
Examples
--------
>>> from larray import ndtest
>>> df = ndtest((2, 2, 2)).to_frame()
>>> df # doctest: +NORMALIZE_WHITESPACE
c c0 c1
a b
a0 b0 0 1
b1 2 3
a1 b0 4 5
b1 6 7
>>> from_frame(df)
a b\c c0 c1
a0 b0 0 1
a0 b1 2 3
a1 b0 4 5
a1 b1 6 7
Names of the last two axes written as ``before_last_axis_name\\last_axis_name``
>>> df = ndtest((2, 2, 2)).to_frame(fold_last_axis_name=True)
>>> df # doctest: +NORMALIZE_WHITESPACE
c0 c1
a b\c
a0 b0 0 1
b1 2 3
a1 b0 4 5
b1 6 7
>>> from_frame(df, unfold_last_axis_name=True)
a b\c c0 c1
a0 b0 0 1
a0 b1 2 3
a1 b0 4 5
a1 b1 6 7
"""
axes_names = [decode(name, 'utf8') if isinstance(name, bytes) else name
for name in df.index.names]
# handle 2 or more dimensions with the last axis name given using \
if unfold_last_axis_name:
if isinstance(axes_names[-1], str) and '\\' in axes_names[-1]:
last_axes = [name.strip() for name in axes_names[-1].split('\\')]
axes_names = axes_names[:-1] + last_axes
else:
axes_names += [None]
else:
axes_names += [df.columns.name]
if cartesian_prod:
df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns,
fill_value=fill_value, **kwargs)
else:
if sort_rows or sort_columns:
raise ValueError('sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
'Please call the method sort_labels on the returned array to sort rows or columns')
axes_labels = index_to_labels(df.index, sort=False)
# Pandas treats column labels as column names (strings) so we need to convert them to values
last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values)
axes_labels.append(last_axis_labels)
axes = AxisCollection([Axis(labels, name) for labels, name in zip(axes_labels, axes_names)])
data = df.values.reshape(axes.shape)
return Array(data, axes, meta=meta)
def set_dataframe_index_by_position(df, index_col_indices):
"""
equivalent to Dataframe.set_index but with column indices, not column labels.
This is necessary to support creating an index from columns without a name or with duplicate names.
Return a new Dataframe
"""
if not isinstance(index_col_indices, list):
index_col_indices = [index_col_indices]
index_col_indices_set = set(index_col_indices)
index_col_values = [df.iloc[:, i] for i in index_col_indices]
non_index_col_indices = [i for i in range(len(df.columns)) if i not in index_col_indices_set]
# drop the index columns from the "normal" columns of the dataframe
df = df.iloc[:, non_index_col_indices]
# add them back as index columns
df.set_index(index_col_values, inplace=True)
return df
def df_asarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, wide=True, cartesian_prod=True,
**kwargs) -> Array:
r"""
Prepare Pandas DataFrame and then convert it into Array.
Parameters
----------
df : Pandas DataFrame
Input dataframe.
sort_rows : bool, optional
Whether to sort the rows alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
sort_columns : bool, optional
Whether to sort the columns alphabetically (sorting is more efficient than not sorting).
Must be False if `cartesian_prod` is set to True.
Defaults to False.
raw : bool, optional
Whether to consider the input dataframe as a raw dataframe, i.e. read without index at all.
If True, build the first N-1 axes of the output array from the first N-1 dataframe columns. Defaults to False.
parse_header : bool, optional
Whether to parse columns labels. Pandas treats column labels as strings.
If True, column labels are converted into int, float or boolean when possible. Defaults to True.
wide : bool, optional
Whether to assume the array is stored in "wide" format.
If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column.
Defaults to True.
cartesian_prod : bool, optional
Whether to expand the dataframe to a cartesian product dataframe as needed by Array.
This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
Defaults to True.
Returns
-------
Array
"""
# we could inline df_asarray into the functions that use it, so that the original (non-cartesian) df is freed from
# memory at this point, but it would be much uglier and would not lower the peak memory usage which happens during
# cartesian_product_df.reindex
# raw = True: the dataframe was read without index at all (ie 2D dataframe),
# irrespective of the actual data dimensionality
if raw:
columns = df.columns.values.tolist()
if wide:
try:
# take the first column which contains '\'
pos_last = next(i for i, v in enumerate(columns) if isinstance(v, str) and '\\' in v)
except StopIteration:
# we assume first column will not contain data
pos_last = 0
# This is required to handle int column names (otherwise we can simply use column positions in set_index).
# This is NOT the same as df.columns[list(range(...))] !
df = set_dataframe_index_by_position(df, list(range(pos_last + 1)))
else:
df = set_dataframe_index_by_position(df, list(range(len(df.columns) - 1)))
series = df.iloc[:, -1]
series.name = df.index.name
return from_series(series, sort_rows=sort_columns, **kwargs)
# handle 1D arrays
if len(df) == 1 and (pd.isnull(df.index.values[0])
or (isinstance(df.index.values[0], str) and df.index.values[0].strip() == '')):
if parse_header:
df.columns = pd.Index([parse(cell) for cell in df.columns.values], name=df.columns.name)
series = df.iloc[0]
series.name = df.index.name
if sort_rows:
raise ValueError('sort_rows=True is not valid for 1D arrays. Please use sort_columns instead.')
res = from_series(series, sort_rows=sort_columns)
else:
def parse_axis_name(name):
if isinstance(name, bytes):
name = decode(name, 'utf8')
if not name:
name = None
return name
axes_names = [parse_axis_name(name) for name in df.index.names]
unfold_last_axis_name = isinstance(axes_names[-1], str) and '\\' in axes_names[-1]
res = from_frame(df, sort_rows=sort_rows, sort_columns=sort_columns, parse_header=parse_header,
unfold_last_axis_name=unfold_last_axis_name, cartesian_prod=cartesian_prod, **kwargs)
# ugly hack to avoid anonymous axes converted as axes with name 'Unnamed: x' by pandas
# we also take the opportunity to change axes with empty name to real anonymous axes (name is None) to
# make them roundtrip correctly, based on the assumption that in an in-memory LArray an anonymouse axis is more
# likely and useful than an Axis with an empty name.
# TODO : find a more robust and elegant solution
res = res.rename({axis: None for axis in res.axes if (isinstance(axis.name, str)
and (axis.name == '' or 'Unnamed:' in axis.name))})
return res