from __future__ import absolute_import, print_function
import os
import warnings
from collections import OrderedDict
import numpy as np
import pandas as pd
try:
import xlwings as xw
except ImportError:
xw = None
from larray.core.array import Array, asarray
from larray.core.axis import Axis
from larray.core.constants import nan
from larray.core.group import Group, _translate_sheet_name
from larray.core.metadata import Metadata
from larray.util.misc import deprecate_kwarg
from larray.inout.session import register_file_handler
from larray.inout.common import _get_index_col, FileHandler
from larray.inout.pandas import df_asarray
from larray.inout.xw_excel import open_excel
from larray.example import get_example_filepath
__all__ = ['read_excel']
# TODO: remove '# doctest: +SKIP' next to arr.info when Python 2.7 will be dropped
[docs]@deprecate_kwarg('nb_index', 'nb_axes', arg_converter=lambda x: x + 1)
@deprecate_kwarg('sheetname', 'sheet')
def read_excel(filepath, sheet=0, nb_axes=None, index_col=None, fill_value=nan, na=nan,
sort_rows=False, sort_columns=False, wide=True, engine=None, range=slice(None), **kwargs):
r"""
Reads excel file from sheet name and returns an Array with the contents
Parameters
----------
filepath : str
Path where the Excel file has to be read or use -1 to refer to the currently active workbook.
sheet : str, Group or int, optional
Name or index of the Excel sheet containing the array to be read.
By default the array is read from the first sheet.
nb_axes : int, optional
Number of axes of output array. The first ``nb_axes`` - 1 columns and the header of the Excel sheet will be used
to set the axes of the output array. If not specified, the number of axes is given by the position of the
first column header including a ``\`` character plus one. If no column header includes a ``\`` character, the
array is assumed to have one axis. Defaults to None.
index_col : list, optional
Positions of columns for the n-1 first axes (ex. [0, 1, 2, 3]). Defaults to None (see nb_axes above).
fill_value : scalar or Array, optional
Value used to fill cells corresponding to label combinations which are not present in the input.
Defaults to NaN.
sort_rows : bool, optional
Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
sort_columns : bool, optional
Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
Defaults to False.
wide : bool, optional
Whether or not to assume the array is stored in "wide" format.
If False, the array is assumed to be stored in "narrow" format: one column per axis plus one value column.
Defaults to True.
engine : {'xlrd', 'xlwings'}, optional
Engine to use to read the Excel file. If None (default), it will use 'xlwings' by default if the module is
installed and relies on Pandas default reader otherwise.
range : str, optional
Range to load the array from (only supported for the 'xlwings' engine). Defaults to slice(None) which loads
the whole sheet, ignoring blank cells in the bottom right corner.
**kwargs
Returns
-------
Array
Examples
--------
>>> fname = get_example_filepath('examples.xlsx')
Read array from first sheet
>>> # The data below is derived from a subset of the demo_pjan table from Eurostat
>>> read_excel(fname)
country gender\time 2013 2014 2015
Belgium Male 5472856 5493792 5524068
Belgium Female 5665118 5687048 5713206
France Male 31772665 32045129 32174258
France Female 33827685 34120851 34283895
Germany Male 39380976 39556923 39835457
Germany Female 41142770 41210540 41362080
Read array from a specific sheet
>>> # The data below is derived from a subset of the demo_fasec table from Eurostat
>>> read_excel(fname, 'births')
country gender\time 2013 2014 2015
Belgium Male 64371 64173 62561
Belgium Female 61235 60841 59713
France Male 415762 418721 409145
France Female 396581 400607 390526
Germany Male 349820 366835 378478
Germany Female 332249 348092 359097
Missing label combinations
Let us take a look inside the sheet 'pop_missing_values'. Note the missing label combinations:
(Paris, male) and (New York, female): ::
country gender\time 2013 2014 2015
Belgium Male 5472856 5493792 5524068
Belgium Female 5665118 5687048 5713206
France Female 33827685 34120851 34283895
Germany Male 39380976 39556923 39835457
By default, cells associated with missing label combinations are filled with NaN. In that case, an int array
is converted to a float array.
>>> read_excel(fname, sheet='pop_missing_values')
country gender\time 2013 2014 2015
Belgium Male 5472856.0 5493792.0 5524068.0
Belgium Female 5665118.0 5687048.0 5713206.0
France Male nan nan nan
France Female 33827685.0 34120851.0 34283895.0
Germany Male 39380976.0 39556923.0 39835457.0
Germany Female nan nan nan
Using the ``fill_value`` argument, you can choose another value to use to fill missing cells.
>>> read_excel(fname, sheet='pop_missing_values', fill_value=0)
country gender\time 2013 2014 2015
Belgium Male 5472856 5493792 5524068
Belgium Female 5665118 5687048 5713206
France Male 0 0 0
France Female 33827685 34120851 34283895
Germany Male 39380976 39556923 39835457
Germany Female 0 0 0
Specify the number of axes of the output array (useful when the name of the last axis is implicit)
The content of the sheet 'missing_axis_name' is: ::
country gender 2013 2014 2015
Belgium Male 5472856 5493792 5524068
Belgium Female 5665118 5687048 5713206
France Male 31772665 32045129 32174258
France Female 33827685 34120851 34283895
Germany Male 39380976 39556923 39835457
Germany Female 41142770 41210540 41362080
>>> # read the array stored in the sheet 'pop_missing_axis_name' as is
>>> arr = read_excel(fname, sheet='pop_missing_axis_name')
>>> # we expected a 3 x 2 x 3 array with data of type int
>>> # but we got a 6 x 4 array with data of type object
>>> arr.info # doctest: +SKIP
6 x 4
country [6]: 'Belgium' 'Belgium' 'France' 'France' 'Germany' 'Germany'
{1} [4]: 'gender' '2013' '2014' '2015'
dtype: object
memory used: 192 bytes
>>> # using argument 'nb_axes', you can force the number of axes of the output array
>>> arr = read_excel(fname, sheet='pop_missing_axis_name', nb_axes=3)
>>> # as expected, we have a 3 x 2 x 3 array with data of type int
>>> arr.info # doctest: +SKIP
3 x 2 x 3
country [3]: 'Belgium' 'France' 'Germany'
gender [2]: 'Male' 'Female'
{2} [3]: 2013 2014 2015
dtype: int64
memory used: 144 bytes
Read array saved in "narrow" format (wide=False)
Let us take a look inside the sheet 'pop_narrow' where the data is stored in a 'narrow' format: ::
country time value
Belgium 2013 11137974
Belgium 2014 11180840
Belgium 2015 11237274
France 2013 65600350
France 2014 66165980
France 2015 66458153
>>> # to read arrays stored in 'narrow' format, you must pass wide=False to read_excel
>>> read_excel(fname, 'pop_narrow_format', wide=False)
country\time 2013 2014 2015
Belgium 11137974 11180840 11237274
France 65600350 66165980 66458153
Extract array from a given range (xlwings only)
>>> read_excel(fname, 'pop_births_deaths', range='A9:E15') # doctest: +SKIP
country gender\time 2013 2014 2015
Belgium Male 64371 64173 62561
Belgium Female 61235 60841 59713
France Male 415762 418721 409145
France Female 396581 400607 390526
Germany Male 349820 366835 378478
Germany Female 332249 348092 359097
"""
if not np.isnan(na):
fill_value = na
warnings.warn("read_excel `na` argument has been renamed to `fill_value`. Please use that instead.",
FutureWarning, stacklevel=2)
sheet = _translate_sheet_name(sheet)
if engine is None:
engine = 'xlwings' if xw is not None else None
index_col = _get_index_col(nb_axes, index_col, wide)
if engine == 'xlwings':
if kwargs:
raise TypeError("'{}' is an invalid keyword argument for this function when using the xlwings backend"
.format(list(kwargs.keys())[0]))
from larray.inout.xw_excel import open_excel
with open_excel(filepath) as wb:
return wb[sheet][range].load(index_col=index_col, fill_value=fill_value, sort_rows=sort_rows,
sort_columns=sort_columns, wide=wide)
else:
# TODO: add support for range argument (using usecols, skiprows and nrows arguments of pandas.read_excel)
df = pd.read_excel(filepath, sheet, index_col=index_col, engine=engine, **kwargs)
return df_asarray(df, sort_rows=sort_rows, sort_columns=sort_columns, raw=index_col is None,
fill_value=fill_value, wide=wide)
@register_file_handler('pandas_excel', ['xls', 'xlsx'] if xw is None else None)
class PandasExcelHandler(FileHandler):
r"""
Handler for Excel files using Pandas.
"""
def __init__(self, fname, overwrite_file=False):
super(PandasExcelHandler, self).__init__(fname, overwrite_file)
def _open_for_read(self):
self.handle = pd.ExcelFile(self.fname)
def _open_for_write(self):
self.handle = pd.ExcelWriter(self.fname)
def list_items(self):
sheet_names = self.handle.sheet_names
items = []
try:
sheet_names.remove('__metadata__')
except:
pass
items += [(name, 'Array') for name in sheet_names]
return items
def _read_item(self, key, type, *args, **kwargs):
if type == 'Array':
df = self.handle.parse(key, *args, **kwargs)
return df_asarray(df, raw=True)
else:
raise TypeError()
def _dump_item(self, key, value, *args, **kwargs):
kwargs['engine'] = 'xlsxwriter'
if isinstance(value, Array):
value.to_excel(self.handle, key, *args, **kwargs)
else:
raise TypeError()
def _read_metadata(self):
sheet_meta = '__metadata__'
if sheet_meta in self.handle.sheet_names:
meta = read_excel(self.handle, sheet_meta, engine='xlrd', wide=False)
return Metadata.from_array(meta)
else:
return Metadata()
def _dump_metadata(self, metadata):
if len(metadata) > 0:
metadata = asarray(metadata)
metadata.to_excel(self.handle, '__metadata__', engine='xlsxwriter', wide=False, value_name='')
def save(self):
pass
def close(self):
self.handle.close()
@register_file_handler('xlwings_excel', ['xls', 'xlsx'] if xw is not None else None)
class XLWingsHandler(FileHandler):
r"""
Handler for Excel files using XLWings.
"""
def __init__(self, fname, overwrite_file=False):
super(XLWingsHandler, self).__init__(fname, overwrite_file)
def _get_original_file_name(self):
# for XLWingsHandler, no need to create a temporary file, the job is already done in the Workbook class
pass
def _open_for_read(self):
self.handle = open_excel(self.fname)
def _open_for_write(self):
self.handle = open_excel(self.fname, overwrite_file=self.overwrite_file)
def list_items(self):
sheet_names = self.handle.sheet_names()
items = []
try:
sheet_names.remove('__metadata__')
except:
pass
items += [(name, 'Array') for name in sheet_names]
return items
def _read_item(self, key, type, *args, **kwargs):
if type == 'Array':
return self.handle[key].load(*args, **kwargs)
else:
raise TypeError()
def _dump_item(self, key, value, *args, **kwargs):
if isinstance(value, Array):
self.handle[key] = value.dump(*args, **kwargs)
else:
raise TypeError()
def _read_metadata(self):
sheet_meta = '__metadata__'
if sheet_meta in self.handle:
meta = self.handle[sheet_meta].load(wide=False)
return Metadata.from_array(meta)
else:
return Metadata()
def _dump_metadata(self, metadata):
if len(metadata) > 0:
metadata = asarray(metadata)
self.handle['__metadata__'] = metadata.dump(wide=False, value_name='')
def save(self):
self.handle.save()
def close(self):
self.handle.close()