from textwrap import dedent
import numpy as np
import pandas as pd
import pytest
import dask.array as da
import dask.dataframe as dd
style = """
"""
def test_repr():
df = pd.DataFrame({"x": list(range(100))})
ddf = dd.from_pandas(df, 3)
for x in [ddf, ddf.index, ddf.x]:
assert type(x).__name__ in repr(x)
assert str(x.npartitions) in repr(x)
def test_repr_meta_mutation():
# Check that the repr changes when meta changes
df = pd.DataFrame({"a": range(5), "b": ["a", "b", "c", "d", "e"]})
ddf = dd.from_pandas(df, npartitions=2)
s1 = repr(ddf)
assert repr(ddf) == s1
ddf.b = ddf.b.astype("category")
assert repr(ddf) != s1
def test_dataframe_format():
pytest.importorskip("jinja2")
df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8],
"B": list("ABCDEFGH"),
"C": pd.Categorical(list("AAABBBCC")),
}
)
ddf = dd.from_pandas(df, 3)
exp = (
"Dask DataFrame Structure:\n"
" A B C\n"
"npartitions=3 \n"
"0 int64 object category[known]\n"
"3 ... ... ...\n"
"6 ... ... ...\n"
"7 ... ... ...\n"
"Dask Name: from_pandas, 3 tasks"
)
assert repr(ddf) == exp
assert str(ddf) == exp
exp = (
" A B C\n"
"npartitions=3 \n"
"0 int64 object category[known]\n"
"3 ... ... ...\n"
"6 ... ... ...\n"
"7 ... ... ..."
)
assert ddf.to_string() == exp
exp_table = """
|
A |
B |
C |
npartitions=3 |
|
|
|
0 |
int64 |
object |
category[known] |
3 |
... |
... |
... |
6 |
... |
... |
... |
7 |
... |
... |
... |
"""
exp = """Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 3 tasks
""".format(
exp_table=exp_table
)
assert ddf.to_html() == exp
# table is boxed with div and has style
exp = """Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 3 tasks
""".format(
style=style, exp_table=exp_table
)
assert ddf._repr_html_() == exp
def test_dataframe_format_with_index():
pytest.importorskip("jinja2")
df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8],
"B": list("ABCDEFGH"),
"C": pd.Categorical(list("AAABBBCC")),
},
index=list("ABCDEFGH"),
)
ddf = dd.from_pandas(df, 3)
exp = (
"Dask DataFrame Structure:\n"
" A B C\n"
"npartitions=3 \n"
"A int64 object category[known]\n"
"D ... ... ...\n"
"G ... ... ...\n"
"H ... ... ...\n"
"Dask Name: from_pandas, 3 tasks"
)
assert repr(ddf) == exp
assert str(ddf) == exp
exp_table = """
|
A |
B |
C |
npartitions=3 |
|
|
|
A |
int64 |
object |
category[known] |
D |
... |
... |
... |
G |
... |
... |
... |
H |
... |
... |
... |
"""
exp = """Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 3 tasks
""".format(
exp_table=exp_table
)
assert ddf.to_html() == exp
# table is boxed with div and has style
exp = """Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 3 tasks
""".format(
style=style, exp_table=exp_table
)
assert ddf._repr_html_() == exp
def test_dataframe_format_unknown_divisions():
pytest.importorskip("jinja2")
df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8],
"B": list("ABCDEFGH"),
"C": pd.Categorical(list("AAABBBCC")),
}
)
ddf = dd.from_pandas(df, 3)
ddf = ddf.clear_divisions()
assert not ddf.known_divisions
exp = (
"Dask DataFrame Structure:\n"
" A B C\n"
"npartitions=3 \n"
" int64 object category[known]\n"
" ... ... ...\n"
" ... ... ...\n"
" ... ... ...\n"
"Dask Name: from_pandas, 3 tasks"
)
assert repr(ddf) == exp
assert str(ddf) == exp
exp = (
" A B C\n"
"npartitions=3 \n"
" int64 object category[known]\n"
" ... ... ...\n"
" ... ... ...\n"
" ... ... ..."
)
assert ddf.to_string() == exp
exp_table = """
|
A |
B |
C |
npartitions=3 |
|
|
|
|
int64 |
object |
category[known] |
|
... |
... |
... |
|
... |
... |
... |
|
... |
... |
... |
"""
exp = """Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 3 tasks
""".format(
exp_table=exp_table
)
assert ddf.to_html() == exp
# table is boxed with div and has style
exp = """Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 3 tasks
""".format(
style=style, exp_table=exp_table
)
assert ddf._repr_html_() == exp
def test_dataframe_format_long():
pytest.importorskip("jinja2")
df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6, 7, 8] * 10,
"B": list("ABCDEFGH") * 10,
"C": pd.Categorical(list("AAABBBCC") * 10),
}
)
ddf = dd.from_pandas(df, 10)
exp = (
"Dask DataFrame Structure:\n"
" A B C\n"
"npartitions=10 \n"
"0 int64 object category[known]\n"
"8 ... ... ...\n"
"... ... ... ...\n"
"72 ... ... ...\n"
"79 ... ... ...\n"
"Dask Name: from_pandas, 10 tasks"
)
assert repr(ddf) == exp
assert str(ddf) == exp
exp = (
" A B C\n"
"npartitions=10 \n"
"0 int64 object category[known]\n"
"8 ... ... ...\n"
"... ... ... ...\n"
"72 ... ... ...\n"
"79 ... ... ..."
)
assert ddf.to_string() == exp
exp_table = """
|
A |
B |
C |
npartitions=10 |
|
|
|
0 |
int64 |
object |
category[known] |
8 |
... |
... |
... |
... |
... |
... |
... |
72 |
... |
... |
... |
79 |
... |
... |
... |
"""
exp = """Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 10 tasks
""".format(
exp_table=exp_table
)
assert ddf.to_html() == exp
# table is boxed with div
exp = """Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 10 tasks
""".format(
style=style, exp_table=exp_table
)
assert ddf._repr_html_() == exp
def test_series_format():
s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list("ABCDEFGH"))
ds = dd.from_pandas(s, 3)
exp = """Dask Series Structure:
npartitions=3
A int64
D ...
G ...
H ...
dtype: int64
Dask Name: from_pandas, 3 tasks"""
assert repr(ds) == exp
assert str(ds) == exp
exp = """npartitions=3
A int64
D ...
G ...
H ..."""
assert ds.to_string() == exp
s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list("ABCDEFGH"), name="XXX")
ds = dd.from_pandas(s, 3)
exp = """Dask Series Structure:
npartitions=3
A int64
D ...
G ...
H ...
Name: XXX, dtype: int64
Dask Name: from_pandas, 3 tasks"""
assert repr(ds) == exp
assert str(ds) == exp
def test_series_format_long():
s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 10, index=list("ABCDEFGHIJ") * 10)
ds = dd.from_pandas(s, 10)
exp = (
"Dask Series Structure:\nnpartitions=10\nA int64\nB ...\n"
" ... \nJ ...\nJ ...\ndtype: int64\n"
"Dask Name: from_pandas, 10 tasks"
)
assert repr(ds) == exp
assert str(ds) == exp
exp = "npartitions=10\nA int64\nB ...\n ... \nJ ...\nJ ..."
assert ds.to_string() == exp
def test_index_format():
s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list("ABCDEFGH"))
ds = dd.from_pandas(s, 3)
exp = """Dask Index Structure:
npartitions=3
A object
D ...
G ...
H ...
dtype: object
Dask Name: from_pandas, 6 tasks"""
assert repr(ds.index) == exp
assert str(ds.index) == exp
s = pd.Series(
[1, 2, 3, 4, 5, 6, 7, 8],
index=pd.CategoricalIndex([1, 2, 3, 4, 5, 6, 7, 8], name="YYY"),
)
ds = dd.from_pandas(s, 3)
exp = dedent(
"""\
Dask Index Structure:
npartitions=3
1 category[known]
4 ...
7 ...
8 ...
Name: YYY, dtype: category
Dask Name: from_pandas, 6 tasks"""
)
assert repr(ds.index) == exp
assert str(ds.index) == exp
def test_categorical_format():
s = pd.Series(["a", "b", "c"]).astype("category")
known = dd.from_pandas(s, npartitions=1)
unknown = known.cat.as_unknown()
exp = (
"Dask Series Structure:\n"
"npartitions=1\n"
"0 category[known]\n"
"2 ...\n"
"dtype: category\n"
"Dask Name: from_pandas, 1 tasks"
)
assert repr(known) == exp
exp = (
"Dask Series Structure:\n"
"npartitions=1\n"
"0 category[unknown]\n"
"2 ...\n"
"dtype: category\n"
"Dask Name: from_pandas, 1 tasks"
)
assert repr(unknown) == exp
def test_duplicate_columns_repr():
arr = da.from_array(np.arange(10).reshape(5, 2), chunks=(5, 2))
frame = dd.from_dask_array(arr, columns=["a", "a"])
repr(frame)
def test_empty_repr():
pytest.importorskip("jinja2")
df = pd.DataFrame()
ddf = dd.from_pandas(df, npartitions=1)
exp = (
"Empty Dask DataFrame Structure:\n"
"Columns: []\n"
"Divisions: [, ]\n"
"Dask Name: from_pandas, 1 tasks"
)
assert repr(ddf) == exp
exp_table = """"""
exp = """Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 1 tasks
""".format(
style=style, exp_table=exp_table
)
assert ddf._repr_html_() == exp