import numpy as np import pytest from pandas import ( Categorical, DataFrame, Series, _testing as tm, concat, read_hdf, ) from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_path, ensure_clean_store, ) pytestmark = [ pytest.mark.single_cpu, # pytables https://github.com/PyTables/PyTables/issues/822 pytest.mark.filterwarnings( "ignore:a closed node found in the registry:UserWarning" ), ] def test_categorical(setup_path): with ensure_clean_store(setup_path) as store: # Basic _maybe_remove(store, "s") s = Series( Categorical( ["a", "b", "b", "a", "a", "c"], categories=["a", "b", "c", "d"], ordered=False, ) ) store.append("s", s, format="table") result = store.select("s") tm.assert_series_equal(s, result) _maybe_remove(store, "s_ordered") s = Series( Categorical( ["a", "b", "b", "a", "a", "c"], categories=["a", "b", "c", "d"], ordered=True, ) ) store.append("s_ordered", s, format="table") result = store.select("s_ordered") tm.assert_series_equal(s, result) _maybe_remove(store, "df") df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) store.append("df", df, format="table") result = store.select("df") tm.assert_frame_equal(result, df) # Dtypes _maybe_remove(store, "si") s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") store.append("si", s) result = store.select("si") tm.assert_series_equal(result, s) _maybe_remove(store, "si2") s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") store.append("si2", s) result = store.select("si2") tm.assert_series_equal(result, s) # Multiple _maybe_remove(store, "df2") df2 = df.copy() df2["s2"] = Series(list("abcdefg")).astype("category") store.append("df2", df2) result = store.select("df2") tm.assert_frame_equal(result, df2) # Make sure the metadata is OK info = store.info() assert "/df2 " in info # df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical assert "/df2/meta/values_block_0/meta" in info assert "/df2/meta/values_block_2/meta" in info # unordered _maybe_remove(store, "s2") s = Series( Categorical( ["a", "b", "b", "a", "a", "c"], categories=["a", "b", "c", "d"], ordered=False, ) ) store.append("s2", s, format="table") result = store.select("s2") tm.assert_series_equal(result, s) # Query _maybe_remove(store, "df3") store.append("df3", df, data_columns=["s"]) expected = df[df.s.isin(["b", "c"])] result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(["b", "c"])] result = store.select("df3", where=['s = ["b","c"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(["d"])] result = store.select("df3", where=['s in ["d"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(["f"])] result = store.select("df3", where=['s in ["f"]']) tm.assert_frame_equal(result, expected) # Appending with same categories is ok store.append("df3", df) df = concat([df, df]) expected = df[df.s.isin(["b", "c"])] result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) # Appending must have the same categories df3 = df.copy() df3["s"] = df3["s"].cat.remove_unused_categories() msg = "cannot append a categorical with different categories to the existing" with pytest.raises(ValueError, match=msg): store.append("df3", df3) # Remove, and make sure meta data is removed (its a recursive # removal so should be). result = store.select("df3/meta/s/meta") assert result is not None store.remove("df3") with pytest.raises( KeyError, match="'No object named df3/meta/s/meta in the file'" ): store.select("df3/meta/s/meta") def test_categorical_conversion(setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if # where criteria isn't met. obsids = ["ESP_012345_6789", "ESP_987654_3210"] imgids = ["APF00006np", "APF0001imm"] data = [4.3, 9.8] # Test without categories df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) # Test with categories df.obsids = df.obsids.astype("category") df.imgids = df.imgids.astype("category") # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) def test_categorical_nan_only_columns(setup_path): # GH18413 # Check that read_hdf with categorical columns with NaN-only values can # be read back. df = DataFrame( { "a": ["a", "b", "c", np.nan], "b": [np.nan, np.nan, np.nan, np.nan], "c": [1, 2, 3, 4], "d": Series([None] * 4, dtype=object), } ) df["a"] = df.a.astype("category") df["b"] = df.b.astype("category") df["d"] = df.b.astype("category") expected = df with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "where, df, expected", [ ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})), ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})), ], ) def test_convert_value(setup_path, where: str, df: DataFrame, expected: DataFrame): # GH39420 # Check that read_hdf with categorical columns can filter by where condition. df.col = df.col.astype("category") max_widths = {"col": 1} categorical_values = sorted(df.col.unique()) expected.col = expected.col.astype("category") expected.col = expected.col.cat.set_categories(categorical_values) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", min_itemsize=max_widths) result = read_hdf(path, where=where) tm.assert_frame_equal(result, expected)