from __future__ import annotations from io import ( BytesIO, StringIO, ) import os import numpy as np import pytest import pandas.util._test_decorators as td from pandas import ( NA, DataFrame, Index, ) import pandas._testing as tm import pandas.io.common as icom from pandas.io.common import get_handle from pandas.io.xml import read_xml """ CHECKLIST [x] - ValueError: "Values for parser can only be lxml or etree." etree [x] - ImportError: "lxml not found, please install or use the etree parser." [X] - TypeError: "...is not a valid type for attr_cols" [X] - TypeError: "...is not a valid type for elem_cols" [X] - LookupError: "unknown encoding" [X] - KeyError: "...is not included in namespaces" [X] - KeyError: "no valid column" [X] - ValueError: "To use stylesheet, you need lxml installed..." [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) [X] - FileNotFoundError: "No such file or directory" [X] - PermissionError: "Forbidden" lxml [X] - TypeError: "...is not a valid type for attr_cols" [X] - TypeError: "...is not a valid type for elem_cols" [X] - LookupError: "unknown encoding" [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) [X] - FileNotFoundError: "No such file or directory" [X] - KeyError: "...is not included in namespaces" [X] - KeyError: "no valid column" [X] - ValueError: "stylesheet is not a url, file, or xml string." [] - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT) [] - URLError: (USUALLY DUE TO NETWORKING) [] - HTTPError: (NEED AN ONLINE STYLESHEET) [X] - OSError: "failed to load external entity" [X] - XMLSyntaxError: "Opening and ending tag mismatch" [X] - XSLTApplyError: "Cannot resolve URI" [X] - XSLTParseError: "failed to compile" [X] - PermissionError: "Forbidden" """ geom_df = DataFrame( { "shape": ["square", "circle", "triangle"], "degrees": [360, 360, 180], "sides": [4, np.nan, 3], } ) planet_df = DataFrame( { "planet": [ "Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune", ], "type": [ "terrestrial", "terrestrial", "terrestrial", "terrestrial", "gas giant", "gas giant", "ice giant", "ice giant", ], "location": [ "inner", "inner", "inner", "inner", "outer", "outer", "outer", "outer", ], "mass": [ 0.330114, 4.86747, 5.97237, 0.641712, 1898.187, 568.3174, 86.8127, 102.4126, ], } ) from_file_expected = """\ 0 cooking Everyday Italian Giada De Laurentiis 2005 30.0 1 children Harry Potter J K. Rowling 2005 29.99 2 web Learning XML Erik T. Ray 2003 39.95 """ def equalize_decl(doc): # etree and lxml differ on quotes and case in xml declaration if doc is not None: doc = doc.replace( ' cooking Everyday Italian Giada De Laurentiis 2005 30.0 children Harry Potter J K. Rowling 2005 29.99 web Learning XML Erik T. Ray 2003 39.95 """ filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) with tm.ensure_clean("test.xml") as path: df_file.to_xml(path, index=False, parser=parser) with open(path, "rb") as f: output = f.read().decode("utf-8").strip() output = equalize_decl(output) assert output == expected def test_index_false_rename_row_root(datapath, parser): expected = """\ cooking Everyday Italian Giada De Laurentiis 2005 30.0 children Harry Potter J K. Rowling 2005 29.99 web Learning XML Erik T. Ray 2003 39.95 """ filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) with tm.ensure_clean("test.xml") as path: df_file.to_xml( path, index=False, root_name="books", row_name="book", parser=parser ) with open(path, "rb") as f: output = f.read().decode("utf-8").strip() output = equalize_decl(output) assert output == expected @pytest.mark.parametrize( "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] ) def test_index_false_with_offset_input_index(parser, offset_index): """ Tests that the output does not contain the `` field when the index of the input Dataframe has an offset. This is a regression test for issue #42458. """ expected = """\ square 360 4.0 circle 360 triangle 180 3.0 """ offset_geom_df = geom_df.copy() offset_geom_df.index = Index(offset_index) output = offset_geom_df.to_xml(index=False, parser=parser) output = equalize_decl(output) assert output == expected # NA_REP na_expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ def test_na_elem_output(datapath, parser): output = geom_df.to_xml(parser=parser) output = equalize_decl(output) assert output == na_expected def test_na_empty_str_elem_option(datapath, parser): output = geom_df.to_xml(na_rep="", parser=parser) output = equalize_decl(output) assert output == na_expected def test_na_empty_elem_option(datapath, parser): expected = """\ 0 square 360 4.0 1 circle 360 0.0 2 triangle 180 3.0 """ output = geom_df.to_xml(na_rep="0.0", parser=parser) output = equalize_decl(output) assert output == expected # ATTR_COLS def test_attrs_cols_nan_output(datapath, parser): expected = """\ """ output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser) output = equalize_decl(output) assert output == expected def test_attrs_cols_prefix(datapath, parser): expected = """\ """ output = geom_df.to_xml( attr_cols=["index", "shape", "degrees", "sides"], namespaces={"doc": "http://example.xom"}, prefix="doc", parser=parser, ) output = equalize_decl(output) assert output == expected def test_attrs_unknown_column(parser): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(attr_cols=["shape", "degree", "sides"], parser=parser) def test_attrs_wrong_type(parser): with pytest.raises(TypeError, match=("is not a valid type for attr_cols")): geom_df.to_xml(attr_cols='"shape", "degree", "sides"', parser=parser) # ELEM_COLS def test_elems_cols_nan_output(datapath, parser): elems_cols_expected = """\ 360 4.0 square 360 circle 180 3.0 triangle """ output = geom_df.to_xml( index=False, elem_cols=["degrees", "sides", "shape"], parser=parser ) output = equalize_decl(output) assert output == elems_cols_expected def test_elems_unknown_column(parser): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(elem_cols=["shape", "degree", "sides"], parser=parser) def test_elems_wrong_type(parser): with pytest.raises(TypeError, match=("is not a valid type for elem_cols")): geom_df.to_xml(elem_cols='"shape", "degree", "sides"', parser=parser) def test_elems_and_attrs_cols(datapath, parser): elems_cols_expected = """\ 360 4.0 360 180 3.0 """ output = geom_df.to_xml( index=False, elem_cols=["degrees", "sides"], attr_cols=["shape"], parser=parser, ) output = equalize_decl(output) assert output == elems_cols_expected # HIERARCHICAL COLUMNS def test_hierarchical_columns(datapath, parser): expected = """\ inner terrestrial 4 11.81 2.95 outer gas giant 2 2466.5 1233.25 outer ice giant 2 189.23 94.61 All 8 2667.54 333.44 """ pvt = planet_df.pivot_table( index=["location", "type"], values="mass", aggfunc=["count", "sum", "mean"], margins=True, ).round(2) output = pvt.to_xml(parser=parser) output = equalize_decl(output) assert output == expected def test_hierarchical_attrs_columns(datapath, parser): expected = """\ """ pvt = planet_df.pivot_table( index=["location", "type"], values="mass", aggfunc=["count", "sum", "mean"], margins=True, ).round(2) output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser) output = equalize_decl(output) assert output == expected # MULTIINDEX def test_multi_index(datapath, parser): expected = """\ inner terrestrial 4 11.81 2.95 outer gas giant 2 2466.5 1233.25 outer ice giant 2 189.23 94.61 """ agg = ( planet_df.groupby(["location", "type"])["mass"] .agg(["count", "sum", "mean"]) .round(2) ) output = agg.to_xml(parser=parser) output = equalize_decl(output) assert output == expected def test_multi_index_attrs_cols(datapath, parser): expected = """\ """ agg = ( planet_df.groupby(["location", "type"])["mass"] .agg(["count", "sum", "mean"]) .round(2) ) output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser) output = equalize_decl(output) assert output == expected # NAMESPACE def test_default_namespace(parser): expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser) output = equalize_decl(output) assert output == expected # PREFIX def test_namespace_prefix(parser): expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml( namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser ) output = equalize_decl(output) assert output == expected def test_missing_prefix_in_nmsp(parser): with pytest.raises(KeyError, match=("doc is not included in namespaces")): geom_df.to_xml( namespaces={"": "http://example.com"}, prefix="doc", parser=parser ) def test_namespace_prefix_and_default(parser): expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml( namespaces={"": "http://example.com", "doc": "http://other.org"}, prefix="doc", parser=parser, ) output = equalize_decl(output) if output is not None: # etree and lxml differs on order of namespace prefixes output = output.replace( 'xmlns:doc="http://other.org" xmlns="http://example.com"', 'xmlns="http://example.com" xmlns:doc="http://other.org"', ) assert output == expected # ENCODING encoding_expected = """\ 0 1 José Sofía 1 2 Luis Valentina 2 3 Carlos Isabella 3 4 Juan Camila 4 5 Jorge Valeria """ def test_encoding_option_str(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") df_file = read_xml(filename, parser=parser, encoding="ISO-8859-1").head(5) output = df_file.to_xml(encoding="ISO-8859-1", parser=parser) if output is not None: # etree and lxml differ on quotes and case in xml declaration output = output.replace( ' 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ output = geom_df.to_xml(xml_declaration=False) assert output == expected def test_no_pretty_print_with_decl(parser): expected = ( "\n" "0square" "3604.0" "1circle360" "2" "triangle1803.0" "" ) output = geom_df.to_xml(pretty_print=False, parser=parser) output = equalize_decl(output) # etree adds space for closed tags if output is not None: output = output.replace(" />", "/>") assert output == expected def test_no_pretty_print_no_decl(parser): expected = ( "0square" "3604.0" "1circle360" "2" "triangle1803.0" "" ) output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser) # etree adds space for closed tags if output is not None: output = output.replace(" />", "/>") assert output == expected # PARSER @td.skip_if_installed("lxml") def test_default_parser_no_lxml(): with pytest.raises( ImportError, match=("lxml not found, please install or use the etree parser.") ): geom_df.to_xml() def test_unknown_parser(): with pytest.raises( ValueError, match=("Values for parser can only be lxml or etree.") ): geom_df.to_xml(parser="bs4") # STYLESHEET xsl_expected = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ @td.skip_if_no("lxml") def test_stylesheet_file_like(datapath, mode): xsl = datapath("io", "data", "xml", "row_field_output.xsl") with open(xsl, mode) as f: assert geom_df.to_xml(stylesheet=f) == xsl_expected @td.skip_if_no("lxml") def test_stylesheet_io(datapath, mode): xsl_path = datapath("io", "data", "xml", "row_field_output.xsl") xsl_obj: BytesIO | StringIO with open(xsl_path, mode) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: xsl_obj = StringIO(f.read()) output = geom_df.to_xml(stylesheet=xsl_obj) assert output == xsl_expected @td.skip_if_no("lxml") def test_stylesheet_buffered_reader(datapath, mode): xsl = datapath("io", "data", "xml", "row_field_output.xsl") with open(xsl, mode) as f: xsl_obj = f.read() output = geom_df.to_xml(stylesheet=xsl_obj) assert output == xsl_expected @td.skip_if_no("lxml") def test_stylesheet_wrong_path(datapath): from lxml.etree import XMLSyntaxError xsl = os.path.join("data", "xml", "row_field_output.xslt") with pytest.raises( XMLSyntaxError, match=("Start tag expected, '<' not found"), ): geom_df.to_xml(stylesheet=xsl) @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_stylesheet(val): from lxml.etree import XMLSyntaxError with pytest.raises( XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") ): geom_df.to_xml(stylesheet=val) @td.skip_if_no("lxml") def test_incorrect_xsl_syntax(): from lxml.etree import XMLSyntaxError xsl = """\ """ with pytest.raises(XMLSyntaxError, match=("Opening and ending tag mismatch")): geom_df.to_xml(stylesheet=xsl) @td.skip_if_no("lxml") def test_incorrect_xsl_eval(): from lxml.etree import XSLTParseError xsl = """\ """ with pytest.raises(XSLTParseError, match=("failed to compile")): geom_df.to_xml(stylesheet=xsl) @td.skip_if_no("lxml") def test_incorrect_xsl_apply(parser): from lxml.etree import XSLTApplyError xsl = """\ """ with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")): with tm.ensure_clean("test.xml") as path: geom_df.to_xml(path, stylesheet=xsl) def test_stylesheet_with_etree(datapath): xsl = """\ """ with pytest.raises( ValueError, match=("To use stylesheet, you need lxml installed") ): geom_df.to_xml(parser="etree", stylesheet=xsl) @td.skip_if_no("lxml") def test_style_to_csv(): xsl = """\ , ,shape,degrees,sides """ out_csv = geom_df.to_csv(line_terminator="\n") if out_csv is not None: out_csv = out_csv.strip() out_xml = geom_df.to_xml(stylesheet=xsl) assert out_csv == out_xml @td.skip_if_no("lxml") def test_style_to_string(): xsl = """\ shape degrees sides """ out_str = geom_df.to_string() out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl) assert out_xml == out_str @td.skip_if_no("lxml") def test_style_to_json(): xsl = """\ " {"shape":{ },"degrees":{ },"sides":{ }} , """ out_json = geom_df.to_json() out_xml = geom_df.to_xml(stylesheet=xsl) assert out_json == out_xml # COMPRESSION geom_xml = """\ 0 square 360 4.0 1 circle 360 2 triangle 180 3.0 """ def test_compression_output(parser, compression_only): with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression=compression_only) with get_handle( path, "r", compression=compression_only, ) as handle_obj: output = handle_obj.handle.read() output = equalize_decl(output) assert geom_xml == output.strip() def test_filename_and_suffix_comp(parser, compression_only): compfile = "xml." + icom._compression_to_extension[compression_only] with tm.ensure_clean(filename=compfile) as path: geom_df.to_xml(path, parser=parser, compression=compression_only) with get_handle( path, "r", compression=compression_only, ) as handle_obj: output = handle_obj.handle.read() output = equalize_decl(output) assert geom_xml == output.strip() def test_ea_dtypes(any_numeric_ea_dtype, parser): # GH#43903 expected = """ 0 """ df = DataFrame({"a": [NA]}).astype(any_numeric_ea_dtype) result = df.to_xml(parser=parser) assert equalize_decl(result).strip() == expected def test_unsuported_compression(datapath, parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression="7z") # STORAGE OPTIONS @pytest.mark.single_cpu @td.skip_if_no("s3fs") @td.skip_if_no("lxml") def test_s3_permission_output(parser, s3_resource): # s3_resource hosts pandas-test import s3fs with pytest.raises(PermissionError, match="Access Denied"): fs = s3fs.S3FileSystem(anon=True) fs.ls("pandas-test") geom_df.to_xml("s3://pandas-test/geom.xml", compression="zip", parser=parser)