# -*- coding: utf-8 -*-
# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
This module tests some of the methods related to the ``HTML``
reader/writer and aims to document its functionality.
Requires `BeautifulSoup `_
to be installed.
"""
from io import StringIO
from astropy.io.ascii import html
from astropy.io.ascii import core
from astropy.table import Table
import pytest
import numpy as np
from .common import setup_function, teardown_function # noqa
from astropy.io import ascii
from astropy.utils.compat.optional_deps import HAS_BLEACH, HAS_BS4 # noqa
if HAS_BS4:
from bs4 import BeautifulSoup, FeatureNotFound
@pytest.mark.skipif('not HAS_BS4')
def test_soupstring():
"""
Test to make sure the class SoupString behaves properly.
"""
soup = BeautifulSoup('
foo
',
'html.parser')
soup_str = html.SoupString(soup)
assert isinstance(soup_str, str)
assert isinstance(soup_str, html.SoupString)
assert soup_str == 'foo
'
assert soup_str.soup is soup
def test_listwriter():
"""
Test to make sure the class ListWriter behaves properly.
"""
lst = []
writer = html.ListWriter(lst)
for i in range(5):
writer.write(i)
for ch in 'abcde':
writer.write(ch)
assert lst == [0, 1, 2, 3, 4, 'a', 'b', 'c', 'd', 'e']
@pytest.mark.skipif('not HAS_BS4')
def test_identify_table():
"""
Test to make sure that identify_table() returns whether the
given BeautifulSoup tag is the correct table to process.
"""
# Should return False on non- tags and None
soup = BeautifulSoup('', 'html.parser')
assert html.identify_table(soup, {}, 0) is False
assert html.identify_table(None, {}, 0) is False
soup = BeautifulSoup('', 'html.parser').table
assert html.identify_table(soup, {}, 2) is False
assert html.identify_table(soup, {}, 1) is True # Default index of 1
# Same tests, but with explicit parameter
assert html.identify_table(soup, {'table_id': 2}, 1) is False
assert html.identify_table(soup, {'table_id': 1}, 1) is True
# Test identification by string ID
assert html.identify_table(soup, {'table_id': 'bar'}, 1) is False
assert html.identify_table(soup, {'table_id': 'foo'}, 1) is True
@pytest.mark.skipif('not HAS_BS4')
def test_missing_data():
"""
Test reading a table with missing data
"""
# First with default where blank => '0'
table_in = ['']
dat = Table.read(table_in, format='ascii.html')
assert dat.masked is False
assert np.all(dat['A'].mask == [True, False])
assert dat['A'].dtype.kind == 'i'
# Now with a specific value '...' => missing
table_in = ['']
dat = Table.read(table_in, format='ascii.html', fill_values=[('...', '0')])
assert dat.masked is False
assert np.all(dat['A'].mask == [True, False])
assert dat['A'].dtype.kind == 'i'
@pytest.mark.skipif('not HAS_BS4')
def test_rename_cols():
"""
Test reading a table and renaming cols
"""
table_in = ['']
# Swap column names
dat = Table.read(table_in, format='ascii.html', names=['B', 'A'])
assert dat.colnames == ['B', 'A']
assert len(dat) == 1
# Swap column names and only include A (the renamed version)
dat = Table.read(table_in, format='ascii.html', names=['B', 'A'], include_names=['A'])
assert dat.colnames == ['A']
assert len(dat) == 1
assert np.all(dat['A'] == 2)
@pytest.mark.skipif('not HAS_BS4')
def test_no_names():
"""
Test reading a table with no column header
"""
table_in = ['']
dat = Table.read(table_in, format='ascii.html')
assert dat.colnames == ['col1']
assert len(dat) == 2
dat = Table.read(table_in, format='ascii.html', names=['a'])
assert dat.colnames == ['a']
assert len(dat) == 2
@pytest.mark.skipif('not HAS_BS4')
def test_identify_table_fail():
"""
Raise an exception with an informative error message if table_id
is not found.
"""
table_in = ['']
with pytest.raises(core.InconsistentTableError) as err:
Table.read(table_in, format='ascii.html', htmldict={'table_id': 'bad_id'},
guess=False)
assert err.match("ERROR: HTML table id 'bad_id' not found$")
with pytest.raises(core.InconsistentTableError) as err:
Table.read(table_in, format='ascii.html', htmldict={'table_id': 3},
guess=False)
assert err.match("ERROR: HTML table number 3 not found$")
@pytest.mark.skipif('not HAS_BS4')
def test_backend_parsers():
"""
Make sure the user can specify which back-end parser to use
and that an error is raised if the parser is invalid.
"""
for parser in ('lxml', 'xml', 'html.parser', 'html5lib'):
try:
Table.read('data/html2.html', format='ascii.html',
htmldict={'parser': parser}, guess=False)
except FeatureNotFound:
if parser == 'html.parser':
raise
# otherwise ignore if the dependency isn't present
# reading should fail if the parser is invalid
with pytest.raises(FeatureNotFound):
Table.read('data/html2.html', format='ascii.html',
htmldict={'parser': 'foo'}, guess=False)
@pytest.mark.skipif('HAS_BS4')
def test_htmlinputter_no_bs4():
"""
This should return an OptionalTableImportError if BeautifulSoup
is not installed.
"""
inputter = html.HTMLInputter()
with pytest.raises(core.OptionalTableImportError):
inputter.process_lines([])
@pytest.mark.skipif('not HAS_BS4')
def test_htmlinputter():
"""
Test to ensure that HTMLInputter correctly converts input
into a list of SoupStrings representing table elements.
"""
f = 'data/html.html'
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
# In absence of table_id, defaults to the first table
expected = ['Column 1 | Column 2 | Column 3 |
',
'1 | a | 1.05 |
',
'2 | b | 2.75 |
',
'3 | c | -1.25 |
']
assert [str(x) for x in inputter.get_lines(table)] == expected
# Should raise an InconsistentTableError if the table is not found
inputter.html = {'table_id': 4}
with pytest.raises(core.InconsistentTableError):
inputter.get_lines(table)
# Identification by string ID
inputter.html['table_id'] = 'second'
expected = ['Column A | Column B | Column C |
',
'4 | d | 10.5 |
',
'5 | e | 27.5 |
',
'6 | f | -12.5 |
']
assert [str(x) for x in inputter.get_lines(table)] == expected
# Identification by integer index
inputter.html['table_id'] = 3
expected = ['C1 | C2 | C3 |
',
'7 | g | 105.0 |
',
'8 | h | 275.0 |
',
'9 | i | -125.0 |
']
assert [str(x) for x in inputter.get_lines(table)] == expected
@pytest.mark.skipif('not HAS_BS4')
def test_htmlsplitter():
"""
Test to make sure that HTMLSplitter correctly inputs lines
of type SoupString to return a generator that gives all
header and data elements.
"""
splitter = html.HTMLSplitter()
lines = [html.SoupString(BeautifulSoup('',
'html.parser').tr),
html.SoupString(BeautifulSoup('',
'html.parser').tr)]
expected_data = [['Col 1', 'Col 2'], ['Data 1', 'Data 2']]
assert list(splitter(lines)) == expected_data
# Make sure the presence of a non-SoupString triggers a TypeError
lines.append('Data 3 | Data 4 |
')
with pytest.raises(TypeError):
list(splitter(lines))
# Make sure that passing an empty list triggers an error
with pytest.raises(core.InconsistentTableError):
list(splitter([]))
@pytest.mark.skipif('not HAS_BS4')
def test_htmlheader_start():
"""
Test to ensure that the start_line method of HTMLHeader
returns the first line of header data. Uses t/html.html
for sample input.
"""
f = 'data/html.html'
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
header = html.HTMLHeader()
lines = inputter.get_lines(table)
assert str(lines[header.start_line(lines)]) == \
'Column 1 | Column 2 | Column 3 |
'
inputter.html['table_id'] = 'second'
lines = inputter.get_lines(table)
assert str(lines[header.start_line(lines)]) == \
'Column A | Column B | Column C |
'
inputter.html['table_id'] = 3
lines = inputter.get_lines(table)
assert str(lines[header.start_line(lines)]) == \
'C1 | C2 | C3 |
'
# start_line should return None if no valid header is found
lines = [html.SoupString(BeautifulSoup('',
'html.parser').tr),
html.SoupString(BeautifulSoup('Text
', 'html.parser').p)]
assert header.start_line(lines) is None
# Should raise an error if a non-SoupString is present
lines.append('Header |
')
with pytest.raises(TypeError):
header.start_line(lines)
@pytest.mark.skipif('not HAS_BS4')
def test_htmldata():
"""
Test to ensure that the start_line and end_lines methods
of HTMLData returns the first line of table data. Uses
t/html.html for sample input.
"""
f = 'data/html.html'
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
data = html.HTMLData()
lines = inputter.get_lines(table)
assert str(lines[data.start_line(lines)]) == \
'1 | a | 1.05 |
'
# end_line returns the index of the last data element + 1
assert str(lines[data.end_line(lines) - 1]) == \
'3 | c | -1.25 |
'
inputter.html['table_id'] = 'second'
lines = inputter.get_lines(table)
assert str(lines[data.start_line(lines)]) == \
'4 | d | 10.5 |
'
assert str(lines[data.end_line(lines) - 1]) == \
'6 | f | -12.5 |
'
inputter.html['table_id'] = 3
lines = inputter.get_lines(table)
assert str(lines[data.start_line(lines)]) == \
'7 | g | 105.0 |
'
assert str(lines[data.end_line(lines) - 1]) == \
'9 | i | -125.0 |
'
# start_line should raise an error if no table data exists
lines = [html.SoupString(BeautifulSoup('', 'html.parser').div),
html.SoupString(BeautifulSoup('Text
', 'html.parser').p)]
with pytest.raises(core.InconsistentTableError):
data.start_line(lines)
# end_line should return None if no table data exists
assert data.end_line(lines) is None
# Should raise an error if a non-SoupString is present
lines.append('Data |
')
with pytest.raises(TypeError):
data.start_line(lines)
with pytest.raises(TypeError):
data.end_line(lines)
def test_multicolumn_write():
"""
Test to make sure that the HTML writer writes multidimensional
columns (those with iterable elements) using the colspan
attribute of .
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [('a', 'a', 'a'), ('b', 'b', 'b'), ('c', 'c', 'c')]
table = Table([col1, col2, col3], names=('C1', 'C2', 'C3'))
expected = """\
C1 |
C2 |
C3 |
1 |
1.0 |
1.0 |
a |
a |
a |
2 |
2.0 |
2.0 |
b |
b |
b |
3 |
3.0 |
3.0 |
c |
c |
c |
"""
out = html.HTML().write(table)[0].strip()
assert out == expected.strip()
@pytest.mark.skipif('not HAS_BLEACH')
def test_multicolumn_write_escape():
"""
Test to make sure that the HTML writer writes multidimensional
columns (those with iterable elements) using the colspan
attribute of | .
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [('', '', 'a'), ('', 'b', 'b'), ('c', 'c', 'c')]
table = Table([col1, col2, col3], names=('C1', 'C2', 'C3'))
expected = """\
C1 |
C2 |
C3 |
1 |
1.0 |
1.0 |
|
|
a |
2 |
2.0 |
2.0 |
|
b |
b |
3 |
3.0 |
3.0 |
c |
c |
c |
"""
out = html.HTML(htmldict={'raw_html_cols': 'C3'}).write(table)[0].strip()
assert out == expected.strip()
def test_write_no_multicols():
"""
Test to make sure that the HTML writer will not use
multi-dimensional columns if the multicol parameter
is False.
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [('a', 'a', 'a'), ('b', 'b', 'b'), ('c', 'c', 'c')]
table = Table([col1, col2, col3], names=('C1', 'C2', 'C3'))
expected = """\
C1 |
C2 |
C3 |
1 |
1.0 .. 1.0 |
a .. a |
2 |
2.0 .. 2.0 |
b .. b |
3 |
3.0 .. 3.0 |
c .. c |
"""
assert html.HTML({'multicol': False}).write(table)[0].strip() == \
expected.strip()
@pytest.mark.skipif('not HAS_BS4')
def test_multicolumn_read():
"""
Test to make sure that the HTML reader inputs multidimensional
columns (those with iterable elements) using the colspan
attribute of | .
Ensure that any string element within a multidimensional column
casts all elements to string prior to type conversion operations.
"""
table = Table.read('data/html2.html', format='ascii.html')
str_type = np.dtype((str, 21))
expected = Table(np.array([(['1', '2.5000000000000000001'], 3),
(['1a', '1'], 3.5)],
dtype=[('A', str_type, (2,)), ('B', 'x'], ['y']], names=['a', 'b'])
# One column contains raw HTML (string input)
out = StringIO()
t.write(out, format='ascii.html', htmldict={'raw_html_cols': 'a'})
expected = """\
x |
<em>y</em> |
"""
assert expected in out.getvalue()
# One column contains raw HTML (list input)
out = StringIO()
t.write(out, format='ascii.html', htmldict={'raw_html_cols': ['a']})
assert expected in out.getvalue()
# Two columns contains raw HTML (list input)
out = StringIO()
t.write(out, format='ascii.html', htmldict={'raw_html_cols': ['a', 'b']})
expected = """\
x |
y |
"""
assert expected in out.getvalue()
@pytest.mark.skipif('not HAS_BLEACH')
def test_raw_html_write_clean():
"""
Test that columns can contain raw HTML which is not escaped.
"""
import bleach # noqa
t = Table([[''], ['y '], ['y']], names=['a', 'b', 'c'])
# Confirm that |