# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""An extensible HTML table reader and writer.
Classes to read and write HTML tables
`BeautifulSoup `_
must be installed to read HTML tables.
import warnings
from . import core
from astropy.table import Column
from astropy.utils.xml import writer
from copy import deepcopy
class SoupString(str):
Allows for strings to hold BeautifulSoup data.
def __new__(cls, *args, **kwargs):
return str.__new__(cls, *args, **kwargs)
def __init__(self, val):
self.soup = val
class ListWriter:
Allows for XMLWriter to write to a list instead of a file.
def __init__(self, out):
self.out = out
def write(self, data):
def identify_table(soup, htmldict, numtable):
Checks whether the given BeautifulSoup tag is the table
the user intends to process.
if soup is None or soup.name != 'table':
return False # Tag is not a
elif 'table_id' not in htmldict:
return numtable == 1
table_id = htmldict['table_id']
if isinstance(table_id, str):
return 'id' in soup.attrs and soup['id'] == table_id
elif isinstance(table_id, int):
return table_id == numtable
# Return False if an invalid parameter is given
return False
class HTMLInputter(core.BaseInputter):
Input lines of HTML in a valid form.
This requires `BeautifulSoup
`_ to be installed.
def process_lines(self, lines):
Convert the given input into a list of SoupString rows
for further processing.
from bs4 import BeautifulSoup
except ImportError:
raise core.OptionalTableImportError('BeautifulSoup must be '
'installed to read HTML tables')
if 'parser' not in self.html:
with warnings.catch_warnings():
# Ignore bs4 parser warning #4550.
warnings.filterwarnings('ignore', '.*no parser was explicitly specified.*')
soup = BeautifulSoup('\n'.join(lines))
else: # use a custom backend parser
soup = BeautifulSoup('\n'.join(lines), self.html['parser'])
tables = soup.find_all('table')
for i, possible_table in enumerate(tables):
if identify_table(possible_table, self.html, i + 1):
table = possible_table # Find the correct table
if isinstance(self.html['table_id'], int):
err_descr = f"number {self.html['table_id']}"
err_descr = f"id '{self.html['table_id']}'"
raise core.InconsistentTableError(
f'ERROR: HTML table {err_descr} not found')
# Get all table rows
soup_list = [SoupString(x) for x in table.find_all('tr')]
return soup_list
class HTMLSplitter(core.BaseSplitter):
Split HTML table data.
def __call__(self, lines):
Return HTML data from lines as a generator.
for line in lines:
if not isinstance(line, SoupString):
raise TypeError('HTML lines should be of type SoupString')
soup = line.soup
header_elements = soup.find_all('th')
if header_elements:
# Return multicolumns as tuples for HTMLHeader handling
yield [(el.text.strip(), el['colspan']) if el.has_attr('colspan')
else el.text.strip() for el in header_elements]
data_elements = soup.find_all('td')
if data_elements:
yield [el.text.strip() for el in data_elements]
if len(lines) == 0:
raise core.InconsistentTableError('HTML tables must contain data '
'in a tag')
class HTMLOutputter(core.TableOutputter):
Output the HTML data as an ``astropy.table.Table`` object.
This subclass allows for the final table to contain
multidimensional columns (defined using the colspan attribute
of ).
default_converters = [core.convert_numpy(int),
def __call__(self, cols, meta):
Process the data in multidimensional columns.
new_cols = []
col_num = 0
while col_num < len(cols):
col = cols[col_num]
if hasattr(col, 'colspan'):
# Join elements of spanned columns together into list of tuples
span_cols = cols[col_num:col_num + col.colspan]
new_col = core.Column(col.name)
new_col.str_vals = list(zip(*[x.str_vals for x in span_cols]))
col_num += col.colspan
col_num += 1
return super().__call__(new_cols, meta)
class HTMLHeader(core.BaseHeader):
splitter_class = HTMLSplitter
def start_line(self, lines):
Return the line number at which header data begins.
for i, line in enumerate(lines):
if not isinstance(line, SoupString):
raise TypeError('HTML lines should be of type SoupString')
soup = line.soup
if soup.th is not None:
return i
return None
def _set_cols_from_names(self):
Set columns from header names, handling multicolumns appropriately.
self.cols = []
new_names = []
for name in self.names:
if isinstance(name, tuple):
col = core.Column(name=name[0])
col.colspan = int(name[1])
for i in range(1, int(name[1])):
# Add dummy columns
self.names = new_names
class HTMLData(core.BaseData):
splitter_class = HTMLSplitter
def start_line(self, lines):
Return the line number at which table data begins.
for i, line in enumerate(lines):
if not isinstance(line, SoupString):
raise TypeError('HTML lines should be of type SoupString')
soup = line.soup
if soup.td is not None:
if soup.th is not None:
raise core.InconsistentTableError('HTML tables cannot '
'have headings and data in the same row')
return i
raise core.InconsistentTableError('No start line found for HTML data')
def end_line(self, lines):
Return the line number at which table data ends.
last_index = -1
for i, line in enumerate(lines):
if not isinstance(line, SoupString):
raise TypeError('HTML lines should be of type SoupString')
soup = line.soup
if soup.td is not None:
last_index = i
if last_index == -1:
return None
return last_index + 1
class HTML(core.BaseReader):
"""HTML format table.
In order to customize input and output, a dict of parameters may
be passed to this class holding specific customizations.
**htmldict** : Dictionary of parameters for HTML input/output.
* css : Customized styling
If present, this parameter will be included in a |