# Licensed under a 3-clause BSD style license - see LICENSE.rst """An extensible HTML table reader and writer. html.py: Classes to read and write HTML tables `BeautifulSoup `_ must be installed to read HTML tables. """ import warnings from . import core from astropy.table import Column from astropy.utils.xml import writer from copy import deepcopy class SoupString(str): """ Allows for strings to hold BeautifulSoup data. """ def __new__(cls, *args, **kwargs): return str.__new__(cls, *args, **kwargs) def __init__(self, val): self.soup = val class ListWriter: """ Allows for XMLWriter to write to a list instead of a file. """ def __init__(self, out): self.out = out def write(self, data): self.out.append(data) def identify_table(soup, htmldict, numtable): """ Checks whether the given BeautifulSoup tag is the table the user intends to process. """ if soup is None or soup.name != 'table': return False # Tag is not a elif 'table_id' not in htmldict: return numtable == 1 table_id = htmldict['table_id'] if isinstance(table_id, str): return 'id' in soup.attrs and soup['id'] == table_id elif isinstance(table_id, int): return table_id == numtable # Return False if an invalid parameter is given return False class HTMLInputter(core.BaseInputter): """ Input lines of HTML in a valid form. This requires `BeautifulSoup `_ to be installed. """ def process_lines(self, lines): """ Convert the given input into a list of SoupString rows for further processing. """ try: from bs4 import BeautifulSoup except ImportError: raise core.OptionalTableImportError('BeautifulSoup must be ' 'installed to read HTML tables') if 'parser' not in self.html: with warnings.catch_warnings(): # Ignore bs4 parser warning #4550. warnings.filterwarnings('ignore', '.*no parser was explicitly specified.*') soup = BeautifulSoup('\n'.join(lines)) else: # use a custom backend parser soup = BeautifulSoup('\n'.join(lines), self.html['parser']) tables = soup.find_all('table') for i, possible_table in enumerate(tables): if identify_table(possible_table, self.html, i + 1): table = possible_table # Find the correct table break else: if isinstance(self.html['table_id'], int): err_descr = f"number {self.html['table_id']}" else: err_descr = f"id '{self.html['table_id']}'" raise core.InconsistentTableError( f'ERROR: HTML table {err_descr} not found') # Get all table rows soup_list = [SoupString(x) for x in table.find_all('tr')] return soup_list class HTMLSplitter(core.BaseSplitter): """ Split HTML table data. """ def __call__(self, lines): """ Return HTML data from lines as a generator. """ for line in lines: if not isinstance(line, SoupString): raise TypeError('HTML lines should be of type SoupString') soup = line.soup header_elements = soup.find_all('th') if header_elements: # Return multicolumns as tuples for HTMLHeader handling yield [(el.text.strip(), el['colspan']) if el.has_attr('colspan') else el.text.strip() for el in header_elements] data_elements = soup.find_all('td') if data_elements: yield [el.text.strip() for el in data_elements] if len(lines) == 0: raise core.InconsistentTableError('HTML tables must contain data ' 'in a
tag') class HTMLOutputter(core.TableOutputter): """ Output the HTML data as an ``astropy.table.Table`` object. This subclass allows for the final table to contain multidimensional columns (defined using the colspan attribute of
). """ default_converters = [core.convert_numpy(int), core.convert_numpy(float), core.convert_numpy(str)] def __call__(self, cols, meta): """ Process the data in multidimensional columns. """ new_cols = [] col_num = 0 while col_num < len(cols): col = cols[col_num] if hasattr(col, 'colspan'): # Join elements of spanned columns together into list of tuples span_cols = cols[col_num:col_num + col.colspan] new_col = core.Column(col.name) new_col.str_vals = list(zip(*[x.str_vals for x in span_cols])) new_cols.append(new_col) col_num += col.colspan else: new_cols.append(col) col_num += 1 return super().__call__(new_cols, meta) class HTMLHeader(core.BaseHeader): splitter_class = HTMLSplitter def start_line(self, lines): """ Return the line number at which header data begins. """ for i, line in enumerate(lines): if not isinstance(line, SoupString): raise TypeError('HTML lines should be of type SoupString') soup = line.soup if soup.th is not None: return i return None def _set_cols_from_names(self): """ Set columns from header names, handling multicolumns appropriately. """ self.cols = [] new_names = [] for name in self.names: if isinstance(name, tuple): col = core.Column(name=name[0]) col.colspan = int(name[1]) self.cols.append(col) new_names.append(name[0]) for i in range(1, int(name[1])): # Add dummy columns self.cols.append(core.Column('')) new_names.append('') else: self.cols.append(core.Column(name=name)) new_names.append(name) self.names = new_names class HTMLData(core.BaseData): splitter_class = HTMLSplitter def start_line(self, lines): """ Return the line number at which table data begins. """ for i, line in enumerate(lines): if not isinstance(line, SoupString): raise TypeError('HTML lines should be of type SoupString') soup = line.soup if soup.td is not None: if soup.th is not None: raise core.InconsistentTableError('HTML tables cannot ' 'have headings and data in the same row') return i raise core.InconsistentTableError('No start line found for HTML data') def end_line(self, lines): """ Return the line number at which table data ends. """ last_index = -1 for i, line in enumerate(lines): if not isinstance(line, SoupString): raise TypeError('HTML lines should be of type SoupString') soup = line.soup if soup.td is not None: last_index = i if last_index == -1: return None return last_index + 1 class HTML(core.BaseReader): """HTML format table. In order to customize input and output, a dict of parameters may be passed to this class holding specific customizations. **htmldict** : Dictionary of parameters for HTML input/output. * css : Customized styling If present, this parameter will be included in a