Source code for qiime2.metadata.io

# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import csv
import itertools
import os.path
import re

import numpy as np
import pandas as pd

from qiime2.core.util import find_duplicates
import qiime2.core.missing as _missing
from .base import SUPPORTED_COLUMN_TYPES, FORMATTED_ID_HEADERS, is_id_header
from .metadata import Metadata, MetadataColumn


[docs] class MetadataFileError(Exception): _suffix = ( "There may be more errors present in the metadata file. To get a full " "report, sample/feature metadata files can be validated with Keemei: " "https://keemei.qiime2.org\n\nFind details on QIIME 2 metadata " "requirements here: https://docs.qiime2.org/%s/tutorials/metadata/") def __init__(self, message, include_suffix=True): # Lazy import because `qiime2.__release__` is available at runtime but # not at import time (otherwise the release value could be interpolated # into `_suffix` in the class definition above). import qiime2 if include_suffix: message = message + '\n\n' + self._suffix % qiime2.__release__ super().__init__(message)
class MetadataReader: def __init__(self, filepath): if not os.path.isfile(filepath): raise MetadataFileError( "Metadata file path doesn't exist, or the path points to " "something other than a file. Please check that the path " "exists, has read permissions, and points to a regular file " "(not a directory): %s" % filepath) self._filepath = filepath # Used by `read()` to store an iterator yielding rows with # leading/trailing whitespace stripped from their cells (this is a # preprocessing step that should happen with *every* row). The iterator # protocol is the only guaranteed API on this object. self._reader = None def read(self, into, column_types=None, column_missing_schemes=None, default_missing_scheme=_missing.DEFAULT_MISSING): if column_types is None: column_types = {} try: # Newline settings based on recommendation from csv docs: # https://docs.python.org/3/library/csv.html#id3 # Ignore BOM on read (but do not write BOM) with open(self._filepath, 'r', newline='', encoding='utf-8-sig') as fh: tsv_reader = csv.reader(fh, dialect='excel-tab', strict=True) self._reader = (self._strip_cell_whitespace(row) for row in tsv_reader) header = self._read_header() directives = self._read_directives(header) ids, data = self._read_data(header) except UnicodeDecodeError as e: if ('0xff in position 0' in str(e) or '0xfe in position 0' in str(e)): raise MetadataFileError( "Metadata file must be encoded as UTF-8 or ASCII, found " "UTF-16. If this file is from Microsoft Excel, save " "as a plain text file, not 'UTF-16 Unicode'") raise MetadataFileError( "Metadata file must be encoded as UTF-8 or ASCII. The " "following error occurred when decoding the file:\n\n%s" % e) finally: self._reader = None index = pd.Index(ids, name=header[0], dtype=object) df = pd.DataFrame(data, columns=header[1:], index=index, dtype=object) # TODO: move these checks over to Metadata.__init__() so that you can # pass column_types with an untyped dataframe. This would require a bit # of a refactor and doesn't buy a whole lot at the moment, hence the # TODO. for name, type in column_types.items(): if name not in df.columns: raise MetadataFileError( "Column name %r specified in `column_types` is not a " "column in the metadata file." % name) if type not in SUPPORTED_COLUMN_TYPES: fmt_column_types = ', '.join( repr(e) for e in sorted(SUPPORTED_COLUMN_TYPES)) raise MetadataFileError( "Column name %r specified in `column_types` has an " "unrecognized column type %r. Supported column types: %s" % (name, type, fmt_column_types)) resolved_column_types = directives.get('types', {}) resolved_column_types.update(column_types) if column_missing_schemes is None: column_missing_schemes = {} resolved_missing = {c: default_missing_scheme for c in df.columns} resolved_missing.update(directives.get('missing', {})) resolved_missing.update(column_missing_schemes) try: # Cast each column to the appropriate dtype based on column type. df = df.apply(self._cast_column, axis='index', column_types=resolved_column_types, missing_schemes=resolved_missing) except MetadataFileError as e: # HACK: If an exception is raised within `DataFrame.apply`, pandas # adds an extra tuple element to `e.args`, making the original # error message difficult to read because a tuple is repr'd instead # of a string. To work around this, we catch and reraise a # MetadataFileError with the original error message. We use # `include_suffix=False` to avoid adding another suffix to the # error message we're reraising. msg = e.args[0] raise MetadataFileError(msg, include_suffix=False) try: return into(df, column_missing_schemes=resolved_missing, default_missing_scheme=default_missing_scheme) except Exception as e: raise MetadataFileError( "There was an issue with loading the metadata file:\n\n%s" % e) def _read_header(self): header = None for row in self._reader: if self._is_header(row): header = row break elif self._is_comment(row): continue elif self._is_empty(row): continue elif self._is_directive(row): raise MetadataFileError( "Found directive %r while searching for header. " "Directives may only appear immediately after the header." % row[0]) else: raise MetadataFileError( "Found unrecognized ID column name %r while searching for " "header. The first column name in the header defines the " "ID column, and must be one of these values:\n\n%s\n\n" "NOTE: Metadata files must contain tab-separated values." % (row[0], FORMATTED_ID_HEADERS)) if header is None: raise MetadataFileError( "Failed to locate header. The metadata file may be empty, or " "consists only of comments or empty rows.") # Trim trailing empty cells from header. data_extent = None for idx, cell in enumerate(header): if cell != '': data_extent = idx header = header[:data_extent+1] # Basic validation to 1) fail early before processing entire file; and # 2) make some basic guarantees about the header for things in this # class that use the header as part of reading the file. column_names = set(header) if '' in column_names: raise MetadataFileError( "Found at least one column without a name in the header. Each " "column must be named.") elif len(header) != len(column_names): duplicates = find_duplicates(header) raise MetadataFileError( "Column names must be unique. The following column names are " "duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) # Skip the first element of the header because we know it is a valid ID # header. The other column names are validated to ensure they *aren't* # valid ID headers. for column_name in header[1:]: if is_id_header(column_name): raise MetadataFileError( "Metadata column name %r conflicts with a name reserved " "for the ID column header. Reserved ID column headers:" "\n\n%s" % (column_name, FORMATTED_ID_HEADERS)) return header def _read_directives(self, header): directives = {} for row in self._reader: directive_kind = None if not self._is_directive(row): self._reader = itertools.chain([row], self._reader) break if self._is_column_types_directive(row): directive_kind = 'types' elif self._is_missing_directive(row): directive_kind = 'missing' else: raise MetadataFileError( "Unrecognized directive %r. Only the #q2:types" " and #q2:missing directives are supported at this" " time." % row[0]) if directive_kind in directives: raise MetadataFileError( "Found duplicate directive %r. Each directive may " "only be specified a single time." % row[0]) row = self._match_header_len(row, header) collected = {name: arg for name, arg in zip(header[1:], row[1:]) if arg} directives[directive_kind] = collected if 'types' in directives: column_types = directives['types'] for column_name, column_type in column_types.items(): type_nocase = column_type.lower() if type_nocase in SUPPORTED_COLUMN_TYPES: column_types[column_name] = type_nocase else: fmt_column_types = ', '.join( repr(e) for e in sorted(SUPPORTED_COLUMN_TYPES)) raise MetadataFileError( "Column %r has an unrecognized column type %r " "specified in its #q2:types directive. " "Supported column types (case-insensitive): %s" % (column_name, column_type, fmt_column_types)) if 'missing' in directives: for column_name, column_missing in directives['missing'].items(): if column_missing not in _missing.BUILTIN_MISSING: raise MetadataFileError( "Column %r has an unrecognized missing value scheme %r" " specified in its #q2:missing directive." " Supported missing value schemes (case-sensitive): %s" % (column_name, column_missing, list(_missing.BUILTIN_MISSING)) ) return directives def _read_data(self, header): ids = [] data = [] for row in self._reader: if self._is_comment(row): continue elif self._is_empty(row): continue elif self._is_directive(row): raise MetadataFileError( "Found directive %r outside of the directives section of " "the file. Directives may only appear immediately after " "the header." % row[0]) elif self._is_header(row): raise MetadataFileError( "Metadata ID %r conflicts with a name reserved for the ID " "column header. Reserved ID column headers:\n\n%s" % (row[0], FORMATTED_ID_HEADERS)) row = self._match_header_len(row, header) ids.append(row[0]) data.append(row[1:]) return ids, data def _strip_cell_whitespace(self, row): return [cell.strip() for cell in row] def _match_header_len(self, row, header): row_len = len(row) header_len = len(header) if row_len < header_len: # Pad row with empty cells to match header length. row = row + [''] * (header_len - row_len) elif row_len > header_len: trailing_row = row[header_len:] if not self._is_empty(trailing_row): raise MetadataFileError( "Metadata row contains more cells than are declared by " "the header. The row has %d cells, while the header " "declares %d cells." % (row_len, header_len)) row = row[:header_len] return row def _is_empty(self, row): # `all` returns True for an empty iterable, so this check works for a # row of zero elements (corresponds to a blank line in the file). return all((cell == '' for cell in row)) def _is_comment(self, row): return ( len(row) > 0 and row[0].startswith('#') and not self._is_directive(row) and not self._is_header(row) ) def _is_header(self, row): if len(row) == 0: return False return is_id_header(row[0]) def _is_directive(self, row): return len(row) > 0 and row[0].startswith('#q2:') def _is_column_types_directive(self, row): return len(row) > 0 and row[0].split(' ')[0] == '#q2:types' def _is_missing_directive(self, row): return len(row) > 0 and row[0].split(' ')[0] == '#q2:missing' def _cast_column(self, series, column_types, missing_schemes): if series.name in missing_schemes: scheme = missing_schemes[series.name] series = _missing.series_encode_missing(series, scheme) if series.name in column_types: if column_types[series.name] == 'numeric': return self._to_numeric(series) else: # 'categorical' return self._to_categorical(series) else: # Infer type try: return self._to_numeric(series) except MetadataFileError: return self._to_categorical(series) def _to_categorical(self, series): # Replace empty strings with `None` to force the series to remain # dtype=object (this only matters if the series consists solely of # missing data). Replacing with np.nan and casting to dtype=object # won't retain the correct dtype in the resulting dataframe # (`DataFrame.apply` seems to force series consisting solely of np.nan # to dtype=float64, even if dtype=object is specified. # # To replace a value with `None`, the following invocation of # `Series.replace` must be used because `None` is a sentinel: # https://stackoverflow.com/a/17097397/3776794 return series.replace([''], [None]) def _to_numeric(self, series): series = series.replace('', np.nan) is_numeric = series.apply(self._is_numeric) if is_numeric.all(): return pd.to_numeric(series, errors='raise') else: non_numerics = series[~is_numeric].unique() raise MetadataFileError( "Cannot convert metadata column %r to numeric. The following " "values could not be interpreted as numeric: %s" % (series.name, ', '.join(repr(e) for e in sorted(non_numerics)))) def _is_numeric(self, value): return (isinstance(value, float) or len(_numeric_regex.findall(value)) == 1) class MetadataWriter: def __init__(self, metadata): self._metadata = metadata def write(self, filepath): # Newline settings based on recommendation from csv docs: # https://docs.python.org/3/library/csv.html#id3 # Do NOT write a BOM, hence utf-8 not utf-8-sig with open(filepath, 'w', newline='', encoding='utf-8') as fh: tsv_writer = csv.writer(fh, dialect='excel-tab', strict=True) md = self._metadata header = [md.id_header] types_directive = ['#q2:types'] missing_directive = ['#q2:missing'] if isinstance(md, Metadata): for name, props in md.columns.items(): header.append(name) types_directive.append(props.type) missing_directive.append(props.missing_scheme) elif isinstance(md, MetadataColumn): header.append(md.name) types_directive.append(md.type) missing_directive.append(md.missing_scheme) else: raise NotImplementedError tsv_writer.writerow(header) tsv_writer.writerow(types_directive) if self._non_default_missing(missing_directive): tsv_writer.writerow(missing_directive) df = md.to_dataframe(encode_missing=True) df.fillna('', inplace=True) df = df.applymap(self._format) tsv_writer.writerows(df.itertuples(index=True)) def _non_default_missing(self, missing_directive): missing = missing_directive[1:] result = False for m in missing: if m != _missing.DEFAULT_MISSING: result = True break return result def _format(self, value): if isinstance(value, str): return value elif isinstance(value, float): # Use fixed precision or scientific notation as necessary (both are # roundtrippable in the metadata file format), with up to 15 digits # *total* precision (i.e. before and after the decimal point), # rounding if necessary. Trailing zeros or decimal points will not # be included in the formatted string (e.g. 42.0 will be formatted # as "42"). A precision of 15 digits is used because that is within # the 64-bit floating point spec (things get weird after that). # # Using repr() and str() each have their own predefined precision # which varies across Python versions. Using the string formatting # presentation types (e.g. %g, %f) without specifying a precision # will usually default to 6 digits past the decimal point, which # seems a little low. # # References: # # - https://stackoverflow.com/a/2440786/3776794 # - https://stackoverflow.com/a/2440708/3776794 # - https://docs.python.org/3/library/string.html# # format-specification-mini-language # - https://stackoverflow.com/a/20586479/3776794 # - https://drj11.wordpress.com/2007/07/03/python-poor-printing- # of-floating-point/ return '{0:.15g}'.format(value) else: raise NotImplementedError # Credit: https://stackoverflow.com/a/4703508/3776794 _numeric_pattern = r""" ^[-+]? # optional sign (?: (?: \d* \. \d+ ) # .1 .12 .123 etc 9.1 etc 98.1 etc | (?: \d+ \.? ) # 1. 12. 123. etc 1 12 123 etc ) # followed by optional exponent part if desired (?: [Ee] [+-]? \d+ ) ?$ """ _numeric_regex = re.compile(_numeric_pattern, re.VERBOSE)