Source code for memote.support.basic

# -*- coding: utf-8 -*-

# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability,
# Technical University of Denmark.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Supporting functions for basic checks performed on the model object."""

from __future__ import absolute_import, division

import logging
from itertools import combinations

from cobra.medium import find_external_compartment
from pylru import lrudecorator

import memote.support.helpers as helpers
from memote.support.gpr_helpers import find_top_level_complex
from memote.utils import filter_none


__all__ = ("check_metabolites_formula_presence",)


LOGGER = logging.getLogger(__name__)


[docs]def check_metabolites_formula_presence(model): """Return the list of model metabolites that have no associated formula.""" return [met for met in model.metabolites if met.formula is None]
def check_metabolites_charge_presence(model): """Return the list of model metabolites that have no associated charge.""" return [met for met in model.metabolites if met.charge is None] def check_gene_protein_reaction_rule_presence(model): """Return the list of model reactions that have no associated gene rule.""" return [rxn for rxn in model.reactions if not rxn.gene_reaction_rule] def find_nonzero_constrained_reactions(model): """Return list of reactions with non-zero, non-maximal bounds.""" lower_bound, upper_bound = helpers.find_bounds(model) return [ rxn for rxn in model.reactions if 0 > rxn.lower_bound > lower_bound or 0 < rxn.upper_bound < upper_bound ] def find_zero_constrained_reactions(model): """Return list of reactions that are constrained to zero flux.""" return [ rxn for rxn in model.reactions if rxn.lower_bound == 0 and rxn.upper_bound == 0 ] def find_irreversible_reactions(model): """Return list of reactions that are irreversible.""" return [rxn for rxn in model.reactions if rxn.reversibility is False] def find_unconstrained_reactions(model): """Return list of reactions that are not constrained at all.""" lower_bound, upper_bound = helpers.find_bounds(model) return [ rxn for rxn in model.reactions if rxn.lower_bound <= lower_bound and rxn.upper_bound >= upper_bound ] def find_ngam(model): """ Return all potential non growth-associated maintenance reactions. From the list of all reactions that convert ATP to ADP select the reactions that match a defined reaction string and whose metabolites are situated within the main model compartment. The main model compartment is the cytosol, and if that cannot be identified, the compartment with the most metabolites. Parameters ---------- model : cobra.Model The metabolic model under investigation. Returns ------- list Reactions that qualify as non-growth associated maintenance reactions. Notes ----- [1]_ define the non-growth associated maintenance (NGAM) as the energy required to maintain all constant processes such as turgor pressure and other housekeeping activities. In metabolic models this is expressed by requiring a simple ATP hydrolysis reaction to always have a fixed minimal amount of flux. This value can be measured as described by [1]_ . References ---------- .. [1] Thiele, I., & Palsson, B. Ø. (2010, January). A protocol for generating a high-quality genome-scale metabolic reconstruction. Nature protocols. Nature Publishing Group. http://doi.org/10.1038/nprot.2009.203 """ atp_adp_conv_rxns = helpers.find_converting_reactions(model, ("MNXM3", "MNXM7")) id_of_main_compartment = helpers.find_compartment_id_in_model(model, "c") reactants = { helpers.find_met_in_model(model, "MNXM3", id_of_main_compartment)[0], helpers.find_met_in_model(model, "MNXM2", id_of_main_compartment)[0], } products = { helpers.find_met_in_model(model, "MNXM7", id_of_main_compartment)[0], helpers.find_met_in_model(model, "MNXM1", id_of_main_compartment)[0], helpers.find_met_in_model(model, "MNXM9", id_of_main_compartment)[0], } candidates = [ rxn for rxn in atp_adp_conv_rxns if rxn.reversibility is False and set(rxn.reactants) == reactants and set(rxn.products) == products ] buzzwords = [ "maintenance", "atpm", "requirement", "ngam", "non-growth", "associated", ] refined_candidates = [ rxn for rxn in candidates if any(string in filter_none(rxn.name, "").lower() for string in buzzwords) ] if refined_candidates: return refined_candidates else: return candidates def calculate_metabolic_coverage(model): """ Return the ratio of reactions and genes included in the model. Determine whether the amount of reactions and genes in model not equal to zero, then return the ratio. Parameters ---------- model : cobra.Model The metabolic model under investigation. Returns ------- float The ratio of reactions to genes also called metabolic coverage. Raises ------ ValueError If the model does not contain either reactions or genes. Notes ----- According to [1]_ this is a good quality indicator expressing the degree of metabolic coverage i.e. modeling detail of a given reconstruction. The authors explain that models with a 'high level of modeling detail have ratios >1, and [models] with low level of detail have ratios <1'. They explain that 'this difference arises because [models] with basic or intermediate levels of detail often include many reactions in which several gene products and their enzymatic transformations are ‘lumped’'. References ---------- .. [1] Monk, J., Nogales, J., & Palsson, B. O. (2014). Optimizing genome-scale network reconstructions. Nature Biotechnology, 32(5), 447–452. http://doi.org/10.1038/nbt.2870 """ if len(model.reactions) == 0 or len(model.genes) == 0: raise ValueError("The model contains no reactions or genes.") return float(len(model.reactions)) / float(len(model.genes)) def find_protein_complexes(model): """ Find reactions that are catalyzed by at least a heterodimer. Parameters ---------- model : cobra.Model The metabolic model under investigation. Returns ------- list Reactions whose gene-protein-reaction association contains at least one logical AND combining different gene products (heterodimer). """ complexes = [] for rxn in model.reactions: if not rxn.gene_reaction_rule: continue size = find_top_level_complex(rxn.gene_reaction_rule) if size >= 2: complexes.append(rxn) return complexes @lrudecorator(size=2) def find_pure_metabolic_reactions(model): """ Return reactions that are neither transporters, exchanges, nor pseudo. Parameters ---------- model : cobra.Model The metabolic model under investigation. """ return set(model.reactions) - helpers.find_interchange_biomass_reactions(model) def is_constrained_reaction(model, rxn): """Return whether a reaction has fixed constraints.""" lower_bound, upper_bound = helpers.find_bounds(model) if rxn.reversibility: return rxn.lower_bound > lower_bound or rxn.upper_bound < upper_bound else: return rxn.lower_bound > 0 or rxn.upper_bound < upper_bound def find_oxygen_reactions(model): """Return list of oxygen-producing/-consuming reactions.""" o2_in_model = helpers.find_met_in_model(model, "MNXM4") return set( [ rxn for met in model.metabolites for rxn in met.reactions if met.formula == "O2" or met in o2_in_model ] ) def find_unique_metabolites(model): """Return set of metabolite IDs without duplicates from compartments.""" unique = set() for met in model.metabolites: is_missing = True for comp in model.compartments: if met.id.endswith("_{}".format(comp)): unique.add(met.id[: -(len(comp) + 1)]) is_missing = False break if is_missing: unique.add(met.id) return unique @lrudecorator(size=2) def find_duplicate_metabolites_in_compartments(model): """ Return list of metabolites with duplicates in the same compartment. This function identifies duplicate metabolites in each compartment by determining if any two metabolites have identical InChI-key annotations. For instance, this function would find compounds with IDs ATP1 and ATP2 in the cytosolic compartment, with both having the same InChI annotations. Parameters ---------- model : cobra.Model The metabolic model under investigation. Returns ------- list A list of tuples of duplicate metabolites. """ unique_identifiers = ["inchikey", "inchi"] duplicates = [] for met_1, met_2 in combinations(model.metabolites, 2): if met_1.compartment == met_2.compartment: for key in unique_identifiers: if key in met_1.annotation and key in met_2.annotation: if met_1.annotation[key] == met_2.annotation[key]: duplicates.append((met_1.id, met_2.id)) break return duplicates def find_reactions_with_partially_identical_annotations(model): """ Return duplicate reactions based on identical annotation. Identify duplicate reactions globally by checking if any two metabolic reactions have the same entries in their annotation attributes. This can be useful to identify one 'type' of reactions that occurs in several compartments, to curate merged models or to clean-up bulk model modifications. The heuristic looks at annotations with the keys "metanetx.reaction", "kegg.reaction", "brenda", "rhea", "biocyc", "bigg.reaction" only. Parameters ---------- model : cobra.Model The metabolic model under investigation. Returns ------- dict A mapping from sets of annotations to groups of reactions with those annotations. int The total number of unique reactions that are duplicated. """ duplicates = {} rxn_db_identifiers = [ "metanetx.reaction", "kegg.reaction", "brenda", "rhea", "biocyc", "bigg.reaction", ] # Build a list that associates a reaction with a set of its annotations. ann_rxns = [] for rxn in model.reactions: ann = [] for key in rxn_db_identifiers: if key in rxn.annotation: if isinstance(rxn.annotation[key], list): ann.extend([(key, elem) for elem in rxn.annotation[key]]) else: ann.append((key, rxn.annotation[key])) ann_rxns.append((rxn, frozenset(ann))) # Compute the intersection between annotations and record the matching # reaction identifiers. for (rxn_a, ann_a), (rxn_b, ann_b) in combinations(ann_rxns, 2): mutual_pair = tuple(ann_a & ann_b) if len(mutual_pair) > 0: duplicates.setdefault(mutual_pair, set()).update([rxn_a.id, rxn_b.id]) # Transform the object for JSON compatibility num_duplicated = set() duplicated = {} for key in duplicates: # Object keys must be strings in JSON. new_key = ",".join(sorted("{}:{}".format(ns, term) for ns, term in key)) duplicated[new_key] = rxns = list(duplicates[key]) num_duplicated.update(rxns) return duplicated, len(num_duplicated) def map_metabolites_to_structures(metabolites, compartments): """ Map metabolites from the identifier namespace to structural space. Metabolites who lack structural annotation (InChI or InChIKey) are ignored. Parameters ---------- metabolites : iterable The cobra.Metabolites to map. compartments : iterable The different compartments to consider. Structures are treated separately for each compartment. Returns ------- dict A mapping from a cobra.Metabolite to its compartment specific structure index. """ # TODO (Moritz Beber): Consider SMILES? unique_identifiers = ["inchikey", "inchi"] met2mol = {} molecules = {c: [] for c in compartments} for met in metabolites: ann = [] for key in unique_identifiers: mol = met.annotation.get(key) if mol is not None: ann.append(mol) # Ignore metabolites without the required information. if len(ann) == 0: continue ann = set(ann) # Compare with other structures in the same compartment. mols = molecules[met.compartment] for i, mol_group in enumerate(mols): if len(ann & mol_group) > 0: mol_group.update(ann) # We map to the index of the group because it is hashable and # cheaper to compare later. met2mol[met] = "{}-{}".format(met.compartment, i) break if met not in met2mol: # The length of the list corresponds to the 0-index after appending. met2mol[met] = "{}-{}".format(met.compartment, len(mols)) mols.append(ann) return met2mol def find_duplicate_reactions(model): """ Return a list with pairs of reactions that are functionally identical. Identify duplicate reactions globally by checking if any two reactions have the same metabolites, same directionality and are in the same compartment. This can be useful to curate merged models or to clean-up bulk model modifications. The heuristic compares reactions in a pairwise manner. For each reaction, the metabolite annotations are checked for a description of the structure (via InChI and InChIKey).If they exist, substrates and products as well as the stoichiometries of any reaction pair are compared. Only reactions where the substrates, products, stoichiometry and reversibility are identical are considered to be duplicates. This test will not be able to identify duplicate reactions if there are no structure annotations. Further, it will report reactions with differing bounds as equal if they otherwise match the above conditions. Parameters ---------- model : cobra.Model The metabolic model under investigation. Returns ------- list A list of pairs of duplicate reactions based on metabolites. int The number of unique reactions that have a duplicates """ met2mol = map_metabolites_to_structures(model.metabolites, model.compartments) # Build a list associating reactions with their stoichiometry in molecular # structure space. structural = [] for rxn in model.reactions: # Ignore reactions that have metabolites without structures. if not all(met in met2mol for met in rxn.metabolites): continue # We consider substrates and products separately since, for example, # the InChI for H2O and OH is the same. substrates = {met2mol[met]: rxn.get_coefficient(met) for met in rxn.reactants} products = {met2mol[met]: rxn.get_coefficient(met) for met in rxn.products} structural.append((rxn, substrates, products)) # Compare reactions using their structure-based stoichiometries. num_duplicated = set() duplicates = [] for (rxn_a, sub_a, prod_a), (rxn_b, sub_b, prod_b) in combinations(structural, 2): # Compare the substrates. if sub_a != sub_b: continue # Compare the products. if prod_a != prod_b: continue # Compare whether they are both (ir-)reversible. if rxn_a.reversibility != rxn_b.reversibility: continue # TODO (Moritz Beber): We could compare bounds here but it might be # worth knowing about the reactions even if their bounds differ? duplicates.append((rxn_a.id, rxn_b.id)) num_duplicated.add(rxn_a.id) num_duplicated.add(rxn_b.id) return duplicates, len(num_duplicated) def find_reactions_with_identical_genes(model): """ Return reactions that have identical genes. Identify duplicate reactions globally by checking if any two reactions have the same genes. This can be useful to curate merged models or to clean-up bulk model modifications, but also to identify promiscuous enzymes. The heuristic compares reactions in a pairwise manner and reports on reaction pairs whose genes are identical. Reactions with missing genes are skipped. Parameters ---------- model : cobra.Model The metabolic model under investigation. Returns ------- dict A mapping from sets of genes to all the reactions containing those genes. int The total number of unique reactions that appear duplicates based on their gene-protein-reaction associations. """ duplicates = dict() for rxn_a, rxn_b in combinations(model.reactions, 2): if not (rxn_a.genes and rxn_b.genes): continue if rxn_a.genes == rxn_b.genes: # This works because the `genes` are frozen sets. identifiers = rxn_a.genes duplicates.setdefault(identifiers, set()).update([rxn_a.id, rxn_b.id]) # Transform the object for JSON compatibility num_duplicated = set() duplicated = {} for key in duplicates: # Object keys must be strings in JSON. new_key = ",".join(sorted(g.id for g in key)) duplicated[new_key] = rxns = list(duplicates[key]) num_duplicated.update(rxns) return duplicated, len(num_duplicated) def check_transport_reaction_gpr_presence(model): """Return the list of transport reactions that have no associated gpr.""" return [ rxn for rxn in helpers.find_transport_reactions(model) if not rxn.gene_reaction_rule ] def find_medium_metabolites(model): """Return the list of metabolites ingested/excreted by the model.""" return [ met.id for rxn in model.medium for met in model.reactions.get_by_id(rxn).metabolites ] def find_external_metabolites(model): """Return all metabolites in the external compartment.""" ex_comp = find_external_compartment(model) return [met for met in model.metabolites if met.compartment == ex_comp]