Source code for memote.support.annotation

# -*- coding: utf-8 -*-

# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability,
# Technical University of Denmark.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Supporting functions for annotation checks performed on the model object."""

from __future__ import absolute_import

import logging
import re
from collections import OrderedDict

import pandas as pd
from future.utils import native_str


[docs]LOGGER = logging.getLogger(__name__)
# MIRIAM (http://www.ebi.ac.uk/miriam/) styled identifiers for # common databases that are currently included are: # DB gen,rxn,met url # # 'MetaNetX' ['rxn','met'] 'http://www.metanetx.org' # 'Kegg' ['gen','rxn','met'] 'http://www.kegg.jp/' # 'SEED' ['met'] 'http://modelseed.org/' # # 'InChI' ['met'] 'https://www.ebi.ac.uk/chebi/' # 'InChIKey' ['met'] 'http://cactus.nci.nih.gov/chemical/structure' # 'ChEBI' ['met'] 'http://bioportal.bioontology.org/ontologies/CHEBI' # 'BRENDA' ['rxn'] 'http://www.brenda-enzymes.org/' # 'RHEA' ['rxn'] 'http://www.rhea-db.org/' # 'HMDB' ['met'] 'http://www.hmdb.ca/' # # 'BioCyc' ['rxn','met'] 'http://biocyc.org' # 'Reactome' ['rxn','met'] 'http://www.reactome.org/' # 'BiGG' ['rxn','met'] 'http://bigg.ucsd.edu/universal/' # 'PubChem' ['met'] 'https://pubchem.ncbi.nlm.nih.gov/' # 'RefSeq' ['gen'] 'http://www.ncbi.nlm.nih.gov/projects/RefSeq/' # 'Uniprot' ['gen'] 'http://www.uniprot.org/' # 'EC-Code' ['rxn'] 'http://www.enzyme-database.org/' # 'EcoGene' ['gen'] 'http://ecogene.org/' # 'NCBI GI' ['gen'] 'http://www.ncbi.nlm.nih.gov/protein/' # 'NCBI Gene' ['gen'] 'http://ncbigene.bio2rdf.org/fct' # 'NCBI Protein'['gen'] 'http://www.ncbi.nlm.nih.gov/protein' # 'CCDS' ['gen'] 'http://www.ncbi.nlm.nih.gov/CCDS/' # 'HPRD' ['gen'] 'http://www.hprd.org/' # 'ASAP' ['gen'] 'http://asap.ahabs.wisc.edu/asap/home.php'
[docs]GENE_PRODUCT_ANNOTATIONS = OrderedDict( [ ( "refseq", re.compile( r"^((AC|AP|NC|NG|NM|NP|NR|NT|" r"NW|XM|XP|XR|YP|ZP)_\d+|" r"(NZ\_[A-Z]{4}\d+))(\.\d+)?$" ), ), ( "uniprot", re.compile( r"^([A-N,R-Z][0-9]([A-Z][A-Z, 0-9]" r"[A-Z, 0-9][0-9]){1,2})|([O,P,Q]" r"[0-9][A-Z, 0-9][A-Z, 0-9][A-Z, 0-9]" r"[0-9])(\.\d+)?$" ), ), ("ecogene", re.compile(r"^EG\d+$")), ("kegg.genes", re.compile(r"^\w+:[\w\d\.-]*$")), ("ncbigi", re.compile(r"^(GI|gi)\:\d+$")), ("ncbigene", re.compile(r"^\d+$")), ("ncbiprotein", re.compile(r"^(\w+\d+(\.\d+)?)|(NP_\d+)$")), ("ccds", re.compile(r"^CCDS\d+\.\d+$")), ("hprd", re.compile(r"^\d+$")), ("asap", re.compile(r"^[A-Za-z0-9-]+$")),
] )
[docs]REACTION_ANNOTATIONS = OrderedDict( [ ("rhea", re.compile(r"^\d{5}$")), ("kegg.reaction", re.compile(r"^R\d+$")), ("seed.reaction", re.compile(r"^rxn\d+$")), ("metanetx.reaction", re.compile(r"^MNXR\d+$")), ("bigg.reaction", re.compile(r"^[a-z_A-Z0-9]+$")), ( "reactome", re.compile(r"(^R-[A-Z]{3}-[0-9]+(-[0-9]+)?$)|(^REACT_\d+(\.\d+)?$)"), ), ( "ec-code", re.compile( r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|" r"\d+\.\d+\.\d+\.-|" r"\d+\.\d+\.\d+\.(n)?\d+$" ), ), ( "brenda", re.compile( r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|" r"\d+\.\d+\.\d+\.-|" r"\d+\.\d+\.\d+\.(n)?\d+$" ), ), ("biocyc", re.compile(r"^[A-Z-0-9]+(?<!CHEBI)" r"(\:)?[A-Za-z0-9+_.%-]+$")),
] )
[docs]METABOLITE_ANNOTATIONS = OrderedDict( [ ("pubchem.compound", re.compile(r"^\d+$")), ("kegg.compound", re.compile(r"^C\d+$")), ("seed.compound", re.compile(r"^cpd\d+$")), ("inchikey", re.compile(r"^[A-Z]{14}\-[A-Z]{10}(\-[A-Z])?")), ( "inchi", re.compile( r"^InChI\=1S?\/[A-Za-z0-9\.]+(\+[0-9]+)?" r"(\/[cnpqbtmsih][A-Za-z0-9\-\+\(\)\,\/\?\;\.]+)*$" ), ), ("chebi", re.compile(r"^CHEBI:\d+$")), ("hmdb", re.compile(r"^HMDB\d{5}$")), ( "reactome", re.compile(r"(^R-[A-Z]{3}-[0-9]+(-[0-9]+)?$)|(^REACT_\d+(\.\d+)?$)"), ), ("metanetx.chemical", re.compile(r"^MNXM\d+$")), ("bigg.metabolite", re.compile(r"^[a-z_A-Z0-9]+$")), ("biocyc", re.compile(r"^[A-Z-0-9]+(?<!CHEBI)(\:)?[A-Za-z0-9+_.%-]+$")),
] )
[docs]def find_components_without_annotation(model, components): """ Find model components with empty annotation attributes. Parameters ---------- model : cobra.Model A cobrapy metabolic model. components : {"metabolites", "reactions", "genes"} A string denoting `cobra.Model` components. Returns ------- list The components without any annotation. """ return [ elem for elem in getattr(model, components) if elem.annotation is None or len(elem.annotation) == 0
]
[docs]def generate_component_annotation_overview(elements, db): """ Tabulate which MIRIAM databases the component's annotation match. Parameters ---------- elements : list Elements of a model, either metabolites, reactions, or genes. db : str One of the MIRIAM database identifiers. Returns ------- list The components that are not annotated with the specified MIRIAM database. """ return [elem for elem in elements if db not in elem.annotation]
[docs]def generate_component_annotation_miriam_match(elements, component, db): """ Tabulate which MIRIAM databases the element's annotation match. If the relevant MIRIAM identifier is not in an element's annotation it is ignored. Parameters ---------- elements : list Elements of a model, either metabolites or reactions. component : {"metabolites", "reactions"} A string denoting a type of ``cobra.Model`` component. db : str One of the MIRIAM database identifiers. Returns ------- list The components whose annotation does not match the pattern for the MIRIAM database. """ def is_faulty(annotation, key, pattern): # Ignore missing annotation for this database. if key not in annotation: return False test = annotation[key] if isinstance(test, native_str): return pattern.match(test) is None else: return any(pattern.match(elem) is None for elem in test) pattern = { "metabolites": METABOLITE_ANNOTATIONS, "reactions": REACTION_ANNOTATIONS, "genes": GENE_PRODUCT_ANNOTATIONS, }[component][db] return [elem for elem in elements if is_faulty(elem.annotation, db, pattern)]
[docs]def generate_component_id_namespace_overview(model, components): """ Tabulate which MIRIAM databases the component's identifier matches. Parameters ---------- model : cobra.Model A cobrapy metabolic model. components : {"metabolites", "reactions", "genes"} A string denoting `cobra.Model` components. Returns ------- pandas.DataFrame The index of the table is given by the component identifiers. Each column corresponds to one MIRIAM database and a Boolean entry determines whether the annotation matches. """ patterns = { "metabolites": METABOLITE_ANNOTATIONS, "reactions": REACTION_ANNOTATIONS, "genes": GENE_PRODUCT_ANNOTATIONS, }[components] databases = list(patterns) data = list() index = list() for elem in getattr(model, components): index.append(elem.id) data.append(tuple(patterns[db].match(elem.id) is not None for db in databases)) df = pd.DataFrame(data, index=index, columns=databases) if components != "genes": # Clean up of the dataframe. Unfortunately the Biocyc patterns match # broadly. Hence, whenever a Metabolite or Reaction ID matches to any # DB pattern AND the Biocyc pattern we have to assume that this is a # false positive. # First determine all rows in which 'biocyc' and other entries are # True simultaneously and use this Boolean series to create another # column temporarily. df["duplicate"] = df[df["biocyc"]].sum(axis=1) >= 2 # Replace all nan values with False df["duplicate"].fillna(False, inplace=True) # Use the additional column to index the original dataframe to identify # false positive biocyc hits and set them to False. df.loc[df["duplicate"], "biocyc"] = False # Delete the additional column del df["duplicate"] return df