Source code for pepkit.chem.desc.descriptor

import pandas as pd
from peptides import Peptide
from rdkit import Chem
from rdkit.Chem import Descriptors
from joblib import Parallel, delayed
from typing import List, Dict, Union, Any


[docs] class Descriptor: """ Compute molecular or peptide descriptors for a collection of records. This class provides descriptor calculation for peptides or small molecules, supporting two engines: - 'peptides': Uses the `peptides` Python package for peptide descriptors. - 'rdkit': Uses RDKit for general molecular descriptors from SMILES. :param engine: Descriptor engine ('peptides' for peptide descriptors, 'rdkit' for molecular descriptors). :type engine: str :param fasta_key: Key for the peptide sequence in input records or DataFrame. :type fasta_key: str :param id_key: Key for unique record identifiers in input. :type id_key: str :param smiles_key: Key for SMILES string in input records (used only by 'rdkit'). :type smiles_key: str Example ------- >>> descriptor = Descriptor(engine='peptides') >>> records = [{'id': 1, 'peptide_sequence': 'ACDE'}] >>> df_out = descriptor.calculate(records, n_jobs=2) >>> descriptor = Descriptor(engine='rdkit') >>> records = [{'id': 1, 'smiles': 'CC(=O)O'}] >>> df_out = descriptor.calculate(records, n_jobs=4) """ SUPPORTED_ENGINES = {"peptides", "rdkit"} def __init__( self, engine: str = "peptides", fasta_key: str = "peptide_sequence", id_key: str = "id", smiles_key: str = "smiles", ) -> None: """ Initialize the Descriptor calculator. :param engine: Descriptor engine to use ('peptides' or 'rdkit'). :type engine: str :param fasta_key: Sequence key in each record (for both engines). :type fasta_key: str :param id_key: ID key in each record. :type id_key: str :param smiles_key: SMILES key in each record (used only by 'rdkit'). :type smiles_key: str :raises ValueError: If unsupported engine is specified. """ engine = engine.lower() if engine not in self.SUPPORTED_ENGINES: raise ValueError( f"Unsupported engine '{engine}'." f"Choose from {self.SUPPORTED_ENGINES}." ) self.engine = engine self.fasta_key = fasta_key self.id_key = id_key self.smiles_key = smiles_key @staticmethod def _calc_peptide_record( record: Dict[str, Any], fasta_key: str, id_key: str ) -> Dict[str, Any]: """ Compute peptide-specific descriptors using the `peptides` library. :param record: Input record with sequence and ID. :type record: dict :param fasta_key: Key for sequence in the record. :type fasta_key: str :param id_key: Key for record ID. :type id_key: str :raises KeyError: If required fields are missing. :return: Descriptor dictionary with sequence and ID fields. :rtype: dict Example ------- >>> Descriptor._calc_peptide_record({'id': 1, 'peptide_sequence': 'ACD'}, 'peptide_sequence', 'id') {'id': 1, 'peptide_sequence': 'ACD', ...} """ seq = record.get(fasta_key) pid = record.get(id_key) if seq is None: raise KeyError( f"Missing sequence under key '{fasta_key}' in record {record}" ) if pid is None: raise KeyError( f"Missing ID under key '{id_key}' in record {record}" ) pep = Peptide(seq) desc = pep.descriptors() desc[id_key] = pid desc[fasta_key] = seq return desc @staticmethod def _calc_rdkit_record( record: Dict[str, Any], smiles_key: str, id_key: str ) -> Dict[str, Any]: """ Compute RDKit descriptors for a single record. :param record: Input record with SMILES and ID. :type record: dict :param smiles_key: Key for SMILES string in the record. :type smiles_key: str :param id_key: Key for record ID. :type id_key: str :raises KeyError: If required fields are missing. :raises ValueError: If SMILES cannot be parsed. :return: Descriptor dictionary with SMILES and ID fields. :rtype: dict Example ------- >>> Descriptor._calc_rdkit_record({'id': 1, 'smiles': 'CC(=O)O'}, 'smiles', 'id') {'id': 1, 'smiles': 'CC(=O)O', ...} """ smi = record.get(smiles_key) pid = record.get(id_key) if smi is None: raise KeyError( f"Missing SMILES under key '{smiles_key}' in record {record}" ) if pid is None: raise KeyError( f"Missing ID under key '{id_key}' in record {record}" ) mol = Chem.MolFromSmiles(smi) if mol is None: raise ValueError( f"Invalid SMILES string '{smi}' in record {record}" ) desc: Dict[str, Any] = {} for name, func in Descriptors._descList: try: desc[name] = func(mol) except Exception: desc[name] = None desc[id_key] = pid desc[smiles_key] = smi return desc
[docs] def calculate( self, data: Union[pd.DataFrame, List[Dict[str, Any]]], n_jobs: int = 1, verbose: int = 0, ) -> Union[pd.DataFrame, List[Dict[str, Any]]]: """ Compute descriptors in parallel for all records in `data`. The output type matches the input type: if you provide a DataFrame, you get a DataFrame; if you provide a list of dicts, you get a list. :param data: Input data (pandas DataFrame or list of dicts), with fields for sequence/SMILES and ID. :type data: Union[pd.DataFrame, List[Dict[str, Any]]] :param n_jobs: Number of parallel jobs (joblib, -1 uses all available cores). :type n_jobs: int :param verbose: Verbosity for joblib parallel execution. :type verbose: int :raises TypeError: If input is not a DataFrame or list of dicts. :raises KeyError: If required keys are missing in input records. :raises ValueError: If SMILES cannot be parsed by RDKit. :return: Descriptor results, in the same format as the input. :rtype: Union[pd.DataFrame, List[Dict[str, Any]]] Example ------- >>> descriptor = Descriptor(engine='peptides') >>> df = pd.DataFrame([{'id': 1, 'peptide_sequence': 'ACDE'}]) >>> result = descriptor.calculate(df, n_jobs=1) """ if isinstance(data, pd.DataFrame): records = data.to_dict(orient="records") as_dataframe = True elif isinstance(data, list): records = data as_dataframe = False else: raise TypeError( "`data` must be a pandas.DataFrame or a list of dicts" ) def peptide_worker(rec): return Descriptor._calc_peptide_record(rec, self.fasta_key, self.id_key) # noqa: E501 def rdkit_worker(rec): return Descriptor._calc_rdkit_record(rec, self.smiles_key, self.id_key) # noqa: E501 if self.engine == "peptides": worker_func = peptide_worker else: worker_func = rdkit_worker results = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(worker_func)(rec) for rec in records ) return pd.DataFrame(results) if as_dataframe else results