Source code for pepkit.chem.desc.descriptor

import pandas as pd
from peptides import Peptide
from rdkit import Chem
from rdkit.Chem import Descriptors
from joblib import Parallel, delayed
from typing import List, Dict, Union, Any



[docs]
class Descriptor:
    """
    Compute molecular or peptide descriptors for a collection of records.

    This class provides descriptor calculation for peptides or small molecules,
    supporting two engines:
      - 'peptides': Uses the `peptides` Python package for peptide descriptors.
      - 'rdkit': Uses RDKit for general molecular descriptors from SMILES.

    :param engine: Descriptor engine ('peptides' for peptide descriptors,
    'rdkit' for molecular descriptors).
    :type engine: str
    :param fasta_key: Key for the peptide sequence in input records or
    DataFrame.
    :type fasta_key: str
    :param id_key: Key for unique record identifiers in input.
    :type id_key: str
    :param smiles_key: Key for SMILES string in input records
    (used only by 'rdkit').
    :type smiles_key: str

    Example
    -------
    >>> descriptor = Descriptor(engine='peptides')
    >>> records = [{'id': 1, 'peptide_sequence': 'ACDE'}]
    >>> df_out = descriptor.calculate(records, n_jobs=2)
    >>> descriptor = Descriptor(engine='rdkit')
    >>> records = [{'id': 1, 'smiles': 'CC(=O)O'}]
    >>> df_out = descriptor.calculate(records, n_jobs=4)
    """

    SUPPORTED_ENGINES = {"peptides", "rdkit"}

    def __init__(
        self,
        engine: str = "peptides",
        fasta_key: str = "peptide_sequence",
        id_key: str = "id",
        smiles_key: str = "smiles",
    ) -> None:
        """
        Initialize the Descriptor calculator.

        :param engine: Descriptor engine to use ('peptides' or 'rdkit').
        :type engine: str
        :param fasta_key: Sequence key in each record (for both engines).
        :type fasta_key: str
        :param id_key: ID key in each record.
        :type id_key: str
        :param smiles_key: SMILES key in each record (used only by 'rdkit').
        :type smiles_key: str
        :raises ValueError: If unsupported engine is specified.
        """
        engine = engine.lower()
        if engine not in self.SUPPORTED_ENGINES:
            raise ValueError(
                f"Unsupported engine '{engine}'."
                f"Choose from {self.SUPPORTED_ENGINES}."
            )
        self.engine = engine
        self.fasta_key = fasta_key
        self.id_key = id_key
        self.smiles_key = smiles_key

    @staticmethod
    def _calc_peptide_record(
        record: Dict[str, Any], fasta_key: str, id_key: str
    ) -> Dict[str, Any]:
        """
        Compute peptide-specific descriptors using the `peptides` library.

        :param record: Input record with sequence and ID.
        :type record: dict
        :param fasta_key: Key for sequence in the record.
        :type fasta_key: str
        :param id_key: Key for record ID.
        :type id_key: str
        :raises KeyError: If required fields are missing.
        :return: Descriptor dictionary with sequence and ID fields.
        :rtype: dict

        Example
        -------
        >>> Descriptor._calc_peptide_record({'id': 1,
        'peptide_sequence': 'ACD'}, 'peptide_sequence', 'id')
        {'id': 1, 'peptide_sequence': 'ACD', ...}
        """
        seq = record.get(fasta_key)
        pid = record.get(id_key)
        if seq is None:
            raise KeyError(
                f"Missing sequence under key '{fasta_key}' in record {record}"
            )
        if pid is None:
            raise KeyError(
                f"Missing ID under key '{id_key}' in record {record}"
            )
        pep = Peptide(seq)
        desc = pep.descriptors()
        desc[id_key] = pid
        desc[fasta_key] = seq
        return desc

    @staticmethod
    def _calc_rdkit_record(
        record: Dict[str, Any], smiles_key: str, id_key: str
    ) -> Dict[str, Any]:
        """
        Compute RDKit descriptors for a single record.

        :param record: Input record with SMILES and ID.
        :type record: dict
        :param smiles_key: Key for SMILES string in the record.
        :type smiles_key: str
        :param id_key: Key for record ID.
        :type id_key: str
        :raises KeyError: If required fields are missing.
        :raises ValueError: If SMILES cannot be parsed.
        :return: Descriptor dictionary with SMILES and ID fields.
        :rtype: dict

        Example
        -------
        >>> Descriptor._calc_rdkit_record({'id': 1, 'smiles': 'CC(=O)O'},
        'smiles', 'id') {'id': 1, 'smiles': 'CC(=O)O', ...}
        """
        smi = record.get(smiles_key)
        pid = record.get(id_key)
        if smi is None:
            raise KeyError(
                f"Missing SMILES under key '{smiles_key}' in record {record}"
            )
        if pid is None:
            raise KeyError(
                f"Missing ID under key '{id_key}' in record {record}"
            )
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            raise ValueError(
                f"Invalid SMILES string '{smi}' in record {record}"
            )
        desc: Dict[str, Any] = {}
        for name, func in Descriptors._descList:
            try:
                desc[name] = func(mol)
            except Exception:
                desc[name] = None
        desc[id_key] = pid
        desc[smiles_key] = smi
        return desc


[docs]
    def calculate(
        self,
        data: Union[pd.DataFrame, List[Dict[str, Any]]],
        n_jobs: int = 1,
        verbose: int = 0,
    ) -> Union[pd.DataFrame, List[Dict[str, Any]]]:
        """
        Compute descriptors in parallel for all records in `data`.

        The output type matches the input type: if you provide a DataFrame,
        you get a DataFrame; if you provide a list of dicts, you get a list.

        :param data: Input data (pandas DataFrame or list of dicts), with
        fields for sequence/SMILES and ID.
        :type data: Union[pd.DataFrame, List[Dict[str, Any]]]
        :param n_jobs: Number of parallel jobs (joblib, -1 uses all available
        cores).
        :type n_jobs: int
        :param verbose: Verbosity for joblib parallel execution.
        :type verbose: int
        :raises TypeError: If input is not a DataFrame or list of dicts.
        :raises KeyError: If required keys are missing in input records.
        :raises ValueError: If SMILES cannot be parsed by RDKit.
        :return: Descriptor results, in the same format as the input.
        :rtype: Union[pd.DataFrame, List[Dict[str, Any]]]

        Example
        -------
        >>> descriptor = Descriptor(engine='peptides')
        >>> df = pd.DataFrame([{'id': 1, 'peptide_sequence': 'ACDE'}])
        >>> result = descriptor.calculate(df, n_jobs=1)
        """
        if isinstance(data, pd.DataFrame):
            records = data.to_dict(orient="records")
            as_dataframe = True
        elif isinstance(data, list):
            records = data
            as_dataframe = False
        else:
            raise TypeError(
                "`data` must be a pandas.DataFrame or a list of dicts"
            )

        def peptide_worker(rec):
            return Descriptor._calc_peptide_record(rec, self.fasta_key, self.id_key)    # noqa: E501

        def rdkit_worker(rec):
            return Descriptor._calc_rdkit_record(rec, self.smiles_key, self.id_key)     # noqa: E501

        if self.engine == "peptides":
            worker_func = peptide_worker
        else:
            worker_func = rdkit_worker

        results = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(worker_func)(rec) for rec in records
        )

        return pd.DataFrame(results) if as_dataframe else results