Source code for pepkit.chem.conversion.conversion

"""Convenience conversion functions exported by pepkit.chem."""

from __future__ import annotations
from typing import Optional
from rdkit import Chem
from .peptide_decoder import PeptideDecoder

__all__ = ["smiles_to_fasta", "fasta_to_smiles"]



[docs]
def smiles_to_fasta(
    smiles: str, header: Optional[str] = None, split: bool = False
) -> str:
    """Convert peptide SMILES to FASTA or raw sequence.

    By default this returns a FASTA-formatted string:
        >[header]
        SEQUENCE

    If ``split=True`` the function returns the raw one-letter sequence
    (e.g. "GPG") without any FASTA header.

    :param smiles: Input SMILES representing a linear peptide.
    :param header: Optional header (without '>'). Ignored when ``split=True``.
    :param split: If True, return the raw sequence string instead of FASTA.
    :returns: FASTA-formatted string (default) or raw sequence (if split=True).
    :raises ValueError: On parse/decoding failure.
    """
    decoder = PeptideDecoder().from_smiles(smiles).decode()
    seq = decoder.sequence
    if split:
        return seq
    hdr = f">{header}\n" if header else ">\n"
    return f"{hdr}{seq}\n"




[docs]
def fasta_to_smiles(fasta: str) -> str:
    """Convert one-letter FASTA (no header) to canonical SMILES using RDKit.

    Rejects non-canonical sequences containing the placeholder 'X'.

    :param fasta: Amino-acid sequence in one-letter code.
    :returns: Canonical SMILES string.
    :raises ValueError: If the sequence contains 'X' or RDKit cannot parse it.
    """
    if "X" in fasta.upper():
        raise ValueError("Non-canonical residue 'X' found in FASTA.")
    mol = Chem.MolFromFASTA(fasta)
    if mol is None:
        raise ValueError(f"Could not parse FASTA: {fasta!r}")
    return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True)