Source code for pepkit.query.query

import pandas as pd
import argparse
from pathlib import Path
from .rcsbapi import peptide_query
from .filter import main_parallel
from ..prep.sequence import extract_sequences
from .mmseqs2 import (
    mmseq2_run,
    _get_representatives,
    _get_unique_representatives
)


[docs] def query( quality, exp_method, release_date, length_cutoff, canonical_check, hetatm_check, csv_path, fasta_path, receptor_only, n_jobs, ): """ Query, validate, and extract peptide–protein complexes from RCSB. This function performs an end-to-end workflow: 1. Query RCSB for candidate peptide–protein complexes using metadata constraints (resolution, experimental method, release date). 2. Validate each PDB entry using structural and sequence-based criteria (peptide detection, length cutoff, canonical residues, HETATM presence). 3. Write a metadata table (CSV) describing valid complexes. 4. Extract corresponding sequences into a FASTA file for downstream modeling (e.g., AF-Multimer, docking, ML pipelines). The function is **side-effect driven**: results are written to disk (CSV + FASTA) and not returned explicitly. :param quality: Maximum allowed experimental resolution (in Å) used to query RCSB. Lower values correspond to higher-quality structures. Example: ``3.0``. :type quality: float :param exp_method: Experimental method used to solve the structure. Must match RCSB metadata exactly. Example: ``"X-RAY DIFFRACTION"``. :type exp_method: str :param release_date: Release date constraint for RCSB query. Can be either: - a dict with ``{"from": YYYY-MM-DD, "to": YYYY-MM-DD}``, or - a single date string (interpreted as lower bound). :type release_date: dict or str :param length_cutoff: Maximum allowed sequence length used for peptide/protein filtering. Typically peptides are expected to be short (e.g. ≤ 50 residues). :type length_cutoff: int :param canonical_check: If ``True``, discard complexes containing non-canonical amino acids (e.g., ``X``) in any retained chain. :type canonical_check: bool :param hetatm_check: If ``True``, discard PDB entries containing HETATM records (e.g., ligands, cofactors, modified residues). :type hetatm_check: bool :param csv_path: Output path for the CSV metadata table describing valid peptide–protein complexes. :type csv_path: str or pathlib.Path :param fasta_path: Output path for the FASTA file containing extracted sequences. The exact content depends on ``receptor_only``. :type fasta_path: str or pathlib.Path :param receptor_only: If ``True``, only receptor (protein) chains are written to FASTA. If ``False``, both peptide and protein chains are included. :type receptor_only: bool :param n_jobs: Number of parallel workers used for PDB validation. Passed to ``joblib.Parallel``. :type n_jobs: int :raises RuntimeError: If RCSB query fails or no valid complexes are found. :raises IOError: If output files cannot be written. :side effects: - Writes ``csv_path`` (CSV metadata) - Writes ``fasta_path`` (FASTA sequences) :example: >>> query( ... quality=3.0, ... exp_method="X-RAY DIFFRACTION", ... release_date={"from": "2018-01-01", "to": "2018-01-08"}, ... length_cutoff=50, ... canonical_check=True, ... hetatm_check=True, ... csv_path="demo.csv", ... fasta_path="demo.fasta", ... receptor_only=True, ... n_jobs=4, ... ) """ peptide_complexes = peptide_query( quality=quality, exp_method=exp_method, release_date=release_date ) results = main_parallel( pdb_ids=peptide_complexes, length_cutoff=length_cutoff, canonical_check=canonical_check, hetatm_check=hetatm_check, n_jobs=n_jobs, ) df = pd.DataFrame( [result for valid, valid_chains, result, valid_sequences in results if valid] # noqa: E501 ) df.to_csv(csv_path, index=False) extract_sequences( results, fasta_path=fasta_path, receptor_only=receptor_only )
def main_process( quality, exp_method, core_release_date, train_release_date, length_cutoff, canonical_check, hetatm_check, core_csv_path, core_fasta_path, train_csv_path, train_fasta_path, final_csv_path, receptor_only, n_jobs, ): # 1. Query peptide-protein complexes from rcsb # core -> metadata + fasta query( quality=quality, exp_method=exp_method, release_date=core_release_date, length_cutoff=length_cutoff, canonical_check=canonical_check, hetatm_check=hetatm_check, csv_path=core_csv_path, fasta_path=core_fasta_path, receptor_only=receptor_only, n_jobs=n_jobs, ) # train -> metadata + fasta query( quality=quality, exp_method=exp_method, release_date=train_release_date, length_cutoff=length_cutoff, canonical_check=False, hetatm_check=False, csv_path=train_csv_path, fasta_path=train_fasta_path, receptor_only=receptor_only, n_jobs=n_jobs, ) # 2. Run MMseqs2 to find representative sequences # Remove redundancy in core set mmseq2_run( query_fasta=core_fasta_path, ) # Remove overlapping of core vs. train # Combine core representatives and all train representatives combined_fasta_path = Path(core_fasta_path).parent / "combined.fasta" core_cluster_rep_path = ( Path(core_fasta_path).parent / f"{Path(core_fasta_path).stem}_out_rep_seq.fasta" # noqa: E501 ) with ( open(core_cluster_rep_path, "r") as core_file, open(train_fasta_path, "r") as train_file, open(combined_fasta_path, "w") as combined_file, ): combined_file.writelines(core_file.readlines()) combined_file.writelines(train_file.readlines()) mmseq2_run( query_fasta=combined_fasta_path, ) # Get representatives from the core output core_representatives = _get_representatives( f"{Path(core_fasta_path).parent}/{Path(core_fasta_path).stem}_out_cluster.tsv" # noqa: E501 ) combined_unique_representatives = _get_unique_representatives( f"{Path(combined_fasta_path).parent}/combined_out_cluster.tsv" ) final_core_lists = list( set(core_representatives) & set(combined_unique_representatives) ) final_core_lists = list(set(rep.split(".")[0] for rep in final_core_lists)) # Filter metadata based on final_core_lists df = pd.read_csv(core_csv_path) final_df = df[df["pdb_id"].isin(final_core_lists)] final_df.to_csv(final_csv_path, index=False) return final_df def parser(): parser = argparse.ArgumentParser(description="Process PDB files") parser.add_argument( "--quality", type=float, default=3.0, help="Quality of the PDB files" ) parser.add_argument( "--exp_method", type=str, default="X-RAY DIFFRACTION", help="Experimental method used", ) parser.add_argument( "--core_release_date", type=str, nargs="+", help="Release date range (YYYY-MM-DD YYYY-MM-DD)", ) parser.add_argument( "--length_cutoff", type=int, default=50, help="Length cutoff for sequences", ) parser.add_argument( "--canonical_check", action="store_true", default=True, help="Check for canonical residues", ) parser.add_argument( "--hetatm_check", action="store_true", default=True, help="Check for HETATM records", ) parser.add_argument( "--core_csv_path", type=str, default="core.csv", help="Path to core CSV file" ) parser.add_argument( "--core_fasta_path", type=str, default="core.fasta", help="Path to core FASTA file", ) parser.add_argument( "--train_csv_path", type=str, default="train.csv", help="Path to train CSV file" ) parser.add_argument( "--train_fasta_path", type=str, default="train.fasta", help="Path to train FASTA file", ) parser.add_argument( "--final_csv_path", type=str, default="final.csv", help="Path to final CSV file" ) parser.add_argument( "--receptor_only", action="store_true", default=True, help="Only include receptor chains", ) parser.add_argument( "--n_jobs", type=int, default=8, help="Number of jobs for parallel processing" ) return parser.parse_args() def main(): args = parser() # release_date = None if args.core_release_date: train_release_date = args.core_release_date[0] core_release_date = { "from": args.core_release_date[0], "to": args.core_release_date[1], } final_df = main_process( quality=args.quality, exp_method=args.exp_method, core_release_date=core_release_date, train_release_date=train_release_date, length_cutoff=args.length_cutoff, canonical_check=args.canonical_check, hetatm_check=args.hetatm_check, core_csv_path=args.core_csv_path, core_fasta_path=args.core_fasta_path, train_csv_path=args.train_csv_path, train_fasta_path=args.train_fasta_path, final_csv_path=args.final_csv_path, receptor_only=args.receptor_only, n_jobs=args.n_jobs, ) print(final_df.head(5)) if __name__ == "__main__": main()