conda install -c conda-forge -c bioconda foldseek
folsseek createdb myDB_dir/ myDB
# myDB_dir has lots of pdb files
usage: foldseek easy-search <i:PDB|mmCIF[.gz]> ... <i:PDB|mmCIF[.gz]>|<i:stdin> <i:targetFastaFile[.gz]>|<i:targetDB> <o:alignmentFile> <tmpDir> [options]
By Martin Steinegger <martin.steinegger@snu.ac.kr>
options: prefilter:
--comp-bias-corr INT Correct for locally biased amino acid composition (range 0-1) [1]
--comp-bias-corr-scale FLOAT Correct for locally biased amino acid composition (range 0-1) [1.000]
--seed-sub-mat TWIN Substitution matrix file for k-mer generation [aa:3di.out,nucl:3di.out]
-s FLOAT Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive [9.500]
-k INT k-mer length (0: automatically set to optimum) [6]
--target-search-mode INT target search mode (0: regular k-mer, 1: similar k-mer) [0]
--k-score TWIN k-mer threshold for generating similar k-mer lists [seq:2147483647,prof:2147483647]
--max-seqs INT Maximum results per query sequence allowed to pass the prefilter (affects sensitivity) [1000]
--split INT Split input into N equally distributed chunks. 0: set the best split automatically [0]
--split-mode INT 0: split target db; 1: split query db; 2: auto, depending on main memory [2]
--split-memory-limit BYTE Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0]
--diag-score BOOL Use ungapped diagonal scoring during prefilter [1]
--exact-kmer-matching INT Extract only exact k-mers for matching (range 0-1) [0]
--mask INT Mask sequences in k-mer stage: 0: w/o low complexity masking, 1: with low complexity masking [0]
--mask-prob FLOAT Mask sequences is probablity is above threshold [1.000]
--mask-lower-case INT Lowercase letters will be excluded from k-mer search 0: include region, 1: exclude region [1]
--min-ungapped-score INT Accept only matches with ungapped alignment score above threshold [30]
--spaced-kmer-mode INT 0: use consecutive positions in k-mers; 1: use spaced k-mers [1]
--spaced-kmer-pattern STR User-specified spaced k-mer pattern []
--local-tmp STR Path where some of the temporary files will be created []
--exhaustive-search BOOL Turns on an exhaustive all vs all search by by passing the prefilter step [0]
align:
--min-seq-id FLOAT List matches above this sequence identity (for clustering) (range 0.0-1.0) [0.000]
-c FLOAT List matches above this fraction of aligned (covered) residues (see --cov-mode) [0.000]
--cov-mode INT 0: coverage of query and target
1: coverage of target
2: coverage of query
3: target seq. length has to be at least x% of query length
4: query seq. length has to be at least x% of target length
5: short seq. needs to be at least x% of the other seq. length [0]
--max-rejected INT Maximum rejected alignments before alignment calculation for a query is stopped [2147483647]
--max-accept INT Maximum accepted alignments before alignment calculation for a query is stopped [2147483647]
-a BOOL Add backtrace string (convert to alignments with mmseqs convertalis module) [0]
--sort-by-structure-bits INT sort by bits*sqrt(alnlddt*alntmscore) [1]
--alignment-mode INT How to compute the alignment:
0: automatic
1: only score and end_pos
2: also start_pos and cov
3: also seq.id [3]
--alignment-output-mode INT How to compute the alignment:
0: automatic
1: only score and end_pos
2: also start_pos and cov
3: also seq.id
4: only ungapped alignment
5: score only (output) cluster format [0]
-e DOUBLE List matches below this E-value (range 0.0-inf) [1.000E+01]
--min-aln-len INT Minimum alignment length (range 0-INT_MAX) [0]
--seq-id-mode INT 0: alignment length 1: shorter, 2: longer sequence [0]
--alt-ali INT Show up to this many alternative alignments [0]
--gap-open TWIN Gap open cost [aa:10,nucl:10]
--gap-extend TWIN Gap extension cost [aa:1,nucl:1]
profile:
--num-iterations INT Number of iterative profile search iterations [1]
misc:
--tmscore-threshold FLOAT accept alignments with a tmsore > thr [0.0,1.0] [0.000]
--tmalign-hit-order INT order hits by 0: (qTM+tTM)/2, 1: qTM, 2: tTM, 3: min(qTM,tTM) 4: max(qTM,tTM) [0]
--tmalign-fast INT turn on fast search in TM-align [1]
--lddt-threshold FLOAT accept alignments with a lddt > thr [0.0,1.0] [0.000]
--alignment-type INT How to compute the alignment:
0: 3di alignment
1: TM alignment
2: 3Di+AA [2]
--exact-tmscore INT turn on fast exact TMscore (slow), default is approximate [0]
--prefilter-mode INT prefilter mode: 0: kmer/ungapped 1: ungapped, 2: nofilter [0]
--cluster-search INT first find representative then align all cluster members [0]
--mask-bfactor-threshold FLOAT mask residues for seeding if b-factor < thr [0,100] [0.000]
--input-format INT Format of input structures:
0: Auto-detect by extension
1: PDB
2: mmCIF
3: mmJSON
4: ChemComp
5: Foldcomp [0]
--file-include STR Include file names based on this regex [.*]
--file-exclude STR Exclude file names based on this regex [^$]
--format-mode INT Output format:
0: BLAST-TAB
1: SAM
2: BLAST-TAB + query/db length
3: Pretty HTML
4: BLAST-TAB + column headers
5: Calpha only PDB super-posed to query
BLAST-TAB (0) and BLAST-TAB + column headers (4)support custom output formats (--format-output)
(5) Superposed PDB files (Calpha only) [0]
--format-output STR Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen
tstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,mismatch,qcov,tcov
qset,qsetid,tset,tsetid,taxid,taxname,taxlineage,
lddt,lddtfull,qca,tca,t,u,qtmscore,ttmscore,alntmscore,rmsd,prob
complexqtmscore,complexttmscore,complexu,complext,complexassignid
[query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits]
--greedy-best-hits BOOL Choose the best hits greedily to cover the query [0]
common:
--db-load-mode INT Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0]
--threads INT Number of CPU-cores used (all by default) [40]
-v INT Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]
--sub-mat TWIN Substitution matrix file [aa:3di.out,nucl:3di.out]
--max-seq-len INT Maximum sequence length [65535]
--compressed INT Write compressed output [0]
--remove-tmp-files BOOL Delete temporary files [1]
--mpi-runner STR Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") []
--force-reuse BOOL Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0]
--prostt5-model STR Path to ProstT5 model []
expert:
--zdrop INT Maximal allowed difference between score values before alignment is truncated (nucleotide alignment only) [40]
--taxon-list STR Taxonomy ID, possibly multiple values separated by ',' []
--chain-name-mode INT Add chain to name:
0: auto
1: always add
[0]
--write-mapping INT write _mapping file containing mapping from internal id to taxonomic identifier [0]
--coord-store-mode INT Coordinate storage mode:
1: C-alpha as float
2: C-alpha as difference (uint16_t) [2]
--write-lookup INT write .lookup file containing mapping from internal id, fasta id and file number [1]
--db-output BOOL Return a result DB instead of a text file [0]
examples:
# Search a single/multiple PDB file against a set of PDB files
foldseek easy-search examples/d1asha_ examples/ result.m8 tmp
# Format output differently
foldseek easy-search examples/d1asha_ examples/ result.m8 tmp --format-output query,target,qstart,tstart,cigar
# Align with TMalign (global)
foldseek easy-search examples/d1asha_ examples/ result.m8 tmp --alignment-type 1
# Skip prefilter and perform an exhaustive alignment (slower but more sensitive)
foldseek easy-search examples/d1asha_ examples/ result.m8 tmp --exhaustive-search 1
#!/bin/bash
source ~/miniconda3/etc/profile.d/conda.sh
conda activate foldseek
pdb_DB="/(...)/lihuilin/myfoldseek/PDB_db_dir/myPDBdb"
afdb_DB="/(...)/lihuilin/myfoldseek/AFDB_db_dir/myAFDBdb"
query_pdb="/(...)/lihuilin/query/xxx.pdb"
cd /(...)/lihuilin/myfoldseek
mkdir -p xxx_all
cd xxx_all
# foldseek easy-search $query_pdb $afdb_DB xxx_mut_afdb_tms_thr_abo_0_4_result.m8 tmp --tmscore-threshold 0.4 --alignment-type 1 --format-output query,target,evalue,pident,fident,bits,tseq,qtmscore,ttmscore,alntmscore,rmsd,prob
# foldseek easy-search $query_pdb $pdb_DB xxx_mut_pdbdb_tms_thr_abo_0_4_result.m8 tmp --tmscore-threshold 0.4 --alignment-type 1 --format-output query,target,evalue,pident,fident,bits,tseq,qtmscore,ttmscore,alntmscore,rmsd,prob
afdb_res_m8="/(...)/lihuilin/myfoldseek/xxx_mut_afdb_tms_thr_abo_0_4_result.m8"
pdb_res_m8="/(...)/lihuilin/myfoldseek/xxx_mut_pdbdb_tms_thr_abo_0_4_result.m8"
PDBdb_dir="/(...)/lihuilin/DB/pure_PDB_DB_by_Chain/"
AFDBdb_dir="/(...)/lihuilin/DB/AFDB_pLDDT_70"
afdb_res_dir="/(...)/lihuilin/myfoldseek/xxx_mut_afdb_tms_thr_abo_0_4_result_dir"
pdb_res_dir="/(...)/lihuilin/myfoldseek/xxx_mut_pdbdb_tms_thr_abo_0_4_result_dir"
mkdir -p $afdb_res_dir
mkdir -p $pdb_res_dir
# for afdb
while read -r line; do
ID=$(echo "$line" | awk '{print $2}')
inp_fp="$AFDBdb_dir/$ID.pdb"
oup_fp="$afdb_res_dir/$ID.pdb"
cp "$inp_fp" "$oup_fp"
done < $afdb_res_m8
# for pdb
while read -r line; do
ID=$(echo "$line" | awk '{print $2}')
inp_fp="$PDBdb_dir$ID.pdb"
oup_fp="$pdb_res_dir/$ID.pdb"
cp "$inp_fp" "$oup_fp"
done < $pdb_res_m8