dataset

ftp.ensembl.org

https://ftp.ensembl.org/pub/current_fasta/

size(no duplicates): 11,551,135 (11 million)

# get .gz files
wget -r -np -e robots=off -A 'pep.all.fa.gz' https://ftp.ensembl.org/pub/current_fasta/ 
import os, gzip, shutil
import glob
import numpy as np
import pandas as pd
from Bio import SeqIO

# unzip all .gz files
start_dir = "./ftp.ensembl.org/pub/current_fasta/"
gz_count = []
destination_dir = "./fa_save/"
for root, dirs, files in os.walk(start_dir):
    for gz_name in files:
        gz_file = os.path.join(root, gz_name)
        if "/pep/" in gz_file:
            # print(gz_file)
            gz_count.append(gz_file)
            out_file = destination_dir + gz_name.split(".gz")[0]
            with gzip.open(gz_file,"rb") as f_in, open(out_file,"wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
# read
with open("./fa_save/all_data.fa") as fasta_file:
    identifiers = []
    seq = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'): 
        identifiers.append(seq_record.id)
        seq.append("".join(seq_record.seq))
df_pro = pd.DataFrame()
df_pro["aa_seq"] = seq
df_pro["p_id"] = identifiers
# clear duplicates
df_pro_drop_dup = df_pro.drop_duplicates(subset='aa_seq')