use unipressed.UniprotkbClient analysis UniProt(Swiss-Prot) DB

We can reveal these information.

{'AFDB': 'hasAFDB',
 'PDB': 'no',
 'annotationScore': 1.0,
 'comments': nan,
 'entryAudit': "{'firstPublicDate': '1988-08-01', 'lastAnnotationUpdateDate': "
               "'2022-05-25', 'lastSequenceUpdateDate': '1988-08-01', "
               "'entryVersion': 36, 'sequenceVersion': 1}",
 'entryType': 'UniProtKB reviewed (Swiss-Prot)',
 'extraAttributes': "{'countByFeatureType': {'Chain': 1}, 'uniParcId': "
                    "'UPI000013C2E9'}",
 'features': "[{'type': 'Chain', 'location': {'start': {'value': 1, "
             "'modifier': 'EXACT'}, 'end': {'value': 79, 'modifier': "
             "'EXACT'}}, 'description': 'Putative uncharacterized protein Z', "
             "'featureId': 'PRO_0000066556'}]",
 'geneLocations': nan,
 'genes': nan,
 'index': 860,
 'keywords': "[{'id': 'KW-1185', 'category': 'Technical term', 'name': "
             "'Reference proteome'}]",
 'organism': "{'scientificName': 'Ovis aries', 'commonName': 'Sheep', "
             "'taxonId': 9940, 'lineage': ['Eukaryota', 'Metazoa', 'Chordata', "
             "'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', "
             "'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', "
             "'Pecora', 'Bovidae', 'Caprinae', 'Ovis']}",
 'organismHosts': nan,
 'primaryAccession': 'P08105',
 'proteinDescription': "{'recommendedName': {'fullName': {'value': 'Putative "
                       "uncharacterized protein Z'}}}",
 'proteinExistence': '4: Predicted',
 'references': "[{'referenceNumber': 1, 'citation': {'id': '6193483', "
               "'citationType': 'journal article', 'authors': ['Powell B.C.', "
               "'Sleigh M.J.', 'Ward K.A.', 'Rogers G.E.'], "
               "'citationCrossReferences': [{'database': 'PubMed', 'id': "
               "'6193483'}, {'database': 'DOI', 'id': "
               "'10.1093/nar/11.16.5327'}], 'title': 'Mammalian keratin gene "
               'families: organisation of genes coding for the B2 high-sulphur '
               "proteins of sheep wool.', 'publicationDate': '1983', "
               "'journal': 'Nucleic Acids Res.', 'firstPage': '5327', "
               "'lastPage': '5346', 'volume': '11'}, 'referencePositions': "
               "['NUCLEOTIDE SEQUENCE [GENOMIC DNA]']}]",
 'secondaryAccessions': nan,
 'sequence': "{'value': "
             "'MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPHFVEVVIPCDSLSRSLRLRVIVLFLAIFFPLLNI', "
             "'length': 79, 'molWeight': 9128, 'crc64': 'A663EB489F6290C3', "
             "'md5': '80A5E20AFD024495655E6A9FA7B6166B'}",
 'uniProtKBCrossReferences': "[{'database': 'EMBL', 'id': 'X01610', "
                             "'properties': [{'key': 'ProteinId', 'value': "
                             "'CAA25758.1'}, {'key': 'Status', 'value': '-'}, "
                             "{'key': 'MoleculeType', 'value': "
                             "'Genomic_DNA'}]}, {'database': 'PIR', 'id': "
                             "'S07912', 'properties': [{'key': 'EntryName', "
                             "'value': 'S07912'}]}, {'database': "
                             "'AlphaFoldDB', 'id': 'P08105', 'properties': "
                             "[{'key': 'Description', 'value': '-'}]}, "
                             "{'database': 'Proteomes', 'id': 'UP000002356', "
                             "'properties': [{'key': 'Component', 'value': "
                             "'Unplaced'}]}]",
 'uniProtkbId': 'Z_SHEEP'}
from unipressed import UniprotkbClient
import pandas as pd

# Entries in Uniprot(Swiss-Prot)
uniprot_sp_entries_txt = open("/path/uniprot_sprot.fasta/SwissProt_entries.txt", "r")
uniprot_sp_entries = uniprot_sp_entries_txt.read().split('\n')
print(len(uniprot_sp_entries), uniprot_sp_entries[-3:])

# Split the whole entries in Uniprot(Swiss-Prot) into some chunks with the size of 1000
chunks = [uniprot_sp_entries[x:x+1000] for x in range(0, len(uniprot_sp_entries), 1000)]
print("We have ", len(chunks), " chunks, and the last chunk has ", len(chunks[-1])," entries.")


# analyze using unipressed
no_uniProtKBCrossReferences_list = []
no_uniProtKBCrossReferences_count = 0


for i in range(len(chunks)):
    print(i, end=": ")
    chunk = chunks[i]
    db_dic = UniprotkbClient.fetch_many(chunk)
    df = pd.DataFrame(db_dic)
    print("chunk size is ", len(df), end="; ")

    # no_uniProtKBCrossReferences list and count
    no_uniProtKBCrossReferences_primaryAccession_list = df[df["uniProtKBCrossReferences"].isna()]["primaryAccession"].tolist()
    no_uniProtKBCrossReferences_list.extend(no_uniProtKBCrossReferences_primaryAccession_list)
    no_uniProtKBCrossReferences_count = no_uniProtKBCrossReferences_count + len(no_uniProtKBCrossReferences_primaryAccession_list)
    print("no_uniProtKBCrossReferences_count is ", no_uniProtKBCrossReferences_count)

    # has uniProtKBCrossReferences df
    db_df = df[~df["uniProtKBCrossReferences"].isna()]
    # has PDB structure or has AFDB structure
    db_df.loc[:,["PDB"]] = db_df["uniProtKBCrossReferences"].apply(lambda x: "hasPDB" if "PDB" in pd.DataFrame(x)["database"].tolist() else "no")
    db_df.loc[:,["AFDB"]] = db_df["uniProtKBCrossReferences"].apply(lambda x: "hasAFDB" if "AlphaFoldDB" in pd.DataFrame(x)["database"].tolist() else "no")
    db_df.to_csv("/path/UniprotkbClient_results/db_df_"+str(i)+".csv")




# write no_uniProtKBCrossReferences_list to txt file
no_uniProtKBCrossReferences_file = open("/path/UniprotkbClient_results/no_uniProtKBCrossReferences_entries.txt", "w")
for line in no_uniProtKBCrossReferences_list:
    no_uniProtKBCrossReferences_file.write(line+"\n")
no_uniProtKBCrossReferences_file.close()