-
Notifications
You must be signed in to change notification settings - Fork 0
/
assignment.py
144 lines (120 loc) · 6.62 KB
/
assignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from itertools import product, combinations
from collections import Counter
translation_table = {'TTT': 'F', 'TCT': 'S', 'TAT': 'Y', 'TGT': 'C',
'TTC': 'F', 'TCC': 'S', 'TAC': 'Y', 'TGC': 'C',
'TTA': 'L', 'TCA': 'S', 'TAA': '*', 'TGA': '*',
'TTG': 'L', 'TCG': 'S', 'TAG': '*', 'TGG': 'W',
'CTT': 'L', 'CCT': 'P', 'CAT': 'H', 'CGT': 'R',
'CTC': 'L', 'CCC': 'P', 'CAC': 'H', 'CGC': 'R',
'CTA': 'L', 'CCA': 'P', 'CAA': 'Q', 'CGA': 'R',
'CTG': 'L', 'CCG': 'P', 'CAG': 'Q', 'CGG': 'R',
'ATT': 'I', 'ACT': 'T', 'AAT': 'N', 'AGT': 'S',
'ATC': 'I', 'ACC': 'T', 'AAC': 'N', 'AGC': 'S',
'ATA': 'I', 'ACA': 'T', 'AAA': 'K', 'AGA': 'R',
'ATG': 'M', 'ACG': 'T', 'AAG': 'K', 'AGG': 'R',
'GTT': 'V', 'GCT': 'A', 'GAT': 'D', 'GGT': 'G',
'GTC': 'V', 'GCC': 'A', 'GAC': 'D', 'GGC': 'G',
'GTA': 'V', 'GCA': 'A', 'GAA': 'E', 'GGA': 'G',
'GTG': 'V', 'GCG': 'A', 'GAG': 'E', 'GGG': 'G'
}
# nomenclature for degenerate codons
expanded_code = {'A': ['A'], 'C': ['C'], 'G': ['G'], 'T': ['T'],
'W': ['A', 'T'], 'S': ['C', 'G'], 'M': ['A', 'C'], 'K': ['G', 'T'], 'R': ['A', 'G'], 'Y': ['C', 'T'],
'B': ['C', 'G', 'T'], 'D': ['A', 'G', 'T'], 'H': ['A', 'C', 'T'], 'V': ['A', 'C', 'G'],
'N': ['A', 'C', 'G', 'T']
}
# helpful for validating input
valid_nucleotides = 'ACGTWSMKRYBDHVN'
valid_aa = 'GAVLIMFWPSTCYNQDEKRH*'
# calculate translation table including codons with nontraditional bases
# value of each codon will be a counter where the keys are the amino acids it can
# generate and the value will be the number of ways it can generate that AA
expanded_tt = {}
# examine each possible codon using expanded code
for codon in product(expanded_code.keys(), repeat = 3):
base1 = expanded_code[codon[0]]
base2 = expanded_code[codon[1]]
base3 = expanded_code[codon[2]]
# if current codon is degenerate, find all traditional codons it can encode for
trad_codons = []
for base1_opt in base1:
for base2_opt in base2:
for base3_opt in base3:
trad_codons.append(base1_opt + base2_opt + base3_opt)
# use possible traditional codons to determine what amino acids current codon can code for
poss_aas = [translation_table[codon] for codon in trad_codons]
expanded_tt[''.join(codon)] = Counter(poss_aas)
def get_codon_for_amino_acids(amino_acids):
"""
:param amino_acids: set
the amino acids we want to code for, i.e. {'A','I','V'}
:rtype: set, float
returns two values the set of most efficient codons for the input set list, e.g. {'RYA', 'RYH', 'RYC', 'RYW', 'RYM', 'RYY', 'RYT'} and the achieved efficiency e.g. 0.75
"""
# validate input
for aa in amino_acids:
if aa not in valid_aa:
raise('Input contains an invalid amino acid')
# initalize tracker for maximum efficiency and codons that achieve it
max_eff = 0
max_eff_codons = set()
# check each possible degenerate codon
for codon, aa in expanded_tt.items():
# if current degenerate codon can generate all input AAs and cannot generate a stop
if all(aa in expanded_tt[codon] for aa in amino_acids) and '*' not in expanded_tt[codon]:
# determine efficiency of using current codon to generate input AAs
counter = expanded_tt[codon]
occurrences = sum(counter[aa] for aa in set(amino_acids))
eff = occurrences/sum(counter[aa] for aa in counter.keys())
# if efficiency of current codon is greater than previously observed max efficiency,
# reassign max efficiency, clear list of max efficient codons, and add current codon to list
if eff > max_eff:
max_eff_codons = set([codon])
max_eff = eff
# if efficiency of current codon matches previously observed max efficiency, add it to list
elif eff == max_eff:
max_eff_codons.add(codon)
return(max_eff_codons, max_eff)
def truncate_list_of_amino_acids(amino_acids):
"""
:param amino_acids: set
the amino acids we want to code for, i.e. {'A','I','V'}
:rtype: set
the set of sets of amino acids that can be coded with 100% efficiency, i.e. {frozenset({'V', 'A'}), frozenset({'V', 'I'})}
"""
# validate input
for aa in amino_acids:
if aa not in valid_aa:
raise('Input contains an invalid amino acid')
# determine whether complete input set can be encoded by codons with 100% efficiency
eff = get_codon_for_amino_acids(amino_acids)[1]
eff_aas = set()
if eff == 1:
eff_aas = frozenset(amino_acids)
# if not, use helper function to examine subsets of input set of AAs that exclude 1 AA
else:
eff_aas = truncate_helper(amino_acids, 1)
return(eff_aas)
def truncate_helper(amino_acids, num_to_exclude):
# if all possible combinations of AAs have been tested, return empty set
eff_aas = set()
if (num_to_exclude == len(amino_acids)):
return eff_aas
# find subsets of input AAs where given number of AAs are excluded
subsets = combinations(amino_acids, len(amino_acids) - num_to_exclude)
# determine whether any of these subsets can be encoded with 100% efficiency
for subset in subsets:
eff = get_codon_for_amino_acids(subset)[1]
# if so, return the subsets that can
if eff == 1:
eff_aas.add(frozenset(subset))
# if not, examine combinations that exclude an additional amino acid
if len(eff_aas) == 0:
eff_aas = truncate_helper(amino_acids, num_to_exclude + 1)
return eff_aas
if __name__ == "__main__":
# # using sets instead of lists throughout the code since the order doesn't matter and all items should be unique
assert get_codon_for_amino_acids({'A', 'I', 'V'}) == ({'RYA', 'RYH', 'RYC', 'RYW', 'RYM', 'RYY', 'RYT'}, 0.75)
assert get_codon_for_amino_acids({'M', 'F'}) == ({'WTS', 'WTK', "WTB"}, 0.5)
# "frozenset" here since this seems to be the only way to get a set of sets - see https://stackoverflow.com/questions/5931291/how-can-i-create-a-set-of-sets-in-python
assert truncate_list_of_amino_acids({'A', 'V', 'I'}) == {frozenset({'V', 'A'}), frozenset({'V', 'I'})}