我正在用 Python 编写一个生物信息学应用程序的脚本,该脚本迭代多个序列,寻找特定位置的氨基酸,以计算它们相对于基因组数据库的频率。然而,我是这个领域的新手,而且我一直在开发项目的同时学习,所以我所做的很多事情都没有得到很好的优化。对于我认为相对简单的任务,脚本对于文件需要半分钟,有时对于较大的文件需要更长的时间
任何人都可以建议我如何尝试解决这个问题吗?如果这很微不足道,请道歉,我是一名生物学家,刚刚开始掌握编程
def main(self):
# Process input data and extract necessary information
mutation_count, mutation_subjects, mutation_positions, alignment_file, seq_ID = self.process_input()
# Initialize a dictionary to count amino acids at each mutation position
aa_counts = {pos: {} for pos in mutation_positions}
# Read the alignment file using Bio.AlignIO
alignment = AlignIO.read(alignment_file, 'clustal')
# Find the reference sequence in the alignment
ref_seq = None
for record in alignment:
if record.id == seq_ID:
ref_seq = record
break
# Convert sequences in the alignment to strings for easier processing
ref_seq_str = str(ref_seq.seq)
alignment_str = {record.id: str(record.seq) for record in alignment}
# Count amino acids at each position in the alignment
for seq in alignment_str.values():
pos_in_ref_seq = 0
for pos, aa in enumerate(seq):
if ref_seq_str[pos] != '-':
pos_in_ref_seq += 1
if pos_in_ref_seq in aa_counts:
aa_counts[pos_in_ref_seq][aa] = aa_counts[pos_in_ref_seq].get(aa, 0) + 1
# If a specific position is provided, calculate and print the amino acid frequencies at that position
if self.args.position:
position = self.args.position
total_count = sum(aa_counts[position].values())
for aa, count in aa_counts[position].items():
freq = (count / total_count * 100) if total_count > 0 else 0
print(f"Amino Acid: {aa} | Frequency: {freq:.2f}% | Count: {count}")
return
# Analyze mutations and calculate frequencies
mutation_info = {}
for mutation, count in sorted(mutation_count.items(), key=lambda x: int(x[0][1:-1])):
seq_pos = int(mutation[1:-1])
query_aa = mutation[0]
subject_aa = mutation[-1]
if query_aa == '-' or subject_aa == '-':
continue
real_count = aa_counts[seq_pos].get(subject_aa, 0)
total_count = sum(aa_counts[seq_pos].values())
query_aa_frequency = (aa_counts[seq_pos].get(query_aa, 0) / total_count * 100) if total_count > 0 else 0
subject_aa_frequency = (aa_counts[seq_pos].get(subject_aa, 0) / total_count * 100) if total_count > 0 else 0
if subject_aa_frequency <= 10 and real_count > 2:
mutation_info[mutation] = {
'query_aa_frequency': query_aa_frequency,
'subject_aa_frequency': subject_aa_frequency,
'real_count': real_count,
}
# Identify strains with specific mutations
strains_with_mutations = {}
for mutation in mutation_count:
query_aa, pos, subject_aa = mutation[0], int(mutation[1:-1]), mutation[-1]
strains_with_this_mutation = []
for record in alignment:
strain_name = record.id[:3]
sequence = str(record.seq)
pos_in_ref_seq = 0
for i, aa in enumerate(sequence):
if ref_seq.seq[i] != '-':
pos_in_ref_seq += 1
if pos_in_ref_seq == pos:
if aa == subject_aa:
strains_with_this_mutation.append(strain_name[:3])
break
strains_with_mutations[mutation] = strains_with_this_mutation
# Write mutation information to a text file
with open('gapslist.txt', 'w') as f:
for mutation, info in mutation_info.items():
f.write(f"Mutation: {mutation} | {mutation[0]} Frequency: {info['query_aa_frequency']:.2f}% | {mutation[-1]} Frequency: {info['subject_aa_frequency']:.2f}% | Count: {info['real_count']}\n\n")
# Write mutation and strain information to a CSV file
with open('rare_mutations.csv', 'w', newline='') as csvfile:
fieldnames = ['Mutation', 'Strains']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for mutation, info in mutation_info.items():
writer.writerow({
'Mutation': mutation,
'Strains': ';'.join(strains_with_mutations.get(mutation, []))
})
分析代码,我意识到最大的瓶颈是 Bio.Seq 中 Seq 类的 _getitem_ 函数被调用的次数:
ncalls tottime percall cumtime percall filename:lineno(function)
69006091 25.002 0.000 99.135 0.000 Seq.py:470(__getitem__)
69098925 6.311 0.000 6.311 0.000 SeqRecord.py:334(<lambda>).
69008225 10.165 0.000 49.965 0.000 <frozen abc>:117(__instancecheck__)
69006091 10.330 0.000 22.310 0.000 <frozen abc>:121(__subclasscheck__)
我一直在研究处理这个问题的方法,但似乎没有任何效果,或者只是稍微减少了执行时间。
不知道我是否做对了,我也在努力学习。
我尝试按照标题建议修改
__getitem__
方法;方法在这里:
github.com/biopython/Bio/Seq.py:'类_SeqAbstractBaseClass(ABC)getitem'
测试代码
test-001.py
:
from Bio.Seq import Seq
a = 'MAGLVWT'
seq_a = Seq(a*1000000)
# print(seq_a , type(seq_a))
empty = { }
for i in a :
empty[i] = 0
print(empty)
for i in range(len(seq_a)) :
x = seq_a[i]
empty[x] +=1
print(empty)
使用cProfile:
> python -m cProfile -s cumulative test-001.py
{'M': 0, 'A': 0, 'G': 0, 'L': 0, 'V': 0, 'W': 0, 'T': 0}
<class 'Bio.Seq.Seq'>
{'M': 10000000, 'A': 10000000, 'G': 10000000, 'L': 10000000, 'V': 10000000, 'W': 10000000, 'T': 10000000}
210037787 function calls (210037746 primitive calls) in 107.698 seconds
Ordered by: cumulative time
ncalls tottime percall cumtime percall filename:lineno(function)
6/1 0.000 0.000 107.698 107.698 {built-in method builtins.exec}
1 48.584 48.584 107.698 107.698 test-001.py:3(<module>)
70000000 40.908 0.000 59.003 0.000 Seq.py:410(__getitem__)
70000033 9.140 0.000 9.140 0.000 {built-in method builtins.isinstance}
70000000 8.955 0.000 8.955 0.000 {built-in method builtins.chr}
修补代码
test-002-patched-002.py
:
from Bio.Seq import Seq
def modded(self, index) :
return chr(self._data[index])
Seq.__getitem__ = modded
a = 'MAGLVWT'
seq_a = Seq(a*10000000)
# print(seq_a , type(seq_a))
empty = { }
for i in a :
empty[i] = 0
print(empty)
for i in range(len(seq_a)) :
x = seq_a[i]
empty[x] +=1
print(empty)
c配置文件结果:
> python -m cProfile -s cumulative test-002-patched-002.py
{'M': 0, 'A': 0, 'G': 0, 'L': 0, 'V': 0, 'W': 0, 'T': 0}
{'M': 10000000, 'A': 10000000, 'G': 10000000, 'L': 10000000, 'V': 10000000, 'W': 10000000, 'T': 10000000}
140037786 function calls (140037745 primitive calls) in 84.476 seconds
Ordered by: cumulative time
ncalls tottime percall cumtime percall filename:lineno(function)
6/1 0.000 0.000 84.476 84.476 {built-in method builtins.exec}
1 46.689 46.689 84.476 84.476 test-002-patched-002.py:3(<module>)
不确定继续操作的正确方法,以及是否确实能加快 20%(没有尝试很多次)。请告诉我
不能再切片
Seq
对象,除非你修补方法回来