所以我给我的班级布置了这个作业,它是这样的:
my_test.fastq
.FilteringReport:
Filename: my_test.fastq
TotalReads: 4000
DiscardLowQual: 89
DiscardLen: 11
ReadsAfterFilter: 3900
BioPython
)导入模块。Description
Usage
Hints
argparse
这样的参数解析器。你也可以使用简单的sys.argv
,但是它会让你的程序更难编写和使用。下面是按顺序列出 ASCII 质量值的表格(第一个字符的质量值是 0,最后一个字符是 50)”
我的代码的最终版本是这样的:
import argparse
import os
class Entry:
def __init__(self, header, content, quality):
self.header = header
self.content = content
self.quality = quality
self.average_quality = 0
self.content_length = 0
class DataProcessor:
def __init__(self, data_file, output_file, quality_threshold, length_threshold):
self.data_file = data_file
self.output_file = output_file
self.quality_threshold = quality_threshold
self.length_threshold = length_threshold
self.entries = []
self.discard_low_qual = 0
self.discard_len = 0
def validate_input(self):
if not os.path.exists(self.data_file):
raise FileNotFoundError(f"Input file '{self.data_file}' not found.")
if self.quality_threshold < 0 or self.quality_threshold > 50:
raise ValueError("Quality threshold must be between 0 and 50.")
if self.length_threshold < 0:
raise ValueError("Length threshold must be a non-negative integer.")
def read_entries(self):
if not os.path.exists(self.data_file):
raise FileNotFoundError(f"Input file '{self.data_file}' not found.")
with open(self.data_file, 'r') as file:
lines = file.readlines()
for line_index in range(0, len(lines), 4):
header = lines[line_index].strip()
content = lines[line_index + 1].strip()
quality = lines[line_index + 3].strip()
entry = Entry(header, content, quality)
self.entries.append(entry)
def validate_thresholds(self):
# Check if quality_threshold and length_threshold are positive integers
if not isinstance(self.quality_threshold, int) or not isinstance(self.length_threshold, int):
raise ValueError("Thresholds must be positive integers.")
if self.quality_threshold < 0 or self.length_threshold < 0:
raise ValueError("Thresholds must be non-negative integers.")
def calculate_and_store_averages(self):
for entry in self.entries:
total_quality = 0
for q in entry.quality:
total_quality += ord(q) - 33
avg_quality = total_quality / len(entry.quality) if entry.quality else 0
entry.average_quality = avg_quality
def calculate_and_store_lengths(self):
for entry in self.entries:
entry.content_length = len(entry.content)
def filter_entries(self):
initial_entry_count = len(self.entries)
filtered_entries_quality = [entry for entry in self.entries if entry.average_quality >= self.quality_threshold]
self.discard_low_qual = initial_entry_count - len(filtered_entries_quality)
filtered_entries_length = [entry for entry in filtered_entries_quality if entry.content_length >= self.length_threshold]
self.discard_len = len(filtered_entries_quality) - len(filtered_entries_length)
self.entries = filtered_entries_length
def write_filtered_entries(self):
with open(self.output_file, 'w') as file:
for entry in self.entries:
file.write(f"{entry.header}\n{entry.content}\n+\n{entry.quality}\n")
def generate_filtering_report(self):
total_entries = len(self.entries) + self.discard_low_qual + self.discard_len
print(f"FilteringReport:")
print(f" TotalEntries: {total_entries}")
print(f" DiscardLowQual: {self.discard_low_qual}")
print(f" DiscardLen: {self.discard_len}")
print(f" EntriesAfterFilter: {len(self.entries)}")
def process_data_entries():
parser = argparse.ArgumentParser(description="Process a data file, filter entries, and generate a filtering report.")
parser.add_argument('data_file', help="Path to the data file")
parser.add_argument('--output_file', help="Path to the output data file", default='filtered_data.txt')
parser.add_argument('--quality_threshold', type=int, help="Quality threshold for filtering entries")
parser.add_argument('--length_threshold', type=int, help="Length threshold for filtering entries")
args = parser.parse_args()
data_processor = DataProcessor(args.data_file, args.output_file, args.quality_threshold, args.length_threshold)
data_processor.validate_input()
data_processor.validate_thresholds()
data_processor.read_entries()
data_processor.calculate_and_store_averages()
data_processor.calculate_and_store_lengths()
data_processor.filter_entries()
data_processor.write_filtered_entries()
data_processor.generate_filtering_report()
if __name__ == "__main__":
process_data_entries()
真正的问题是结果 DiscardLen,它始终为 0,而且不应该,有时它等于我的另一个值 DiscardLowQual。
我现在真的迷失了,因为尽管我试图理解错误在哪里,但我就是找不到它,并且我使用这个 stackoverflow 作为尝试找到错误的最后希望,我不'不想要正确的答案,我只想了解错误以及为什么它不起作用。 还是谢谢啦
真正的问题是结果 DiscardLen,它始终为 0,而且不应该,有时它等于我的另一个值 DiscardLowQual。
ok尝试重现你的错误,我的输入
test_2.fastq
:
@2fa9ee19-5c51-4281-abdd-eac8663f9b49 runid=f53ee40429765e7817081d4bcdee6c1199c2f91d sampleid=18S_amplicon read=109831 ch=33 start_time=2019-09-12T12:05:03Z
CGGTAGCCAGCTGCGTTCAGTATGGAAGATTTGATTTGTTTAGCGATCGCCATACTACCGTGACAAGAAAGTTGTCAGTCTTTGTGACTTGCCTGTCGCTCTATCTTCCAGACTCCTTGGTCCGTGTTCAATCCCGGTAGTAGCGACGGGCGGTGTATGTATTATCAGCGCAACAGAAACAAAGACACC
+
+&&-&%$%%$$$#)33&0$&%$''*''%$#%$%#+-5/---*&&%$%&())(&$#&,'))5769*+..*&(+28./#&1228956:7674';:;80.8>;91;>?B=%.**==?(/'($$$$*'&'**%&/));807;3A=;88>=?9498++0%"%%%%'#&5/($0.$2%&0'))*'%**&)(.%&&
@1f9ca490-2f25-484a-8972-d60922b41f6f runid=f53ee40429765e7817081d4bcdee6c1199c2f91d sampleid=18S_amplicon read=106343 ch=28 start_time=2019-09-12T12:05:07Z
GATGCATACTTCGTTCGATTTCGTTTCAACTGGACAACCTACCGTGACAAAGAAAGTTGTCGATGCTTTGTGACTTGCTGTCCTCTATCTTCAGACTCCTTGGTCCATTTCAAGACCAAACAATCAGTAGTAGCGACGGGCGGTGTGGCAATATCGCTTTCAACGAAACACAAAGAAT
+
&%&%''&'+,005<./'%*-)%(#$'$#$%&&'('$$..74483=.0412$*/,)9/194/+('%%(+1+'')+,-&,>;%%.*@@D>>?)3%%296070717%%16;<'<236800%(,734(0$7769879@;?8)09:+/4'1+**7<<4.4,%%(.)##%&'(&&%*++'&#%$
@06936a64-6c08-40e9-8a10-0fbc74812c89 runid=f53ee40429765e7817081d4bcdee6c1199c2f91d sampleid=18S_amplicon read=83531 ch=23 start_time=2019-09-12T12:03:50Z
GTTTTGTCGCTGCGTTCAGTTTATGGGTGCGGGTGTTATGATGCTTCGCTTTACGTGACAAGAAAGTTAGTAGATTGTCTTTATGTTTCTGTGGTGCTGATATTGCCACACCGCCCGATAGCTCTACCGATTGAAACACGGACCAAGGAATCGGAAATGTAGGCGAGCAGGCCGTCCTGAACACCCATTAACTTTCTTGTC
+
$&'((&%$$$.$2/=-*#'.2'&&##$$#$#&&(&+-%'(%&#"###""$$%#)%,+)+&'(,&%*((%%&%$%'+),,+,,&%$')1+*$.&+6*+(*%(&'*(''&%*+,*)('%#$$$%,$&&'&)))12)*&/*,364$%$%))$'')#%%&%$#$%$$#('$(%$%$%%$$*$&$%)''%%$$&'&$)+2++,)&%
@d6a555a1-d8dd-4e55-936f-ade7c78d9d38 runid=f53ee40429765e7817081d4bcdee6c1199c2f91d sampleid=18S_amplicon read=112978 ch=97 start_time=2019-09-12T12:03:49Z
CGTATGCTTTGAGATTCATTCAGGAGGCGGGTATTTGCTCGATCATACCATACGTGGCAAGAAAGTTGTCAGTGTCTTTGTGTTTCTCTGTGGTGCGCGATATTGCCACGCCCGTCGCTACACCGATTGAAACACGGACCGAAGTCTGAAGATAGAGCGACGAGCGAAGTCACAAAGGAACTAGAGCAACTTTTTATC
+
#$%%%%''(($$%$*-&%$%)%*'%(+($)(%$,.)##$&$$#$$&('(%&%%%%#$$%(&*('('+18/(6?65510+))'--*&&$$$,*+;/+%%&&''13&%&%(133<;9=/.2*$*657,0*&(237'85;A1/$$%'7:;;:<2:..%$)0,*.)(1)1&&1+-$$,-&(-&&####%%98:AHFEB4(%,
@91ca9c6c-12fe-4255-83cc-96ba4d39ac4b runid=f53ee40429765e7817081d4bcdee6c1199c2f91d sampleid=18S_amplicon read=110811 ch=113 start_time=2019-09-12T12:04:28Z
CGGTGTACTT
+
%$&'$&'&&'0,42%*$&&%$%$#$)$*+,'($&))(*$%$%'-8644(()-&'%&*'')%*('579:?.*,9:+)1-9.'(7491:7,(52.11'7;:<===E@;>448,,(%*.''*,%&$-.;<:;66138/**,2?8<:**'%&)%&#$&&,,'&
@4dbe5037-abe2-4176-88db-32cc336a67fb runid=f53ee40429765e7817081d4bcdee6c1199c2f91d sampleid=18S_amplicon read=113290 ch=97 start_time=2019-09-12T12:05:31Z
GCAGGTGATGCTTTGGTTCAGTTTCAGATTTGGGTGTTTTATGATTTATACTATACGTGACAAGAAAGTTGTCATTGTGTATCTTTGTACTTTTGCCTGTGTCGCTCTATCTTCCAGACTCCCGGTCCGTGTTTCAATCGGTAGCTAGCGACGGGCGGTGTGGCAATATCCAGCACCAACAAACGCAAGACACCGACAACAT
+
#$%'(('#%'###$#&&(%$'.+%+&'##%%%'%4.4570'%%##%$('%##%&$',:8;7--0/**.6:6,,%#$%$##%''))$#$$#%&&&#))+&()/.(,0+-4;;FE(-/38@B/7+$,01=8?>:;/@54.(-/(%$#%+.+849.5;=35;6>62277;;%%>734298&'5$'%(%'()*,1/..-<3'''#$
@df3de4e9-48ca-45fc-83f4-02d7f066a41a runid=f53ee40429765e7817081d4bcdee6c1199c2f91d sampleid=18S_amplicon read=109639 ch=69 start_time=2019-09-12T12:05:16Z
CATGCTTCGT
+
&&&%(((*./)%(&,%$%'+#%(,+*-1+,/,%$$&%&)-.1422/29+,)&,1-,5'2=;<==<9).'7:>B?;91%+A988(*,,,,05356455395='&)###"(219;9262%+%,18+,#$''%$499999111,-/6:987-099-.048'%-.0)%-0*)*)),+121.5;<;;7.&'
你的代码
question-003.py
:
import argparse
import os
class Entry:
def __init__(self, header, content, quality):
self.header = header
self.content = content
self.quality = quality
self.average_quality = 0
self.content_length = 0
print(self.header[:20] , len(self.content))
class DataProcessor:
def __init__(self, data_file, output_file, quality_threshold, length_threshold):
self.data_file = data_file
self.output_file = output_file
self.quality_threshold = quality_threshold
self.length_threshold = length_threshold
self.entries = []
self.discard_low_qual = 0
self.discard_len = 0
def validate_input(self):
if not os.path.exists(self.data_file):
raise FileNotFoundError(f"Input file '{self.data_file}' not found.")
if self.quality_threshold < 0 or self.quality_threshold > 50:
raise ValueError("Quality threshold must be between 0 and 50.")
if self.length_threshold < 0:
raise ValueError("Length threshold must be a non-negative integer.")
def read_entries(self):
if not os.path.exists(self.data_file):
raise FileNotFoundError(f"Input file '{self.data_file}' not found.")
with open(self.data_file, 'r') as file:
lines = file.readlines()
for line_index in range(0, len(lines), 4):
header = lines[line_index].strip()
content = lines[line_index + 1].strip()
quality = lines[line_index + 3].strip()
entry = Entry(header, content, quality)
self.entries.append(entry)
def validate_thresholds(self):
# Check if quality_threshold and length_threshold are positive integers
if not isinstance(self.quality_threshold, int) or not isinstance(self.length_threshold, int):
raise ValueError("Thresholds must be positive integers.")
if self.quality_threshold < 0 or self.length_threshold < 0:
raise ValueError("Thresholds must be non-negative integers.")
def calculate_and_store_averages(self):
for entry in self.entries:
total_quality = 0
for q in entry.quality:
total_quality += ord(q) - 33
avg_quality = total_quality / len(entry.quality) if entry.quality else 0
entry.average_quality = avg_quality
def calculate_and_store_lengths(self):
for entry in self.entries:
entry.content_length = len(entry.content)
for i in self.entries :
print(i ,i.content_length)
def filter_entries(self):
initial_entry_count = len(self.entries)
filtered_entries_quality = [entry for entry in self.entries if entry.average_quality >= self.quality_threshold]
self.discard_low_qual = initial_entry_count - len(filtered_entries_quality)
filtered_entries_length = [entry for entry in filtered_entries_quality if entry.content_length >= self.length_threshold]
self.discard_len = len(filtered_entries_quality) - len(filtered_entries_length)
self.entries = filtered_entries_length
def write_filtered_entries(self):
with open(self.output_file, 'w') as file:
for entry in self.entries:
file.write(f"{entry.header}\n{entry.content}\n+\n{entry.quality}\n")
def generate_filtering_report(self):
total_entries = len(self.entries) + self.discard_low_qual + self.discard_len
print(f"FilteringReport:")
print(f" TotalEntries: {total_entries}")
print(f" DiscardLowQual: {self.discard_low_qual}")
print(f" DiscardLen: {self.discard_len}")
print(f" EntriesAfterFilter: {len(self.entries)}")
def process_data_entries():
parser = argparse.ArgumentParser(description="Process a data file, filter entries, and generate a filtering report.")
parser.add_argument('data_file', help="Path to the data file")
parser.add_argument('-o' , '--output_file', help="Path to the output data file", default='filtered_data.txt')
parser.add_argument('-q' , '--quality_threshold', type=int, help="Quality threshold for filtering entries")
parser.add_argument('-l' , '--length_threshold', type=int, help="Length threshold for filtering entries")
args = parser.parse_args()
data_processor = DataProcessor(args.data_file, args.output_file, args.quality_threshold, args.length_threshold)
data_processor.validate_input()
data_processor.validate_thresholds()
data_processor.read_entries()
data_processor.calculate_and_store_averages()
data_processor.calculate_and_store_lengths()
data_processor.filter_entries()
data_processor.write_filtered_entries()
data_processor.generate_filtering_report()
if __name__ == "__main__":
process_data_entries()
使用
> python3 ./question-003.py test_2.fastaq -q 1 -l 50
输出:
@2fa9ee19-5c51-4281- 189
@1f9ca490-2f25-484a- 178
@06936a64-6c08-40e9- 201
@d6a555a1-d8dd-4e55- 198
@91ca9c6c-12fe-4255- 10
@4dbe5037-abe2-4176- 202
@df3de4e9-48ca-45fc- 10
<__main__.Entry object at 0x7f282a744f70> 189
<__main__.Entry object at 0x7f282a692040> 178
<__main__.Entry object at 0x7f282a6920a0> 201
<__main__.Entry object at 0x7f282a692100> 198
<__main__.Entry object at 0x7f282a692160> 10
<__main__.Entry object at 0x7f282a6921c0> 202
<__main__.Entry object at 0x7f282a692220> 10
FilteringReport:
TotalEntries: 7
DiscardLowQual: 0
DiscardLen: 2
EntriesAfterFilter: 5
所以我没有收到你的错误