我正在尝试使用核苷酸序列进行BLAST搜索并打印最佳匹配命中,但不确定应该使用哪个选项/命令。有
max_hpsp
和 best_hit_overhang
等选项。我不知道它们的差异,我只想打印 1 个命中。 (最匹配的一个)我应该使用 max_hpsp 1
吗?
我写了这段代码,但它仍然没有用。如果您能告诉我,我错在哪里以及应该做什么,我将非常感激:) 谢谢!
from Bio.Blast import NCBIWWW
seq = Seq("GTTGA......CT")
def best_matching_hit(seq):
try:
result_handle = NCBIWWW.qblast("blastn", "nt", seq)
except:
print('BLAST run failed!')
return None
blast_record = NCBIXML.read(result_handle)
for hit in blast_record.alignments:
for hsp in hit.hsps:
if hsp.expect == max_hsps 1:
print(hit.title)
print(hsp.sbjct)
best_matching_hit(seq)
这仅返回一次点击,我想是第一个点击,按照
限制 Biostars 上 Biopython NCBIWWW 搜索中的点击次数:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 7 15:28:11 2021
@author: Pietro
https://stackoverflow.com/questions/67872118/how-to-print-the-best-matching-hit-in-the-blast-search-biopython
"""
from Bio.Blast import NCBIWWW
from Bio.Seq import Seq
seq = Seq("ATGGCGTGGAATGAGCCTGGAAATAACAACGGCAACAATGGCCGCGATAATGACCCTTGGGGTAATAA\
TAATCGTGGTGGCCAGCGTCCTGGTGGCCGAGATCAAGGTCCGCCAGATTTAGATGAAGTGTTCAACAA\
ACTGAGTCAAAAGCTGGGTGGCAAGTTTGGTAAAAAAGGCGGCGGTGGTTCCTCTATCGGCGGTGGCGG\
TGGTGCAATTGGCTTTGGTGTCATTGCGATCATTGCAATTGCGGTGTGGATTTTCGCTGGTTTTTACAC\
CATCGGTGAAGCAGAGCGTGGTGTTGTACTGCGTTTAGGTAAATACGATCGTATCGTAGACCCAGGCCT\
TAACTGGCGTCCTCGTTTTATTGATGAATACGAAGCGGTTAACGTACAAGCGATTCGCTCACTACGTGC\
ATCTGGTCTAATGCTGACGAAAGATGAAAACGTAGTAACGGTTGCAATGGACGTTCAATACCGAGTTGC\
TGACCCATACAAATACCTATACCGCGTGACCAATGCAGATGATAGCTTGCGTCAAGCAACAGACTCTGC\
GCTACGTGCGGTAATTGGTGATTCACTAATGGATAGCATTCTAACCAGTGGTCGTCAGCAAATTCGTCA\
AAGCACTCAAGAAACACTAAACCAAATCATCGATAGCTATGATATGGGTCTGGTGATTGTTGACGTGAA\
CTTCCAGTCTGCACGTCCGCCAGAGCAAGTAAAAGATGCGTTTGATGACGCGATTGCTGCGCGTGAGGA\
TGAAGAGCGTTTCATCCGTGAAGCAGAAGCTTACAAGAACGAAATCTTGCCGAAGGCAACGGGTCGTGC\
TGAACGTTTGAAGAAGGAAGCTCAAGGTTACAACGAGCGTGTAACTAACGAAGCATTAGGTCAAGTAGC\
ACAGTTTGAAAAACTACTACCTGAATACCAAGCGGCTCCTGGCGTAACACGTGACCGTCTGTACATTGA\
CGCGATGGAAGAGGTTTACACCAACACATCTAAAGTGTTGATTGACTCTGAATCAAGCGGCAACCTTTT\
GTACCTACCAATCGATAAATTGGCAGGTCAAGAAGGCCAAACAGACACTAAACGTAAATCGAAATCTTC\
TTCAACCTACGATCACATTCAACTAGAGTCTGAGCGTACACAAGAAGAAACATCGAACACGCAGTCTCG\
TTCAACAGGTACACGTCAAGGGAGATACTAA")
def best_matching_hit(seq):
try:
result_handle = NCBIWWW.qblast("blastn", "nt", seq, hitlist_size=1)
except:
print('BLAST run failed!')
return None
blast_record = result_handle.read()
print(blast_record)
best_matching_hit(seq)
您正在尝试将 E 值 (hsp.expect) 与未定义的变量 max_hsps 进行比较,这对于检索最佳命中不是必需的。另外,要限制命中数,您应该使用 NCBIWWW.qblast 中的 hitlist_size 参数而不是 max_hsps。我提供了您的函数的更正版本,该版本可以正确检索并打印最佳命中的信息,或指示是否未找到命中。
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
def best_matching_hit(seq):
try:
result_handle = NCBIWWW.qblast("blastn", "nt", seq, hitlist_size=1)
except Exception as e:
print('BLAST run failed:', e)
return None
blast_record = NCBIXML.read(result_handle)
if blast_record.alignments:
best_hit = blast_record.alignments[0] # Access the first (best) alignment
best_hsp = best_hit.hsps[0] ## Access the first high-scoring segment pair
print("Best Hit Title:", best_hit.title)
print("Best Hit Subject Sequence:", best_hsp.sbjct)
#if u want alignment aswell u can do below one otherwise skip these four steps and continue with else
print("Alignment:")
print(best_hsp.query)
print(best_hsp.match)
print(best_hsp.sbjct)
else:
print("No hits found.")
seq = Seq("ATGGCGTGGAATGAGCCTGGAAATAACAACGGCAACAATGGCCGCGATAATGACCCTTGGGGTAATAA\
TAATCGTGGTGGCCAGCGTCCTGGTGGCCGAGATCAAGGTCCGCCAGATTTAGATGAAGTGTTCAACAA\
ACTGAGTCAAAAGCTGGGTGGCAAGTTTGGTAAAAAAGGCGGCGGTGGTTCCTCTATCGGCGGTGGCGG\
TGGTGCAATTGGCTTTGGTGTCATTGCGATCATTGCAATTGCGGTGTGGATTTTCGCTGGTTTTTACAC\
CATCGGTGAAGCAGAGCGTGGTGTTGTACTGCGTTTAGGTAAATACGATCGTATCGTAGACCCAGGCCT\
TAACTGGCGTCCTCGTTTTATTGATGAATACGAAGCGGTTAACGTACAAGCGATTCGCTCACTACGTGC\
ATCTGGTCTAATGCTGACGAAAGATGAAAACGTAGTAACGGTTGCAATGGACGTTCAATACCGAGTTGC\
TGACCCATACAAATACCTATACCGCGTGACCAATGCAGATGATAGCTTGCGTCAAGCAACAGACTCTGC\
GCTACGTGCGGTAATTGGTGATTCACTAATGGATAGCATTCTAACCAGTGGTCGTCAGCAAATTCGTCA\
AAGCACTCAAGAAACACTAAACCAAATCATCGATAGCTATGATATGGGTCTGGTGATTGTTGACGTGAA\
CTTCCAGTCTGCACGTCCGCCAGAGCAAGTAAAAGATGCGTTTGATGACGCGATTGCTGCGCGTGAGGA\
TGAAGAGCGTTTCATCCGTGAAGCAGAAGCTTACAAGAACGAAATCTTGCCGAAGGCAACGGGTCGTGC\
TGAACGTTTGAAGAAGGAAGCTCAAGGTTACAACGAGCGTGTAACTAACGAAGCATTAGGTCAAGTAGC\
ACAGTTTGAAAAACTACTACCTGAATACCAAGCGGCTCCTGGCGTAACACGTGACCGTCTGTACATTGA\
CGCGATGGAAGAGGTTTACACCAACACATCTAAAGTGTTGATTGACTCTGAATCAAGCGGCAACCTTTT\
GTACCTACCAATCGATAAATTGGCAGGTCAAGAAGGCCAAACAGACACTAAACGTAAATCGAAATCTTC\
TTCAACCTACGATCACATTCAACTAGAGTCTGAGCGTACACAAGAAGAAACATCGAACACGCAGTCTCG\
TTCAACAGGTACACGTCAAGGGAGATACTAA")
best_matching_hit(seq)