我已经对齐了 2 个 DNAStringSet,并使用
Biostring
包。我想提取序列中完美匹配且长度至少为 10 个核苷酸 (nt) 的所有部分,以及模式和查询序列中匹配的位置。
library('Biostrings')
set.seed(123)
dna_alphabet <- DNA_ALPHABET[1:4]
seq1_random <- sample(dna_alphabet, 100, replace = TRUE)
seq2_random <- sample(dna_alphabet, 100, replace = TRUE)
seq1_random[15:(15+11)] <- match_12
seq2_random[17:(17+11)] <- match_12
seq1_random[41:(41+14)] <- match_15
seq2_random[40:(40+14)] <- match_15
seq1_random[70:(70+22)] <- match_23
seq2_random[73:(73+22)] <- match_23
seq1 <- DNAString(paste(seq1_random, collapse = ""))
seq2 <- DNAString(paste(seq2_random, collapse = ""))
alignment <- pairwiseAlignment(pattern = seq1, subject = seq2, type = "global")
alignment
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: GGGCGCCCGATCCA--CCTATTCGTTTGTCGCACGTCAGGATATTTCACGTGAGTCATCACAAT----TGACAAGACGACCATTTCTAACATGAATTCACTAACGG
subject: ATCATCAGGTTCTGACCCTATTCGTTTGA---ATGCCCTCCCATTTCACGTGAGTCAAGGGAGGCCCGTGGCACTACGACCATTTCTAACATGAATTCGGTAT---
score: -138.1462
> df_results
perfect_match start_pos_pattern end_pos_pattern start_pos_subject end_pos_subject
1 CCTATTCGTTTG 15 26 17 28
2 ATTTCACGTGAGTCA 41 55 40 54
3 ACGACCATTTCTAACATGAATTC 70 92 73 95
我的实际对齐长度 >15,000 nt 长,所以我需要找到一种有效的方法来继续。如果可能的话,我对
{tidyverse}
比 {data.table}
选项更熟悉。
到目前为止我发现的是: 提取比对序列:
aligned_pattern_str <- as.character(alignment@pattern)
aligned_pattern_str
[1] "GGGCGCCCGATCCA--CCTATTCGTTTGTCGCACGTCAGGATATTTCACGTGAGTCATCACAAT----TGACAAGACGACCATTTCTAACATGAATTCACTAA"
aligned_subject_str <- as.character(alignment@subject)
aligned_subject_str
[1] "ATCATCAGGTTCTGACCCTATTCGTTTGA---ATGCCCTCCCATTTCACGTGAGTCAAGGGAGGCCCGTGGCACTACGACCATTTCTAACATGAATTCGGTAT"
然后:我使用滚动窗口来比较aligned_pattern_str中的第一个字符和aligned_subject_str中的第一个字符。如果它们不匹配,我会比较每个的第二个等,直到它们匹配。一旦它们匹配:我会继续比较下一个字符,直到出现错误匹配为止。然后,如果匹配>10个核苷酸,我会报告它,否则我不会报告。我继续进行,直到覆盖整个序列。
这不是最终答案:此时,我只有对齐模式序列中的位置。这意味着不匹配 - - 被计入位置。而且我在主题序列中没有匹配的部分。此外,可能还有更有效的方法。 所以我的代码到那一步:
match_start <- NULL
match_length <- 0
matches <- list()
for (i in 1:nchar(aligned_pattern_str)) {
if (substr(aligned_pattern_str, i, i) == substr(aligned_subject_str, i, i) && substr(aligned_pattern_str, i, i) != "-") && {
if (is.null(match_start)) {
match_start <- i
}
match_length <- match_length + 1
} else {
if (!is.null(match_start) && match_length > 10) {
matches[[length(matches) + 1]] <- list(start = match_start, end = i - 1, length = match_length)
}
match_start <- NULL
match_length <- 0
}
}
if (!is.null(match_start) && match_length > 10) {
matches[[length(matches) + 1]] <- list(start = match_start, end = nchar(aligned_pattern_str), length = match_length)
}
for (match in matches) {
cat(sprintf("Match from %d to %d, Length: %d\n", match$start, match$end, match$length))
}
>
Match from 17 to 28, Length: 12
Match from 43 to 57, Length: 15
Match from 76 to 98, Length: 23
这是一种将每 10 个 nt 转换为单个值并使用
data.table
连接查找匹配项的方法。应该是相当快的。请注意,如果最小匹配长度太长,它可能会崩溃(10 完全在它可以处理的范围内)。
library(data.table)
n <- 10L
n1 <- n - 1L
fn <- function(x, n) {
# function to convert rolling sequences to integers
n1 <- n - 1
y <- c(0, x*4^n1)
xn <- numeric(length(x) - n1)
xn[1] <- sum(x[1:n]*4^(n1:0))
x <- x[-1:-n1]
if (length(xn) > 1L) {
for (i in 2:length(x)) xn[i] <- 4*(xn[i - 1] - y[i]) + x[i]
}
xn
}
setorder(
data.table(
end = n:length(seq1),
seq10 = fn(log2(as.numeric(seq1)), n)
)[ # join to find the matches
data.table(
end = n:length(seq2),
seq10 = fn(log2(as.numeric(seq2)), n)
), on = "seq10", allow.cartesian = TRUE, nomatch = FALSE
][,d := i.end - end], end, d
)[ # roll up sequential matches
,.(
seq_match = toString(subseq(seq1, end[1] - n1, end[.N])),
start1 = end[1] - n1,
end1 = end[.N],
start2 = i.end[1] - n1,
end2 = i.end[.N]
), cumsum(c(0L, diff(end) != 1L | diff(i.end) != 1L))
][,cumsum := NULL][]
#> seq_match start1 end1 start2 end2
#> <char> <int> <int> <int> <int>
#> 1: CCTATTCGTTTG 15 26 17 28
#> 2: ATTTCACGTGAGTCA 41 55 40 54
#> 3: ACGACCATTTCTAACATGAATTC 70 92 73 95