如果有人能帮助我将多个文件(最多 8 个)与两个公共列($1$2)合并,我将非常感激。我想获取 $3 的所有值并将空白替换为 0。这是来自 4 个文件的样本
文件1:
chr1 111001 234
chr2 22099 108
文件2:
chr1 111001 42
chr1 430229 267
文件3:
chr1 111001 92
chr5 663800 311
文件4:
chr1 111001 129
chr2 22099 442
所需输出
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
我试过了
awk '{ a[$1 OFS $2 FS] = a[$1 OFS $2 FS] ( a[$1 OFS $2 FS] == "" ? "" : OFS) $3 }END{ for (i in a){print i,"0",a[i]} }' OFS="\t" file1.txt file2.txt file3.txt file4.txt | sort -k1
输出
chr1 111001 0 234 42 92 129
chr1 430229 0 267
chr2 22099 0 108 442
chr5 663800 0 311
提前非常感谢
您可以使用这个
gnu-awk
:
awk 'BEGIN {
for (k=1; k<ARGC; ++k)
s = s " " 0
}
{
key=$1 OFS $2
if (!(key in map))
map[key] = s
map[key] = gensub("^( ([0-9]+ ){" ARGIND-1 "})[0-9]+", "\\1" $3, "1", map[key])
}
END {
PROCINFO["sorted_in"]="@ind_str_asc"
for (k in map)
print k map[k]
}' file{1..4} | column -t
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
说明:
gensub
我们使用 ARGIND
(当前参数索引)构建正则表达式0
位置中的 ARGIND
替换为 $3
END
块只是打印出存储在map
column -t
用于数据的表格显示这是一个使其在 POSIX 中工作的等效命令
awk
(非 GNU):
awk 'BEGIN {
for (k=1; k<ARGC; ++k)
s = s " " 0
}
FNR == 1 {
++argind
}
{
key=$1 OFS $2
if (!(key in map))
map[key] = s
split(map[key], a)
a[argind] = $3
v = ""
for (k=1; k<ARGC; ++k)
v = v " " a[k]
map[key]=v
}
END {
for (k in map)
print k map[k]
}' file{1..4}
还有一个变体,您可以尝试按照所示示例进行编写和测试吗?
awk '
{
if(!a[FILENAME]++){
file[++count]=FILENAME
}
b[$1 OFS $2 OFS FILENAME]=$NF
c[$1 OFS $2]++
if(!d[$1 OFS $2]++){
e[++count1]=$1 OFS $2
}
}
END{
for(i=1;i<=length(c);i++){
printf("%s ",e[i])
for(j=1;j<=count;j++){
printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS)
}
}
}
' file{1..4} | sort -k1
输出如下。
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
说明: 为上述内容添加详细说明。
awk ' ##Starting awk program from here.
{
if(!a[FILENAME]++){ ##Checking condition if FILENAME is present in a then do following.
file[++count]=FILENAME ##Creating file with index of count and value is current file name.
}
b[$1 OFS $2 OFS FILENAME]=$NF ##Creating array b with index of 1st 2nd and filename and which has value as last field.
c[$1 OFS $2]++ ##Creating array c with index of 1st and 2nd field and keep increasing its value with 1.
if(!d[$1 OFS $2]++){ ##Checking condition if 1st and 2nd field are NOT present in d then do following.
e[++count1]=$1 OFS $2 ##Creating e with index of count1 with increasing value of 1 and which has first and second fields here.
}
}
END{ ##Starting END block of this awk program from here.
for(i=1;i<=length(c);i++){ ##Starting for loop which runs from i=1 to till length of c here.
printf("%s ",e[i]) ##Printing value of array e with index i here.
for(j=1;j<=count;j++){ ##Starting for loop till value of count here.
printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS) ##Printing value of b with index of e[i] OFS file[j] if it present then print else print 0, print new line if j==count or print space.
}
}
}
' file{1..4} | sort -k1 ##Mentioning Input_files 1 to 4 here and sorting output with 1st field here.
编辑: 根据伟大的正则表达式大师@anubhava先生的评论,添加带有
ARGC
和 ARGV
的解决方案与 GNU awk
。
awk '
{
b[$1 OFS $2 OFS FILENAME]=$NF
c[$1 OFS $2]++
if(!d[$1 OFS $2]++){
e[++count1]=$1 OFS $2
}
}
END{
count=(ARGC-1)
for(i=1;i<=length(c);i++){
printf("%s ",e[i])
for(j=1;j<=(ARGC-1);j++){
printf("%s %s",(b[e[i] OFS ARGV[j]]!=""?b[e[i] OFS ARGV[j]]:0),j==count?ORS:OFS)
}
}
}
' file{1..4} | sort -k1
这些文件看起来像是源自 vcf 文件床。如果是这样,请不要重新发明轮子。使用任何专门的生物信息学工具来操作这些文件。例如:
bedtools
,
bcftools
,
Picard MergeVcfs
等
通过搜索
merge bed files
或 merge vcf files
查找更多信息。大多数生物信息学工具/软件包都可以使用 conda
频道中的 bioconda
安装。
合并/连接/相交/等 bed/vcf 文件后,当文件不采用任何常见生物信息学格式时,请使用常见的 *NIX 实用程序和脚本语言来提取和操作文件。