我有几个文件。
cat << EOF > file1.csv
val 1, val 2, val 5, val 6
34,12,23,54
22,32,22,43
12,32,11,31
32,32,44,22
32,34,23,45
EOF
cat << EOF > file2.csv
val 5, val 2, val 3, val 4, val 7
4,82,13,4,33
2,22,42,3,23
1,42,51,1,67
33,42,84,2,43
42,44,93,5,56
23,42,32,32,34
32,43,22,11,2
EOF
我想打印两个文件中“val 2”和“val 5”下的列,并将它们写入单独的文件中。另外,file1.csv 有 5 行,而 file2.csv 有 7 行,因此缺失值将打印为“?”。
我的脚本:
awk 'BEGIN { FS=", "; OFS=" " }
NR==FNR {
if (FNR > 1) {
val2[FNR-1] = $2
val5[FNR-1] = $3
}
next
}
{
if (FNR > 1) {
val2[FNR-1] = val2[FNR-1] " " $2
val5[FNR-1] = val5[FNR-1] " " $5
}
}
END {
print "val 2"
for (i=1; i<=NR; i++) {
if (i <= length(val2)) {
print val2[i]
} else {
print "?"
}
}
print ""
print "val 5"
for (i=1; i<=NR; i++) {
if (i <= length(val5)) {
print val5[i]
} else {
print "?"
}
}
}' file1.csv file2.csv
这是打印一些东西,但它们不正确。
输出会像
cat << EOF > output_val_2.txt
12 82
32 22
32 42
32 42
34 44
? 42
? 43
EOF
cat << EOF > output_val_5.txt
23 4
22 2
11 21
44 33
23 42
? 23
? 32
EOF
这似乎是使用
join
命令的情况。
#!/bin/bash
header="$1"
file1="$2"
file2="$3"
# Determine row number from header string
rownum () {
head -1 "$2" \
| sed -e 's/,\s*/\n/g' \
| awk -v FS='\t' -v c="$1" '$1 == c { print NR + 1; exit; }'
}
join -e'?' -a1 -a2 \
-o "1.$(rownum "$header" "$file1"),2.$(rownum "$header" "$file2")" \
<(nl "$file1" | column -ts,) \
<(nl "$file2" | column -ts,) \
| tail -n +2 \
| column -t
% ./output.sh 'val 2' file1.csv file2.csv
12 82
32 22
32 42
32 42
34 44
? 42
? 43
% ./output.sh 'val 5' file1.csv file2.csv
23 4
22 2
11 1
44 33
23 42
? 23
? 32
使用任何 awk 和不必要的更多变量以及冗长且有意义的变量名,以使代码最容易理解:
$ cat tst.awk
BEGIN {
FS = ", *"
numTags = split(tags,tagNr2tag)
for ( tagNr in tagNr2tag ) {
tag = tagNr2tag[tagNr]
tag2tagNr[tag] = tagNr
}
}
FNR==1 {
rowNr = 1
colNr = ++numCols
for ( tagNr in tagNr2tag ) {
idx = tagNr RS rowNr RS colNr
vals[idx] = FILENAME
}
delete tagNr2fldNr
for ( fldNr=1; fldNr<=NF; fldNr++ ) {
tag = $fldNr
if ( tag in tag2tagNr ) {
tagNr = tag2tagNr[tag]
tagNr2fldNr[tagNr] = fldNr
}
}
next
}
{
++rowNr
numRows = ( rowNr > numRows ? rowNr : numRows )
for ( tagNr=1; tagNr<=numTags; tagNr++ ) {
if ( tagNr in tagNr2fldNr ) {
fldNr = tagNr2fldNr[tagNr]
val = $fldNr
idx = tagNr RS rowNr RS colNr
vals[idx] = val
}
}
}
END {
for ( tagNr=1; tagNr<=numTags; tagNr++ ) {
close(out)
out = "output_" tagNr2tag[tagNr] ".txt"
gsub(/ +/,"_",out)
for ( rowNr=1; rowNr<=numRows; rowNr++ ) {
for ( colNr=1; colNr<=numCols; colNr++ ) {
idx = tagNr RS rowNr RS colNr
val = ( idx in vals ? vals[idx] : "?" )
printf "%s%s", val, ( colNr<numCols ? OFS : ORS ) > out
}
}
}
}
$ awk -v tags='val 2,val 5' -f tst.awk file1.csv file2.csv
$ head output_*.txt
==> output_val_2.txt <==
file1.csv file2.csv
12 82
32 22
32 42
32 42
34 44
? 42
? 43
==> output_val_5.txt <==
file1.csv file2.csv
23 4
22 2
11 1
44 33
23 42
? 23
? 32
如果您不希望列标题行包含输入文件名,则只需在
for ( rowNr=1; rowNr<=numRows; rowNr++ )
而不是 rowNr=2
处启动输出循环 rowNr=1
。
上述内容适用于任意数量的输入文件和您想要从中打印的任意数量的列,以及不包含某些目标列的文件,并且将以与您提供文件名相同的顺序输出列参数,例如:
$ cp file1.csv file3.csv
$ awk -v tags='val 5,val 4,val 1' -f tst.awk file2.csv file3.csv file1.csv
$ head output_*
==> output_val_1.txt <==
file2.csv file3.csv file1.csv
? 34 34
? 22 22
? 12 12
? 32 32
? 32 32
? ? ?
? ? ?
==> output_val_4.txt <==
file2.csv file3.csv file1.csv
4 ? ?
3 ? ?
1 ? ?
2 ? ?
5 ? ?
32 ? ?
11 ? ?
==> output_val_5.txt <==
file2.csv file3.csv file1.csv
4 23 23
2 22 22
1 11 11
33 44 44
42 23 23
23 ? ?
32 ? ?