Ytdyklly

Question

Is it possible to pipe a list of numbers into sed?

The pipeline I currently have looks something like:

grep -nP 'foo' file_full.txt | sort | awk -F'[:;t]' '{print $1,$3,$9,$13}'

The output of this:

2374 213 MID=212 GO=1

2462 213 MID=477 GO=137

2394 233 MID=232 GO=1

2464 233 MID=668 GO=1070

2185 24 MID=23 GO=1

2465 24 MID=752 GO=1083

2146 48 MID=354 GO=1010

1893 48 MID=47 GO=1

2219 58 MID=57 GO=1

2463 58 MID=595 GO=1057

My main goal here is to look at $2 for identical values then compare their $4 or GO values. The line with the larger GO value needs to be deleted.

When I add: sed 's/GO=/& /' | sort -k2,2 -k5n | awk 'a[$2]++ {sub(/GO= /,"GO="); print $1}'to the previous pipeline I get:

Which is a list of line numbers I wish to remove from file_full.txt

I know sed -i '2462d;2464d;2465d;2146d;2463d' file_full.txt would work but I am unsure how to pipe each of those numbers from above into one sed command.

Am I missing something?

Raw data:

#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  i0      i1      i2      i3      i4      i5      i6      i7      i8      i9      i10     i11     i12     i13     i14

1       1       .       A       T       1000    PASS    MID=0;S=0.0324764;DOM=0.5;PO=1;GO=1;MT=0;AC=200;DP=1000 GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       2       .       A       T       1000    PASS    MID=1;S=0.0125739;DOM=0.5;PO=1;GO=1;MT=5;AC=200;DP=1000 GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       3       .       A       T       1000    PASS    MID=2;S=-0.0693919;DOM=0.5;PO=1;GO=1;MT=9;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       4       .       A       T       1000    PASS    MID=3;S=0.0611535;DOM=0.5;PO=1;GO=1;MT=12;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       5       .       A       T       1000    PASS    MID=4;S=-0.0791182;DOM=0.5;PO=1;GO=1;MT=16;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       6       .       A       T       1000    PASS    MID=5;S=0.0463103;DOM=0.5;PO=1;GO=1;MT=21;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       7       .       A       T       1000    PASS    MID=6;S=0.0509527;DOM=0.5;PO=1;GO=1;MT=25;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       8       .       A       T       1000    PASS    MID=7;S=-0.0134404;DOM=0.5;PO=1;GO=1;MT=28;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       9       .       A       T       1000    PASS    MID=8;S=-0.00478324;DOM=0.5;PO=1;GO=1;MT=32;AC=200;DP=1000      GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       10      .       A       T       1000    PASS    MID=9;S=0.03588;DOM=0.5;PO=1;GO=1;MT=36;AC=200;DP=1000  GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       11      .       A       T       1000    PASS    MID=10;S=-0.028843;DOM=0.5;PO=1;GO=1;MT=41;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       12      .       A       T       1000    PASS    MID=11;S=-0.0832497;DOM=0.5;PO=1;GO=1;MT=45;AC=200;DP=1000      GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       13      .       A       T       1000    PASS    MID=12;S=0.0389281;DOM=0.5;PO=1;GO=1;MT=48;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       14      .       A       T       1000    PASS    MID=13;S=0.0362106;DOM=0.5;PO=1;GO=1;MT=53;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       15      .       A       T       1000    PASS    MID=14;S=0.0375309;DOM=0.5;PO=1;GO=1;MT=57;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       16      .       A       T       1000    PASS    MID=15;S=0.0112808;DOM=0.5;PO=1;GO=1;MT=60;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       17      .       A       T       1000    PASS    MID=16;S=0.0243286;DOM=0.5;PO=1;GO=1;MT=65;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       18      .       A       T       1000    PASS    MID=17;S=0.0596463;DOM=0.5;PO=1;GO=1;MT=69;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       24      .       A       T       1000    PASS    MID=23;S=-0.0086571;DOM=0.5;PO=1;GO=1;MT=92;AC=199;DP=1000;MULTIALLELIC GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       58      .       A       T       1000    PASS    MID=57;S=-0.0926969;DOM=0.5;PO=1;GO=1;MT=229;AC=198;DP=1000;MULTIALLELIC        GT      1|1     0|1     1|1     1|1     0|1     1|1

1       213     .       A       T       1000    PASS    MID=212;S=-0.0925562;DOM=0.5;PO=1;GO=1;MT=848;AC=196;DP=1000;MULTIALLELIC       GT      1|1     1|1     1|1     1|1     1|1     1|1

1       233     .       A       T       1000    PASS    MID=232;S=-0.0868037;DOM=0.5;PO=1;GO=1;MT=929;AC=199;DP=1000;MULTIALLELIC       GT      1|1     1|1     1|1     1|1     1|1     1|1

1       213     .       A       T       1000    PASS    MID=477;S=0.0600971;DOM=0.5;PO=1;GO=1037;MT=849;AC=4;DP=1000;MULTIALLELIC       GT      0|0     0|0     0|0     0|0     0|0     0|0

1       58      .       A       T       1000    PASS    MID=595;S=0.0450203;DOM=0.5;PO=1;GO=1057;MT=228;AC=2;DP=1000;MULTIALLELIC       GT      0|0     1|0     0|0     0|0     1|0     0|0

1       233     .       A       T       1000    PASS    MID=668;S=-0.0447337;DOM=0.5;PO=1;GO=1070;MT=928;AC=1;DP=1000;MULTIALLELIC      GT      0|0     0|0     0|0     0|0     0|0     0|0

1       24      .       A       T       1000    PASS    MID=752;S=-0.104791;DOM=0.5;PO=1;GO=1083;MT=93;AC=1;DP=1000;MULTIALLELIC        GT      0|0     0|0     0|0     0|0     0|0     0|0

You may not want to read from the file while sed is trying to edit the file... — 7 hours ago
Added actual awk and sed commands to my question, hopefully it is makes more sense. — 7 hours ago
Please edit your question and show us the original data. There may well be easier ways to get to this point or even to do the whole thing in one go. — 7 hours ago
Ah! A vcf! I thought these looked familiar! While this is completely on topic here, you might want to consider asking on Bioinformatics in future since there are probably nifty ways of doing this with domain-specific tools like bcftools etc. Also, do you want the entire line of the vcf file? What output are you expecting? — 6 hours ago

score 4 · Accepted Answer · 2019-02-23 00:07:07Z

It's a pity we can't see your original data. Given the result of you pipeline in a file called file, you would get the right result though

$ sort -t ' ' -k2,2 -k4.4n file | sort -u -k2,2

2374 213 MID=212 GO=1

2394 233 MID=232 GO=1

2185 24 MID=23 GO=1

1893 48 MID=47 GO=1

2219 58 MID=57 GO=1

These are the lines in the data with the smallest GO value for each group (where the second field defines the group).

The first sort sorts the data in such a way that each group's GO values are ordered smallest to largest. That -k4.4n should have a dot in it, not a comma. It specifies the actual value after the = in the fourth field as the secondary sort key.

The second sort uses only the group number in the second field and does a unique sort. This has the effect of only leaving the first of each unique group in the output.

Using the raw data in the question:

sed 's/./;./' file |

sort -t ';' -k1,1 -k6.4,6n |

sort -u -t ';' -k1,1 |

sed 's/;././' |

sort -k1,1n -k2,2n

The first two sort calls do the same sort of operation as in the earlier part of this answer.

The first sed replaces the dot in column 3 with ;.. This is done to let us use ; as the field delimiter properly in the two sort calls.
The second sed calls restores the original dots.

The final sort gets the data sorted on chromosomes and position, as from the start.

This results in

#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  i0      i1      i2      i3      i4      i5      i6      i7      i8      i9      i10     i11     i12     i13     i14

1       1       .       A       T       1000    PASS    MID=0;S=0.0324764;DOM=0.5;PO=1;GO=1;MT=0;AC=200;DP=1000 GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       2       .       A       T       1000    PASS    MID=1;S=0.0125739;DOM=0.5;PO=1;GO=1;MT=5;AC=200;DP=1000 GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       3       .       A       T       1000    PASS    MID=2;S=-0.0693919;DOM=0.5;PO=1;GO=1;MT=9;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       4       .       A       T       1000    PASS    MID=3;S=0.0611535;DOM=0.5;PO=1;GO=1;MT=12;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       5       .       A       T       1000    PASS    MID=4;S=-0.0791182;DOM=0.5;PO=1;GO=1;MT=16;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       6       .       A       T       1000    PASS    MID=5;S=0.0463103;DOM=0.5;PO=1;GO=1;MT=21;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       7       .       A       T       1000    PASS    MID=6;S=0.0509527;DOM=0.5;PO=1;GO=1;MT=25;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       8       .       A       T       1000    PASS    MID=7;S=-0.0134404;DOM=0.5;PO=1;GO=1;MT=28;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       9       .       A       T       1000    PASS    MID=8;S=-0.00478324;DOM=0.5;PO=1;GO=1;MT=32;AC=200;DP=1000      GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       10      .       A       T       1000    PASS    MID=9;S=0.03588;DOM=0.5;PO=1;GO=1;MT=36;AC=200;DP=1000  GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       11      .       A       T       1000    PASS    MID=10;S=-0.028843;DOM=0.5;PO=1;GO=1;MT=41;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       12      .       A       T       1000    PASS    MID=11;S=-0.0832497;DOM=0.5;PO=1;GO=1;MT=45;AC=200;DP=1000      GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       13      .       A       T       1000    PASS    MID=12;S=0.0389281;DOM=0.5;PO=1;GO=1;MT=48;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       14      .       A       T       1000    PASS    MID=13;S=0.0362106;DOM=0.5;PO=1;GO=1;MT=53;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       15      .       A       T       1000    PASS    MID=14;S=0.0375309;DOM=0.5;PO=1;GO=1;MT=57;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       16      .       A       T       1000    PASS    MID=15;S=0.0112808;DOM=0.5;PO=1;GO=1;MT=60;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       17      .       A       T       1000    PASS    MID=16;S=0.0243286;DOM=0.5;PO=1;GO=1;MT=65;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       18      .       A       T       1000    PASS    MID=17;S=0.0596463;DOM=0.5;PO=1;GO=1;MT=69;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       24      .       A       T       1000    PASS    MID=23;S=-0.0086571;DOM=0.5;PO=1;GO=1;MT=92;AC=199;DP=1000;MULTIALLELIC GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       58      .       A       T       1000    PASS    MID=57;S=-0.0926969;DOM=0.5;PO=1;GO=1;MT=229;AC=198;DP=1000;MULTIALLELIC        GT      1|1     0|1     1|1     1|1     0|1     1|1

1       213     .       A       T       1000    PASS    MID=212;S=-0.0925562;DOM=0.5;PO=1;GO=1;MT=848;AC=196;DP=1000;MULTIALLELIC       GT      1|1     1|1     1|1     1|1     1|1     1|1

1       233     .       A       T       1000    PASS    MID=232;S=-0.0868037;DOM=0.5;PO=1;GO=1;MT=929;AC=199;DP=1000;MULTIALLELIC       GT      1|1     1|1     1|1     1|1     1|1     1|1

Extracting line numbers that you'd like to delete in a pipeline is often the wrong way to go about a task like this. Note that each part of a pipeline runs concurrently with every other part of the same pipeline. This means that you can't start overwriting or modifying a file in one part that you are, at the same, time reading from in another part.

Also note that when extracting data through the stages of a pipeline, you lose the data that you're not passing on. This makes it even harder to allow the one pipeline to modify the original data (because it's lost along the way through the pipeline).

score 4 · Accepted Answer · 2019-02-23 00:29:34Z

You haven't shown the output you expect, but if I understood you correctly, you're looking for something like this (file has the data from your question):

$ sort -t= -k3 -rn file | awk '{a[$2]=$0}END{for(i in a){print a[i]}}'

2185 24 MID=23 GO=1

1893 48 MID=47 GO=1

2219 58 MID=57 GO=1

2374 213 MID=212 GO=1

2394 233 MID=232 GO=1

The idea is to first sort the input on the value of the GO. The -t= sets sort's field separator to =, which makes the number after GO the 3rd field. We sort on that, in reverse numerical order so that the larger numbers come first. Then, the awk will save each line as a value in the array a whose keys are the second fields. Since the file is sorted by the GO value, this means we'll always keep the greatest value for each $2. Then, at the end of the file we print the array.

Alternatively, you can do the whole thing directly from the original file:

$ awk -F'[t=;]' '/^[^#]/{

                    if(!a[$1$2] || a[$1$2]>$17){

                        line[$1$2]=$0; 

                        a[$1$2]=$17

                    }

                   } 

                   END{

                    for(i in a){

                        print line[i]

                    }

                   }' file.vcf 

1   1   .   A   T   1000    PASS    MID=0;S=0.0324764;DOM=0.5;PO=1;GO=1;MT=0;AC=200;DP=1000 GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   2   .   A   T   1000    PASS    MID=1;S=0.0125739;DOM=0.5;PO=1;GO=1;MT=5;AC=200;DP=1000 GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   3   .   A   T   1000    PASS    MID=2;S=-0.0693919;DOM=0.5;PO=1;GO=1;MT=9;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   4   .   A   T   1000    PASS    MID=3;S=0.0611535;DOM=0.5;PO=1;GO=1;MT=12;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   5   .   A   T   1000    PASS    MID=4;S=-0.0791182;DOM=0.5;PO=1;GO=1;MT=16;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   6   .   A   T   1000    PASS    MID=5;S=0.0463103;DOM=0.5;PO=1;GO=1;MT=21;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   7   .   A   T   1000    PASS    MID=6;S=0.0509527;DOM=0.5;PO=1;GO=1;MT=25;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   8   .   A   T   1000    PASS    MID=7;S=-0.0134404;DOM=0.5;PO=1;GO=1;MT=28;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   9   .   A   T   1000    PASS    MID=8;S=-0.00478324;DOM=0.5;PO=1;GO=1;MT=32;AC=200;DP=1000  GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   10  .   A   T   1000    PASS    MID=9;S=0.03588;DOM=0.5;PO=1;GO=1;MT=36;AC=200;DP=1000  GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   11  .   A   T   1000    PASS    MID=10;S=-0.028843;DOM=0.5;PO=1;GO=1;MT=41;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   12  .   A   T   1000    PASS    MID=11;S=-0.0832497;DOM=0.5;PO=1;GO=1;MT=45;AC=200;DP=1000  GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   13  .   A   T   1000    PASS    MID=12;S=0.0389281;DOM=0.5;PO=1;GO=1;MT=48;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   14  .   A   T   1000    PASS    MID=13;S=0.0362106;DOM=0.5;PO=1;GO=1;MT=53;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   15  .   A   T   1000    PASS    MID=14;S=0.0375309;DOM=0.5;PO=1;GO=1;MT=57;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   16  .   A   T   1000    PASS    MID=15;S=0.0112808;DOM=0.5;PO=1;GO=1;MT=60;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   17  .   A   T   1000    PASS    MID=16;S=0.0243286;DOM=0.5;PO=1;GO=1;MT=65;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   18  .   A   T   1000    PASS    MID=17;S=0.0596463;DOM=0.5;PO=1;GO=1;MT=69;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   24  .   A   T   1000    PASS    MID=23;S=-0.0086571;DOM=0.5;PO=1;GO=1;MT=92;AC=199;DP=1000;MULTIALLELIC GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   58  .   A   T   1000    PASS    MID=57;S=-0.0926969;DOM=0.5;PO=1;GO=1;MT=229;AC=198;DP=1000;MULTIALLELIC    GT  1|1 0|1 1|1 1|1 0|1 1|1

1   213 .   A   T   1000    PASS    MID=212;S=-0.0925562;DOM=0.5;PO=1;GO=1;MT=848;AC=196;DP=1000;MULTIALLELIC   GT  1|1 1|1 1|1 1|1 1|1 1|1

1   233 .   A   T   1000    PASS    MID=232;S=-0.0868037;DOM=0.5;PO=1;GO=1;MT=929;AC=199;DP=1000;MULTIALLELIC   GT  1|1 1|1 1|1 1|1 1|1 1|1

Here the trick is the -F'[t=;]' which sets awk's input field separator to any of t, =, or ;. That makes the GO value field 17 (assuming all of your lines have the same structure, if they don't, please ask on Bioinformatics since that would be better handled with other tools). The rest means:

/^[^#]/{ } : do this on lines that don't start with a #.

if(a[$1$2]<$17){ : if the value stored in the array a for the first and second fields (chromosome and position) is smaller than the 17th field of this line

line[$1$2]=$0; : save this line as the value for the combination of fields 1 and 2 in the array line.

a[$1$2]=$17 : save the 17th field as the value for the combination of first and second field in the array a.

END{for(i in a){print a[i]}} : just as as above, print the lines.

Now, this will require you to store one entire line for each unique value of the second field. That can be a problem if your file is very large.

One, admittedly inelegant, way of avoiding this issue is to basically do what you originally asked for and use the line numbers. Something like:

awk -F'[t=;]' 'NR==FNR && /^[^#]/{

                    if(!a[$1$2] || a[$1$2]>$17){

                        want[$1$2]=NR; 

                        a[$1$2]=$17

                    }

                } 

                NR!=FNR && want[$1$2]==FNR' file.vcf file.vcf

jthilljthill 2,333715 · Accepted Answer · 2019-02-23 05:40:06Z

You can pipe the sed script in through its stdin with the conventional - name. Working off your supplied sample and dropping the unused $9,

grep -n = full.txt 

| awk -F'[:;t]' '{sub(/.*=/,"",$13); print $1,$3,$13 }' 

| sort -nk2,3 

| awk 'last==$2{print $1"d"}last=$2' 

# | sed -i -f- full.txt

If you're on a Mac, they've made a backup extension mandatory, you'll have to say sed -i '' etc to shut it off.

I figure it's better with a file that promises to be large to do as much data reduction as possible before sorting.

When I sub in $1"p" for $1"d" and run it through |sed -nf- full.txt to print the lines it'd delete I get

$ grep -n = full.txt     | awk -F'[:;t]' '{sub(/.*=/,"",$13); print $1,$3,$13 }'     | sort -nk2,3     | awk 'last==$2{print $1"p"}{last=$2}' | sed -nf- full.txt

1       213     .       A       T       1000    PASS    MID=477;S=0.0600971;DOM=0.5;PO=1;GO=1037;MT=849;AC=4;DP=1000;MULTIALLELIC       GT      0|0     0|0     0|0     0|0     0|0     0|0

1       58      .       A       T       1000    PASS    MID=595;S=0.0450203;DOM=0.5;PO=1;GO=1057;MT=228;AC=2;DP=1000;MULTIALLELIC       GT      0|0     1|0     0|0     0|0     1|0     0|0

1       233     .       A       T       1000    PASS    MID=668;S=-0.0447337;DOM=0.5;PO=1;GO=1070;MT=928;AC=1;DP=1000;MULTIALLELIC      GT      0|0     0|0     0|0     0|0     0|0     0|0

1       24      .       A       T       1000    PASS    MID=752;S=-0.104791;DOM=0.5;PO=1;GO=1083;MT=93;AC=1;DP=1000;MULTIALLELIC        GT      0|0     0|0     0|0     0|0     0|0     0|0

$

score 4 · Accepted Answer · 2019-02-23 00:07:07Z

It's a pity we can't see your original data. Given the result of you pipeline in a file called file, you would get the right result though

$ sort -t ' ' -k2,2 -k4.4n file | sort -u -k2,2

2374 213 MID=212 GO=1

2394 233 MID=232 GO=1

2185 24 MID=23 GO=1

1893 48 MID=47 GO=1

2219 58 MID=57 GO=1

These are the lines in the data with the smallest GO value for each group (where the second field defines the group).

The first sort sorts the data in such a way that each group's GO values are ordered smallest to largest. That -k4.4n should have a dot in it, not a comma. It specifies the actual value after the = in the fourth field as the secondary sort key.

The second sort uses only the group number in the second field and does a unique sort. This has the effect of only leaving the first of each unique group in the output.

Using the raw data in the question:

sed 's/./;./' file |

sort -t ';' -k1,1 -k6.4,6n |

sort -u -t ';' -k1,1 |

sed 's/;././' |

sort -k1,1n -k2,2n

The first two sort calls do the same sort of operation as in the earlier part of this answer.

The first sed replaces the dot in column 3 with ;.. This is done to let us use ; as the field delimiter properly in the two sort calls.
The second sed calls restores the original dots.

The final sort gets the data sorted on chromosomes and position, as from the start.

This results in

#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  i0      i1      i2      i3      i4      i5      i6      i7      i8      i9      i10     i11     i12     i13     i14

1       1       .       A       T       1000    PASS    MID=0;S=0.0324764;DOM=0.5;PO=1;GO=1;MT=0;AC=200;DP=1000 GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       2       .       A       T       1000    PASS    MID=1;S=0.0125739;DOM=0.5;PO=1;GO=1;MT=5;AC=200;DP=1000 GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       3       .       A       T       1000    PASS    MID=2;S=-0.0693919;DOM=0.5;PO=1;GO=1;MT=9;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       4       .       A       T       1000    PASS    MID=3;S=0.0611535;DOM=0.5;PO=1;GO=1;MT=12;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       5       .       A       T       1000    PASS    MID=4;S=-0.0791182;DOM=0.5;PO=1;GO=1;MT=16;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       6       .       A       T       1000    PASS    MID=5;S=0.0463103;DOM=0.5;PO=1;GO=1;MT=21;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       7       .       A       T       1000    PASS    MID=6;S=0.0509527;DOM=0.5;PO=1;GO=1;MT=25;AC=200;DP=1000        GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       8       .       A       T       1000    PASS    MID=7;S=-0.0134404;DOM=0.5;PO=1;GO=1;MT=28;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       9       .       A       T       1000    PASS    MID=8;S=-0.00478324;DOM=0.5;PO=1;GO=1;MT=32;AC=200;DP=1000      GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       10      .       A       T       1000    PASS    MID=9;S=0.03588;DOM=0.5;PO=1;GO=1;MT=36;AC=200;DP=1000  GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       11      .       A       T       1000    PASS    MID=10;S=-0.028843;DOM=0.5;PO=1;GO=1;MT=41;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       12      .       A       T       1000    PASS    MID=11;S=-0.0832497;DOM=0.5;PO=1;GO=1;MT=45;AC=200;DP=1000      GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       13      .       A       T       1000    PASS    MID=12;S=0.0389281;DOM=0.5;PO=1;GO=1;MT=48;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       14      .       A       T       1000    PASS    MID=13;S=0.0362106;DOM=0.5;PO=1;GO=1;MT=53;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       15      .       A       T       1000    PASS    MID=14;S=0.0375309;DOM=0.5;PO=1;GO=1;MT=57;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       16      .       A       T       1000    PASS    MID=15;S=0.0112808;DOM=0.5;PO=1;GO=1;MT=60;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       17      .       A       T       1000    PASS    MID=16;S=0.0243286;DOM=0.5;PO=1;GO=1;MT=65;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       18      .       A       T       1000    PASS    MID=17;S=0.0596463;DOM=0.5;PO=1;GO=1;MT=69;AC=200;DP=1000       GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       24      .       A       T       1000    PASS    MID=23;S=-0.0086571;DOM=0.5;PO=1;GO=1;MT=92;AC=199;DP=1000;MULTIALLELIC GT      1|1     1|1     1|1     1|1     1|1     1|1     1|1

1       58      .       A       T       1000    PASS    MID=57;S=-0.0926969;DOM=0.5;PO=1;GO=1;MT=229;AC=198;DP=1000;MULTIALLELIC        GT      1|1     0|1     1|1     1|1     0|1     1|1

1       213     .       A       T       1000    PASS    MID=212;S=-0.0925562;DOM=0.5;PO=1;GO=1;MT=848;AC=196;DP=1000;MULTIALLELIC       GT      1|1     1|1     1|1     1|1     1|1     1|1

1       233     .       A       T       1000    PASS    MID=232;S=-0.0868037;DOM=0.5;PO=1;GO=1;MT=929;AC=199;DP=1000;MULTIALLELIC       GT      1|1     1|1     1|1     1|1     1|1     1|1

Extracting line numbers that you'd like to delete in a pipeline is often the wrong way to go about a task like this. Note that each part of a pipeline runs concurrently with every other part of the same pipeline. This means that you can't start overwriting or modifying a file in one part that you are, at the same, time reading from in another part.

Also note that when extracting data through the stages of a pipeline, you lose the data that you're not passing on. This makes it even harder to allow the one pipeline to modify the original data (because it's lost along the way through the pipeline).

score 4 · Accepted Answer · 2019-02-23 00:29:34Z

You haven't shown the output you expect, but if I understood you correctly, you're looking for something like this (file has the data from your question):

$ sort -t= -k3 -rn file | awk '{a[$2]=$0}END{for(i in a){print a[i]}}'

2185 24 MID=23 GO=1

1893 48 MID=47 GO=1

2219 58 MID=57 GO=1

2374 213 MID=212 GO=1

2394 233 MID=232 GO=1

The idea is to first sort the input on the value of the GO. The -t= sets sort's field separator to =, which makes the number after GO the 3rd field. We sort on that, in reverse numerical order so that the larger numbers come first. Then, the awk will save each line as a value in the array a whose keys are the second fields. Since the file is sorted by the GO value, this means we'll always keep the greatest value for each $2. Then, at the end of the file we print the array.

Alternatively, you can do the whole thing directly from the original file:

$ awk -F'[t=;]' '/^[^#]/{

                    if(!a[$1$2] || a[$1$2]>$17){

                        line[$1$2]=$0; 

                        a[$1$2]=$17

                    }

                   } 

                   END{

                    for(i in a){

                        print line[i]

                    }

                   }' file.vcf 

1   1   .   A   T   1000    PASS    MID=0;S=0.0324764;DOM=0.5;PO=1;GO=1;MT=0;AC=200;DP=1000 GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   2   .   A   T   1000    PASS    MID=1;S=0.0125739;DOM=0.5;PO=1;GO=1;MT=5;AC=200;DP=1000 GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   3   .   A   T   1000    PASS    MID=2;S=-0.0693919;DOM=0.5;PO=1;GO=1;MT=9;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   4   .   A   T   1000    PASS    MID=3;S=0.0611535;DOM=0.5;PO=1;GO=1;MT=12;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   5   .   A   T   1000    PASS    MID=4;S=-0.0791182;DOM=0.5;PO=1;GO=1;MT=16;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   6   .   A   T   1000    PASS    MID=5;S=0.0463103;DOM=0.5;PO=1;GO=1;MT=21;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   7   .   A   T   1000    PASS    MID=6;S=0.0509527;DOM=0.5;PO=1;GO=1;MT=25;AC=200;DP=1000    GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   8   .   A   T   1000    PASS    MID=7;S=-0.0134404;DOM=0.5;PO=1;GO=1;MT=28;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   9   .   A   T   1000    PASS    MID=8;S=-0.00478324;DOM=0.5;PO=1;GO=1;MT=32;AC=200;DP=1000  GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   10  .   A   T   1000    PASS    MID=9;S=0.03588;DOM=0.5;PO=1;GO=1;MT=36;AC=200;DP=1000  GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   11  .   A   T   1000    PASS    MID=10;S=-0.028843;DOM=0.5;PO=1;GO=1;MT=41;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   12  .   A   T   1000    PASS    MID=11;S=-0.0832497;DOM=0.5;PO=1;GO=1;MT=45;AC=200;DP=1000  GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   13  .   A   T   1000    PASS    MID=12;S=0.0389281;DOM=0.5;PO=1;GO=1;MT=48;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   14  .   A   T   1000    PASS    MID=13;S=0.0362106;DOM=0.5;PO=1;GO=1;MT=53;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   15  .   A   T   1000    PASS    MID=14;S=0.0375309;DOM=0.5;PO=1;GO=1;MT=57;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   16  .   A   T   1000    PASS    MID=15;S=0.0112808;DOM=0.5;PO=1;GO=1;MT=60;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   17  .   A   T   1000    PASS    MID=16;S=0.0243286;DOM=0.5;PO=1;GO=1;MT=65;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   18  .   A   T   1000    PASS    MID=17;S=0.0596463;DOM=0.5;PO=1;GO=1;MT=69;AC=200;DP=1000   GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   24  .   A   T   1000    PASS    MID=23;S=-0.0086571;DOM=0.5;PO=1;GO=1;MT=92;AC=199;DP=1000;MULTIALLELIC GT  1|1 1|1 1|1 1|1 1|1 1|1 1|1

1   58  .   A   T   1000    PASS    MID=57;S=-0.0926969;DOM=0.5;PO=1;GO=1;MT=229;AC=198;DP=1000;MULTIALLELIC    GT  1|1 0|1 1|1 1|1 0|1 1|1

1   213 .   A   T   1000    PASS    MID=212;S=-0.0925562;DOM=0.5;PO=1;GO=1;MT=848;AC=196;DP=1000;MULTIALLELIC   GT  1|1 1|1 1|1 1|1 1|1 1|1

1   233 .   A   T   1000    PASS    MID=232;S=-0.0868037;DOM=0.5;PO=1;GO=1;MT=929;AC=199;DP=1000;MULTIALLELIC   GT  1|1 1|1 1|1 1|1 1|1 1|1

Here the trick is the -F'[t=;]' which sets awk's input field separator to any of t, =, or ;. That makes the GO value field 17 (assuming all of your lines have the same structure, if they don't, please ask on Bioinformatics since that would be better handled with other tools). The rest means:

/^[^#]/{ } : do this on lines that don't start with a #.

if(a[$1$2]<$17){ : if the value stored in the array a for the first and second fields (chromosome and position) is smaller than the 17th field of this line

line[$1$2]=$0; : save this line as the value for the combination of fields 1 and 2 in the array line.

a[$1$2]=$17 : save the 17th field as the value for the combination of first and second field in the array a.

END{for(i in a){print a[i]}} : just as as above, print the lines.

Now, this will require you to store one entire line for each unique value of the second field. That can be a problem if your file is very large.

One, admittedly inelegant, way of avoiding this issue is to basically do what you originally asked for and use the line numbers. Something like:

awk -F'[t=;]' 'NR==FNR && /^[^#]/{

                    if(!a[$1$2] || a[$1$2]>$17){

                        want[$1$2]=NR; 

                        a[$1$2]=$17

                    }

                } 

                NR!=FNR && want[$1$2]==FNR' file.vcf file.vcf

jthilljthill 2,333715 · Accepted Answer · 2019-02-23 05:40:06Z

You can pipe the sed script in through its stdin with the conventional - name. Working off your supplied sample and dropping the unused $9,

grep -n = full.txt 

| awk -F'[:;t]' '{sub(/.*=/,"",$13); print $1,$3,$13 }' 

| sort -nk2,3 

| awk 'last==$2{print $1"d"}last=$2' 

# | sed -i -f- full.txt

If you're on a Mac, they've made a backup extension mandatory, you'll have to say sed -i '' etc to shut it off.

I figure it's better with a file that promises to be large to do as much data reduction as possible before sorting.

When I sub in $1"p" for $1"d" and run it through |sed -nf- full.txt to print the lines it'd delete I get

$ grep -n = full.txt     | awk -F'[:;t]' '{sub(/.*=/,"",$13); print $1,$3,$13 }'     | sort -nk2,3     | awk 'last==$2{print $1"p"}{last=$2}' | sed -nf- full.txt

1       213     .       A       T       1000    PASS    MID=477;S=0.0600971;DOM=0.5;PO=1;GO=1037;MT=849;AC=4;DP=1000;MULTIALLELIC       GT      0|0     0|0     0|0     0|0     0|0     0|0

1       58      .       A       T       1000    PASS    MID=595;S=0.0450203;DOM=0.5;PO=1;GO=1057;MT=228;AC=2;DP=1000;MULTIALLELIC       GT      0|0     1|0     0|0     0|0     1|0     0|0

1       233     .       A       T       1000    PASS    MID=668;S=-0.0447337;DOM=0.5;PO=1;GO=1070;MT=928;AC=1;DP=1000;MULTIALLELIC      GT      0|0     0|0     0|0     0|0     0|0     0|0

1       24      .       A       T       1000    PASS    MID=752;S=-0.104791;DOM=0.5;PO=1;GO=1083;MT=93;AC=1;DP=1000;MULTIALLELIC        GT      0|0     0|0     0|0     0|0     0|0     0|0

$

搜尋此網誌

Ytdyklly

Piping Multiple Numbers into Sed

3 Answers
3

Your Answer

Post as a guest

3 Answers
3

3 Answers
3

Post as a guest

Popular posts from this blog

How to make a Squid Proxy server?

第一次世界大戦

Touch on Surface Book

Piping Multiple Numbers into Sed

3 Answers 3

Your Answer

Sign up or log in

Post as a guest

Post as a guest

3 Answers 3

3 Answers 3

Sign up or log in

Post as a guest

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Popular posts from this blog

How to make a Squid Proxy server?

第一次世界大戦

Touch on Surface Book

3 Answers
3

3 Answers
3

3 Answers
3