|
53 | 53 |
|
54 | 54 | # Before anything, let's parse the headers of this supposed "MAF-like" file and do some checks
|
55 | 55 | my $maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open input MAF: $input_maf!\n";
|
56 |
| -my ( %uniq_regions, %flanking_bps, @tn_pair, %col_idx, $header_line ); |
| 56 | +my ( %uniq_regions, %filter_tags, %flanking_bps, @tn_pair, %col_idx, $header_line ); |
57 | 57 | while( my $line = $maf_fh->getline ) {
|
58 | 58 |
|
59 | 59 | # If the file uses Mac OS 9 newlines, quit with an error
|
|
89 | 89 | ( %col_idx ) or die "ERROR: Couldn't find a header line (must start with Hugo_Symbol, Chromosome, or Tumor_Sample_Barcode): $input_maf\n";
|
90 | 90 |
|
91 | 91 | # For each variant in the MAF, parse out the locus for running samtools faidx later
|
92 |
| - my ( $chr, $pos, $ref ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele ); |
| 92 | + my ( $chr, $pos, $ref, $filter ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele FILTER ); |
93 | 93 | $ref =~ s/^(\?|-|0)+$//; # Blank out the dashes (or other weird chars) used with indels
|
94 | 94 | my $region = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref ));
|
95 | 95 | $uniq_regions{$region} = 1;
|
| 96 | + # Also track the unique FILTER tags seen, so we can construct VCF header lines for each |
| 97 | + map{ $filter_tags{$_} = 1 unless( $_ eq "PASS" or $_ eq "." )} split( /,|;/, $filter ); |
96 | 98 | }
|
97 | 99 | $maf_fh->close;
|
98 | 100 |
|
|
152 | 154 | $tn_vcf{$vcf_file} .= "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
|
153 | 155 | $tn_vcf{$vcf_file} .= "##FORMAT=<ID=AD,Number=G,Type=Integer,Description=\"Allelic depths of REF and ALT(s) in the order listed\">\n";
|
154 | 156 | $tn_vcf{$vcf_file} .= "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth across this site\">\n";
|
| 157 | + $tn_vcf{$vcf_file} .= "##FILTER=<ID=$_,Description=\"\">\n" foreach ( sort keys %filter_tags ); |
155 | 158 | $tn_vcf{$vcf_file} .= "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n";
|
156 | 159 | }
|
157 | 160 |
|
|
304 | 307 | $vcf_fh->print( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" );
|
305 | 308 | $vcf_fh->print( "##FORMAT=<ID=AD,Number=G,Type=Integer,Description=\"Allelic Depths of REF and ALT(s) in the order listed\">\n" );
|
306 | 309 | $vcf_fh->print( "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n" );
|
| 310 | +$vcf_fh->print( "##FILTER=<ID=$_,Description=\"\">\n" ) foreach ( sort keys %filter_tags ); |
307 | 311 | $vcf_fh->print( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" . join("\t", @vcf_cols) . "\n" );
|
308 | 312 |
|
309 | 313 | # Write each variant into the multi-sample VCF
|
|
0 commit comments