06845edb9038bb632170891cebcdc8f477f2ea8d angie Fri Dec 4 08:59:38 2015 -0800 Add dbNSFP v3.1a (including VEST scores) to hg38 for hgVai. Add hgVai options for adding in transcript status info: GENCODE tags when applicable, knownCanonical for knownGene, refSeqStatus for refGene. refs #16502, #16503 diff --git src/hg/utils/dbNsfpToBed.pl src/hg/utils/dbNsfpToBed.pl index 9eeaa2c..0361f26 100755 --- src/hg/utils/dbNsfpToBed.pl +++ src/hg/utils/dbNsfpToBed.pl @@ -1,379 +1,477 @@ #!/usr/bin/env perl # This is under source control -- make sure you are editing # kent/src/hg/utils/dbNsfpToBed.pl # The database of non-synonymous functional predictions (dbNSFP) # contains precomputed scores from a wide variety of tools on all # non-synon variants of all genomic positions in the CDS of Gencode -# transcripts. Pick out some interesting subsets of its 52 columns +# transcripts. Pick out some interesting subsets of its many columns # and translate into bigBed and bigWig files that can be joined with # users' variants by the Variant Annotation Integrator (#6152). use warnings; use strict; -# As of dbNSFP 2.0 (Feb. 2013), the files have 52 tab-separated columns +# As of dbNSFP 3.1a (Nov. 2015), the files had 143 tab-separated columns # described by a #header line with a label for each column. +# (Back in dbNSFP 2.0 (Feb. 2013), the files had a mere 52 columns.) # Expect to find these input header column labels, in this order; # also define corresponding symbolic names for internal use: my @columnNames = ( [ "chr", "chrom" ], - [ "pos(1-coor)", "hg19Start" ], + [ "pos(1-based)", "hg38Pos" ], [ "ref", "refAl" ], [ "alt", "altAl" ], [ "aaref", "refAa" ], [ "aaalt", "altAa" ], - [ "hg18_pos(1-coor)", "hg18Start" ], + [ "rs_dbSNP144", "" ], + [ "hg19_chr", "hg19Chrom" ], + [ "hg19_pos(1-based)", "hg19Pos" ], + [ "hg18_chr", "hg18Chrom" ], + [ "hg18_pos(1-based)", "hg18Pos" ], [ "genename", "geneName" ], - [ "Uniprot_acc", "uniProtAcc" ], - [ "Uniprot_id", "uniProtId" ], - [ "Uniprot_aapos", "uniProtAaPos" ], - [ "Interpro_domain", "interProDomain" ], [ "cds_strand", "cdsStrand" ], [ "refcodon", "refCodon" ], - [ "SLR_test_statistic ", "slrTestStat" ], [ "codonpos", "codonPos" ], - [ "fold-degenerate", "foldDegen" ], - [ "Ancestral_allele", "ancestralAl" ], + [ "codon_degeneracy", "" ], + [ "Ancestral_allele", "" ], + [ "AltaiNeandertal", "" ], + [ "Denisova", "" ], [ "Ensembl_geneid", "ensGeneId" ], [ "Ensembl_transcriptid", "ensTxId" ], + [ "Ensembl_proteinid", "" ], [ "aapos", "aaPos" ], [ "SIFT_score", "siftScore" ], + [ "SIFT_converted_rankscore", "" ], + [ "SIFT_pred", "" ], + [ "Uniprot_acc_Polyphen2", "uniProtAcc" ], + [ "Uniprot_id_Polyphen2", "uniProtId" ], + [ "Uniprot_aapos_Polyphen2", "uniProtAaPos" ], [ "Polyphen2_HDIV_score", "pp2HDivScore" ], + [ "Polyphen2_HDIV_rankscore", "" ], [ "Polyphen2_HDIV_pred", "pp2HDivPred" ], [ "Polyphen2_HVAR_score", "pp2HVarScore" ], + [ "Polyphen2_HVAR_rankscore", "" ], [ "Polyphen2_HVAR_pred", "pp2HVarPred" ], [ "LRT_score", "lrtScore" ], + [ "LRT_converted_rankscore", "" ], [ "LRT_pred", "lrtPred" ], + [ "LRT_Omega", "lrtOmega" ], [ "MutationTaster_score", "mtScore" ], + [ "MutationTaster_converted_rankscore", "" ], [ "MutationTaster_pred", "mtPred" ], + [ "MutationTaster_model", "" ], + [ "MutationTaster_AAE", "" ], + [ "Uniprot_id_MutationAssessor", "" ], + [ "Uniprot_variant_MutationAssessor", "" ], [ "MutationAssessor_score", "maScore" ], + [ "MutationAssessor_rankscore", "" ], [ "MutationAssessor_pred", "maPred" ], - [ "FATHMM_score", "fathmmScore" ], + [ "FATHMM_score", "" ], + [ "FATHMM_converted_rankscore", "" ], + [ "FATHMM_pred", "" ], + [ "PROVEAN_score", "" ], + [ "PROVEAN_converted_rankscore", "" ], + [ "PROVEAN_pred", "" ], + [ "Transcript_id_VEST3", "vestNm" ], + [ "Transcript_var_VEST3", "vestVar" ], + [ "VEST3_score", "vestScore" ], + [ "VEST3_rankscore", "" ], + [ "CADD_raw", "" ], + [ "CADD_raw_rankscore", "" ], + [ "CADD_phred", "" ], + [ "DANN_score", "" ], + [ "DANN_rankscore", "" ], + [ "fathmm-MKL_coding_score", "" ], + [ "fathmm-MKL_coding_rankscore", "" ], + [ "fathmm-MKL_coding_pred", "" ], + [ "fathmm-MKL_coding_group", "" ], + [ "MetaSVM_score", "" ], + [ "MetaSVM_rankscore", "" ], + [ "MetaSVM_pred", "" ], + [ "MetaLR_score", "" ], + [ "MetaLR_rankscore", "" ], + [ "MetaLR_pred", "" ], + [ "Reliability_index", "" ], + [ "integrated_fitCons_score", "" ], + [ "integrated_fitCons_rankscore", "" ], + [ "integrated_confidence_value", "" ], + [ "GM12878_fitCons_score", "" ], + [ "GM12878_fitCons_rankscore", "" ], + [ "GM12878_confidence_value", "" ], + [ "H1-hESC_fitCons_score", "" ], + [ "H1-hESC_fitCons_rankscore", "" ], + [ "H1-hESC_confidence_value", "" ], + [ "HUVEC_fitCons_score", "" ], + [ "HUVEC_fitCons_rankscore", "" ], + [ "HUVEC_confidence_value", "" ], [ "GERP++_NR", "gerpNr" ], [ "GERP++_RS", "gerpRs" ], - [ "phyloP", "phyloP" ], - [ "29way_pi", "siPhy29wayPi" ], - [ "29way_logOdds", "siPhy29wayLogOdds" ], - [ "LRT_Omega", "lrtOmega" ], - [ "UniSNP_ids", "uniSnpIds" ], - [ "1000Gp1_AC", "tgpAc" ], - [ "1000Gp1_AF", "tgpAf" ], - [ "1000Gp1_AFR_AC", "tgpAfrAc" ], - [ "1000Gp1_AFR_AF", "tgpAfrAf" ], - [ "1000Gp1_EUR_AC", "tgpEurAc" ], - [ "1000Gp1_EUR_AF", "tgpEurAf" ], - [ "1000Gp1_AMR_AC", "tgpAmrAc" ], - [ "1000Gp1_AMR_AF", "tgpAmrAf" ], - [ "1000Gp1_ASN_AC", "tgpAsnAc" ], - [ "1000Gp1_ASN_AF", "tgpAsnAf" ], - [ "ESP6500_AA_AF", "esp6500AaAf" ], - [ "ESP6500_EA_AF", "esp6500EaAf" ], + [ "GERP++_RS_rankscore", "" ], + [ "phyloP7way_vertebrate", "" ], + [ "phyloP7way_vertebrate_rankscore", "" ], + [ "phyloP20way_mammalian", "" ], + [ "phyloP20way_mammalian_rankscore", "" ], + [ "phastCons7way_vertebrate", "" ], + [ "phastCons7way_vertebrate_rankscore", "" ], + [ "phastCons20way_mammalian", "" ], + [ "phastCons20way_mammalian_rankscore", "" ], + [ "SiPhy_29way_pi", "" ], + [ "SiPhy_29way_logOdds", "" ], + [ "SiPhy_29way_logOdds_rankscore", "" ], + [ "1000Gp3_AC", "" ], + [ "1000Gp3_AF", "" ], + [ "1000Gp3_AFR_AC", "" ], + [ "1000Gp3_AFR_AF", "" ], + [ "1000Gp3_EUR_AC", "" ], + [ "1000Gp3_EUR_AF", "" ], + [ "1000Gp3_AMR_AC", "" ], + [ "1000Gp3_AMR_AF", "" ], + [ "1000Gp3_EAS_AC", "" ], + [ "1000Gp3_EAS_AF", "" ], + [ "1000Gp3_SAS_AC", "" ], + [ "1000Gp3_SAS_AF", "" ], + [ "TWINSUK_AC", "" ], + [ "TWINSUK_AF", "" ], + [ "ALSPAC_AC", "" ], + [ "ALSPAC_AF", "" ], + [ "ESP6500_AA_AC", "" ], + [ "ESP6500_AA_AF", "" ], + [ "ESP6500_EA_AC", "" ], + [ "ESP6500_EA_AF", "" ], + [ "ExAC_AC", "" ], + [ "ExAC_AF", "" ], + [ "ExAC_Adj_AC", "" ], + [ "ExAC_Adj_AF", "" ], + [ "ExAC_AFR_AC", "" ], + [ "ExAC_AFR_AF", "" ], + [ "ExAC_AMR_AC", "" ], + [ "ExAC_AMR_AF", "" ], + [ "ExAC_EAS_AC", "" ], + [ "ExAC_EAS_AF", "" ], + [ "ExAC_FIN_AC", "" ], + [ "ExAC_FIN_AF", "" ], + [ "ExAC_NFE_AC", "" ], + [ "ExAC_NFE_AF", "" ], + [ "ExAC_SAS_AC", "" ], + [ "ExAC_SAS_AF", "" ], + [ "clinvar_rs", "" ], + [ "clinvar_clnsig", "" ], + [ "clinvar_trait", "" ], + [ "Interpro_domain", "interProDomain" ], + [ "GTEx_V6_gene", "" ], + [ "GTEx_V6_tissue", "" ], ); # Define subs to use in lieu of string constants for symbolic names: # ('use constant' doesn't work with 'use strict'; # see http://stackoverflow.com/questions/3057027/using-constants-in-perl) -map { my $symbol = $_->[1]; eval "sub $symbol () { $symbol; }" } @columnNames; +eval join(" ", + map { my $symbol = $_->[1]; $symbol ? "sub $symbol () { $symbol; } " : "" } + @columnNames); # Finally, the interesting part! Which columns will we tease out into independent BED files? my %subsetColumns = ( Sift => [ refAl(), ensTxId(), altAl(), siftScore() ], # PolyPhen2 scores are not directly related to ensTxIds, # but rather to UniProt seqs. # We can use uniProt db to recover ensTxIds via extDb and extDbRef: # select extAcc1 from extDb,extDbRef # where val = "Ensembl" and id = extDb and acc = "P51172"; PolyPhen2 => [ refAl(), uniProtAaPos(), altAl(), pp2HDivScore(), pp2HDivPred(), pp2HVarScore(), pp2HVarPred() ], SeqChange => [ refAl(), ensTxId(), ensGeneId(), cdsStrand(), refCodon(), codonPos(), refAa(), aaPos(), altAl(), altAa() ], Lrt => [ refAl(), ensTxId(), altAl(), lrtScore(), lrtPred(), lrtOmega() ], MutationTaster => [ refAl(), ensTxId(), altAl(), mtScore(), mtPred() ], MutationAssessor => [ refAl(), ensTxId(), altAl(), maScore(), maPred() ], + # New in v3: VEST, from Rachel Karchin's lab at JHU + Vest => [ refAl(), ensTxId(), vestNm(), altAl(), + vestVar(), vestScore() ], + # These subsets are related to position or transcript, not alleles: GerpNr => [ gerpNr() ], GerpRs => [ gerpRs() ], InterPro => [ interProDomain() ], # Note: these were taken from PolyPhen2! Not directly related to ensTxIds. UniProt => [ uniProtAcc(), uniProtId() ], - # These two sound great but are extremely sparse as of v2.0... - # UniSnp => [ uniSnpIds() ], - # - # Tgp => [ tgpAf(), tgpAfrAf(), tgpEurAf(), tgpAmrAf(), tgpAsnAf() ], ); sub checkHeader($) { # In order to catch any future column swizzles, require that the input header # columns exactly match our expectation. my ($headerLine) = @_; my @expectedHeaderColumns = map { my $label = $_->[0]; } @columnNames; $headerLine =~ s/^#// || die "Expected header line to begin with #"; chomp $headerLine; $headerLine =~ s/\r$//; # why doesn't chomp do this? my @headerColumns = split("\t", $headerLine); my $columnCount = @headerColumns; my $expectedCount = @expectedHeaderColumns; if ($columnCount != $expectedCount) { die "Different number of columns! Expected $expectedCount columns, " . "got $columnCount"; }; for (my $i = 0; $i < $columnCount; $i++) { if ($headerColumns[$i] ne $expectedHeaderColumns[$i]) { die "Header column labels don't match expectations -- you must edit this script " . "to accomodate input format changes, starting with column[$i] = " . "'$headerColumns[$i]' (expected '$expectedHeaderColumns[$i]')"; } } } sub createOutputFile($) { # Return a file handle to a dbNsfp{Subset}.bed file opened for writing my ($base) = (@_); my $file = "dbNsfp" . $base . ".bed"; my $pipe = "| uniq >$file"; if ($subsetColumns{$base}->[0] ne refAl()) { # Subsets that are not related to specific {ref, alt} often span many bases, # and dbNSFP skips 4-fold degenerate bases (wobble bases where any nucleotide # codes for the same amino acid), so merge ranges within 1bp of each other # if all columns after bed3 are the same for each range: $pipe = "| mergeSortedBed3Plus.pl -maxGap=1 - >$file"; } open(my $handle, $pipe) || die "Can't open pipe '$pipe' for writing: $!\n"; $handle; } # Make a hash mapping symbolic names to column indices: my $n = 0; my %symbolIx = map { my $symbol = $_->[1]; $symbol => $n++; } @columnNames; sub getIxList($) { my ($subset) = @_; my @cols = @{$subsetColumns{$subset}}; map { if (!exists $symbolIx{$_}) { die "Unrecognized column name '$_'"; } } @cols; my @ixList = map { $symbolIx{$_} } @cols; return \@ixList; } my ($refAlIx, $altAlIx) = ($symbolIx{refAl()}, $symbolIx{altAl()}); sub skipToAlt($) { # When a subset list starts with refAl and altAl, which are never placeholders, # we want to look for non-placeholder values *after* those. my ($ixList) = @_; if ($ixList->[0] != $refAlIx) { return $ixList; } else { my $i = 1; for (; $i < @$ixList; $i++) { last if ($ixList->[$i] == $altAlIx); } my @ixListFromAlt = (); for (; $i < @$ixList; $i++) { push @ixListFromAlt, $ixList->[$i]; } return \@ixListFromAlt; } } sub getVals($$) { # Given a reference to array of words, and ref to list of indices, # return a list of corresponding elements of $words. my ($words, $ixList) = @_; my @vals = map { $words->[$_] } @$ixList; return @vals; } sub haveNonPlaceholder($$) { # Look for anything starting at $vals->[$iStart] that is not the placeholder ".". my ($iStart, $vals) = @_; my $i = 0; return grep { $i++ >= $iStart && $_ ne "." } @$vals; } sub useCommasForLists($) { # dbNsfp includes some text values that contain commas, so it uses ';' as its # list separator. Html-encode commas, then replace ; with , to separate lists. my ($listRef) = @_; for (my $i = 0; $i < @$listRef; $i++) { $listRef->[$i] =~ s/,/%2C/g; $listRef->[$i] =~ s/;/,/g; } } my ($ensTxIdIx, $aaPosIx) = ($symbolIx{ensTxId()}, $symbolIx{aaPos()}); sub collateEnsTx($$$) { # dbNSFP rows are *almost* unique by {chr, start, altAl} -- but actually # it's by {chr, start, altAl, ensTxId}, and the ensTxIds appear on # alternating lines. So, collate rows by ensTxId before collapsing them. my ($chr, $start, $rows) = @_; my %ensTxRows = (); foreach my $r (@$rows) { my $ensTxId = $r->[$ensTxIdIx]; push @{$ensTxRows{$ensTxId}}, $r; } foreach my $ensTxId (sort keys %ensTxRows) { my @rowsForTx = @{$ensTxRows{$ensTxId}}; my $rowCount = @rowsForTx; if ($rowCount > 3) { # There are some duplicate rows which have identical content from # 'cut -f 1-4,7-13,15,18-20,23-', so differ in 'cut -f 5,6,14,16,17,21,22' # (aaRef, aaAlt, refCodon, codonPos, foldDegen, aaPos, siftScore). # These seem to be explained by less informative rows with "-1" for aaPos # that precede rows that have nonnegative aaPos and other coding info. print "FYI: >3 rows ($rowCount) for {$chr, $start, $ensTxId}; " . "removing less-informative duplicates.\n"; my @newRows = (); for (my $i = 0; $i < $rowCount; $i++) { if ($rowsForTx[$i]->[$aaPosIx] =~ /^-1(;-1)*$/ && $i < $rowCount-1 && $rowsForTx[$i+1]->[$aaPosIx] !~ /^-1(;-1)*$/) { next; } + if ($rowsForTx[$i]->[$altAlIx] eq $rowsForTx[$i]->[$refAlIx]) { + # Sometimes there's a row with altAl the same as refAl and mostly empty columns; + # drop those rows. + print STDERR "refAl == altAl:\n" . join("\t", @{$rowsForTx[$i]}) . "\n"; + } else { push @newRows, $rowsForTx[$i]; } + } $rowCount = @newRows; if ($rowCount > 3) { - die "Failed to resolve rows to 3 {$chr, $start, altAl, $ensTxId}."; + print STDERR "Failed to resolve rows to 3 {$chr, $start, altAl, $ensTxId}.\n"; + foreach my $r (@newRows) { + print STDERR join("\t", @{$r}) . "\n"; + } + die; } $ensTxRows{$ensTxId} = \@newRows; } } return %ensTxRows; } # Parallel arrays of our subset names, associated file handles and column indices my @subsets = (keys %subsetColumns); my $subsetCount = @subsets; my @handles = map { createOutputFile($_) } @subsets; my @ixLists = map { getIxList($_) } @subsets; my @ixListsFromAlt = map { skipToAlt($_) } @ixLists; my @hasAaPos = map { 0 + grep { $_ == $aaPosIx } @{$_} } @ixLists; my @iAfterAlt = (); for (my $i = 0; $i < $subsetCount; $i++) { $iAfterAlt[$i] = 1 + @{$ixLists[$i]} - @{$ixListsFromAlt[$i]}; } sub printIfNonEmpty($$$) { # If any of the special columns (after refAl and altAl) is not placeholder ".", print one line. my ($chr, $start, $rows) = @_; my @bed3Vals = ($chr, $start, $start+1); my %ensTxRows = collateEnsTx($chr, $start, $rows); - my $ensTxIdCount = scalar(keys %ensTxRows); foreach my $ensTxId (sort keys %ensTxRows) { my @rowsForTx = @{$ensTxRows{$ensTxId}}; - # Most locations have only one Ensembl transcript set; to cut file size, - # abbreviate ensTxId to '.' when there's only one. - if ($ensTxIdCount == 1) { - $rowsForTx[0]->[$ensTxIdIx] = '.'; - } # Select columns and print output for each subset: for (my $subset = 0; $subset < $subsetCount; $subset++) { my $fOut = $handles[$subset]; my $ixList = $ixLists[$subset]; if ($ixList->[0] ne $refAlIx) { # If this subset is unrelated to refAl and altAl, just print out subset columns # (if nonempty) as independent lines and let mergeSortedBed3Plus.pl compress them: foreach my $r (@rowsForTx) { my @specialVals = getVals($r, $ixList); if (haveNonPlaceholder(0, \@specialVals)) { useCommasForLists(\@specialVals); print $fOut join("\t", @bed3Vals, @specialVals) . "\n"; } } } else { # Since we generally have different content per alt allele, print one line with # combined content from all rows' alt alleles and their associated values. # Print all subset columns starting with refAl (ixList) for first row only. my @specialVals = getVals($rowsForTx[0], $ixList); # refAl is *almost* always uppercase: force the few stragglers to uppercase: $specialVals[0] = uc $specialVals[0]; # Watch out for missing ref-associated vals in the first row that are defined in # subsequent rows -- find and replace: if (defined $rowsForTx[1] && $hasAaPos[$subset]) { my $nextRow = $rowsForTx[1]; my @extraSpecialVals = getVals($nextRow, $ixList); for (my $i = 0; $i < @$nextRow; $i++) { last if ($ixList->[$i] == $altAlIx); if (($specialVals[$i] eq '.' || $specialVals[$i] eq '-1') && $extraSpecialVals[$i] ne '.' && $extraSpecialVals[$i] ne '-1') { $specialVals[$i] = $extraSpecialVals[$i]; } } } my $nonEmpty = 0; my $ixListFromAlt = $ixListsFromAlt[$subset]; # Skip everything up through altAl when looking for non-placeholders: $nonEmpty = 1 if (haveNonPlaceholder($iAfterAlt[$subset], \@specialVals)); # Print only altAl and beyond for subsequent rows. If there are fewer than 3 # altAl's / rows, fill in with placeholders to get correct column count. for (my $i = 1; $i < 3; $i++) { if (defined $rowsForTx[$i]) { my @subsetVals = getVals($rowsForTx[$i], $ixListFromAlt); push @specialVals, @subsetVals; $nonEmpty = 1 if (haveNonPlaceholder(1, \@subsetVals)); } else { push @specialVals, map { "." } @$ixListFromAlt; } } if ($nonEmpty){ useCommasForLists(\@specialVals); print $fOut join("\t", @bed3Vals, @specialVals) . "\n"; } } } } } ################################## MAIN ################################# my $expectedCols = @columnNames; -my ($chromIx, $hg19StartIx) = ($symbolIx{chrom()}, $symbolIx{hg19Start()}); +my ($chromIx, $hg38PosIx) = ($symbolIx{chrom()}, $symbolIx{hg38Pos()}); my ($prevChr, $prevStart); my @prevRows = (); while (<>) { if (/^#/) { checkHeader($_); next; } chomp; s/\r$//; my @w = split("\t"); if (@w != $expectedCols) { die "Wrong number of columns (expected $expectedCols, got " . scalar(@w); } - # Positions are all given as {chr, start}; end is always start+1. - my ($chr, $start) = getVals(\@w, [ $chromIx, $hg19StartIx ]); + # Positions are all given as {chr, start}; size is always 1 base. + my ($chr, $start) = getVals(\@w, [ $chromIx, $hg38PosIx ]); $chr = "chr" . $chr unless ($chr =~ /^chr/); $start--; if (defined $prevChr && ($chr ne $prevChr || $start != $prevStart)) { if ($chr eq $prevChr && $start < $prevStart) { die "Input is not sorted: $chr, $start < $prevStart"; } printIfNonEmpty($prevChr, $prevStart, \@prevRows); @prevRows = (); } ($prevChr, $prevStart) = ($chr, $start); push @prevRows, \@w; } if (defined $prevChr) { printIfNonEmpty($prevChr, $prevStart, \@prevRows); }