be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hg/makeDb/outside/covid/makeCovidHgiGwas.pl src/hg/makeDb/outside/covid/makeCovidHgiGwas.pl index 6cad4b3..9569f0c 100644 --- src/hg/makeDb/outside/covid/makeCovidHgiGwas.pl +++ src/hg/makeDb/outside/covid/makeCovidHgiGwas.pl @@ -1,71 +1,96 @@ # Format files from COVID Host Genetics Initiative as BED 9+11 (covidHgiGwas.as) for lollipop display +# Usage: makeCovidHgiGwas.pl <#studies> [R4] [neg-threshold pos-threshold] + +# R4 indicates newer format of input files. effect | pvalue are alternative views +# thresholds are for effect size coloring when using pvalue view +# effect size view is hard-coded to pvalue 5 threshold (conventional threshold for GWAS) + use strict; use English; my $db = $ARGV[0]; my $allStudies = $ARGV[1]; my $file = $ARGV[2]; # support slightly changed format in newer releases my $version = "R3"; # initial track if ($#ARGV >= 3) { $version = $ARGV[3]; } + +# color light/bright threshold +my $colorBy = "pvalue"; +my $negThreshold = "5"; # if coloring by -log10 pvalue +my $posThreshold = "5"; # if coloring by -log10 pvalue +if ($#ARGV >=4) { + $colorBy = "effect"; + $negThreshold = $ARGV[4]; + $posThreshold = $ARGV[5]; +} + open(my $fh, $file) or die ("can't open file $file\n"); my $hdr = <$fh>; chomp($hdr); my @hdr = split('\t', $hdr); my $fields = $#hdr + 1; # skip over middle fields in R3 format (per-study values) # extended format in R3 hg19 which was lifted from hg38 analysis (4 fields w/ hg38) if ($db eq "hg38" or $version ne "R3") { $fields += 4; } # skip field 5 (long name) my $first = $fields-12; my $last = $fields-5; #die 'fields: ' . $fields . ' $first: ' . $first . ' $last: ' . $last . "\n"; my $scale = 3; my $sizeBins = 5; while (<$fh>) { + # parse and generate field values chomp; my ($chromNum, $pos, $ref, $alt) = split; - # drop 'chr23' entries (tills we identify) + # drop 'chr23' entries (till we identify) next if $chromNum eq "23"; my @data = split; my ($studies, $effectSize, $effectSE, $pval, $pvalHet, $samples, $alleleFreq, $snp) = @data[$first..$last]; my $chr = "chr" . $chromNum; my $start = $pos - 1; my $end = $pos; my $score = 0; + my $name = ($snp eq "NA") ? $chromNum . ":" . $pos : $snp; + my $studyWeight = int(($studies * $sizeBins) / $allStudies) + $scale; + + # assign color + my $color; my $blue = "0,0,255"; my $red = "255,0,0"; my $lightBlue = "160,160,255"; my $lightRed = "255,160,160"; my $logPval = -(log($pval)/log(10)); - my $intPval = int($logPval); - my $color; + my $absEffectSize = abs($effectSize); + my $colorThresholdVal = int($logPval); + if ($colorBy eq "effect") { + $colorThresholdVal = $absEffectSize; + } if ($effectSize > 0) { # positive (red) - $color = ($intPval >= 5) ? $red : $lightRed; + $color = ($colorThresholdVal >= $posThreshold) ? $red : $lightRed; } else { # negative (blue) - $color = ($intPval >= 5) ? $blue : $lightBlue; + $color = ($colorThresholdVal >= $negThreshold) ? $blue : $lightBlue; } - my $name = ($snp eq "NA") ? $chromNum . ":" . $pos : $snp; - my $studyWeight = int(($studies * $sizeBins) / $allStudies) + $scale; + # print $OFS = "\t"; print $chr, $start, $end, $name, $score, '.', $start, $end, $color; $OFS = ""; printf("\t%.3f\t%.3f", $effectSize, $effectSE); printf("\t%.2e\t%.3f\t%.2e", $pval, $logPval, $pvalHet); printf("\t%s\t%s\t%.3f\t%s\t%s\t%d\t%.3f", $ref, $alt, $alleleFreq, $samples, $studies, - $studyWeight, abs($effectSize)); + $studyWeight, $absEffectSize); print "\n"; } close ($fh);