src/hg/makeDb/doc/hg18.txt 1.384
1.384 2009/09/28 22:07:49 angie
Genome Variants: added Korean (SJK). Added base counts to 1000 Genomes SNPs.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.383
retrieving revision 1.384
diff -b -B -U 4 -r1.383 -r1.384
--- src/hg/makeDb/doc/hg18.txt 20 Sep 2009 17:16:44 -0000 1.383
+++ src/hg/makeDb/doc/hg18.txt 28 Sep 2009 22:07:49 -0000 1.384
@@ -27472,16 +27472,105 @@
zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 11 > NA12878.pgSnp
zcat $relDir/YRI.child.dec.intersect.calls.gz | trio2pg.pl 7 > NA19240.pgSnp
#gff for indels does not give nts, can't put in pgSnp format
- hgLoadBed hg18 pgNA12878 NA12878.pgSnp \
+ # 9/25/09: use samtools pileup to add base counts back in to those files.
+ cat > addCounts.pl <<'_EOF_'
+#!/usr/bin/env perl
+use warnings;
+use strict;
+
+my $sample = shift @ARGV;
+my $bamTemplate = shift @ARGV;
+if (! (defined $sample && defined $bamTemplate)) {
+ die "Usage: $0 sampleId bamTemplate [pgSnpFile]\n";
+}
+
+my $prevChr;
+my ($bamFile, $PLUP);
+while (<>) {
+ my ($chr, $s, $e, $alleles, $aCount, $baseCounts, $quals) = split("\t");
+ # New chrom? open pipe from samtools pileup:
+ if (!defined $prevChr || $prevChr ne $chr) {
+ close ($PLUP) if (defined $PLUP);
+ (my $c = $chr) =~ s/^chr//;
+ ($bamFile = $bamTemplate) =~ s/__S__/$sample/g; $bamFile =~ s/__C__/$c/;
+ if (-e $bamFile) {
+ my $pileupPipe = "samtools pileup $bamFile |";
+ warn "Opening '$pileupPipe'\n";
+ open($PLUP, $pileupPipe) || die "Can't open pipe '$pileupPipe': $!\n";
+ }
+ else {
+ warn "bamFile '$bamFile' does not exist";
+ $PLUP = undef;
+ }
+ }
+ # Fast-forward to pileup line corresponding to this pgSnp line:
+ if (defined $PLUP) {
+ my ($pc, $ps, undef, $depth, $bases);
+ do {
+ ($pc, $ps, undef, $depth, $bases) = split("\t", <$PLUP>);
+ if (defined $pc) {
+ die "Unexpected chrom '$pc' (!~ '$chr') in $bamFile" if ("chr$pc" ne $chr);
+ $ps--;
+ } else {
+ $ps = $s+1;
+ close($PLUP);
+ $PLUP = undef;
+ }
+ } while ($ps < $s);
+ if (defined $pc && $ps == $s) {
+ $bases =~ s/\^.//g; $bases =~ s/\$//g; # ignore begin/end-of-read markers
+ while ($bases =~ /[-+](\d+)\w+/) { # ignore indels
+ my $count = $1;
+ $bases =~ s/[-+]$count\w{$count}//;
+ }
+ die "length of $bases (" . length($bases) . ") != $depth" if (length($bases) != $depth);
+ $bases =~ tr/acgtn/ACGTN/;
+ my @origBaseCounts = split(',', $baseCounts);
+ $baseCounts = "";
+ foreach my $al (split("/", $alleles)) {
+ my $alCt = ($bases =~ s/$al//g) + shift @origBaseCounts;
+ $baseCounts .= ',' if ($baseCounts ne "");
+ $baseCounts .= $alCt;
+ }
+ #warn "Leftover bases: $bases ($alleles)" if (length($bases) > 10);
+ # Sometimes the allele is given as homozygous but there are many other
+ # copies of some other base detected...? And sometimes lots of "*"
+ # characters, not described on http://samtools.sourceforge.net/pileup.shtml
+ } # end if we found the pileup line for this pgSnp line
+ } # end if there is a $bamFile for this template and chrom.
+ print join("\t", $chr, $s, $e, $alleles, $aCount, $baseCounts, $quals);
+ $prevChr = $chr;
+}
+'_EOF_'
+ # << emacs
+ chmod a+x addCounts.pl
+ foreach f (NA*.pgSNP)
+ set s = $f:r
+ cat $f \
+ | ./addCounts.pl $s \
+ /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.SLX.maq.SRP000032.2009_07.bam \
+ | ./addCounts.pl $s \
+ /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.SOLID.corona.SRP000032.2009_08.bam \
+ | ./addCounts.pl $s \
+ /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.454.ssaha.SRP000032.2009_07.bam \
+ > $f.counts
+ end
+ # NA12878 and NA19240 have all 3 platforms; just SLX.maq for NA12891, NA12892
+
+ hgLoadBed hg18 pgNA12878 NA12878.pgSnp.counts \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
- hgLoadBed hg18 pgNA12891 NA12891.pgSnp \
+#Loaded 3049749 elements of size 7
+ hgLoadBed hg18 pgNA12891 NA12891.pgSnp.counts \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
- hgLoadBed hg18 pgNA12892 NA12892.pgSnp \
+#Loaded 2968312 elements of size 7
+ hgLoadBed hg18 pgNA12892 NA12892.pgSnp.counts \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
- hgLoadBed hg18 pgNA19240 NA19240.pgSnp \
+#Loaded 2972120 elements of size 7
+ hgLoadBed hg18 pgNA19240 NA19240.pgSnp.counts \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
+#Loaded 3586490 elements of size 7
#############################################################################
# GENOME VARIANTS - (DONE 1/7/09 giardine, adapted by angie from pgSnp/README)
@@ -27530,8 +27619,22 @@
'http://yh.genomics.org.cn/do.downServlet?file=data/snps/yhsnp_add.gff'
#############################################################################
+# GENOME VARIANTS - KOREF (DONE 9/17/09 angie)
+ # Korean individual (Seong-Jin Kim) from Genome Research paper
+ cd /hive/data/genomes/hg18/bed/pgSnp/
+ # Download Belinda's file from PSU, use same table name (pgSjk) as on
+ # http://main.genome-browser.bx.psu.edu/ :
+ wget http://www.bx.psu.edu/~giardine/tests/tmp/koref.sub.pgSnp
+ hgLoadBed hg18 pgSjk koref.sub.pgSnp \
+ -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
+#Loaded 3439107 elements of size 7
+ # Downloading because I think it's the original data:
+ wget ftp://ftp.kobic.kr/pub/KOBIC-KoreanGenome/genetic_variations/KOREF-solexa-snp-X30_Q40d4D100.gff
+
+
+#############################################################################
# Initial import of LSSNP data for SNP and hgGene linking (2009-02-02 markd)
#############################################################################
# dump and load LSSNP databases from Johns Hopkins. This will be automated
# soon.