src/hg/makeDb/doc/hg18.txt 1.406
1.406 2010/03/25 21:35:39 angie
Some really old changes: snp130 func annotation fixes, hapmap and 1000 genomes experimental work (some interrupted).
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.405
retrieving revision 1.406
diff -b -B -U 4 -r1.405 -r1.406
--- src/hg/makeDb/doc/hg18.txt 19 Mar 2010 00:23:03 -0000 1.405
+++ src/hg/makeDb/doc/hg18.txt 25 Mar 2010 21:35:39 -0000 1.406
@@ -3323,8 +3323,9 @@
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
+ # << emacs
gensub2 contigs.lst bacends.lst template jobList
foreach f (`cat bacends.lst`)
set d = $f:r:t
echo $d
@@ -21630,9 +21631,8 @@
# snp129Seq) to .2008-08-08 00:00:00 to avoid unwarranted joinerCheck
# time discrepancy errors. (8/8/08, brooke)
# Set up build directory
- ssh kkstore06
mkdir -p /cluster/store3/dbSNP129/{human,shared}
ln -s /cluster/store3/dbSNP129 /cluster/data/dbSNP/129
# Get field encodings -- if there are changes or additions to the
@@ -22367,8 +22367,10 @@
############################################################################
# dbSNP BUILD 130 (UPDATED 8/18/09 angie)
# Originally done 5/22/09.
+# Functional annotations restricted by mapping position 7/7.
+# dbSNP corrections applied to func field 8/18.
# Set up build directory
mkdir -p /hive/data/outside/dbSNP/130/{human,shared}
# Get field encodings -- if there are changes or additions to the
@@ -22513,33 +22515,46 @@
#################### EXTRACT INFO FROM NCBI TABLES ####################
# Glom each SNP's function codes together and load up a new hg18Snp130 table.
# Also extract NCBI's annotations of coding SNPs' effects on translation.
- # ContigLocusId includes contig_acc and asn_{from,to} but those are not comprehensive!
- # If a SNP has been mapped to multiple contigs, one is randomly selected, and if
- # it is not a reference contig, we miss out if we restrict by contig. We may end
- # up getting a few spurious functional annotations from mappings to other assemblies
- # but them's the breaks.
+ # We extract ContigLocusId info only for reference assembly mapping.
+ # Some SNP's functional annotations are for an alternate assembly, so we will
+ # have no NCBI functional annotations to display for those (but our own are
+ # available).
cd /hive/data/outside/dbSNP/130/human
- hgsql hg18snp130 -NBe 'select snp_id, mrna_acc, fxn_class, \
- reading_frame, allele, residue, codon from ContigLocusId' \
+ hgsql hg18snp130 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
+ fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
+ from ContigLocusId as cli, ContigInfo as ci \
+ where cli.ctg_id = ci.ctg_id and \
+ group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")' \
> ncbiFuncAnnotations.txt
- cut -f 1,3 ncbiFuncAnnotations.txt \
- | sort -u -k1,1 -k2,2n \
- | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
- else { if (prevId) {print prevId "\t" prevFunc;} \
- prevFunc = $2 ","; }} \
- {prevId = $1;} \
- END {print prevId "\t" prevFunc;}' \
+ # Ignore function code 8 (cds-reference, just means that some allele matches reference)
+ # and glom functions for each SNP id:
+ cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
+ | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
+ | perl -we 'while (<>) { chomp; \
+ ($id, undef, $s, $e, $f, $c) = split; \
+ if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
+ $prevFunc .= "$f," unless ($f == 8); \
+ } else { \
+ print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if (defined $prevId); \
+ $prevFunc = ($f == 8) ? "" : "$f,"; \
+ } \
+ ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
+ } \
+ print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \
> ucscFunc.txt
-
wc -l ucscFunc.txt
-#7344853 ucscFunc.txt
+#7035685 ucscFunc.txt
cat > ucscFunc.sql <<EOF
CREATE TABLE ucscFunc (
snp_id int NOT NULL ,
+ ctg_id int(10) NOT NULL ,
+ asn_from int(10) NOT NULL ,
+ asn_to int(10) NOT NULL ,
fxn_class varchar(255) NOT NULL ,
- INDEX snp_id (snp_id)
+ INDEX snp_id (snp_id),
+ INDEX ctg_id (ctg_id)
);
EOF
hgLoadSqlTab hg18snp130 ucscFunc{,.sql,.txt}
@@ -22568,9 +22583,9 @@
# Add indices to tables for a big join (5 or 6 minutes):
hgsql hg18snp130 -e \
'alter table ContigLoc add index (ctg_id); \
alter table ContigInfo add index (ctg_id); \
- alter table ContigLoc add index (snp_id); \
+ alter table ContigLocusId add index (snp_id); \
alter table SNP add index (snp_id); \
alter table MapInfo add index (snp_id);'
# Big leftie join to bring together all of the columns that we want in snp130,
@@ -22586,18 +22601,22 @@
ci.group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")) \
LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
- LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id;' \
+ LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
+ and uf.asn_from = cl.asn_from;' \
> ucscNcbiSnp.ctg.bed
#on a not-so busy hgwdev: 80.735u 36.958s 8:54.76 22.0% 0+0k 0+0io 0pf+0w
-#on a very busy hgwdev: 78.753u 41.304s 30:19.77 6.5% 0+0k 0+0io 0pf+0w
+#on a busy hgwdev: 78.753u 41.304s 30:19.77 6.5% 0+0k 0+0io 0pf+0w
+#on hgwdev with giant chains loading in parallel:
+# 78.213u 33.826s 58:16.41 3.2% 0+0k 0+0io 0pf+0w
wc -l ucscNcbiSnp.ctg.bed
#19189750 ucscNcbiSnp.ctg.bed
+
liftUp ucscNcbiSnp.bed \
- /cluster/data/hg18/jkStuff/liftContigs.lft warn \
+ /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn \
ucscNcbiSnp.ctg.bed
-#116.027u 7.078s 2:27.93 83.2% 0+0k 0+0io 0pf+0w
+#119.644u 8.992s 2:36.67 82.1% 0+0k 0+0io 3pf+0w
# Drum roll please... translate NCBI's encoding into UCSC's, and
# perform a bunch of checks. This is where developer involvement
# is most likely as NCBI extends the encodings used in dbSNP.
@@ -22612,9 +22631,9 @@
#count of snps with weight 2 = 389501
#count of snps with weight 3 = 1189989
#count of snps with weight 10 = 281391
#Found no errors.
-#143.111u 14.313s 3:15.18 80.6% 0+0k 0+0io 0pf+0w
+#163.878u 10.302s 3:33.84 81.4% 0+0k 0+0io 0pf+0w
wc -l snp*
# 18833531 snp130.bed
# 22 snp130.sql
# 0 snp130Errors.bed
@@ -22645,24 +22664,20 @@
cd /hive/data/outside/dbSNP/130/human
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
hg18 snp130 -sqlTable=snp130.sql snp130.bed
#Loaded 18833531 elements of size 17
-#107.246u 11.546s 10:49.23 18.2% 0+0k 0+0io 0pf+0w
+#114.088u 12.924s 12:54.18 16.4% 0+0k 0+0io 0pf+0w
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
hg18 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
snp130Exceptions.bed
#15.255u 1.257s 1:11.11 23.2% 0+0k 0+0io 0pf+0w
- sed -e 's/snp125/snp130/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
- > snp130ExceptionDesc.sql
- hgLoadSqlTab hg18 snp130ExceptionDesc snp130ExceptionDesc.sql \
+ hgLoadSqlTab hg18 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
snp130ExceptionDesc.tab
# Load up sequences.
- sed -e 's/snpSeq/snp130Seq/' ~/kent/src/hg/lib/snpSeq.sql \
- > snp130Seq.sql
mkdir -p /gbdb/hg18/snp
ln -s /hive/data/outside/dbSNP/130/human/snp130.fa /gbdb/hg18/snp/snp130.fa
- time nice hgLoadSqlTab hg18 snp130Seq snp130Seq.sql snp130Seq.tab
-#0.001u 0.004s 3:41.13 0.0% 0+0k 0+0io 0pf+0w
+ time nice hgLoadSqlTab hg18 snp130Seq ~/kent/src/hg/lib/snpSeq.sql snp130Seq.tab
+#0.005u 0.002s 6:02.78 0.0% 0+0k 0+0io 0pf+0w
# Put in a link where one would expect to find the track build dir...
ln -s /hive/data/outside/dbSNP/130/human /cluster/data/hg18/bed/snp130
@@ -22717,9 +22732,11 @@
# those. E.g. rs437678.
#######################################################################
-# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/15/09 angie)
+# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/26/09 angie)
+# Originally done 5/15; reloaded 5/26 after making sure no coords had changed,
+# reloaded 7/7/09 to bump timestamp
mkdir /hive/data/genomes/hg18/bed/snp130Ortho
cd /hive/data/genomes/hg18/bed/snp130Ortho
# Following Heather's lead in snp126orthos, filter SNPs to to keep
@@ -22876,8 +22893,198 @@
nice gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed
rm -r run*/split tmp.txt *.orthoGlom.txt
+#######################################################################
+# DBSNP CODING ANNOTATIONS (DONE 7/7/09 angie)
+# originally done 6/2/09 - redone w/snp130, using mapping locations of dbSNP's func. annos
+ cd /hive/data/outside/dbSNP/130/human
+ awk '$9 != NULL {print;}' ncbiFuncAnnotations.txt \
+ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n | uniq > ncbiCodingAnnotations.txt
+ wc -l ncbiCodingAnnotations.txt
+#576726 ncbiCodingAnnotations.txt
+ # How many & what kinds of function types?
+ cut -f 6 ncbiCodingAnnotations.txt \
+ | sort -n | uniq -c
+# 107963 3 (coding-synon)
+# 276197 8 (cds-reference)
+# 4664 41 (nonsense)
+# 146908 42 (missense)
+# 40994 44 (frameshift)
+ # Does everybody have a reference annotation?
+ awk '$6 == 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp1
+ awk '$6 != 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp2
+ wc -l tmp1 tmp2
+# 276113 tmp1
+# 279647 tmp2
+ # Doh! not everybody. So hgTracks will sometimes have to process ref itself...
+ # Gather up multiple annotation lines into one line per {snp, gene, frame}:
+ perl -e 'while (<>) { chomp; \
+ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
+ if (defined $lastRs && \
+ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
+ $lastTx ne $txId || $lastFrm ne $frm)) { \
+ if (defined $refRow) { \
+ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \
+ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \
+ } \
+ print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
+ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \
+ } \
+ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
+ ($rsId, $ctg, $s, $e, $txId, $frm); \
+ $count++; \
+ if ($fxn == 8) { \
+ $refRow = [$fxn, $nt, $aa, $codon]; \
+ } else { \
+ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \
+ } \
+ } \
+ if (defined $refRow) { \
+ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \
+ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \
+ } \
+ print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+ "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
+ ncbiCodingAnnotations.txt \
+ > snp130CodingDbSnp.ctg.txt
+ liftUp snp130CodingDbSnp.bed \
+ /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn snp130CodingDbSnp.ctg.txt
+ hgLoadBed hg18 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
+ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
+ snp130CodingDbSnp.bed
+#Loaded 279815 elements of size 11
+
+
+#######################################################################
+# SNPMASKED SEQUENCE FOR SNP130 (DONE 7/10/09 angie)
+ mkdir /hive/data/genomes/hg18/snp130Mask
+ cd /hive/data/genomes/hg18/snp130Mask
+
+ # Identify rsIds with various problems -- we will exclude those.
+ # MultipleAlignments is kinda broad because anything that maps on
+ # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
+ # matches on chrN_random might disqualify good matches on chrN.
+ # Well, erring on the side of caution is good.
+ awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
+ /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \
+ | sort -u \
+ > snp130ExcludeRsIds.txt
+ time grep -vFwf snp130ExcludeRsIds.txt \
+ /hive/data/outside/dbSNP/130/human/snp130.bed \
+ > snp130Cleaned.bed
+#185.202u 4.847s 3:22.55 93.8% 0+0k 0+0io 0pf+0w
+
+ # Substitutions:
+ mkdir substitutions
+ snpMaskSingle snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
+ | faSplit byname stdin substitutions/
+#Masked 12142171 snps in 12141860 out of 3091592211 genomic bases
+#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091592211 (difference is 16085062)
+#94.376u 16.038s 3:10.37 57.9% 0+0k 0+0io 0pf+0w
+ # Check that 16085062 is the total #bases in sequences with nothing in snp130Cleaned:
+ cut -f 1 snp130Cleaned.bed | uniq > /tmp/1
+ grep -vwf /tmp/1 ../chrom.sizes
+ grep -vwf /tmp/1 ../chrom.sizes \
+ | awk 'BEGIN {TOTAL = 0 ; } {TOTAL += $2 ; } END {printf "%d\n", TOTAL ; }'
+#16085062
+ # 338 warnings about differing observed strings at same base position --
+ # saved as diffObserved.txt.
+#TODO: send list to dbSNP.
+ # Make sure that sizes are identical, first diffs are normal -> IUPAC,
+ # and first diffs' case is preserved:
+ foreach f (substitutions/chr*.fa)
+ faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
+ end
+#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 491 (y != c)
+#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 55877 (s != c)
+#...
+#(output OK -- ambiguous bases replacing [agct] at SNP positions)
+ foreach f (substitutions/chr*.fa)
+ echo $f:t:r
+ mv $f $f:r.subst.fa
+ gzip $f:r.subst.fa
+ end
+
+ # Insertions:
+ mkdir insertions
+ snpMaskAddInsertions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
+ | faSplit byname stdin insertions/
+#Added 2464798 snps totaling 5891837 bases to 3085167749 genomic bases
+#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
+#99.269u 17.928s 3:31.80 55.3% 0+0k 0+0io 1pf+0w
+ # Again, that just means that some chroms didn't have filtered SNPs.
+ # Make sure that all sizes have increased relative to original:
+ foreach f (insertions/chr*.fa)
+ faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
+ |& perl -we '$_=<>; \
+ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \
+ if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
+ else {die "ERROR: ins size $1 <= $2\n";} \
+ } else {die $_;}'
+ end
+#OK: ins size 247711739 > 247249719
+#OK: ins size 135642480 > 135374737
+#...
+#(output OK -- new sizes > old)
+ foreach f (insertions/chr*.fa)
+ mv $f $f:r.ins.fa
+ gzip $f:r.ins.fa
+ end
+
+ # Deletions:
+ mkdir deletions
+ snpMaskCutDeletions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
+ | faSplit byname stdin deletions/
+#Cut 1514798 snps totaling 3554896 bases from 3086962619 genomic bases
+#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3086962619 (difference is 20714654)
+#103.312u 31.094s 3:56.12 56.9% 0+0k 0+0io 1pf+0w
+ # Again, that just means that some chroms didn't have filtered SNPs.
+ # Make sure that all sizes have decreased relative to original:
+ foreach f (deletions/chr*.fa)
+ faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
+ |& perl -we '$_=<>; \
+ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \
+ if ($1 < $2) {print "OK: del size $1 < $2\n";} \
+ else {die "ERROR: del size $1 >= $2\n";} \
+ } else {die $_;}'
+ end
+#OK: del size 246960459 < 247249719
+#OK: del size 135214654 < 135374737
+#...
+#(output OK -- del sizes < old)
+ foreach f (deletions/chr*.fa)
+ mv $f $f:r.del.fa
+ gzip $f:r.del.fa
+ end
+
+ # Clean up and prepare for download:
+ gzip snp130Cleaned.bed
+ foreach d (substitutions insertions deletions)
+ pushd $d
+ md5sum *.gz > md5sum.txt
+ cp ../../snp129Mask/$d/README.txt .
+ popd
+ end
+ # Edit the README.txt in each subdir.
+
+ # Create download links on hgwdev.
+ # NOTE: Currently we offer only the substitutions.
+ # If we get any user requests, then maybe we can put the insertions
+ # and deletions out there.
+ mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask
+ ln -s /hive/data/genomes/hg18/snp130Mask/substitutions/* \
+ /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/
+## If there is user demand for ins & del, then start over with an empty
+## goldenPath/snp130Mask and do this:
+## foreach type (substitutions insertions deletions)
+## mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type
+## ln -s /hive/data/genomes/hg18/snp130Mask/$type/* \
+## /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type/
+## end
+
+
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
@@ -28104,8 +28311,32 @@
-tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapPhaseIIISummary.sql
#Loaded 4166007 elements of size 18
#33.401u 3.275s 1:46.95 34.2% 0+0k 0+0io 0pf+0w
+#############################################################################
+# DOWNLOAD HAPMAP PHASED GENOTYPES (PHASE III) (DONE 2/23/09 angie)
+ mkdir -p /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2
+ cd /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2
+ wget --timestamping \
+ ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/\*
+ foreach pop (ASW CEU CHD GIH JPT+CHB LWK MEX MKK TSI YRI)
+ foreach type (DUOS TRIOS UNRELATED)
+ mkdir -p $pop/$type
+ pushd $pop/$type
+ wget --timestamping \
+ ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/$pop/$type/\*
+ popd
+ end
+ end
+ # Looks like phased genotypes are given only for the populations with
+ # family structure: ASW, CEU, MEX, MKK, and YRI.
+ # Next: use these data to make LD tracks.
+
+
+#############################################################################
+# HAPMAP LD COMPUTED ON PHASED & UNPHASED GENOTYPES (TODO angie)
+
+
#############################################################################
# GERP Conservation scoring and elements for Ensembl 31-way alignments
# From Javier Guerroro
@@ -28130,8 +28361,9 @@
echo $f
bzcat $f | tail -n +2 | wigEncode stdin temp.wig temp.wib
end
'EOF'
+ # << emacs
bzcat lab/*.wig.bz2 | tail -n +2 | \
wigEncode stdin ensembl31wayGerpScores.wig ensembl31wayGerpScores.wib
@@ -28170,8 +28403,82 @@
#checked: 62958 failed: 0
hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
+
+#############################################################################
+# COVERAGE FOR 1000 GENOMES HIGH-COV INDIVIDS (IN PROGRESS 6/10/09 angie)
+#TODO: try again now that wigToBigWig is more mem-efficient
+# also, new alignments have probably become available since then.
+ # wigBedToStep ran out of memory on hgwdev (w/limit of 32G)... roll own:
+ cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes
+ foreach s (NA12878 NA12891 NA12892 NA19238 NA19239 NA19240)
+ pushd data/$s/alignment
+ foreach p (454 SLX)
+ echo "==== $s $p ===="
+ ls -1 $s.chrom*.$p.SRP000032.2009_04.bam \
+ | grep -v chromMT \
+ | xargs -L 1 samtools pileup \
+ | perl -pe '($c, $start, undef, $depth) = split; \
+ if ($c ne $lastC || $start != $lastStart+1) { \
+ print "fixedStep chrom=chr$c start=$start step=1 span=1\n"; \
+ } \
+ $_ = "$depth\n"; \
+ ($lastC, $lastStart) = ($c, $start);' \
+ | gzip -c > cov${s}By{$p}.fixedStep.gz
+ echo ""
+ end
+ popd
+ end
+
+#TODO
+ # Killing memory -- run separately:
+ | wigToBigWig -clip stdin /hive/data/genomes/hg18/chrom.sizes cov${s}By$p.bw
+
+#[bam_pileup] fail to read the header of NA12878.chromY.454.SRP000032.2009_04.bam: non-exisiting file or wrong format.
+ # NA12878.chromY.454.SRP000032.2009_04.bam is an empty file.
+
+ # Load tables
+ foreach bw (`find /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes \
+ -name cov\*.bw`)
+ ln -s $bw /gbdb/hg18/bbi/
+ hgsql hg18 -e "drop table if exists $bw:t:r; \
+ create table $bw:t:r (fileName varchar(255) not null); \
+ insert into $bw:t:r values ('/gbdb/hg18/bbi/$bw:t');"
+ end
+
+
+#############################################################################
+# 1000 GENOMES HIGH-COV INDIVIDS READ ALIGNMENTS (DONE 11/30/09 angie)
+ # one-off to test BAM as track type:
+ cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes
+ set testBam = NA12878/alignment/NA12878.chrom22.SRP000032.2009_02.bam
+ ln -s `pwd`/$testBam{,.bai} \
+ /gbdb/hg18/bbi/
+ hgsql hg18 -e "drop table if exists bamNA12878; \
+ create table bamNA12878 (fileName varchar(255) not null); \
+ insert into bamNA12878 values ('/gbdb/hg18/bbi/$testBam:t');"
+
+ # 9/14/09: update bamNA12878 to use new seqName column and try samtools'
+ # capability to fetch ftp sparsely:
+ hgsql hg18 -e "drop table if exists bamNA12878; \
+ create table bamNA12878 (fileName varchar(255) not null, \
+ seqName varchar(255) not null); \
+ insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom21.SLX.maq.SRP000032.2009_07.bam', '21'); \
+ insert into bamNA12878 values ('/gbdb/hg18/bbi/NA12878.chrom22.SLX.SRP000032.2009_04.bam', '22');"
+ # 11/30/09: Add more remote files:
+ foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Y)
+ hgsql hg18 -e "insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');"
+ end
+ # Add an all-remote NA12891 for testing composite track:
+ hgsql hg18 -e "create table bamNA12891 (fileName varchar(255) not null, \
+ seqName varchar(255) not null);"
+ foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
+ hgsql hg18 -e "insert into bamNA12891 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12891/alignment/NA12891.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');"
+ end
+
+
+
##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
mkdir /hive/data/genomes/hg18/bed/ucscToEnsembl
cd /hive/data/genomes/hg18/bed/ucscToEnsembl
@@ -28813,8 +29120,52 @@
# Added code to src/hg/hgTracks/simpleTracks.c to register a track handler
# for vegaGeneComposite that is now used for this data. This used
# vegaGeneMethods to display the name2 field (gene) as the item label in
# the track.
+
+
+############################################################################
+# EPO ANCESTRAL REGIONS (DONE 8/5/09 angie)
+ # Use Aspera client to download 1000Genomes' Enredo-Pecan-Ortheus
+ # four-catarrhini ancestral-tree calls for genome regions, as well as
+ # their annotated fasta (requested by Sol Katzman):
+ cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/
+ set asperaInst = /opt/aspera/connect
+ set ascp = $asperaInst/bin/ascp
+ set aKey = $asperaInst/etc/asperaweb_id_dsa.putty
+ set aOpts = "-i $aKey -QTr -l300M -q"
+ set server = anonftp@ftp-private.ncbi.nlm.nih.gov
+ set aliDir = technical/reference/ancestral_alignments
+ mkdir -p $aliDir
+ cd $aliDir
+ foreach f (MD5SUM README README.ancestral_alignments summary.txt \
+ $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/$f .
+ end
+ foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
+ $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.bed
+ $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.fa.bz2
+ end
+ chmod 444 *
+ # Check md5sums:
+ perl -wpe 'chomp; ($expSum, $f) = split; $actSum = `md5sum $f`; $actSum =~ s/ .*\n//; \
+ $_ = ""; \
+ if ($expSum ne $actSum) { warn "MISMATCH: $f exp=$expSum, actual=$actSum\n"; } \
+ else {print "$f OK\n";}' MD5SUM
+ # Shortcut requested by Sol:
+ ln -s `pwd` /hive/data/outside/ancestral.epo.hg18
+ # Load up the regions:
+ mkdir /hive/data/genomes/hg18/bed/epoAncestralRegions
+ cd /hive/data/genomes/hg18/bed/epoAncestralRegions
+ set aliPath = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/$aliDir
+ sed -e 's/^/chr/' $aliPath/human_ancestor_*.bed > epoAncestralRegions.bed
+ hgLoadBed hg18 epoAncestralRegions epoAncestralRegions.bed -tab -allowStartEqualEnd
+#Loaded 10195 elements of size 4
+ featureBits hg18 epoAncestralRegions
+#2778857014 bases of 2881515245 (96.437%) in intersection
+ featureBits hg18 -countGaps gap epoAncestralRegions
+#6232933 bases of 3107677273 (0.201%) in intersection
+
+
# 2009-08-16 (hartera)
# ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
# There is an index on the protein field so it can not be NULL.
# If there is no protein, the gene name is given.
@@ -29292,8 +29643,18 @@
hgLoadBed hg18 gwasCatalog gwasCatalog.bed \
-tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
#Loaded 2930 elements of size 22
+ # For David: find examples of risk alleles for which dbSNP observed
+ # alleles are complementary (A/T or C/G) -- how do we know what strand the
+ # risk allele is on?? -- asking corresp. author Teri Manolio.
+ hgsql hg18 -e 'select snp.name,gc.riskAllele,snp.strand,snp.refNcbi,snp.observed \
+ from gwasCatalog as gc, snp130 as snp \
+ where gc.riskAllele rlike "^rs[0-9]+-[ACGT]" and \
+ gc.name = snp.name and snp.observed in ("C/G", "A/T") \
+ order by gc.name limit 20;'
+
+
#############################################################################
# CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)
# Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
# from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona.