src/hg/makeDb/doc/hg18.txt 1.350
1.350 2009/02/23 23:43:20 angie
Remove thin tails from DGV (SAB feedback). Added 5 tracks from Human Genome Diversity Project.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.349
retrieving revision 1.350
diff -b -B -U 4 -r1.349 -r1.350
--- src/hg/makeDb/doc/hg18.txt 23 Feb 2009 22:50:04 -0000 1.349
+++ src/hg/makeDb/doc/hg18.txt 23 Feb 2009 23:43:20 -0000 1.350
@@ -21250,9 +21250,10 @@
hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
############################################################################
-# DGV V6 (DATABASE OF GENOMIC VARIANTS) (DONE 11/12/08 angie)
+# DGV V6 (DATABASE OF GENOMIC VARIANTS) (DONE 2/23/09 angie)
+# DGV V6 with useless thin regions done 11/12/08
# DGV V5 done 7/16/08
# DGV V4 done 5/9/08
ssh hgwdev
mkdir /cluster/data/hg18/bed/dgv.v6
@@ -21262,33 +21263,28 @@
wget --timestamping \
http://projects.tcag.ca/variation/downloads/indel.hg18.v6.nov.2008.txt
# shuffle fields into bed8+
foreach f (*.v6.*.txt)
- tail +2 $f \
+ tail -n +2 $f \
| perl -wpe 'chomp; \
($id, $landmark, $chr, $start, $end, $varType, \
- $locChr, $locStart, $locEnd, $ref, $pmid, $method, \
+ undef, undef, undef, $ref, $pmid, $method, \
undef, undef, undef, undef, $sample) = split("\t"); \
- die "chr $chr != loc $locChr" if ($chr ne $locChr); \
$id =~ s/^Variation_//; \
- $chromStart = $start < $locStart ? $start : $locStart; \
- $chromEnd = $end > $locEnd ? $end : $locEnd; \
- $thickStart = $locStart > $start ? $locStart : $start; \
- $thickEnd = $locEnd < $end ? $locEnd : $end; \
- $chromStart--; $thickStart--; \
+ $start--; \
$landmark = "" if ($landmark =~ /^chr.*\d\.\.\d/); \
$rgb = "255,128,0"; \
$rgb = "200,0,0" if ($varType =~ /^Inv/); \
$rgb = "0,100,0" if ($varType eq "InDel"); \
- $_ = join("\t", $chr, $chromStart, $chromEnd, $id, 0, "+", \
- $thickStart, $thickEnd, $rgb, $landmark, $varType, \
+ $_ = join("\t", $chr, $start, $end, $id, 0, "+", \
+ $start, $end, $rgb, $landmark, $varType, \
$ref, $pmid, $method, $sample) . "\n";' \
> $f:r.bed
end
hgsql hg18 -e 'rename table dgv to dgvV5'
hgLoadBed hg18 dgv *.bed \
-onServer -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -tab
-#Loaded 27799 elements of size 15
+#Loaded 31615 elements of size 15
############################################################################
# AGILENT CGH PROBES (AND MM8, RN4) (Done 2008-05-13, Andy)
@@ -26758,8 +26754,15 @@
featureBits hg18 rmskRM327 \!rmsk
#63060562 bases of 2881515245 (2.188%) in intersection
# hgTables: 49,804 rmskRM327 items (4,805,535 bases) have no overlap with rmsk
+ # Added download file 2/5/09:
+ cd /hive/data/genomes/hg18
+ zip -j bigZips/chromOut.RM3.2.7.zip */chr*.RM327.fa.out
+ ln -s /hive/data/genomes/hg18/bigZips/chromOut.RM3.2.7.zip \
+ /usr/local/apache/htdocs/goldenPath/hg18/bigZips/
+
+
#############################################################################
# Initial import of LSSNP data for SNP and hgGene linking (2009-02-02 markd)
#############################################################################
# dump and load LSSNP databases from Johns Hopkins. This will be automated
@@ -26775,5 +26778,145 @@
hgLsSnpPdbLoad load hg18 lsSnpPdb lsSnpPdb.tab
#############################################################################
+#############################################################################
+# HGDP GEOGRAPHIC SNP MAPS (DONE 2/5/09 angie)
+ # Project data downloaded and parsed in /hive/data/outside/hgdpGeo,
+ # see makeDb/doc/hgdpGeo.txt.
+ mkdir /hive/data/genomes/hg18/bed/hgdpGeo
+ cd /hive/data/genomes/hg18/bed/hgdpGeo
+ # Make an rsId-sorted snp coords file for joining with the hgdpGeo data.
+ grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \
+ ../snp129/snp129.bed \
+ | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3;}' \
+ | sort > snp129Coords.txt
+ wc -l snp129Coords.txt
+#660280 snp129Coords.txt
+ # How many distinct SNPs in there? (compare to 657000 from HGDP):
+ cut -f 1 snp129Coords.txt |uniq | wc -l
+#656496
+
+ # Join files to make a track table:
+ join -e ERROR -t' ' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4 \
+ snp129Coords.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \
+ | sed -re 's/([AGTC])\*/\1/' \
+ | sort -k1,1 -k2n,2n \
+ > hgdpGeo.tab
+ wc -l hgdpGeo.tab
+#660280 hgdpGeo.tab
+ grep ERROR hgdpGeo.tab | wc -l
+#0
+
+ hgLoadBed hg18 hgdpGeo hgdpGeo.tab \
+ -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql
+#Loaded 660280 elements of size 7
+
+
+#############################################################################
+# HGDP HETEROZYGOSITY (DONE 2/12/09 angie)
+ mkdir /hive/data/genomes/hg18/bed/hgdpHzy
+ cd /hive/data/genomes/hg18/bed/hgdpHzy
+ foreach continent (african americas easia european mideast oceania sasia)
+ wget --timestamping http://hgdp.uchicago.edu/data/hzy/$continent.gff.gz
+ end
+ foreach continent (african americas easia european mideast oceania sasia)
+ set bedGraph = `echo $continent \
+ | sed -re 's/can$/ca/; s/pean$/pe/; s/asia/Asia/; \
+ s/(.*)/hgdpHzy\u\1.bedGraph/'`
+ echo $bedGraph
+ zcat $continent.gff.gz \
+ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
+ > $bedGraph
+ end
+ # Using bedGraph, not wig, because there are only 640k datapoints and
+ # some are over the 10Mbase wiggle item size limit.
+ foreach f (*.bedGraph)
+ hgLoadBed hg18 $f:r $f -bedGraph=4
+ end
+ # All 7 have same size:
+#Loaded 640676 elements of size 4
+
+
+#############################################################################
+# HGDP FST (DONE 2/12/09 angie)
+ mkdir /hive/data/genomes/hg18/bed/hgdpFst
+ cd /hive/data/genomes/hg18/bed/hgdpFst
+ wget --timestamping \
+ http://hgdp.uchicago.edu/data/FST/autosomal_illuminasnps7_pval.gff.gz
+ zcat autosomal_illuminasnps7_pval.gff.gz \
+ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
+ > hgdpFst.bedGraph
+ hgLoadBed hg18 hgdpFst hgdpFst.bedGraph -bedGraph=4
+#Loaded 640676 elements of size 4
+
+#############################################################################
+# HGDP IHS (DONE 2/13/09 angie)
+ mkdir /hive/data/genomes/hg18/bed/hgdpIhs
+ cd /hive/data/genomes/hg18/bed/hgdpIhs
+ foreach continent (Bantu Americas E.Asia European MiddleEast Oceania S.Asian)
+ wget --timestamping \
+ http://hgdp.uchicago.edu/data/iHS/smoothed$continent.iHS.gff.gz
+ set bedGraph = `echo $continent \
+ | sed -re 's/Bantu/Africa/; s/pean$/pe/; s/\.Asian?/Asia/; \
+ s/MiddleEast/Mideast/; s/(.*)/hgdpIhs\1.bedGraph/'`
+ echo $bedGraph
+ zcat smoothed$continent.iHS.gff.gz \
+ | sed -e 's/^chr23/chrX/' \
+ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
+ > $bedGraph
+ end
+ foreach f (*.bedGraph)
+ hgLoadBed hg18 $f:r $f -bedGraph=4
+ end
+#Reading hgdpIhsAfrica.bedGraph
+#Loaded 540438 elements of size 4
+#Reading hgdpIhsAmericas.bedGraph
+#Loaded 422167 elements of size 4
+#Reading hgdpIhsEAsia.bedGraph
+#Loaded 487801 elements of size 4
+#Reading hgdpIhsEurope.bedGraph
+#Loaded 543875 elements of size 4
+#Reading hgdpIhsMideast.bedGraph
+#Loaded 552277 elements of size 4
+#Reading hgdpIhsOceania.bedGraph
+#Loaded 425340 elements of size 4
+#Reading hgdpIhsSAsia.bedGraph
+#Loaded 550231 elements of size 4
+
+
+#############################################################################
+# HGDP XP-EHH (DONE 2/12/09 angie)
+ mkdir /hive/data/genomes/hg18/bed/hgdpXpehh
+ cd /hive/data/genomes/hg18/bed/hgdpXpehh
+ foreach continent (Bantu Americas E.Asia Europe Mideast Oceania S.Asia)
+ wget --timestamping \
+ http://hgdp.uchicago.edu/data/XPEHH/$continent.xpehh.forbrowser.gff.gz
+ set bedGraph = `echo $continent \
+ | sed -re 's/Bantu/Africa/; s/\.Asia?/Asia/; \
+ s/(.*)/hgdpXpehh\1.bedGraph/'`
+ echo $bedGraph
+ zcat $continent.xpehh.forbrowser.gff.gz \
+ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
+ > $bedGraph
+ end
+ foreach f (*.bedGraph)
+ hgLoadBed hg18 $f:r $f -bedGraph=4
+ end
+#Reading hgdpXpehhAfrica.bedGraph
+#Loaded 636680 elements of size 4
+#Reading hgdpXpehhAmericas.bedGraph
+#Loaded 636143 elements of size 4
+#Reading hgdpXpehhEAsia.bedGraph
+#Loaded 635799 elements of size 4
+#Reading hgdpXpehhEurope.bedGraph
+#Loaded 636680 elements of size 4
+#Reading hgdpXpehhMideast.bedGraph
+#Loaded 636849 elements of size 4
+#Reading hgdpXpehhOceania.bedGraph
+#Loaded 637418 elements of size 4
+#Reading hgdpXpehhSAsia.bedGraph
+#Loaded 636773 elements of size 4
+
+
+#############################################################################