418b11748d3ae9c1217714a92d4a7fe46b6819b9
chmalee
  Thu May 28 17:14:54 2020 -0700
Added a few more trios to the 1000 Genomes trio track QA ready, refs #25582

diff --git src/hg/makeDb/doc/hg38/variation.txt src/hg/makeDb/doc/hg38/variation.txt
index 1e06434..a61f1c8 100644
--- src/hg/makeDb/doc/hg38/variation.txt
+++ src/hg/makeDb/doc/hg38/variation.txt
@@ -1,1876 +1,1971 @@
 # for emacs: -*- mode: sh; -*-
 
 # Variation tracks for hg38 / GRCh38
 
 ##############################################################################
 # DBSNP B141 / SNP141 (DONE 10/14/14)
 # originally done 9/10/14.
 # updated 10/9/14 - 10/14/14 after Matt found that MultipleAlignments was erroneously triggered
 # by PAR1 X alt mappings.
 # Redmine #13309
     mkdir -p /hive/data/outside/dbSNP/141/human_hg38
     cd /hive/data/outside/dbSNP/141/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
     # to find the subdir name to use as orgDir below (human_9606 in this case).
     # Then click into that directory and look for file names like
     #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
     # -- use the first num for build and the second num_num for buildAssembly.
     # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
     #
     # Some trial and error was required to get the config.ra just right
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606
 build 141
 buildAssembly 
 refAssemblyLabel GRCh38
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
     # hg38 doesn't have a jkStuff/ liftUp file like hg19 & earlier; use the file
     # created by the script from the ContigInfo table dump:
     cp suggested.lft usingSuggested.lft
 cat >> config.ra <<EOF
 liftUp usingSuggested.lft
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp \
       >>& do.log & tail -f do.log
 
     # After first run, find SNPs that were dropped due to not having rs_fasta records:
     zcat snp141Errors.bed.gz \
     | grep 'Missing observed' \
     | cut -f 4 \
     | sort -u \
       > idsNoFasta.txt
     # Upload that file here and select FASTA as output format:
     #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
     # Wait for email with link to results.
     # 41 IDs are rejected as not in dbSNP, OK.
     # Download the batch query results:
     wget ftp://ftp.ncbi.nlm.nih.gov/snp/batch/140822173438.gz
     # Move that file into the rs_fasta directory with a name that will be picked up
     # by "zcat rs_fasta/rs_ch*.fas.gz":
     mv 140822173438.gz rs_fasta/rs_chBatchQuery.fas.gz
     # Now continue from the addToDbSnp step.
     # NOTE: I actually combined this addToDbSnp run with the dbSNP data update run below.
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp \
       >>& do.log & tail -f do.log
 
     # dbSNP re-released a subset of the FTP files... patch those in and rebuild.
     # I made a new subdir firstBuild and moved a bunch of files there so I can compare.
     cd /hive/data/outside/dbSNP/141/human_hg38
     mv data data.orig
     mkdir data
     cd data
     ln -s ../data.orig/* .
     rm b141_SNPContigLoc.bcp.gz b141_SNPContigLocusId.bcp.gz b141_SNPMapInfo.bcp.gz
     foreach f (b141_SNPContigLoc.bcp.gz b141_SNPContigLocusId.bcp.gz b141_SNPMapInfo.bcp.gz)
       wget ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b141_GRCh38/database/organism_data/update_2014_Jul25/$f
     end
     hgsql hg38snp141 -e 'drop table b141_SNPContigLoc; \
                          drop table b141_SNPContigLocusId; \
                          drop table b141_SNPMapInfo;'
     # Recreate those tables using schema/table.sql.
     # Run the parts of loadDbSnp.csh for those specific tables:
     # ---------- begin loadDbSnp.csh excerpt ----------
     foreach t (b141_SNPContigLocusId b141_SNPMapInfo)
       zcat /hive/data/outside/dbSNP/141/human_hg38/data/$t.bcp.gz  \
       | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
         > tmp.tab
       hgLoadSqlTab -oldTable hg38snp141 $t placeholder tmp.tab
       rm tmp.tab
     end
     hgsql hg38snp141 -e \
       'alter table b141_SNPContigLocusId add index (ctg_id); \
        alter table b141_SNPMapInfo add index (snp_id);'
 
     # b141_SNPContigLoc is huge, and we want only the reference contig mappings.
     # Keep lines only if they have a word match to some reference contig ID.
     # That allows some false positives from coord matches; clean those up afterward.
     zcat /hive/data/outside/dbSNP/141/human_hg38/data/b141_ContigInfo.bcp.gz \
     | cut -f 1 | sort -n > b141_ContigInfo.ctg_id.txt
     zcat /hive/data/outside/dbSNP/141/human_hg38/data/b141_SNPContigLoc.bcp.gz \
     | grep -Fwf b141_ContigInfo.ctg_id.txt \
     | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
     | hgLoadSqlTab -oldTable hg38snp141 b141_SNPContigLoc placeholder stdin
     # Get rid of those false positives:
     hgsql hg38snp141 -e 'alter table b141_SNPContigLoc add index (ctg_id);'
     hgsql hg38snp141 -e 'create table ContigLocFix select cl.* from b141_SNPContigLoc as cl, b141_ContigInfo as ci where cl.ctg_id = ci.ctg_id;'
     hgsql hg38snp141 -e 'alter table ContigLocFix add index (ctg_id);'
     hgsql hg38snp141 -e 'drop table b141_SNPContigLoc; \
                          rename table ContigLocFix to b141_SNPContigLoc;'
     hgsql hg38snp141 -e 'alter table b141_SNPContigLoc add index (snp_id);'
     # ---------- end loadDbSnp.csh excerpt ----------
     # Now continue from addToDbSnp to pick up the changes -- but stop after translate
     # so we can look for strand bugs using hg19 b138 vs. b141 info.
     cd /hive/data/outside/dbSNP/141/human_hg38
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp -stop translate \
       >>& do.log & tail -f do.log
 
     # Try to identify wrongful strand-flips from b138 (see hg19.txt SNP141):
     # Map rsIDs (uniquely mapped this time around) to strand, ref allele,
     # observed alleles, and ObservedMismatch status:
     zcat snp141.bed.gz \
     | awk -F"\t" '{if ($18 ~ /ObservedMismatch/) { gotObs = "y"; } else { gotObs = "n"; } \
                    print $4 "\t" $6 "\t" $7 "\t" $9 "\t" gotObs;}' \
     | sort -u \
       > hg38_snp141_oriEtc.txt
     # Get the same rsID-only format from the hg19 snp141 directory:
     sed -e 's/^.*\.rs/rs/' ../human_hg19/hg19_snp138_snp141_oriEtc.txt \
     | sort -u \
       > hg19_snp138_snp141_oriEtc.txt
     join hg19_snp138_snp141_oriEtc.txt hg38_snp141_oriEtc.txt \
       > hg19_snp138_snp141_hg38_snp141_oriEtc.txt
     # Now look for cases where there's no ObservedMismatch in b138 but strand changed
     # from hg19/b139 to hg19/b141:
     awk '$5 == "n" && $2 != $6 {print;}' hg19_snp138_snp141_hg38_snp141_oriEtc.txt \
     | uniq \
       > hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
     wc -l hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
 #335406 hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
     # What are the combinations of ObservedMismatch status in hg19/b141 and hg38/b141?
     awk '{print $5, $9, $13;}' hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | sort \
     | uniq -c
 #  65455 n n n
 #     17 n n y
 #    148 n y n
 # 269786 n y y
     # Take a look at some of each
     # No ObservedMismatch:
     awk '$9 == "n" && $13 == "n"' hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
 #rs1000005 - G C/G n + G C/G n + G C/G n  # b141 bad alignment
 #rs1000021 - G C/G n + G C/G n + G C/G n  # b141 bad alignment
 #rs1000023 - G C/G n + G C/G n + G C/G n  # b141 bad alignment
 #rs1000024 - C C/G n + C C/G n + C C/G n
 #rs1000190 - G C/G n + G C/G n + G C/G n
 #rs1000195 - A A/T n + A A/T n + A A/T n
 #rs1000205 - A A/T n + A A/T n + A A/T n  # b141 bad alignment
     # -- as in hg19, complementary alleles or insertions can't trigger ObsMismatch anyway.
     # Any allele changes in there?
     awk '$9 == "n" && $13 == "n" && $3 != $11' \
       hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
 #rs1005350 - G C/G n + G C/G n + C C/G n  # flank matches - strand (b138 correct), new contig
 #rs1008420 - A A/T n + A A/T n + T A/T n  # flank matches - strand (b138 correct) on every alt
                                           # (19, 19_GL*, 19_KI*), some new contigs, some not
 #rs1008475 - G C/G n + G C/G n + C C/G n  # flank matches - strand (b138 correct), new contig
 #rs1008525 - G C/G n + G C/G n + C C/G n  # flank matches - strand (b138 correct) on every alt
 #rs1008598 - C C/G n + C C/G n + G C/G n  # flank matches - strand (b138 correct) on every alt
 #rs1012024 - A A/T n + A A/T n + T A/T n  # flank matches - strand (b138 correct), new contig
 #rs1016720 - T A/T n + T A/T n + A A/T n  # flank matches - strand (b138 correct), new contig
 #rs1018837 - A A/T n + A A/T n + T A/T n  # flank matches - strand (b138 correct), new contig
 #rs1018839 - A A/T n + A A/T n + T A/T n  # flank matches - strand (b138 correct) on every alt
 #rs1024351 - C C/G n + C C/G n + G C/G n  # flank matches - strand (b138 correct) on every alt
     # -- in all of those cases, both strand and allele changed... any exceptions?
     awk '$9 == "n" && $13 == "n" && $3 != $11 && $2 == 10' \
       hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
     # -- no output.
     # Any non-C/G|A/T obs?  yep, tri- and quad-allelics:
     awk '$9 == "n" && $13 == "n" && $12 != "C/G" && $12 != "A/G"' \
       hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
 #rs1001331 - G C/G/T n + G C/G/T n + G C/G/T n        # b141 bad alignment
 #rs1001741 - G A/C/G n + G A/C/G n + G A/C/G n        # b141 bad alignment
 #rs10017873 + G A/C/G/T n - G A/C/G/T n - G A/C/G/T n # !! b141 correct !!  uh-oh.
 #rs1002888 - G A/C/G n + G A/C/G n + G A/C/G n        # b141 bad alignment
 #rs1003420 - C C/G/T n + C C/G/T n + C C/G/T n        # b141 bad alignment
 #rs1003510 - A A/C/T n + A A/C/T n + A A/C/T n        # b141 bad alignment
 #rs1003524 - C C/G/T n + C C/G/T n + C C/G/T n        # b141 bad alignment
 #rs1003689 - C A/C/G n + C A/C/G n + C A/C/G n        # b141 bad alignment
 #rs1004401 - G C/G/T n + G C/G/T n + G C/G/T n        # b141 bad alignment
 #rs10059147 + T A/C/G/T n - T A/C/G/T n - T A/C/G/T n # !! b141 correct !!  uh-oh.
     # -- I think we should swap the (n n n)'s strands except for quad-allelics.
     # ObservedMismatch new in hg38/b141 (only 17 of these):
     awk '$9 == "n" && $13 == "y"' hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
 #rs238806 - G C/G/T n + G C/G/T n - T C/G/T y    # flank matches - strand on 17, + on alt,
                                                  # b141 has it backwards
 #rs2520920 - A A/C/T n + A A/C/T n - C A/C/T y   # flank matches - strand on16, + on alt,
                                                  # b141 has it backwards
 #rs290792 - T A/G/T n + T A/G/T n + C A/G/T y    # flank matches -, 141 has +, new contig
 #rs4114722 - C C/G/T n + C C/G/T n + A C/G/T y   # b141 bad alignment
 #rs41544215 - C A/C/G n + C A/C/G n + T A/C/G y  # b141 bad aligment on all regardless of ObsMis
 #rs41562819 - A A/C/T n + A A/C/T n + G A/C/T y  # b141 bad alignment
 #rs499521 - C A/C/G n + C A/C/G n + T A/C/G y    # b141 bad alignment on all regardless of ObsMis
 #rs550657 - G C/G/T n + G C/G/T n - T C/G/T y    # b141 bad alignment on all regardless of ObsMis
 #rs9629160 - T A/C/T n + T A/C/T n + G A/C/T y   # b141 bad alignment
     # -- yep, looks like this category (n n y) should be strand-swapped.
     # ObservedMismatch in hg19/b141 but not hg38/b141:
     awk '$9 == "y" && $13 == "n"' hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
 #rs1038172 - G A/C/T n + G A/C/T y - T A/C/T n   # b141 bad alignment, lucky w/ObsMis
 #rs1052993 - A C/T n + A C/T y - A C/T n         # !! b141 correct on all incl 9 alts !!
 #rs1052993 - A C/T n + A C/T y - G C/T n         # -- same variant, different allele
 #rs1052995 - T A/G n + T A/G y - C A/G n         # !! b141 correct on all incl 9 alts !!
 #rs1052995 - T A/G n + T A/G y - T A/G n         # -- same variant, different allele
 #rs1070637 - G A/C/T n + G A/C/T y - A A/C/T n   # b141 bad alignment, lucky w/ObsMis
 #rs1081481 - C A/G n + C A/G y + G A/G n         # !! b141 correct !!
 #rs1096815 - C G/T n + C G/T y + T G/T n         # !! b141 correct !!
 #rs11266881 - A C/G/T n + A C/G/T y - C C/G/T n  # b141 bad alignment, lucky w/ObsMis
 #rs1132605 - G C/T n + G C/T y - A C/T n         # !! b141 correct !!
     # -- in those cases strand in hg38/b141 matches strand in b138.  Exceptions?
     awk '$9 == "y" && $13 == "n" && $2 != $10' \
       hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
 #rs1081481 - C A/G n + C A/G y + G A/G n         # !! b141 correct !!
 #rs1096815 - C G/T n + C G/T y + T G/T n         # !! b141 correct !!
 #rs1539656 - C A/G n + C A/G y + G A/G n
 #rs1663847 - C A/G n + C A/G y + G A/G n
 #rs1663850 - C A/G n + C A/G y + G A/G n
 #rs1663851 - C A/G n + C A/G y + G A/G n
 #rs1663852 - C G/T n + C G/T y + G G/T n
 #rs1663901 - T A/G n + T A/G y + A A/G n
 #rs1663903 - C A/G n + C A/G y + G A/G n
 #rs1778671 - G C/T n + G C/T y + C C/T n
     # -- allele changes accompany strand change in hg38.  Leave this category (n y n) alone.
     # ObservedMismatch in both hg19/b141 and hg38/b141
     awk '$9 == "y" && $13 == "y"' hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
 #rs1000002 - C A/G n + C A/G y + C A/G y         # b141 bad alignment
 #rs1000008 - G C/T n + G C/T y + G C/T y         # b141 bad alignment
 #rs1000015 - T A/G n + T A/G y + T A/G y         # b141 bad alignment
 #rs1000020 - C A/G n + C A/G y + C A/G y
 #rs1000027 - G A/C n + G A/C y + G A/C y
 #rs1000032 - G A/C n + G A/C y + G A/C y
 #rs1000041 - C A/G n + C A/G y + C A/G y
 #rs1000042 - C A/G n + C A/G y + C A/G y
 #rs1000043 - C A/G n + C A/G y + C A/G y
 #rs1000044 - C A/G n + C A/G y + C A/G y
     # -- in those, hg38/snp141 exactly matches hg19/snp141... any allele changes?
     awk '$9 == "y" && $13 == "y" && $7 != $11' \
       hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | head
 #rs1001572 - A C/T n + A C/T y + G C/T y         # b141 bad alignment
 #rs1001686 - G C/T n + G C/T y - C C/T y         # b141 bad alignment
 #rs1001814 - G C/T n + G C/T y + A C/T y         # b141 bad alignment, both mappings
 #rs1003590 - T A/G n + T A/G y + C A/G y
 #rs1003645 - C A/G n + C A/G y + T A/G y
 #rs1003847 - A C/T n + A C/T y + G C/T y
 #rs1005196 - G C/T n + G C/T y - C C/T y         # b141 bad alignment, all 3 mappings
 #rs1005197 - A C/T n + A C/T y - T C/T y
 #rs1005295 - T A/G n + T A/G y - A A/G y
 #rs1005581 - A C/T n + A C/T y + G C/T y
     # Yep, looking consistently bad.  Swap the (n y y)'s.
     # Are there any IDs that appear more than once? (i.e. in multiple categories or allele change?)
     # Yep, quite a few:
     awk '{print $1;}' hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
       > ids_strandChange_all.txt
     uniq ids_strandChange_all.txt \
       > ids_strandChange_uniq.txt
     comm -3 ids_strandChange_all.txt ids_strandChange_uniq.txt | uniq | wc -l
 #2379
     # Some examples:
     grep rs1008310 hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
 #rs1008310 - C A/G n + C A/G y + C A/G y
 #rs1008310 - C A/G n + C A/G y - A A/G y
 #rs1008310 - C A/G n + C A/G y - G A/G y
     # Let's look specifically for multiple ObsMis categories:
     awk '{print $1, $5, $9, $13;}' hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | uniq \
       > rsObsMis.txt
     wc -l rsObsMis.txt
 #332961 rsObsMis.txt
     awk '{print $1;}' rsObsMis.txt | uniq | wc -l
 #332942
     # Yep.  OK, get a list:
     awk '{print $1;}' rsObsMis.txt > ids_rsObsMis_all.txt
     uniq ids_rsObsMis_all.txt > ids_rsObsMis_uniq.txt
     comm -3 ids_rsObsMis_all.txt ids_rsObsMis_uniq.txt > ids_rsObsMis_mult.txt
     wc -l ids_rsObsMis_mult.txt
 #19 ids_rsObsMis_mult.txt
     # Example:
     grep rs11266881 hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
 #rs11266881 - A C/G/T n + A C/G/T y + A C/G/T y  # b141 bad ali to chr17
 #rs11266881 - A C/G/T n + A C/G/T y - C C/G/T n  # b141 bad ali to alt but lucky w/ObsMis
     grep rs238806 hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
 #rs238806 - G C/G/T n + G C/G/T n + G C/G/T n    # b141 bad ali to chr17 but lucky w/ObsMis
 #rs238806 - G C/G/T n + G C/G/T n - T C/G/T y    # b141 bad ali to alt
     grep rs2432527 hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
 #rs2432527 - G A/C n + G A/C y + A A/C n         # b141 bad ali to alt 935 but lucky w/ObsMis
 #rs2432527 - G A/C n + G A/C y + G A/C y         # b141 bad ali to chr3, alts 779
 #rs2432527 - G A/C n + G A/C y - C A/C y         # b141 bad ali to alts 895, 924, 934, 936, 937
     grep rs2520920 hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt
 #rs2520920 - A A/C/T n + A A/C/T n + A A/C/T n   # b141 bad ali but lucky w/ObsMis
 #rs2520920 - A A/C/T n + A A/C/T n - C A/C/T y
     # -- spot-checked a few more and they're all bad but have at least one lucky w/ObsMis.
     # Therefore I think it is safe to swap strand by rsID -- I haven't seen any cases where
     # some mappings are correct but not other mappings for the same rsID.
     # Make a list of rsID's that need strand-swapping, i.e. anything with ObsMis in hg38 b141
     # and the (n n n)'s except quad-allelic.
     awk '$13 == "y" || ($9 == "n" && $4 != "A/C/G/T" && $12 != "A/C/G/T") {print $1;}' \
       hg19_snp138_snp141_hg38_snp141_oriEtc_strandChange.txt \
     | uniq \
       > idsToSwapStrand.txt
     wc -l idsToSwapStrand.txt 
 #331493 idsToSwapStrand.txt
     sed -e 's/^rs//;' idsToSwapStrand.txt \
     | awk \
     '{print "update b141_SNPContigLoc set orientation = 1 - orientation where snp_id = "$1";";}' \
       > fixOrientation.sql
     hgsql hg38snp141 < fixOrientation.sql
     # Now continue from addToDbSnp to pick up the changes
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp \
       >>& do.log & tail -f do.log
 # *** All done!
 
     # As in hg19.snp141, there were SNPs with weight > 1 but not MultipleAlignments,
     # and after realigning some flanking sequences, I believe the weights are erroneous.
     # Unfortunately hg38 also has some true cases of MultipleAlignments caused by
     # weird almost-duplicate mappings... so pay attention to MultipleAlignments, but if
     # we didn't find MultipleAlignments, then tweak the weight to 1 as we did for hg19:
     mv snp141.bed.gz snp141BadWeights.bed.gz
     zcat snp141BadWeights.bed.gz \
     | awk -F"\t" 'BEGIN{OFS="\t";} {if ($18 !~ /MultipleAlignments/) { $17 = 1; }  print;}' \
     | gzip -c \
       > snp141.bed.gz
     hgLoadBed -tab -onServer -tmpDir=$TMPDIR -allowStartEqualEnd -type=bed6+ \
       hg38 snp141 -sqlTable=snp141.sql snp141.bed.gz
     # Now rebuild snp141{Common,Flagged} which were missing some SNPs due to incorrect
     # weights.  Fortunately, I don't need to redo ortho alleles or masked sequences
     # because they filter out MultipleAlignments ignoring weight.
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=filter -stop=filter \
       >>& do.log & tail -f do.log
 # (before: snp141Common had 14546553 rows, snp141Flagged had 124075)
 # (after:  snp141Common has 14547298 rows, snp141Flagged has 124776)
 
     # 10/9/14: Matt found that MultipleAlignments was erroneously triggered
     # by PAR1 X alt mappings.  I added chrX_*_alt entries to PAR table
     # (see "Add chrX alts to par" section below); now re-run from snpNcbiToUcsc.
     # 10/14/14: turned out I messed up the X_*_alt par entries, doh! Fixed par; run again.
     cd /hive/data/outside/dbSNP/141/human_hg38
     set tmpDir = `cat workingDir`
     mkdir $tmpDir
     pushd $tmpDir
     hgsql hg38 -NBe 'select chrom,chromStart,chromEnd,name from par' > par.bed
     ln -s /hive/data/outside/dbSNP/141/human_hg38/ucscNcbiSnp.bed.gz .
     echo "\n10/14/14: Redo from snpNcbiToUcsc\n" >> /hive/data/outside/dbSNP/141/human_hg38/do.log
     echo "snpNcbiToUcsc -snp132Ext -par=par.bed ucscNcbiSnp.bed.gz /hive/data/genomes/hg38/hg38.2bit snp141" >> /hive/data/outside/dbSNP/141/human_hg38/do.log
     snpNcbiToUcsc -snp132Ext -par=par.bed ucscNcbiSnp.bed.gz /hive/data/genomes/hg38/hg38.2bit \
       snp141 \
       >>& /hive/data/outside/dbSNP/141/human_hg38/do.log &
     tail -f /hive/data/outside/dbSNP/141/human_hg38/do.log
     # Now repeat the fix from above for incorrect weights:
     mv snp141.bed snp141BadWeights.bed
     awk -F"\t" 'BEGIN{OFS="\t";} {if ($18 !~ /MultipleAlignments/) { $17 = 1; }  print;}' \
       snp141BadWeights.bed \
       > snp141.bed
     # Compare results -- the only changes should be the disappearance of MultipleAlignments
     # from the part of PAR1 that the alts overlap: chrX:319338-601516, chrY:319338-601516,
     # chrX_KI270880v1_alt and chrX_KI270913v1_alt).
     zcat /hive/data/outside/dbSNP/141/human_hg38/snp141.bed.gz > snp141.prev.bed
     diff snp141.prev.bed snp141.bed | less
     # Yep, differences are limited to those ranges and are disappearance of MultipleAlignments.
     rm snp141.prev.bed
     gzip *.txt *.bed *.tab
     cp -p * /hive/data/outside/dbSNP/141/human_hg38/
     popd
 
     # Reload snp141 and snp141ExceptionDesc
     hgLoadBed -tab -onServer -tmpDir=$TMPDIR -allowStartEqualEnd -type=bed6+ \
       hg38 snp141 -sqlTable=snp141.sql snp141.bed.gz
     zcat snp141ExceptionDesc.tab.gz \
     | hgLoadSqlTab hg38 snp141ExceptionDesc $HOME/kent/src/hg/lib/snp125ExceptionDesc.sql stdin
 
     # Regenerate snp141{Mult,Common,Flagged}:
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=filter -stop=filter \
       >>& do.log & tail -f do.log
 
     # Clean up by pasting the output of this if it looks right:
     echo rm `cat workingDir`/snp*
 
     # Now unfortunately I do need to redo pretty much everything from this point on
     # because everything uses MultipleAlignments.
 
 ##############################################################################
 # SNP141 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 10/15/14 angie)
 # originally done 9/10/14.
 # updated 10/15/14 after Matt found that MultipleAlignments was erroneously triggered
     # Redmine #13309
     mkdir /hive/data/genomes/hg38/bed/snp141Ortho
     cd /hive/data/genomes/hg38/bed/snp141Ortho
     # Filter snp141 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
     zcat /hive/data/outside/dbSNP/141/human_hg38/snp141.bed.gz \
     | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
     | sort -u \
       > snp141ExcludeIds.txt
     wc -l snp141ExcludeIds.txt
 #304622 snp141ExcludeIds.txt
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     zcat /hive/data/outside/dbSNP/141/human_hg38/snp141.bed.gz \
     | awk '$3-$2 == 1 && $11 == "single" {print;}' \
     | grep -vFwf snp141ExcludeIds.txt \
     | awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                0, $6;}' \
       > snp141ForLiftOver.bed
       wc -l snp141ForLiftOver.bed
 #56466708 snp141ForLiftOver.bed
 
     # For each other species, do a cluster run to liftOver to that species' coords
     # and get the species' "allele" (reference assembly base) at that location.
     # End with a lexical sort because we're going to join these files later.
     cat > liftOne.csh<<'EOF'
 #!/bin/csh -ef
 set chunkFile = $1
 set db = $2
 set outFile = $3
 set Db = `echo $db | perl -wpe 's/(\S+)/\u$1/'`
 set liftOverFile = /hive/data/genomes/hg38/bed/liftOver/hg38To$Db.over.chain.gz
 set other2bit = /hive/data/genomes/$db/$db.2bit
 liftOver $chunkFile $liftOverFile stdout /dev/null \
 | $HOME/kent/src/hg/snp/snpLoad/getOrthoSeq.pl $other2bit \
 | sort > $outFile
 'EOF'
 EOF
     chmod a+x liftOne.csh
 
     # Map coords to chimp using liftOver.
     mkdir run.liftOver
     cd run.liftOver
     mkdir split out
     splitFile ../snp141ForLiftOver.bed 10000 split/chunk
     cp /dev/null jobList
     foreach chunkFile (split/chunk*)
       set chunk = $chunkFile:t:r
       foreach db (panTro4 ponAbe2 rheMac3)
         echo ../liftOne.csh $chunkFile $db \{check out exists out/$db.$chunk.bed\} \
           >> jobList
       end
     end
     ssh ku
     screen -S ortho -t ortho
     cd /hive/data/genomes/hg38/bed/snp141Ortho/run.liftOver
     para make jobList
 #Completed: 16941 of 16941 jobs
 #CPU time in finished jobs:    1562387s   26039.78m   434.00h   18.08d  0.050 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:                  82s       1.37m     0.02h    0.00d
 #Longest finished job:             588s       9.80m     0.16h    0.01d
 #Submission to last job:          2881s      48.02m     0.80h    0.03d
 
     cd /hive/data/genomes/hg38/bed/snp141Ortho
     # Join the results for each chunk:
     cat > joinOne.csh <<'EOF'
 #!/bin/csh -ef
 set chimpFile = $1
 set orangFile = $2
 set macFile = $3
 set outFile = $4
 set tmpFile = `mktemp`
     # Use the glommed name field as a key to join up chimp, orang and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
     # in the orthoGlom fields from each file, which are in the same order
     # as the chimp and macaque columns of hg18.snp128OrthoPanTro2RheMac2.
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       $chimpFile $orangFile \
     | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
             else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
       > $tmpFile
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       $tmpFile $macFile \
     | perl -wpe 'chomp; \
         ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
         $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
         ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
           split(/\|/, $glomKey); \
         $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
         $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
         print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                          $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
         s/^.*$//;' \
         > $outFile
      rm $tmpFile
 'EOF'
 EOF
     chmod a+x joinOne.csh
     mkdir /hive/data/genomes/hg38/bed/snp141Ortho/run.join
     cd /hive/data/genomes/hg38/bed/snp141Ortho/run.join
     mkdir out
     ln -s ../run.liftOver/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       set chunk = $f:t
       echo ../joinOne.csh ../run.liftOver/out/{panTro4,ponAbe2,rheMac3}.$chunk.bed \
         \{check out exists out/$chunk.bed\} \
         >> jobList
     end
     para make jobList
 #Completed: 5647 of 5647 jobs
 #CPU time in finished jobs:       1626s      27.10m     0.45h    0.02d  0.000 y
 #IO & Wait Time:                 44978s     749.63m    12.49h    0.52d  0.001 y
 #Average job time:                   8s       0.14m     0.00h    0.00d
 #Longest finished job:              35s       0.58m     0.01h    0.00d
 #Submission to last job:           642s      10.70m     0.18h    0.01d
 
     # Back on hgwdev, cat all of the joined results together (~45mins) & load table.
     cd /hive/data/genomes/hg38/bed/snp141Ortho
     sort -k1,1 -k2n,2n run.join/out/chunk*.bed > snp141OrthoPt4Pa2Rm3.bed
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
       -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
       hg38 snp141OrthoPt4Pa2Rm3 snp141OrthoPt4Pa2Rm3.bed
 #Read 54877953 elements of size 22 from snp141OrthoPt4Pa2Rm3.bed
 
     # Cleanup:
     rm -r run*/out run*/split
     gzip snp141ExcludeIds.txt snp141ForLiftOver.bed snp141OrthoPt4Pa2Rm3.bed &
 
 
 ############################################################################
 # SNPMASKED SEQUENCE FOR SNP141 (DONE 10/14/14 angie)
 # originally done 9/3/14.
 # updated 10/14/14 after Matt found that MultipleAlignments was erroneously triggered
     # Redmine #13309
     mkdir /hive/data/genomes/hg38/snp141Mask
     cd /hive/data/genomes/hg38/snp141Mask
     # Identify rsIds with various problems -- we will exclude those.
     zcat /hive/data/outside/dbSNP/141/human_hg38/snp141.bed.gz \
     | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
       | sort -u \
       > snp141ExcludeRsIds.txt
     zcat /hive/data/outside/dbSNP/141/human_hg38/snp141.bed.gz \
     | grep -vFwf snp141ExcludeRsIds.txt \
       > snp141Cleaned.bed
     wc -l snp141Cleaned.bed
 #61856902 snp141Cleaned.bed
 
     # Substitutions:
     mkdir substitutions
     snpMaskSingle snp141Cleaned.bed /hive/data/genomes/hg38/hg38.2bit stdout diffObserved.txt \
     | faSplit byname stdin substitutions/
 #Masked 55246923 snps in 55246914 out of 3204659592 genomic bases
 #/hive/data/genomes/hg38/hg38.2bit has 3209286105 total bases, but the total number of bases in sequences for which we masked snps is 3204659592 (difference is 4626513)
     # Check that 4626513 is the total #bases in sequences with nothing in snp141Cleaned:
     grep -Fw single snp141Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
     grep -vwf /data/tmp/1 ../chrom.sizes \
     | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
 #4626513
     # warnings about differing observed strings at same base position:
     wc -l diffObserved.txt
 #20 diffObserved.txt
     # -- small beans, and dbSNP is aware of thousands of SNPs that have clustering issues.
     # Make sure that sizes are identical, first diffs are normal -> IUPAC,
     # and first diffs' case is preserved:
     foreach f (substitutions/chr*.fa)
       twoBitToFa -seq=$f:t:r /hive/data/genomes/hg38/hg38.2bit stdout \
       | faCmp -softMask $f stdin |& grep -v "that differ"
     end
 #chr1 in substitutions/chr1.fa differs from chr1 at stdin at base 10107 (y != c)
 #chr10 in substitutions/chr10.fa differs from chr10 at stdin at base 14582 (K != T)
 #chr10_GL383545v1_alt in substitutions/chr10_GL383545v1_alt.fa differs from chr10_GL383545v1_alt at stdin at base 58 (R != G)
 #chr10_GL383546v1_alt in substitutions/chr10_GL383546v1_alt.fa differs from chr10_GL383546v1_alt at stdin at base 35 (R != A)
 #...
 #(output OK -- ambiguous bases replacing [agct] at SNP positions)
     foreach f (substitutions/chr*.fa)
       mv $f $f:r.subst.fa
     end
     # Fire off a bunch of gzip jobs in parallel:
     ls -1 substitutions/*.fa | split -l 10
     foreach f (x??)
       gzip `cat $f` &
     end
     # Wait for backgrounded gzip jobs to complete
     rm x??
 
     # Insertions & deletions not done.  To date we have only offered substs for download.
     # If there is user demand, use template from snp131 above.
 
     # Clean up and prepare for download:
     gzip snp141Cleaned.bed &
     foreach d (substitutions)
       pushd $d
         md5sum *.gz > md5sum.txt
         # NOTE FOR NEXT TIME: copy from previous hg38 version.
         cp /hive/data/genomes/hg19/snp141Mask/$d/README.txt .
       popd
     end
     # Edit the README.txt.
 
     # Create download links on hgwdev.
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/snp141Mask
     ln -s /hive/data/genomes/hg38/snp141Mask/substitutions/* \
       /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/snp141Mask/
 
 
 ##############################################################################
 # DBSNP B142 / SNP142 (DONE 10/14/15)
 # originally done 10/24/14
 # 11/17/14: Allele dump file was outdated at time of download -- it was re-dumped a couple weeks
 #           after release.  snp142 was missing some frequencies due to incomplete join with Allele.
 #           Rebuilding all tables that have allele frequencies (just the track tables & exceptions).
 # 2/17/15: User reported missing validation info (#14836); Jonathan found that SNP.bcp.gz
 #          had been updated 1/30/15, argh.  Rebuilding parts that depend on the SNP table.
 # 10/14/15: Zero out the allele freq columns for snp142 rows on the '-' strand that include
 #           1000GENOMES in submitters, because they are incorrect (Bug #16204).
 #           Regenerate snp142{Common,Flagged,Mult}.
     # Redmine #14189
     mkdir -p /hive/data/outside/dbSNP/142/human_hg38
     cd /hive/data/outside/dbSNP/142/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
     # to find the subdir name to use as orgDir below (human_9606_b142_GRCh38 in this case).
     # Go to that subdirectory, then to database/organism_data/ and look for files
     # whose names start with b142_* and may or may not end with a suffix that identifies
     # the build assembly version or some annotation version.  If there is a suffix shared
     # by all b142_* files, add that to config.ra as the "buildAssembly".
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606_b142_GRCh38
 build 142
 buildAssembly 106
 refAssemblyLabel GRCh38
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
 
     # It fails saying there are sequences it won't be able to lift (because we haven't
     # specified a liftUp file in config.ra).  Use the auto-generated suggested.lft file:
     cp suggested.lft usingSuggested.lft
     cat >> config.ra <<EOF
 liftUp usingSuggested.lft
 EOF
     # Continue at loadDbSnp.  Stop after loadDbSnp to identify SNPs whose sequences didn't
     # make it into the rs_fasta dump files; we can fetch those separately from dbSNP's
     # batch query.
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp -stop=loadDbSnp \
       >>& do.log & tail -f do.log
 
     # While that's running, compare rs IDs in rs_fasta with b142_SNPContigLoc_106 to see
     # which IDs (if any) were omitted from the rs_fasta dump.
     zcat rs_fasta/rs*.fas.gz \
     | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
     | sort -u > rsFastaIds.txt
     zcat data/b142_SNPContigLoc_106.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
 #113 missingFromRsFasta.txt
     # Fewer than in snp141 but more than hg19.snp142.
     # Fetch fasta for those ids from dbSNP's batch_query.
     # Go to this URL:
     #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
     # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
     # Wait for email with link to results -- don't close that tab! -- then download the results:
     # NOTE: after 1 day no email has arrived from dbSNP, so I suspect that as in hg19 snp142,
     # all of those rs's have been deleted.  I hand-checked a half dozen from different parts
     # of the file and all were deleted -- looks like a big range of rs's were deleted due to
     # retracted single submissions.
     # So rs_fasta is actually complete this time and those extras in SNPContigLoc can be dropped!
     # Here's what I would have done if actual fasta sequences were returned:
     wget -O rs_fasta/rs_chBatchQuery.fas.gz $fileFromDbSnpEmail
     # Now continue from the addToDbSnp step.
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp \
       >>& do.log & tail -f do.log
 # *** All done!
     # Wow, it almost never runs so smoothly; there are usually some formatting weirdnesses
     # that gum up snpNcbiToUcsc in the translate step.
 
     # Ah, they have made lists of strand flippers again... should reality-check them:
     wget ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/misc/known_issues/b142/rs_with_changed_orientation.bcp
     # First two lines:
 #GRCh38_chr|GRCh38_chr_pos_start_0based|GRCh38_chr_reference_allele|snp_id|b142_GRCh38_rs2chr_orien|b141_GRCh38_rs2chr_orien|orienChange_reason|
 #1|119072199|G|138203431|0|1|dataflow_issue|
     # Blatting rs138203431's flanking seqs to hg19, looks like + (0) is the correct orientation
     # so does this mean they goofed the strand in 142?
     # Nope, looks like they goofed it in 141 and 142 is OK, good.
     # Checked more examples, all look like improvements.  Yay!
 
     #########################################################################
     # 11/17/14: Allele dump file was outdated at time of download -- it was re-dumped a couple weeks
     # after release.  snp142 was missing some frequencies due to incomplete join with Allele.
     # Rebuilding all tables that have allele frequencies (just the track tables & exceptions).
     # First download the updated Allele dump file (actually did this only for hg19, same file):
     cd /hive/data/outside/dbSNP/142/shared
     rm Allele.bcp.gz
     wget --timestamping --no-verbose ftp://ftp.ncbi.nih.gov/snp/database/shared_data/Allele.bcp.gz
     cd /hive/data/outside/dbSNP/142/human_hg38
     # Clear out old data from Allele table, then reload with new dump file as in loadDbSnp.csh:
     hgsql hg38snp142 -e 'delete from Allele'
     zcat /hive/data/outside/dbSNP/142/shared/Allele.bcp.gz \
     | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
     | hgLoadSqlTab -oldTable hg38snp142 Allele placeholder stdin
     hgsql hg38snp142 -e 'alter table Allele add index (allele_id);'
     # Now update ucscAlleleFreq as in addToDbSnp.csh:
     /cluster/home/angie/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl hg38snp142 \
       -contigLoc=b142_SNPContigLoc_106 -deDupTGP \
       > ucscAlleleFreq.txt
     hgLoadSqlTab hg38snp142 ucscAlleleFreq{,.sql,.txt}
     # Now re-run from bigJoin onward.
     mkdir `cat workingDir`
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin \
       >>& do.log & tail -f do.log
 
     #########################################################################
     # 1/23/15: Jonathan noticed that NonIntegerChromCount's count in snp142ExceptionDesc
     # was much higher than the number of snp142 rows with NonIntegerChromCount; turns out
     # it was tallied per allele not per variant.  Fixed, regenerate;
     cd /hive/data/outside/dbSNP/142/human_hg38
     snpNcbiToUcsc -snp132Ext -par=par.bed.gz ucscNcbiSnp.bed.gz /hive/data/genomes/hg38/hg38.2bit \
         snp142RedoExceptionDesc
     # Takes ~45min.  Make sure there are no changes to snp142.bed:
     zcat snp142.bed.gz > /data/tmp/snp142.bed
     cmp /data/tmp/snp142.bed snp142RedoExceptionDesc.bed
     # No output -- good.
     rm /data/tmp/snp142.bed
     mv snp142ExceptionDesc.tab.gz snp142ExceptionDesc.tab.bak.gz
     mv snp142RedoExceptionDescExceptionDesc.tab snp142ExceptionDesc.tab
     hgLoadSqlTab hg38 snp142ExceptionDesc $HOME/kent/src/hg/lib/snp125ExceptionDesc.sql \
       snp142ExceptionDesc.tab
     rm snp142RedoExceptionDesc*
 
     #########################################################################
     # 2/17/15: User reported missing validation info (#14836); Jonathan found that SNP.bcp.gz
     #          had been updated 1/30/15, argh.  Rebuilding parts that depend on the SNP table.
     # First get a baseline, so we know whether the updated file has fixed the problem:
     hgsql hg38 -e 'select count(*) from snp142 where valid = "unknown"'
 #| 109942158 |
     cd /hive/data/outside/dbSNP/142/human_hg38/data
     mv SNP.bcp.gz SNP.orig.bcp.gz
     wget --timestamping \
       ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b142_GRCh38/database/organism_data/SNP.bcp.gz
     # Empty out hg38snp142.SNP because our loading command uses -oldTable because of table.sql...:
     hgsql hg38snp142 -e 'delete from SNP'
     # from loadDbSnp.csh:
     zcat /hive/data/outside/dbSNP/142/human_hg38/data/SNP.bcp.gz \
     | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
     | hgLoadSqlTab -oldTable hg38snp142 SNP placeholder stdin
     # Make sure there's no need to (re)create the index on snp_id as in loadDbSnp.csh.
     hgsql hg38snp142 -e 'show index from SNP'
 #| SNP   |          1 | snp_id   |            1 | snp_id      | A         |   112745696 |     NULL | NULL   | YES  | BTREE      |         |               |
     # yep, the index is still there.
     # Now re-run from bigJoin onward.
     mkdir -p `cat workingDir`
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin -stop=filter \
       >>& do.log & tail -f do.log
     # Make sure the updated file has fixed the problem (i.e. has << 109942158 missing 'valid' vals):
     hgsql hg38 -e 'select count(*) from snp142 where valid = "unknown"'
 #| 24844969 |
     # 25M out of 113M is significantly less than 110M/113M, but still -- that's a lot!!
     # By way of comparison, hg19.snp138 has -- wow, 20M out of 65M!  Never realized it was that bad.
 
     #########################################################################
     # 10/14/15: Zero out the allele freq columns for snp142 rows on the '-' strand that include
     #           1000GENOMES in submitters, because they are incorrect (Bug #16204).
     #           Regenerate snp142{Common,Flagged,Mult}.
     cd /hive/data/outside/dbSNP/142/human_hg38
     hgsql hg38 -e 'update snp142 \
                      set alleleFreqCount=0, alleles="", alleleNs="", alleleFreqs="" \
                      where strand = "-" and find_in_set("1000GENOMES", submitters);'
     # Run doDbSnp.pl's filter step on a *modified* snp144.bed.gz to regenerate Common etc.
     mv snp142.bed.gz snp142.150218.bed.gz
     zcat snp142.150218.bed.gz \
     | awk -F"\t" 'BEGIN{OFS="\t";} \
         ($6 == "-" && $20 ~ /1000GENOMES/) { $21=0; $22 = ""; $23 = ""; $24 = ""; } {print;}' \
     > snp142.bed
     # Make sure the changes are as intended:
     zcat snp142.150218.bed.gz | diff - snp142.bed | less
     gzip snp142.bed
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=filter -stop=filter \
       >>& do.log & tail -f do.log
 # *** All done!  (through the 'filter' step)
 
 
 ##############################################################################
 # SNP142 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 11/05/14 angie)
     # Redmine #13309
     mkdir /hive/data/genomes/hg38/bed/snp142Ortho
     cd /hive/data/genomes/hg38/bed/snp142Ortho
     # Filter snp142 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
     zcat /hive/data/outside/dbSNP/142/human_hg38/snp142.bed.gz \
     | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
     | sort -u \
       > snp142ExcludeIds.txt
     wc -l snp142ExcludeIds.txt
 #901804 snp142ExcludeIds.txt
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     zcat /hive/data/outside/dbSNP/142/human_hg38/snp142.bed.gz \
     | awk '$3-$2 == 1 && $11 == "single" {print;}' \
     | grep -vFwf snp142ExcludeIds.txt \
     | awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                0, $6;}' \
       > snp142ForLiftOver.bed
       wc -l snp142ForLiftOver.bed
 #105591074 snp142ForLiftOver.bed
 
     # Do a cluster run to use liftOver to get the other species' coords
     # and get the species' "allele" (reference assembly base) at that location.
     # End with a lexical sort because we're going to join these files later.
     cat > liftOne.csh<<'EOF'
 #!/bin/csh -ef
 set chunkFile = $1
 set db = $2
 set outFile = $3
 set Db = `echo $db | perl -wpe 's/(\S+)/\u$1/'`
 set liftOverFile = /hive/data/genomes/hg38/bed/liftOver/hg38To$Db.over.chain.gz
 set other2bit = /hive/data/genomes/$db/$db.2bit
 liftOver $chunkFile $liftOverFile stdout /dev/null \
 | $HOME/kent/src/hg/snp/snpLoad/getOrthoSeq.pl $other2bit \
 | sort > $outFile
 'EOF'
 EOF
     chmod a+x liftOne.csh
 
     # Map coords to chimp using liftOver.
     mkdir run.liftOver
     cd run.liftOver
     mkdir split out
     splitFile ../snp142ForLiftOver.bed 10000 split/chunk
     cp /dev/null jobList
     foreach chunkFile (split/chunk*)
       set chunk = $chunkFile:t:r
       foreach db (panTro4 ponAbe2 rheMac3)
         echo ../liftOne.csh $chunkFile $db \{check out exists out/$db.$chunk.bed\} \
           >> jobList
       end
     end
     ssh ku
     screen -S ortho -t ortho
     cd /hive/data/genomes/hg38/bed/snp142Ortho/run.liftOver
     para make jobList
     # busy day on the cluster -- 4.4 hours instead of <1 hr
 #Completed: 31680 of 31680 jobs
 #CPU time in finished jobs:    2877974s   47966.23m   799.44h   33.31d  0.091 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:                  89s       1.48m     0.02h    0.00d
 #Longest finished job:             532s       8.87m     0.15h    0.01d
 #Submission to last job:         15927s     265.45m     4.42h    0.18d
 
     cd /hive/data/genomes/hg38/bed/snp142Ortho
     # Join the results for each chunk:
     cat > joinOne.csh <<'EOF'
 #!/bin/csh -ef
 set chimpFile = $1
 set orangFile = $2
 set macFile = $3
 set outFile = $4
 set tmpFile = `mktemp`
     # Use the glommed name field as a key to join up chimp, orang and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
     # in the orthoGlom fields from each file, which are in the same order
     # as the chimp and macaque columns of hg18.snp128OrthoPanTro2RheMac2.
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       $chimpFile $orangFile \
     | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
             else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
       > $tmpFile
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       $tmpFile $macFile \
     | perl -wpe 'chomp; \
         ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
         $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
         ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
           split(/\|/, $glomKey); \
         $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
         $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
         print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                          $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
         s/^.*$//;' \
         > $outFile
      rm $tmpFile
 'EOF'
 EOF
     chmod a+x joinOne.csh
     mkdir /hive/data/genomes/hg38/bed/snp142Ortho/run.join
     cd /hive/data/genomes/hg38/bed/snp142Ortho/run.join
     mkdir out
     ln -s ../run.liftOver/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       set chunk = $f:t
       echo ../joinOne.csh ../run.liftOver/out/{panTro4,ponAbe2,rheMac3}.$chunk.bed \
         \{check out exists out/$chunk.bed\} \
         >> jobList
     end
     para make jobList
 #Completed: 10560 of 10560 jobs
 #CPU time in finished jobs:       2815s      46.92m     0.78h    0.03d  0.000 y
 #IO & Wait Time:                 35161s     586.02m     9.77h    0.41d  0.001 y
 #Average job time:                   4s       0.06m     0.00h    0.00d
 #Longest finished job:              21s       0.35m     0.01h    0.00d
 #Submission to last job:          2149s      35.82m     0.60h    0.02d
 
     # Back on hgwdev, cat all of the joined results together (~45mins) & load table.
     cd /hive/data/genomes/hg38/bed/snp142Ortho
     sort -k1,1 -k2n,2n run.join/out/chunk*.bed > snp142OrthoPt4Pa2Rm3.bed
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
       -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
       hg38 snp142OrthoPt4Pa2Rm3 snp142OrthoPt4Pa2Rm3.bed
 #Read 103101754 elements of size 22 from snp142OrthoPt4Pa2Rm3.bed
 
     # Cleanup:
     rm -r run*/out run*/split
     gzip snp142ExcludeIds.txt snp142ForLiftOver.bed snp142OrthoPt4Pa2Rm3.bed &
 
 
 ##############################################################################
 # SNPMASKED SEQUENCE FOR SNP142 (DONE 11/06/14 angie)
     # Redmine #13309
     mkdir /hive/data/genomes/hg38/snp142Mask
     cd /hive/data/genomes/hg38/snp142Mask
     # Identify rsIds with various problems -- we will exclude those.
     zcat /hive/data/outside/dbSNP/142/human_hg38/snp142.bed.gz \
     | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
       | sort -u \
       > snp142ExcludeRsIds.txt
     zcat /hive/data/outside/dbSNP/142/human_hg38/snp142.bed.gz \
     | grep -vFwf snp142ExcludeRsIds.txt \
       > snp142Cleaned.bed
     wc -l snp142Cleaned.bed
 #113091839 snp142Cleaned.bed
 
     # Substitutions:
     mkdir substitutions
     snpMaskSingle snp142Cleaned.bed /hive/data/genomes/hg38/hg38.2bit stdout diffObserved.txt \
     | faSplit byname stdin substitutions/
 #Masked 105456777 snps in 105456777 out of 3204743519 genomic bases
 #/hive/data/genomes/hg38/hg38.2bit has 3209286105 total bases, but the total number of bases in sequences for which we masked snps is 3204743519 (difference is 4542586)
     # Check that 4542586 is the total #bases in sequences with nothing in snp142Cleaned:
     grep -Fw single snp142Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
     grep -vwf /data/tmp/1 ../chrom.sizes \
     | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
 #4542586
     # warnings about differing observed strings at same base position:
     wc -l diffObserved.txt
 #0 diffObserved.txt
     # yay!
     # Make sure that sizes are identical, first diffs are normal -> IUPAC,
     # and first diffs' case is preserved:
     foreach f (substitutions/chr*.fa)
       twoBitToFa -seq=$f:t:r /hive/data/genomes/hg38/hg38.2bit stdout \
       | faCmp -softMask $f stdin |& grep -v "that differ"
     end
 #chr1 in substitutions/chr1.fa differs from chr1 at stdin at base 10107 (y != c)
 #chr10 in substitutions/chr10.fa differs from chr10 at stdin at base 14553 (R != A)
 #chr10_GL383545v1_alt in substitutions/chr10_GL383545v1_alt.fa differs from chr10_GL383545v1_alt at stdin at base 5 (M != C)
 #chr10_GL383546v1_alt in substitutions/chr10_GL383546v1_alt.fa differs from chr10_GL383546v1_alt at stdin at base 13 (K != G)
 #...
 #(output OK -- ambiguous bases replacing [agct] at SNP positions)
     foreach f (substitutions/chr*.fa)
       mv $f $f:r.subst.fa
     end
     # Fire off a bunch of gzip jobs in parallel:
     ls -1 substitutions/*.fa | split -l 10
     foreach f (x??)
       gzip `cat $f` &
     end
     # Wait for backgrounded gzip jobs to complete
     rm x??
 
     # Insertions & deletions not done.  To date we have only offered substs for download.
     # If there is user demand, use template from snp131 in hg19.txt.
 
     # Clean up and prepare for download:
     gzip snp142Cleaned.bed &
     foreach d (substitutions)
       pushd $d
         md5sum *.gz > md5sum.txt
         cp /hive/data/genomes/hg38/snp141Mask/$d/README.txt .
       popd
     end
     # Edit the README.txt.
 
     # Create download links on hgwdev.
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/snp142Mask
     ln -s /hive/data/genomes/hg38/snp142Mask/substitutions/* \
       /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/snp142Mask/
 
 
 ##############################################################################
 # DGV (DATABASE OF GENOMIC VARIANTS) (DONE 9/6/16 angie)
 # Redmine #17351
 # previously done 11/07/14 #14188, 8/18/15 #15767, 8/10/16 #17351 (bug)
     set today = `date +%y%m%d`
     mkdir -p /hive/data/genomes/hg38/bed/dgv/$today
     cd /hive/data/genomes/hg38/bed/dgv/$today
     set release = 2016-08-31
     wget http://dgv.tcag.ca/dgv/docs/GRCh38_hg38_variants_$release.txt
     wget http://dgv.tcag.ca/dgv/docs/GRCh38_hg38_supportingvariants_$release.txt
     # These are the latest columns; if any changes, update translateDgvPlus.pl:
     head -1 GRCh38_hg38*.txt
 #variantaccession        chr     start   end     varianttype     variantsubtype  reference       pubmedid        method  platform        mergedvariants  supportingvariants      mergedorsample  frequency       samplesize      observedgains   observedlosses  cohortdescription genes   samples
     # Eyeball the categories of variants:
     cut -f 5,6 GRCh38_hg38_supportingvariants_$release.txt  | sort | uniq -c | head -100
 #1921790 CNV     deletion
 # 163085 CNV     duplication
 # 859255 CNV     gain
 #   8274 CNV     gain+loss
 #  75467 CNV     insertion
 #3452323 CNV     loss
 #  48428 CNV     mobile element insertion
 #   9063 CNV     novel sequence insertion
 #  12235 CNV     tandem duplication
 #    493 OTHER   complex
 #  30556 OTHER   inversion
 #   3696 OTHER   sequence alteration
 #      1 varianttype     variantsubtype
 
     cut -f 5,6 GRCh38_hg38_variants_$release.txt | sort | uniq -c | head -100
 # 132721 CNV     deletion
 #  29519 CNV     duplication
 #  47073 CNV     gain
 #   6867 CNV     gain+loss
 #  27553 CNV     insertion
 # 123401 CNV     loss
 #   4156 CNV     mobile element insertion
 #   8973 CNV     novel sequence insertion
 #   3693 CNV     tandem duplication
 #    568 OTHER   complex
 #   2590 OTHER   inversion
 #   1992 OTHER   sequence alteration
 #      1 varianttype     variantsubtype
 
     ~/kent/src/hg/utils/automation/translateDgvPlus.pl \
       GRCh38_hg38_supportingvariants_$release.txt > dgvSupporting.bed
     ~/kent/src/hg/utils/automation/translateDgvPlus.pl \
       GRCh38_hg38_variants_$release.txt > dgvMerged.bed
     hgLoadBed hg38 dgvSupporting dgvSupporting.bed \
       -sqlTable=$HOME/kent/src/hg/lib/dgvPlus.sql -renameSqlTable -tab
 #Read 6584665 elements of size 22 from dgvSupporting.bed
     hgLoadBed hg38 dgvMerged dgvMerged.bed \
       -sqlTable=$HOME/kent/src/hg/lib/dgvPlus.sql -renameSqlTable -tab
 #Read 389106 elements of size 22 from dgvMerged.bed
     rm bed.tab && gzip *.{txt,bed} &
 
 
 ##############################################################################
 # DBSNP B144 / SNP144 (IN PROGRESS 9/3/15)
     # Originally done 7/15/15
     # Updated 9/3/15 with additional alt mappings from dbSNP, see below
     # Redmine #15493
     mkdir -p /hive/data/outside/dbSNP/144/human_hg38
     cd /hive/data/outside/dbSNP/144/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
     # to find the subdir name to use as orgDir below (human_9606_b144_GRCh38p2 in this case).
     # Go to that subdirectory, then to database/organism_data/ and look for files
     # whose names start with b144_* and may or may not end with a suffix that identifies
     # the build assembly version or some annotation version.  If there is a suffix shared
     # by all b144_* files, add that to config.ra as the "buildAssembly".
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606_b144_GRCh38p2
 build 144
 buildAssembly 107
 refAssemblyLabel GRCh38.p2
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
 
     # It fails saying there are sequences it won't be able to lift (because we haven't
     # specified a liftUp file in config.ra).  Use the auto-generated suggested.lft file:
     cp suggested.lft usingSuggested.lft
     cat >> config.ra <<EOF
 liftUp usingSuggested.lft
 EOF
     # GRCh38 has patch contigs that we'll need to ignore (next time around these should
     # be omitted from suggested.lft):
     zcat data/b144_ContigInfo_107.bcp.gz | grep PATCHES | cut -f 3 \
       > ignoreContig
     cat >> config.ra <<EOF
 ignoreDbSnpContigsFile ignoreContig
 EOF
 
     # Continue at loadDbSnp.
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp \
       >>& do.log & tail -f do.log
 
     # While that's running, compare rs IDs in rs_fasta with b144_SNPContigLoc_107 to see
     # which IDs (if any) were omitted from the rs_fasta dump.
     zcat rs_fasta/rs*.fas.gz \
     | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
     | sort -u > rsFastaIds.txt
     zcat data/b144_SNPContigLoc_107.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
 #227 missingFromRsFasta.txt
     # Fetch fasta for those ids from dbSNP's batch_query.
     # Go to this URL:
     #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
     # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
     # Wait for email with link to results -- don't close that tab!
     # Excerpt from email:
 #Total number of Submitted ID:           227
 #============================================================
 #Total number of ID not in dbSNP:        7
 #RS number(s) not found in dbSNP
 #________________________________________________
 #730882053
 #730882172
 #730882173
 #730882174
 #730882175
 #730882176
 #730882177
     # -- Those are the same 7 that were missing from hg19.
     # Click on the email's download link to go to the download page, then right-click on
     # the "Click to download button" to copy the URL of the actual sequence file.
     # Download the results:
     wget -O rs_fasta/rs_chBatchQuery.fas.gz ftp://ftp.ncbi.nlm.nih.gov/snp/batch/150702154538.gz
 
     # Check on the progress of the script.  If we did get sequence from dbSNP, and if the
     # script has progressed past the loadDbSnp step and into the addToDbSnp step, then kill
     # it and continue from addToDbSnp to pick up the added sequence.
     # This time it was still in the loadDbSnp step, so no need to kill it.
 # *** All done!
 
     # 9/3/2015: Sarah Hunt at EBI alerted dbSNP that there seemed to be a lot
     # fewer mappings to alt sequences than in 142; dbSNP made new download files
     # that seem to be an updated b144_SNPContigLoc trimmed to contain only the
     # rows on alt sequences.
     cd /hive/data/outside/dbSNP/144/human_hg38/data
     wget ftp://ftp.ncbi.nih.gov/snp/temp/EBI/b144_alt/b144_SNPContigLoc_alt_only_107.bcp.gz
     gunzip b144_SNPContigLoc_alt_only_107.bcp.gz
     gunzip b144_SNPContigLoc_107.bcp.gz
     # Check sorting so we can merge-sort (quicker):
     sort -c -k 2n,2n -k3n,3n -k4n,4n -k5n,5n b144_SNPContigLoc_107.bcp
     sort -c -k 2n,2n -k3n,3n -k4n,4n -k5n,5n b144_SNPContigLoc_alt_only_107.bcp
     # No output, good.
     # Merge-sort the files, discarding lines with the same values in columns 2-5:
     sort -m -u -k 2n,2n -k3n,3n -k4n,4n -k5n,5n b144_SNPContigLoc_107.bcp \
       b144_SNPContigLoc_alt_only_107.bcp \
       > b144_SNPContigLoc_107_merged.bcp
     wc -l b144_SNPContigLoc_107.bcp b144_SNPContigLoc_alt_only_107.bcp \
       b144_SNPContigLoc_107_merged.bcp
 #  151235228 b144_SNPContigLoc_107.bcp
 #    5735312 b144_SNPContigLoc_alt_only_107.bcp
 #  152546474 b144_SNPContigLoc_107_merged.bcp
     # How many of the 1831401 alt mappings were already in b144_SNPContigLoc_107.bcp?
     expr 151235228 + 5735312 - 152546474
 #4424066
     # How many new mappings were added?
     expr 152546474 - 151235228
 #1311246
     # Install the merged file in place of the original b144_SNPContigLoc_107.bcp.gz
     gzip b144_SNPContigLoc_107_merged.bcp &
     mv b144_SNPContigLoc_107{,_orig}.bcp
     gzip b144_SNPContigLoc_107_orig.bcp &
     ln -s b144_SNPContigLoc_107_merged.bcp.gz b144_SNPContigLoc_107.bcp.gz
     # Run the pipeline again when the gzip of b144_SNPContigLoc_107_merged.bcp is complete.
     cd /hive/data/outside/dbSNP/144/human_hg38
     hgsql hg38 -NBe 'select chrom, count(*) from snp144 group by chrom order by chrom' \
       > chromCounts.orig
     mkdir origFiles
     mv snp144*.{bed,tab}*.gz do.log origFiles/
     hgsql '' -e 'drop database hg38snp144'
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
     tail -f do.log
     # While that's reloading the database, see if we need to fetch more fasta:
     zcat data/b144_SNPContigLoc_107.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
 #432 missingFromRsFasta.txt
     # It was 227 before, so let's get sequence from dbSNP again.
     #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
     # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
     # Wait for email with link to results -- don't close that tab!
     # Excerpt from email:
 #dbSNP BATCH QUERY REPORT: FASTA
 #============================================================
 #Total number of Submitted ID:           432
 #============================================================
 #Total number of ID processed:           432
 #============================================================
 #These rs have been merged (submitted_rs -> new_rs)
 #________________________________________________
 #============================================================
     # Download the gzipped fasta file and move/rename it to rs_chBatchQuery.fas.gz.
     # Strangely, there are only 86 sequences...
     faSize rs_chBatchQuery.fas.gz
  #9386 bases (86 N's 9300 real 9300 upper 0 lower) in 86 sequences in 1 files
     # Compare that to the rs_fasta/rs_chBatchQuery.fas.gz that we got the first time around...
     faSize rs_fasta/rs_chBatchQuery.fas.gz
 #22220 bases (220 N's 22000 real 22000 upper 0 lower) in 220 sequences in 1 files
     # Better not overwrite that!  Turns out they have 60 sequences in common, so it's
     # not as simple as concatenating them either.
     # I'm going to resubmit in case of glitch... same results.  And one of the rsIDs
     # in the fasta file isn't even in missingFromRsFasta.txt... sheesh.
     # Well, just concatenate, 60 sequences will be duplicated, big deal.
     gunzip rs_chBatchQuery.fas.gz rs_fasta/rs_chBatchQuery.fas.gz
     cat rs_chBatchQuery.fas >> rs_fasta/rs_chBatchQuery.fas
     gzip rs_fasta/rs_chBatchQuery.fas
     # Actually, doing another query for only the IDs that weren't returned by the first
     # query does give us more sequences:
     faSize -detailed rs_chBatchQuery.fas | perl -wpe 's/.*(rs\d+).*/$1/' > idsFromBatch
     grep -Fvwf idsFromBatch missingFromRsFasta.txt > stillMissingFromRsFasta.txt
     # Repeat the fasta-fetching... now we have a new file with 163 sequences!
     # Save as rs_chBatchQuery_2.fas and repeat...
     faSize -detailed rs_chBatchQuery_2.fas | perl -wpe 's/.*(rs\d+).*/$1/' >> idsFromBatch
     grep -Fvwf idsFromBatch missingFromRsFasta.txt > stillMissingFromRsFasta.txt
     # Got 185 more in rs_chBatchQuery_3.fas... that might do it...
     faSize -detailed rs_chBatchQuery_3.fas | perl -wpe 's/.*(rs\d+).*/$1/' >> idsFromBatch
     grep -Fvwf idsFromBatch missingFromRsFasta.txt > stillMissingFromRsFasta.txt
     wc -l stillMissingFromRsFasta.txt
 #1 stillMissingFromRsFasta.txt
     # I bet that one kept causing batches to crash.
     cat stillMissingFromRsFasta.txt
 #rs281874781
     # http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs281874781 says merged into rs9274426
     gunzip rs_fasta/rs_chBatchQuery.fas.gz
     cat rs_chBatchQuery_2.fas rs_chBatchQuery_3.fas >> rs_fasta/rs_chBatchQuery.fas
     faSize -detailed rs_fasta/rs_chBatchQuery.fas \
     | perl -wpe 's/.*(rs\d+).*/$1/' \
     | sort > allIdsFromBatches
     uniq allIdsFromBatches > allIdsFromBatchesU
     wc -l allIdsFromBatches*
 #  654 allIdsFromBatches
 #  432 allIdsFromBatchesU
     # Well, 222 dupes... not too horrible on the scale of dbSNP size.
     # I briefly experimented with faSplit byName for uniquifying, but that lost the
     # extra header stuff from which we scrape observed alleles... so never mind.
     gzip rs_fasta/rs_chBatchQuery.fas
     tail -f do.log
 # *** All done!
     hgsql hg38 -NBe 'select chrom, count(*) from snp144 group by chrom order by chrom' \
       > chromCounts.merged
     sdiff chromCounts.{orig,merged}
     # Most _alt sequences have more mappings now, good.
 
 
 ##############################################################################
 # SNP144 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 9/9/15 angie)
     # Originally done 7/15/15
     # Redmine #15493
     screen -S ortho -t ortho
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 144 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/bed/snp144Ortho.2015-09-09
     cd /hive/data/genomes/hg38/bed/snp144Ortho.2015-09-09
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 144 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNPMASKED SEQUENCE FOR SNP144 (DONE 9/9/15  angie)
     # Originally done 7/16/15
     # Redmine #15493
     screen -S mask -t mask
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 144 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/snp144Mask.2015-09-09
     cd /hive/data/genomes/hg38/snp144Mask.2015-09-09
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 144 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # DBNSFP (DONE 12/4/15 angie)
     # The database of non-synonymous functional predictions (dbNSFP) contains
     # precomputed scores from a wide variety of tools on all non-synon variants
     # of all genomic positions in the CDS of Gencode transcripts.  Pick out
     # some interesting subsets of v3.1a's 143 columns and translate into bigBed and
     # bigWig files that can be joined with users' variants by the Variant
     # Annotation Integrator (#6152).
     screen -S dbNSFP -t dbNSFP
     mkdir -p /hive/data/genomes/hg38/bed/dbNSFP/dbNSFPv3.1a
     cd /hive/data/genomes/hg38/bed/dbNSFP
     wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.1a.zip
     cd dbNSFPv3.1a
     unzip ../dbNSFPv3.1a.zip
 
     # Run a perl script that digests the 52-column input files into several independent
     # bed3+ files:
     ~/kent/src/hg/utils/dbNsfpToBed.pl dbNSFP*_variant.chr*
 
     # There are many mild warnings like these two:
 #FYI: >3 rows (6) for {chr1, 1579933, ENST00000624594}; removing less-informative duplicates.
 #refAl == altAl:
 #1       25802093        T       T       U       X       .       1       26128584        1       26001171        SEPN1   +       TGA     1       0       T       T/T     T/T     ENSG00000162430 ENST00000361547 ENSP00000355141 ....
     # The script has a workaround and follow-up error check, but it would be
     # good to report the cases to dbNSFP (see script for details).
 
     wc -l *.bed
 #    2341018 dbNsfpGerpNr.bed
 #   21514727 dbNsfpGerpRs.bed
 #     234192 dbNsfpInterPro.bed
 #   24486742 dbNsfpLrt.bed
 #   26832024 dbNsfpMutationAssessor.bed
 #   29467212 dbNsfpMutationTaster.bed
 #   27524420 dbNsfpPolyPhen2.bed
 #   29376024 dbNsfpSeqChange.bed
 #   27368993 dbNsfpSift.bed
 #     414922 dbNsfpUniProt.bed
 #   28459886 dbNsfpVest.bed
 
     # Convert Gerp scores to bigWig
     bedGraphToBigWig dbNsfpGerpNr.bed /hive/data/genomes/hg38/chrom.sizes dbNsfpGerpNr.bw
     bedGraphToBigWig dbNsfpGerpRs.bed /hive/data/genomes/hg38/chrom.sizes dbNsfpGerpRs.bw
 
     # Convert remaining files to bigBed -- all should have 24 or 25 chroms
     # (some include chrM, some don't)
     foreach f (dbNsfp[^G]*.bed)
       set track = $f:r
       echo $track
       set autoSql = ~/kent/src/hg/lib/$track.as
       bedToBigBed -type=bed3+ -as=$autoSql -tab \
         $f /hive/data/genomes/hg38/chrom.sizes $track.bb
     end
 
     # Make /gbdb/ links and load database tables
     mkdir /gbdb/hg38/dbNsfp
     foreach f (dbNsfp*.{bb,bw})
       ln -s `pwd`/$f /gbdb/hg38/dbNsfp/
       hgBbiDbLink hg38 $f:r /gbdb/hg38/dbNsfp/$f
     end
 
     # Clean up: remove large files that can be re-unzipped or regenerated:
     rm -f search_db* dbNS*_variant.chr* dbNs*.bed
 
 
 ##############################################################################
 # DBSNP B146 / SNP146 (DONE 3/11/16 angie)
     # Redmine #16777
     mkdir -p /hive/data/outside/dbSNP/146/human_hg38
     cd /hive/data/outside/dbSNP/146/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
     # to find the subdir name to use as orgDir below (human_9606_b146_GRCh38p2 in this case).
     # Go to that subdirectory, then to database/organism_data/ and look for files
     # whose names start with b146_* and may or may not end with a suffix that identifies
     # the build assembly version or some annotation version.  If there is a suffix shared
     # by all b146_* files, add that to config.ra as the "buildAssembly".
     # Since this build is on GRCh38.p2 like b144 above, use the files usingSuggested.lft
     # and ignoreContig constructed for b144.
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606_b146_GRCh38p2
 build 146
 buildAssembly 107
 liftUp ../../144/human_hg38/usingSuggested.lft
 refAssemblyLabel GRCh38.p2
 ignoreDbSnpContigsFile ../../144/human_hg38/ignoreContig
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
 
     # The script hit an error:
 #HgStepManager: executing step 'loadDbSnp' Tue Feb  9 12:56:42 2016.
 #Expected to process 12 CREATE statements but got 8
 #         at /cluster/home/angie/kent/src/hg/utils/automation/doDbSnp.pl line 452.
     # Turns out that schema/human_9606_table.sql.gz had tables named *_105 instead
     # of *_107.  Fix that and continue with loadDbSnp.
     mv human_9606_table.sql.{,orig.}gz
     zcat human_9606_table.sql.orig.gz | sed -e 's/_105/_107/' > human_9606_table.sql
     gzip human_9606_table.sql
     hgsql '' -e 'drop database hg38snp146'
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp \
       >>& do.log & tail -f do.log
 
     # While that's running, compare rs IDs in rs_fasta with b146_SNPContigLoc_107 to see
     # which IDs (if any) were omitted from the rs_fasta dump.
     zcat rs_fasta/rs*.fas.gz \
     | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
     | sort -u > rsFastaIds.txt
     zcat data/b146_SNPContigLoc_107.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
     # There were > 14,000 missing -- because dbSNP dumped fasta for GRCh38 a few days early,
     # but dumped complete fasta for GRCh37.  Instead of doing a large batch query to duplicate
     # what we already had in ../human_hg19/rs_fasta/, I decided to use ../human_hg19/rs_fasta.
     mv rs_fasta rs_fasta.origIncomplete
     ln -s ../human_hg19/rs_fasta .
     # I edited addToDbSnp.csh to wrap an "if (0) then ... endif" around all steps except the
     # last step that creates the ucscGnl table from rs_fasta headers.
     mkdir -p `cat workingDir`
     csh -efx addToDbSnp.csh >>& do.log & tail -f do.log
     # When that completed, I continued running from the bigJoin step.
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin >>& do.log &
     tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNP146 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 3/11/16 angie)
     # Redmine #16777
     screen -S ortho -t ortho
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 146 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/bed/snp146Ortho.2016-03-11
     cd /hive/data/genomes/hg38/bed/snp146Ortho.2016-03-11
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 146 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNPMASKED SEQUENCE FOR SNP146 (DONE 3/11/16 angie)
     # Redmine #16777
     screen -S mask -t mask
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 146 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/snp146Mask.2016-03-11
     cd /hive/data/genomes/hg38/snp146Mask.2016-03-11
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 146 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # DBSNP B147 / SNP147 (DONE 7/30/16 angie)
 # Initially done 7/1/16.  Updated 7/28/16 to fix func code bug (see below).
     # Redmine #17209
     mkdir -p /hive/data/outside/dbSNP/147/human_hg38
     cd /hive/data/outside/dbSNP/147/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
     # to find the subdir name to use as orgDir below (human_9606_b147_GRCh38p2 in this case).
     # Go to that subdirectory, then to database/organism_data/ and look for files
     # whose names start with b147_* and may or may not end with a suffix that identifies
     # the build assembly version or some annotation version.  If there is a suffix shared
     # by all b147_* files, add that to config.ra as the "buildAssembly".
     # Since this build is on GRCh38.p2 like b144 above, use the files usingSuggested.lft
     # and ignoreContig constructed for b144.
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606_b147_GRCh38p2
 build 147
 buildAssembly 107
 liftUp ../../144/human_hg38/usingSuggested.lft
 refAssemblyLabel GRCh38.p2
 ignoreDbSnpContigsFile ../../144/human_hg38/ignoreContig
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
 
     # While that's running, compare rs IDs in rs_fasta with b147_SNPContigLoc_107 to see
     # which IDs (if any) were omitted from the rs_fasta dump.
     zcat rs_fasta/rs*.fas.gz \
     | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
     | sort -u > rsFastaIds.txt
     zcat data/b147_SNPContigLoc_107.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     wc -l rsFastaIds.txt contigLocIds.txt
 # 153950629 rsFastaIds.txt
 # 152450925 contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
 #1 missingFromRsFasta.txt
     cat missingFromRsFasta.txt
 #rs869025304
     # Sequence for rs869025304 was added to the shared file rs_fasta/rs_chBatchQuery.fas.gz --
     # see ../hg19.txt.
     # Now start over from addToDbSnp...
     mkdir -p `cat workingDir`
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra --continue addToDbSnp >>& do.log &
     tail -f do.log
 # *** All done!
 
 
     # 7/28/16 -- Chris Lee found some NR_'s and XR_'s in snp147CodingDbSnp where they don't
     # belong.  Turns out that dbSnp was assigning cds-indel to any deletion with length
     # multiple of 3 that intersected an exon, whether the transcript was coding or not.
     # Some affected variants will end up with both cds-indel and ncRNA because they hit
     # both coding and noncoding transcripts.
     # This does not affect ortho alleles or SNP-masked sequences.
     # Fix the values in hg38snp147.b147_SNPContigLocusId_107 and rebuild from ucscFunc onwards.
     # For rows in which dbSNP has assigned cds-indel (45) to a non-coding transcript (NR_ or
     # XR_), assign ncRNA (30) instead.
     cd /hive/data/outside/dbSNP/147/human_hg38
     hgsql hg38snp147 -e 'update b147_SNPContigLocusId_107 set fxn_class = 30 \
                            where fxn_class = 45 and mrna_acc like "_R\_%";'
     # Comment out all of the parts of addToDbSnp.csh except for the part that creates ucscFunc
     # and run the modified script.
     mkdir -p `cat workingDir`
     csh -efx ./addToDbSnp.csh >>& do.log & tail -f do.log
     # Re-run the pipeline from bigJoin onward.
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra --continue bigJoin >>& do.log &
     tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNP147 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 7/2/16 angie)
     # Redmine #17209
     screen -S ortho -t ortho
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 147 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/bed/snp147Ortho.2016-07-01
     cd /hive/data/genomes/hg38/bed/snp147Ortho.2016-07-01
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 147 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNPMASKED SEQUENCE FOR SNP147 (DONE 7/1/16 angie)
     # Redmine #17209
     screen -S mask -t mask
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 147 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/snp147Mask.2016-07-01
     cd /hive/data/genomes/hg38/snp147Mask.2016-07-01
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 147 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNP147 BED4 BIGBED FOR VAI (DONE 10/13/16 angie)
     # Redmine #18222 -- probably should fold this into dbSNP pipeline...
     screen -S snp -t snp
     cd /hive/data/outside/dbSNP/147/human_hg38
     # Make bigBed with just the first 4 columns of snp147
     zcat snp147.bed.gz \
     | cut -f 1-4 \
     | sort -k1,1 -k2n,2n \
       > $TMPDIR/hg38.snp147.bed4
     bigBedToBed $TMPDIR/hg38.snp147.bed4 /hive/data/genomes/hg38/chrom.sizes snp147.bed4.bb
     # Install in location that hgVai will check:
     mkdir -p /gbdb/hg38/vai
     ln -s `pwd`/snp147.bed4.bb /gbdb/hg38/vai/
     # Clean up
     rm $TMPDIR/hg38.snp147.bed4
 
 
 ##############################################################################
 # DBSNP B149 / SNP149 (DONE 3/21/17 angie)
     # Redmine #18330
     mkdir -p /hive/data/outside/dbSNP/149/human_hg38
     cd /hive/data/outside/dbSNP/149/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
     # to find the subdir name to use as orgDir below (human_9606_b149_GRCh38p7 in this case).
     # Go to that subdirectory, then to database/organism_data/ and look for files
     # whose names start with b149_* and may or may not end with a suffix that identifies
     # the build assembly version or some annotation version.  If there is a suffix shared
     # by all b149_* files, add that to config.ra as the "buildAssembly".
     # Since this is the first time we're building on p7, let the script construct a liftUp
     # file and file of dbSNP contigs (GRCh38.p7) not included in hg38.
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606_b149_GRCh38p7
 build 149
 buildAssembly 108
 refAssemblyLabel GRCh38.p7
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
     # Take a look at suggested.lft -- looks reasonable, add to config.ra and try again.
     cat >> config.ra <<EOF
 liftUp suggested.lft
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl --continue loadDbSnp config.ra >>& do.log &
     tail -f do.log
     # Take a look at dbSnpContigsNotInUcsc.txt -- if those all look like patch contigs
     # (check some on https://www.ncbi.nlm.nih.gov/nucleotide/), add to config.ra and try again.
     cat >> config.ra <<EOF
 ignoreDbSnpContigsFile dbSnpContigsNotInUcsc.txt
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl --continue loadDbSnp config.ra >>& do.log &
     tail -f do.log
 
     # While that's running, compare rs IDs in rs_fasta with b149_SNPContigLoc_108 to see
     # which IDs (if any) were omitted from the rs_fasta dump.
     zcat rs_fasta/rs*.fas.gz \
     | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
     | sort -u > rsFastaIds.txt
     zcat data/b149_SNPContigLoc_108.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     wc -l rsFastaIds.txt contigLocIds.txt
 # 154206854 rsFastaIds.txt
 # 153684362 contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
 #0 missingFromRsFasta.txt
     # Good!  If there are some missing in the future see ../hg19.txt for instructions.
 
     # Yay, script completed:
 # *** All done!  (through the 'bigBed' step)
 
 
 ##############################################################################
 # SNP149 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 3/21/17 angie)
     # Redmine #18330
     screen -S ortho -t ortho
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 149 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/bed/snp149Ortho.2017-03-21
     cd /hive/data/genomes/hg38/bed/snp149Ortho.2017-03-21
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 149 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNPMASKED SEQUENCE FOR SNP149 (DONE 3/21/17 angie)
     # Redmine #18330
     screen -S mask -t mask
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 149 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/snp149Mask.2017-03-21
     cd /hive/data/genomes/hg38/snp149Mask.2017-03-21
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 149 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # DBSNP B150 / SNP150 (DONE 4/18/17 angie)
     # Redmine #19202
     mkdir -p /hive/data/outside/dbSNP/150/human_hg38
     cd /hive/data/outside/dbSNP/150/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
     # to find the subdir name to use as orgDir below (human_9606_b150_GRCh38p7 in this case).
     # Go to that subdirectory, then to database/organism_data/ and look for files
     # whose names start with b150_* and may or may not end with a suffix that identifies
     # the build assembly version or some annotation version.  If there is a suffix shared
     # by all b150_* files, add that to config.ra as the "buildAssembly".
     # NOTE: in b150 it's database/data/organism_data/ instead of database/organism_data/ !
     # Since this build is on GRCh38.p7 like b149 above, use the files usingSuggested.lft
     # and ignoreContig constructed for b149.
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606_b150_GRCh38p7
 build 150
 buildAssembly 108
 refAssemblyLabel GRCh38.p7
 liftUp ../../149/human_hg38/suggested.lft
 ignoreDbSnpContigsFile ../../149/human_hg38/dbSnpContigsNotInUcsc.txt
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
     # Download failed because they changed database/organism_data to database/data/organism_data .
     # For now, just fix up and re-run ../download_human_hg38_150.csh.
     # If they keep the tweaked path before we switch over to JSON files, update doDbSnp.pl.
     ../download_human_hg38_150.csh >>& do.log & tail -f do.log
     # Now continue after download:
     ~/kent/src/hg/utils/automation/doDbSnp.pl --continue loadDbSnp config.ra >>& do.log &
     tail -f do.log
 #*** 70 lines of b150_ContigInfo_108.bcp.gz contained contig names that
 #*** could not be mapped to chrom.size via their GenBank contig mappings; see
 #*** /hive/data/outside/dbSNP/150/human_hg38/cantLiftUpSeqNames.txt .
 #
 #*** You must account for all 70 contig_acc values in config.ra,
 #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
 #*** Then run again with -continue=loadDbSnp .
 
     # Hmmm, need to use local dbSnpContigsNotInUcsc.txt file after all.
     # Edit config.ra to have this line for ignoreDbSnpContigsFile:
 #ignoreDbSnpContigsFile dbSnpContigsNotInUcsc.txt
 
     # NOTE FOR NEXT TIME: the script overwrites dbSnpContigsNotInUcsc.txt so after a
     # successful run it is an empty file!  Next time rename it before using.
 
     ~/kent/src/hg/utils/automation/doDbSnp.pl --continue loadDbSnp config.ra >>& do.log &
     tail -f do.log
 
     # While that's running, compare rs IDs in rs_fasta with b150_SNPContigLoc_108 to see
     # which IDs (if any) were omitted from the rs_fasta dump.
     zcat rs_fasta/rs*.fas.gz \
     | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
     | sort -u > rsFastaIds.txt
     zcat data/b150_SNPContigLoc_108.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     wc -l rsFastaIds.txt contigLocIds.txt
 # 325658303 rsFastaIds.txt
 # 325001232 contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
 #0 missingFromRsFasta.txt
     # Good!  If there are some missing in the future see ../hg19.txt for instructions.
 
     # Yay, script completed:
 # *** All done!  (through the 'bigBed' step)
 
 
 ##############################################################################
 # SNP150 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 4/26/17 angie)
     # Redmine #19202
     screen -S ortho -t ortho
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 150 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/bed/snp150Ortho.2017-04-26
     cd /hive/data/genomes/hg38/bed/snp150Ortho.2017-04-26
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 150 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNPMASKED SEQUENCE FOR SNP150 (DONE 4/19/17 angie)
     # Redmine #19202
     screen -S mask -t mask
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 150 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/snp150Mask.2017-04-19
     cd /hive/data/genomes/hg38/snp150Mask.2017-04-19
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 150 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # DBSNP B151 / SNP151 (DONE 4/21/18 angie)
     # Redmine #21010
     mkdir -p /hive/data/outside/dbSNP/151/human_hg38
     cd /hive/data/outside/dbSNP/151/human_hg38
     # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
     # to find the subdir name to use as orgDir below (human_9606_b151_GRCh38p7 in this case).
     # Go to that subdirectory, then to database/organism_data/ and look for files
     # whose names start with b151_* and may or may not end with a suffix that identifies
     # the build assembly version or some annotation version.  If there is a suffix shared
     # by all b151_* files, add that to config.ra as the "buildAssembly".
     # NOTE: in b151 it's database/data/organism_data/ instead of database/organism_data/ !
     # Since this build is on GRCh38.p7 like b149 above, use the files usingSuggested.lft
     # and ignoreContig constructed for b149.
     cat > config.ra <<EOF
 db hg38
 orgDir human_9606_b151_GRCh38p7
 build 151
 buildAssembly 108
 refAssemblyLabel GRCh38.p7
 liftUp ../../149/human_hg38/suggested.lft
 ignoreDbSnpContigsFile ../../149/human_hg38/dbSnpContigsNotInUcsc.txt
 EOF
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >>& do.log & tail -f do.log
 
 #*** 70 lines of b151_ContigInfo_108.bcp.gz contained contig names that
 #*** could not be mapped to chrom.size via their GenBank contig mappings; see
 #*** /hive/data/outside/dbSNP/151/human_hg38/cantLiftUpSeqNames.txt .
 #
 #*** You must account for all 70 contig_acc values in config.ra,
 #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
 #*** Then run again with -continue=loadDbSnp .
 
     cp ../../149/human_hg38/dbSnpContigsNotInUcsc.txt patchContigs.txt
     grep ^'patch contig' cantLiftUpSeqNames.txt | awk '{print $3;}' >> patchContigs.txt
 
     # Edit config.ra to have this line for ignoreDbSnpContigsFile:
 #ignoreDbSnpContigsFile patchContigs.txt
 
     ~/kent/src/hg/utils/automation/doDbSnp.pl --continue loadDbSnp config.ra >>& do.log &
     tail -f do.log
 
     # While that's running, compare rs IDs in rs_fasta with b151_SNPContigLoc_108 to see
     # which IDs (if any) were omitted from the rs_fasta dump.
     zcat rs_fasta/rs*.fas.gz \
     | perl -wne 'if (/^>/) { s/^>gnl\|dbSNP\|(rs\d+) .*/$1/ || die; print; }' \
     | sort -u > rsFastaIds.txt
     zcat data/b151_SNPContigLoc_108.bcp.gz \
     | awk '{print "rs" $2;}' \
     | sort -u > contigLocIds.txt
     wc -l rsFastaIds.txt contigLocIds.txt
 #  660773082 rsFastaIds.txt
 #  660440048 contigLocIds.txt
     comm -13 rsFastaIds.txt contigLocIds.txt > missingFromRsFasta.txt
     wc -l missingFromRsFasta.txt
 #45 missingFromRsFasta.txt
     # A small enough number that we can get sequence from dbSNP via Batch Query.
     #   http://www.ncbi.nlm.nih.gov/projects/SNP/dbSNP.cgi?list=rsfile
     # Enter your email address, select FASTA as output format, and upload missingFromRsFasta.txt
     # Wait for email with link to results -- don't close that tab!
     # Aw dang, even the Batch Query found only two of them (excerpt from email):
 #dbSNP BATCH QUERY REPORT: FASTA
 #============================================================
 #Total number of Submitted ID:           45
 #============================================================
 #Total number of ID not in dbSNP:        43
 #RS number(s) not found in dbSNP
 #________________________________________________
 #1162539939
 #1165260363
 #.....
     # Hmmm... the first ID has flanking seqs on their details page:
     # https://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=1162539939
     # Letting it go -- that's a tiny number out of 683M+ variants.
 
     # A mysql command in the loadDbSnp step failed, so I edited loadDbSnp.csh to continue
     # from the failed command.
     csh -efx loadDbSnp.csh >> & do.log & tail -f do.log
     # Continue from next step:
     ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp >>& do.log &
     tail -f do.log
 
     # Yay, script completed:
 # *** All done!  (through the 'bigBed' step)
 
 
 ##############################################################################
 # SNP151 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 4/24/18 angie)
     # Redmine #21010
     screen -S ortho -t ortho
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 151 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/bed/snp151Ortho.2018-04-23
     cd /hive/data/genomes/hg38/bed/snp151Ortho.2018-04-23
     ~/kent/src/hg/utils/automation/doDbSnpOrthoAlleles.pl hg38 151 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # SNPMASKED SEQUENCE FOR SNP151 (DONE 4/23/18 angie)
     # Redmine #21010
     screen -S mask -t mask
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 151 -debug
 # *** Steps were performed in /hive/data/genomes/hg38/snp151Mask.2018-04-23
     cd /hive/data/genomes/hg38/snp151Mask.2018-04-23
     ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 151 \
       >>& do.log & tail -f do.log
 # *** All done!
 
 
 ##############################################################################
 # DBSNP152 / DBSNP153 / bigDbSnp: see ../bigDbSnp.txt
 
 
 ##############################################################################
 # 1000 GENOMES PHASE 3 VARIANT CALLS (DONE 11/13/19 angie)
     screen -S tgp -t tgp
     mkdir /hive/data/genomes/hg38/bed/1000GenomesPhase3
     cd /hive/data/genomes/hg38/bed/1000GenomesPhase3
     wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_README.txt
     wget --timestamping \
       ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr\*.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
     wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_MANIFEST.txt
     # Check md5sums
     tawk '{print $3, $1;}' 20190312_biallelic_SNV_and_INDEL_MANIFEST.txt | g -v tbi > md5sums.check
     md5sum -c md5sums.check
 #./ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK
 #...
 #./ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK
     # Index them locally instead of downloading, probably quicker.
     for f in *.vcf.gz; do
       echo indexing $f
       tabix -p vcf $f
     done
     # Install the files
     mkdir /gbdb/hg38/1000Genomes
     ln -s `pwd`/*.vcf.gz* /gbdb/hg38/1000Genomes/
     cp /dev/null tgpPhase3.txt
     for c in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X; do
       file=ALL.chr$c.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
       echo -e "/gbdb/hg38/1000Genomes/$file\tchr$c" >> tgpPhase3.txt
     done
     # hgBbiDbLink doesn't support the seq column so use hgLoadSqlTab:
     hgLoadSqlTab hg38 tgpPhase3 ~/kent/src/hg/lib/bbiChroms.sql tgpPhase3.txt
     # Make a chromosomes line for trackDb (no alts, no Y!):
     hgsql hg38 -NBe 'select seqName from tgpPhase3' | xargs echo | sed -e 's/ /,/g'
 #chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX
     # I don't see counts of SNPs / indels documented anywhere, so extract:
     time (zcat ALL.chr*.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | g -v ^# | cut -f 8 | sed -re 's/.*VT=//; s/;.*//' | sort | uniq -c | head -100)
 
 
 ##############################################################################
 # DGV with filters - DONE Jan 01 2020
     zcat dgvSupporting.bed.gz | tawk '{print $0, $3-$2}' | sort -k1,1 -k2,2n > dgvSupportingWithSize.bed
     zcat dgvMerged.bed.gz | tawk '{print $0, $3-$2}' | sort -k1,1 -k2,2n > dgvMergedWithSize.bed
     bedToBigBed -tab -as=/hive/data/genomes/hg19/bed/dgv/160810/dgvPlusSize.as -type=bed9+14 dgvMergedWithSize.bed  /hive/data/genomes/hg38/chrom.sizes dgvMerged.bb
     # pass1 - making usageList (41 chroms): 333 millis
     # pass2 - checking and writing primary data (389106 records, 23 fields): 9803 millis
     bedToBigBed -tab -as=/hive/data/genomes/hg19/bed/dgv/160810/dgvPlusSize.as -type=bed9+14 dgvSupportingWithSize.bed  /hive/data/genomes/hg38/chrom.sizes dgvSupporting.bb
     # pass1 - making usageList (41 chroms): 2695 millis
     # pass2 - checking and writing primary data (6584665 records, 23 fields): 27859 millis
     mkdir -p /gbdb/hg38/dgv/
     cd /gbdb/hg38/dgv
     ln -s /hive/data/genomes/hg38/bed/dgv/160906/dgvMerged.bb
     ln -s /hive/data/genomes/hg38/bed/dgv/160906/dgvSupporting.bb
+
+##############################################################################
+# Trios for 1000 Genomes Phase 3 Data - DONE 05/28/20 - ChrisL
+
+    # download the related samples
+    cd /hive/data/genomes/hg38/bed/1000GenomesPhase3
+    mkdir related_samples
+    cd related_samples/
+
+    wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_README.txt
+    wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_MANIFEST.txt
+    time for i in {1..22}; do wget --timestamping http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/supporting/related_samples/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz; done &> wget.log
+    # real    43m6.593s
+    # user    0m2.132s
+    # sys 0m16.292s
+    wget --timestamping http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/supporting/related_samples/ALL.chrX.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz                  
+
+    # Check md5sums
+    tawk '{print $3, $1;}' 20190312_related_biallelic_SNV_and_INDEL_MANIFEST.txt | grep -v tbi > md5sums.check
+    md5sum -c md5sums.check
+    #./ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK
+    #...
+    #./ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK
+    # Index them locally instead of downloading, probably quicker.
+
+    for f in *.vcf.gz; do
+      echo indexing $f
+      tabix -p vcf $f
+    done
+
+    # Download 1000 Genomes Family tree info and find existing trios:
+    wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v2.20130502.ALL.ped
+    tawk '{if ($2 != 0 && $3 != 0 && $4 != 0) printf "%s\t%s,%s\t%s_%s_%s\n", $2,$4,$3,$2,$1,$7 }' integrated_call_samples_v2.20130502.ALL.ped > allTrios.txt
+
+    # this is not strictly "child" IDs if there are grandparents or something but close enough
+    cut -f1 allTrios.txt | tail -n +2 > childIds.txt
+    bcftools query -l ../related_samples/ALL.chr22.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz | grep -Fwf childIds.txt  - | grep -wf - allTrios.txt > trackTrios.txt
+    bcftools query -l ../ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | grep -Fwf childIds.txt  - | grep -wf - allTrios.txt >> trackTrios.txt
+
+    # this below script runs the various bcftools commands to merge VCFs together:
+    # subset sample: bcftools view --no-version -s 'sample1,sample2,...'
+    # merge two VCFs: "bcftools merge --no-version file1 file2
+    # remove homozygous reference variants (artifacts of subsetting the whole pop dataset):
+    #   bcftools view --no-version -i \"GT[*]!='RR'\" "
+    # this is a parasol run as it can take ~20 minutes to run on one chromosome
+
+    # first make a configuration file so we know where the VCFs are and stuff:
+    cat << _EOF > conf.txt
+    relationshipFile=trackTrios.txt
+    primaryVCF=/hive/data/genomes/hg38/bed/1000GenomesPhase3/ALL.*.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
+    suppVCF=/hive/data/genomes/hg38/bed/1000GenomesPhase3/related_samples/ALL.*.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz
+    _EOF
+
+    ./makeTrio.py -v conf.txt
+
+    cd work
+    para time
+    # 161 jobs in batch
+    # 0 jobs (including everybody's) in Parasol queue or running.
+    # Checking finished jobs
+    # Completed: 161 of 161 jobs
+    # CPU time in finished jobs:     277100s    4618.34m    76.97h    3.21d  0.009 y
+    # IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
+    # Average job time:                1634s      27.23m     0.45h    0.02d
+    # Longest finished job:            3753s      62.55m     1.04h    0.04d
+    # Submission to last job:          3766s      62.77m     1.05h    0.04d
+    # Estimated complete:                 0s       0.00m     0.00h    0.00d
+
+    # now load the per chrom VCF tables:
+    ./loadTables.sh
+
+    # Get some stats:
+    for d in output/*; do echo $d; for f in $d/*.vcf.gz; do zgrep -v "^#"  $f | cut -f8 | sed -re 's/.*VT=//; s/;.*//' | sort | uniq -c; done | awk '{arr[$2] += $1}END{for (i in arr){printf "%s: %d\n", i, arr[i]}}'; done
+    output/HG00702_SH089_CHS
+    SNP: 4522294
+    INDEL: 589214
+    output/HG00733_PR05_PUR
+    SNP: 4783649
+    INDEL: 621948
+    output/HG02024_VN049_KHV
+    SNP: 4554542
+    INDEL: 597609
+    output/NA12878_1463_CEU
+    SNP: 4565394
+    INDEL: 593052
+    output/NA19240_Y117_YRI
+    SNP: 5954728
+    INDEL: 728480
+    output/NA19675_m004_MXL
+    SNP: 4763107
+    INDEL: 619293
+    output/NA19685_m011_MXL
+    SNP: 4758396
+    INDEL: 619294
+
 ##############################################################################