9e79e16956fd1400cda121b2ac69aaa27a88ab8a braney Mon Aug 11 14:39:59 2025 -0700 add new method of calculating genarkOrg table diff --git src/hg/makeDb/doc/genarkOrg.txt src/hg/makeDb/doc/genarkOrg.txt index 2197d4b1dcc..49af77240df 100644 --- src/hg/makeDb/doc/genarkOrg.txt +++ src/hg/makeDb/doc/genarkOrg.txt @@ -1,6 +1,32 @@ - hgsql hgcentraltest -Ne "select taxId, gcAccession from genark" | sort > /tmp/1 - hgsql hgcentraltest -Ne "select taxId, genome from dbDb group by taxId" | sort > /tmp/2 - hgsql hgcentraltest -Ne "create table genarkOrg (gcAccession varchar(255), genome varchar(255))" - join -t $'\t' /tmp/1 /tmp/2 | cut -f 2- > /tmp/3 - hgsql hgcentraltest -Ne "load data local infile '/tmp/3' into table genarkOrg" +# get UCSC org names from dbDb +hgsql -hgenome-centdb.soe.ucsc.edu hgcentral -Ne "select taxId, organism from dbDb where active=1" | sort | uniq > taxIdOrg.txt +# get all the genark accessions with their taxId +hgsql hgcentraltest -Ne "select taxId, gcAccession from genark" | sort > taxIdGC.txt + +# mapping of genark accessions to UCSC Org +join -t $'\t' taxIdOrg.txt taxIdGC.txt | tawk '{print $3,$2}' | sort > gcToUCSCOrg.txt + +# grab NCBI Taxonomy database name table +wget "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" +tar xf taxdump.tar.gz names.dmp + +# get list of genbank common names from NCBI taxonomy database +grep "genbank common" names.dmp | tawk '{print $1, $3}' | sort > taxIdCommon.txt + +# mapping of genark accessions to NCBI Common names +join -t $'\t' taxIdCommon.txt taxIdGC.txt | tawk '{print $3,$2}' | sort > gcToCommon.txt + +# get list of scientific names from NCBI taxonomy database +grep "scientific" names.dmp | tawk '{print $1, $3}' | sort > taxIdScientific.txt + +# mapping of genark accessions to NCBI Scientific names +join -t $'\t' taxIdScientific.txt taxIdGC.txt | tawk '{print $3,$2}' | sort > gcToSci.txt + +# get list of all GCs to make "other" file +cut -f 2 taxIdGC.txt | sort > gc.txt +tawk '{print $1, "Other"}' gc.txt > gcToOther.txt + +cat gcToUCSCOrg.txt gcToCommon.txt gcToSci.txt gcToOther.txt | tawk '{if (!seen[$1]) print; seen[$1]=1}' | sort > genarkOrg.txt + +# load as genarkOrg table..