c1c72d06c1897453aa237aa30124d66a14464f59 hiram Wed Oct 13 10:12:40 2021 -0700 fixup the primate to primate DEF file and begin to add the auto detection of which should be target vs query refs #28318 diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh index 8539b77..028e6ca 100755 --- src/hg/utils/automation/pairLastz.sh +++ src/hg/utils/automation/pairLastz.sh @@ -48,42 +48,59 @@ export GCxPath="" case $asmName in GC[AF]_*) gcX=`echo $asmName | cut -c1-3` d0=`echo $asmName | cut -c5-7` d1=`echo $asmName | cut -c8-10` d2=`echo $asmName | cut -c11-13` GCxPath="${gcX}/${d0}/${d1}/${d2}" ;; *) ;; esac printf "%s" "${GCxPath}" } +# asmSize - determine size of genome +function asmSize() { + export asmName=$1 + export sizes="/hive/data/genomes/${asmName}/chrom.sizes" + case $asmName in + GC[AF]_*) + gcxPath=$(gcPath $asmName) + id=$(asmId $asmName) + size=`awk '{sum+=$2}END{print sum}' /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt` + ;; + *) + size=`awk '{sum+=$2}END{print sum}' ${sizes}` + ;; + esac + printf "%s" "${size}" +} + # seqCount - determine the sequence count in given genome target function seqCount() { export asmName=$1 export sizes="/hive/data/genomes/${asmName}/chrom.sizes" case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) id=$(asmId $asmName) - count=`cat /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | wc -l` + count=`wc -l /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | cut -d' ' -f1` ;; *) - count=`cat ${sizes} | wc -l` + count=`wc -l ${sizes} | cut -d' ' -f1` ;; esac printf "%s" "${count}" } function orgName() { export asmName=$1 case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}" asmRpt="${asmDir}/${asmName}_assembly_report.txt" oName=`egrep -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'` ;; *) @@ -234,35 +251,38 @@ printf "# swap in progress ${query} -> ${target}\n" 1>&2 printf "# " 1>&2 ls -ogd "${swapDir}" 1>&2 if [ "${doneCount}" -ne 2 ]; then exit 0 fi fi if [ "${doneCount}" -eq 2 ]; then printf "# all done\n" 1>&2 fi export tOrgName="$(orgName $target)" export qOrgName="$(orgName $query)" +export tAsmSize="$(asmSize $target)" +export qAsmSize="$(asmSize $query)" export tSequenceCount="$(seqCount $target)" export qSequenceCount="$(seqCount $query)" printf "# working: %s\n" "${buildDir}" 1>&2 printf "# target: $target - $tOrgName - $tClade - $tSequenceCount sequences\n" 1>&2 printf "# query: $query - $qOrgName - $qClade - $qSequenceCount sequences\n" 1>&2 +LC_NUMERIC=en_US printf "# sizes: target: %'d - query: %'d\n" "${tAsmSize}" "${qAsmSize}" 1>&2 export seq1Limit="40" if [ "${tSequenceCount}" -gt 50000 ]; then seq1Limit="100" fi export seq2Limit="100" if [ "${qSequenceCount}" -gt 50000 ]; then seq2Limit="500" fi export minScore="3000" export linearGap="medium" case $tClade in @@ -284,50 +304,88 @@ "primate") ;; "mammal") ;; "other") minScore="5000" linearGap="loose" ;; esac ;; "other") minScore="5000" linearGap="loose" esac +if [ "$tClade" == "primate" -a "$qClade" == "primate" ]; then + export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target} BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz +BLASTZ_T=2 +BLASTZ_O=600 +BLASTZ_E=150 +BLASTZ_M=254 +BLASTZ_K=4500 +BLASTZ_Y=15000 +BLASTZ_Q=/hive/data/staging/data/blastz/human_chimp.v2.q +# A C G T +# A 90 -330 -236 -356 +# C -330 100 -318 -236 +# G -236 -318 100 -330 +# T -356 -236 -330 90 # TARGET: ${tOrgName} ${Target} SEQ1_DIR=${target2bit} SEQ1_LEN=${targetSizes} SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=${seq1Limit} # QUERY: ${qOrgName} ${Query} SEQ2_DIR=${query2bit} SEQ2_LEN=${querySizes} SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=${seq2Limit} BASE=${buildDir} TMPDIR=/dev/shm " +else + +export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target} +BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz + +# TARGET: ${tOrgName} ${Target} +SEQ1_DIR=${target2bit} +SEQ1_LEN=${targetSizes} +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 +SEQ1_LIMIT=${seq1Limit} + +# QUERY: ${qOrgName} ${Query} +SEQ2_DIR=${query2bit} +SEQ2_LEN=${querySizes} +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 +SEQ2_LIMIT=${seq2Limit} + +BASE=${buildDir} +TMPDIR=/dev/shm +" + +fi ### skip primary alignment if it is already done ### primaryDone == 0 means NOT done yet if [ $primaryDone -eq 0 ]; then mkdir "${buildDir}" ### setup the DEF file printf "%s" "${defString}" > ${buildDir}/DEF ### setup the buildDir/run.sh script printf "#!/bin/bash set -beEu -o pipefail export targetDb=\"${tAsmId}\"