4513440c373f2ba57b4d28910e832b4dc85108f5 hiram Thu Feb 6 16:53:25 2020 -0800 warn only for liftUp not error out duplicates cause missing refs #23891 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 5303e5a..6e244a5 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -356,31 +356,31 @@ fi rm -f before.cut9.txt # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\ | (grep ' :: ' || true) \\ | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\ | sort -u \\ > pragmaLabels.txt # extract cross reference text for refLink \$gff3ToRefLink \$downloadDir/\$asmId.raFile.txt \$ncbiGffGz pragmaLabels.txt 2> \$db.refLink.stderr.txt \\ | sort > \$asmId.refLink.tab # converting the NCBI coordinates to UCSC coordinates -liftUp -extGenePred -type=.gp stdout $localLiftFile drop \$asmId.gp \\ +liftUp -extGenePred -type=.gp stdout $localLiftFile warn \$asmId.gp \\ | gzip -c > \$asmId.\$db.gp.gz $genePredCheckDb \$asmId.\$db.gp.gz # curated subset of all genes (zegrep "^[NY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.curated.gp # may not be any curated genes if [ ! -s \$db.curated.gp ]; then rm -f \$db.curated.gp elif [ -s \$asmId.refseqSelectTranscripts.txt ]; then cat \$db.curated.gp | fgrep -f \$asmId.refseqSelectTranscripts.txt - \\ > \$db.refseqSelect.curated.gp # may not be any refseqSelect.curated genes if [ ! -s \$db.refseqSelect.curated.gp ]; then rm -f \$db.refseqSelect.curated.gp fi @@ -421,31 +421,31 @@ \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb # Make trix index for ncbiRefSeqOther $ncbiRefSeqOtherIxIxx \\ ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x} # PSL data will be loaded into a psl type track to show the alignments (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff (zegrep -v "NG_" \$ncbiGffGz || true) \\ | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\ gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\ - | liftUp -type=.psl stdout $localLiftFile drop stdin \\ + | liftUp -type=.psl stdout $localLiftFile warn stdin \\ | gzip -c > \$db.psl.gz pslCheck $pslTargetSizes \\ -querySizes=\$downloadDir/rna.sizes \$db.psl.gz # extract RNA CDS information from genbank record # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts # that do not *exactly* match the reference genome. For all other transcripts # construct 'fake' PSL records representing the alignments of all cDNAs # that would be perfect matches to the reference genome. The pslFixCdsJoinGap # will fixup those records with unusual alignments due to frameshifts of # various sorts as found in the rna.cds file: genePredToFakePsl -qSizes=\$downloadDir/rna.sizes $fakePslSizes \$db \$db.ncbiRefSeq.gp \\