d78f2f29016298f39d54b536b0b51a9ff8ffbff9 hiram Mon Feb 12 15:07:39 2024 -0800 procedure to add selective annotations to ncbiRefSeq tables refs #32902 diff --git src/hg/makeDb/doc/mm39/chrM.ncbiRefSeq.txt src/hg/makeDb/doc/mm39/chrM.ncbiRefSeq.txt new file mode 100644 index 0000000..e022ea6 --- /dev/null +++ src/hg/makeDb/doc/mm39/chrM.ncbiRefSeq.txt @@ -0,0 +1,108 @@ +# add the NCBI RefSeq genes for chrM to the NCBI RefSeq track +# DONE - Hiram - 2024-02-12 + +mkdir /hive/data/genomes/mm39/bed/ncbiRefSeq.2023-04-19/chrMcatchUp +cd /hive/data/genomes/mm39/bed/ncbiRefSeq.2023-04-19/chrMcatchUp + +# what is the NCBI name for chrM: + +grep chrM ../../chromAlias/mm39.chromAlias.txt + +# chrM AY172335.1 MT NC_005089.1 + +# extract the gff descriptions from the primary gff file: + +zgrep NC_005089.1 ../download/GCF_000001635.27_GRCm39_genomic.gff.gz \ + | grep -v "^#" > genome.gff.NC_005089.1.tsv + +# extract the RefSeqLink data from that gff file: + +~/kent/src/hg/makeDb/doc/mm39/gffToLink.pl > to.add.ncbiRefSeqLink.tsv + +# obtain the genePred file from the genomic.gff.gz file: + +export asmId=GCF_000001635.27_GRCm39 +export downloadDir=/hive/data/genomes/mm39/bed/ncbiRefSeq.2023-04-19/download +export ncbiGffGz=$downloadDir/${asmId}_genomic.gff.gz + +zcat $ncbiGffGz \ + | sed -re 's/([;\t])SO_type=/\1so_type=/;' \ + | gff3ToGenePred -refseqHacks -attrsOut=$asmId.attrs.txt \ + -unprocessedRootsOut=$asmId.unprocessedRoots.txt stdin raw.$asmId.gp + +grep NC_005089.1 raw.$asmId.gp > NC_005089.1.gp +sed -e 's/NC_005089.1/chrM/g;' NC_005089.1.gp > chrM.gp + +# save the existing files in case they are broken during this process: +hgsql -N -e 'select * from ncbiRefSeq;' mm39 > before.mm39.ncbiRefSeq.tsv +hgsql -N -e 'select * from ncbiRefSeqCurated;' mm39 > before.mm39.ncbiRefSeqCurated.tsv +hgsql -N -e 'select * from ncbiRefSeqLink;' mm39 > before.mm39.ncbiRefSeqLink.tsv + +# loading the genePred data into two tables: + +hgsql -N -e 'select count(*) from ncbiRefSeq;' mm39 +# 134700 +hgsql -e 'LOAD DATA LOCAL INFILE "to.add.chrM.sql.tsv" INTO TABLE ncbiRefSeq;' mm39 +hgsql -N -e 'select count(*) from ncbiRefSeq;' mm39 +# 134737 + +hgsql -N -e 'select count(*) from ncbiRefSeqCurated;' mm39 +# 55621 +hgsql -e 'LOAD DATA LOCAL INFILE "to.add.chrM.sql.tsv" INTO TABLE ncbiRefSeqCurated;' mm39 +hgsql -N -e 'select count(*) from ncbiRefSeqCurated;' mm39 +# 55658 + +## and the RefSeqLink data: + +hgsql -N -e 'select count(*) from ncbiRefSeqLink;' mm39 +# 134699 +hgsql -e 'LOAD DATA LOCAL INFILE "to.add.ncbiRefSeqLink.tsv" INTO TABLE ncbiRefSeqLink;' mm39 +hgsql -N -e 'select count(*) from ncbiRefSeqLink;' mm39 +# 134736 + +# check the relationship between the tables: + +hgsql -e 'SELECT e.name,e.chrom,j.id,j.name FROM + ncbiRefSeq e, + ncbiRefSeqLink j +WHERE e.name = j.id AND e.chrom = "chrM";' mm39 + ++---------+-------+---------+---------+ +| TrnF | chrM | TrnF | TrnF | +| mt-Rnr1 | chrM | mt-Rnr1 | mt-Rnr1 | +| TrnV | chrM | TrnV | TrnV | +| mt-Rnr2 | chrM | mt-Rnr2 | mt-Rnr2 | +| TrnL1 | chrM | TrnL1 | TrnL1 | +| ND1 | chrM | ND1 | ND1 | +| TrnI | chrM | TrnI | TrnI | +| TrnQ | chrM | TrnQ | TrnQ | +| TrnM | chrM | TrnM | TrnM | +| ND2 | chrM | ND2 | ND2 | +| TrnW | chrM | TrnW | TrnW | +| TrnA | chrM | TrnA | TrnA | +| TrnN | chrM | TrnN | TrnN | +| TrnC | chrM | TrnC | TrnC | +| TrnY | chrM | TrnY | TrnY | +| COX1 | chrM | COX1 | COX1 | +| TrnS1 | chrM | TrnS1 | TrnS1 | +| TrnD | chrM | TrnD | TrnD | +| COX2 | chrM | COX2 | COX2 | +| TrnK | chrM | TrnK | TrnK | +| ATP8 | chrM | ATP8 | ATP8 | +| ATP6 | chrM | ATP6 | ATP6 | +| COX3 | chrM | COX3 | COX3 | +| TrnG | chrM | TrnG | TrnG | +| ND3 | chrM | ND3 | ND3 | +| TrnR | chrM | TrnR | TrnR | +| ND4L | chrM | ND4L | ND4L | +| ND4 | chrM | ND4 | ND4 | +| TrnH | chrM | TrnH | TrnH | +| TrnS2 | chrM | TrnS2 | TrnS2 | +| TrnL2 | chrM | TrnL2 | TrnL2 | +| ND5 | chrM | ND5 | ND5 | +| ND6 | chrM | ND6 | ND6 | +| TrnE | chrM | TrnE | TrnE | +| CYTB | chrM | CYTB | CYTB | +| TrnT | chrM | TrnT | TrnT | +| TrnP | chrM | TrnP | TrnP | ++---------+-------+---------+---------+