cb2db10ee029b7dfed6b003d2a4bdf29ec0e99a7 markd Wed Nov 15 21:17:21 2023 -0800 work on tabula-sapiens, but it is not done diff --git src/hg/makeDb/doc/hg38/tabulaSapiens.txt src/hg/makeDb/doc/hg38/tabulaSapiens.txt index 2094a2b..34d4558 100644 --- src/hg/makeDb/doc/hg38/tabulaSapiens.txt +++ src/hg/makeDb/doc/hg38/tabulaSapiens.txt @@ -1,66 +1,89 @@ This describes building tabula-sapiens related tracks ############################################################################# # bam-dec2022 download (Max) ############################################################################# Download to: /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022/ some doc in https://github.com/czbiohub/tabula-sapiens metadata: - /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv - /hive/data/inside/cells/datasets/tabula-sapiens/orig/metaMerged/cellxgene.tsv + /hive/data/inside/cells/datasets/tabula-sapiens/orig/from-cellxgene-all.meta.tsv # note that smartseq2 BAM can't be erad by samtools or picard due to duplicate # chromosomes: cd /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022/Pilot1/alignment-gencode/ % samtools view -H ./smartseq2/B107813_G5_S31.homo.covid19.Aligned.out.sorted.bam| head [E::sam_hrecs_refs_from_targets_array] Duplicate entry "NC_040671" in target list samtools view: failed to add PG line to the header % picard ValidateSamFile I=./smartseq2/B107813_G5_S31.homo.covid19.Aligned.out.sorted.bam ERROR 2023-02-08 21:44:44 ValidateSamFile Cannot add sequence that already exists in SAMSequenceDictionary: NC_040671 however, intronProspector can read them +Only BAM files in ceelxgene.tsv should be used. There are bam files for wells +where no cells were sorted and poor quality cells are not annotated. + + ############################################################################# # setup for intronProspector runs (markd) ############################################################################# Install htslib-1.18 and intronProspector-1.0.3 in /cluster/software/. Obtain from: https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2 https://github.com/diekhans/intronProspector/archive/refs/tags/v1.0.3.tar.gz +## +# get list of BAMs and associate metadata +# this is tricky because: +# - We have BAMs that were rejected and not part of the metadata +# - BAM names contain the cell id, plus other information, but metadata doesn't +# directly reference the BAM +# B107821_A1_S297.homo.gencode.v30.ERCC.chrM -> +# Pilot1/alignment-gencode/SS2/B107821_A1_S297.homo.gencode.v30.ERCC.chrM.Aligned.out.sorted.bam +# TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P5_S365 -> +# Pilot2/alignment-gencode/smartseq2/batch3/TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P5_S365.homo.gencode.v30.ERCC.chrM.Aligned.out.sorted.bam +## ## # run intronProspector ## cd /hive/data/genomes/hg38/bed/tabula-sapiens-dec2022 # script runIntronProspector is run on each BAM like intronProspector --genome-fasta=${ref} --skip-missing-targets --min-confidence-score=1.0 \p --intron-calls=${outtsvtmp} ${inbam} +# get list of SS2 cell ids in metadata +tmlr filter '$method == "smartseq2"' /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv > smartseq2.metadata.tsv + +# get list of all BAMs we have along with id + +##### tawk '$3 == "smartseq2"{print $1}' /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv |head + + # build jobs + (cd /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022 && find Pilot*/alignment-gencode/{smartseq2,SS2} -name '*.bam') | awk '{print "./runIntronProspector", $0}' >smartseq2.jobs para create smartseq2.jobs -batch=b1 para push -maxPush=1000000 -batch=b1 ## # metadata check ## find tmp -name "*.tsv" | sed -E -e 's#^.+/##' -e 's#.homo.gencode.+$##' > calls-cells.tab tmlr filter '$method == "smartseq2"' /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv > smartseq2.metadata.tsv have: selectById 1 smartseq2.metadata.tsv 1 calls-cells.tab |twc missing: selectById -not 1 smartseq2.metadata.tsv 1 calls-cells.tab |twc