cb2db10ee029b7dfed6b003d2a4bdf29ec0e99a7
markd
  Wed Nov 15 21:17:21 2023 -0800
work on tabula-sapiens, but it is not done

diff --git src/hg/makeDb/doc/hg38/tabulaSapiens.txt src/hg/makeDb/doc/hg38/tabulaSapiens.txt
index 2094a2b..34d4558 100644
--- src/hg/makeDb/doc/hg38/tabulaSapiens.txt
+++ src/hg/makeDb/doc/hg38/tabulaSapiens.txt
@@ -1,66 +1,89 @@
 This describes building tabula-sapiens related tracks
 
 
 #############################################################################
 # bam-dec2022 download (Max)
 #############################################################################
 
 
 Download to:
   /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022/
 
 some doc in https://github.com/czbiohub/tabula-sapiens
 
 metadata:
-  /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv
-  /hive/data/inside/cells/datasets/tabula-sapiens/orig/metaMerged/cellxgene.tsv
+  /hive/data/inside/cells/datasets/tabula-sapiens/orig/from-cellxgene-all.meta.tsv
 
 # note that smartseq2 BAM can't be erad by samtools or picard due to duplicate
 # chromosomes:
     cd /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022/Pilot1/alignment-gencode/
 
     % samtools view -H  ./smartseq2/B107813_G5_S31.homo.covid19.Aligned.out.sorted.bam| head
     [E::sam_hrecs_refs_from_targets_array] Duplicate entry "NC_040671" in target list
     samtools view: failed to add PG line to the header
 
     % picard ValidateSamFile I=./smartseq2/B107813_G5_S31.homo.covid19.Aligned.out.sorted.bam
     ERROR	2023-02-08 21:44:44	ValidateSamFile	Cannot add sequence that already exists in SAMSequenceDictionary: NC_040671
 
 however, intronProspector can read them
 
+Only BAM files in ceelxgene.tsv should be used. There are bam files for wells
+where no cells were sorted and poor quality cells are not annotated.
+
+
 #############################################################################
 # setup for intronProspector runs (markd)
 #############################################################################
 
 Install htslib-1.18 and intronProspector-1.0.3 in /cluster/software/. Obtain
 from:
 
    https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2
    https://github.com/diekhans/intronProspector/archive/refs/tags/v1.0.3.tar.gz
 
+##
+# get list of BAMs and associate metadata
+# this is tricky because:
+#   - We have BAMs that were rejected and not part of the metadata
+#   - BAM names contain the cell id, plus other information, but metadata doesn't
+#     directly reference the BAM
+#       B107821_A1_S297.homo.gencode.v30.ERCC.chrM ->
+#          Pilot1/alignment-gencode/SS2/B107821_A1_S297.homo.gencode.v30.ERCC.chrM.Aligned.out.sorted.bam
+#       TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P5_S365 ->
+#          Pilot2/alignment-gencode/smartseq2/batch3/TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P5_S365.homo.gencode.v30.ERCC.chrM.Aligned.out.sorted.bam
+##
 
 ##
 # run intronProspector
 ## 
 cd /hive/data/genomes/hg38/bed/tabula-sapiens-dec2022
 
 # script runIntronProspector is run on each BAM like
    intronProspector --genome-fasta=${ref} --skip-missing-targets --min-confidence-score=1.0 \p
                    --intron-calls=${outtsvtmp} ${inbam}
 
+# get list of SS2 cell ids in metadata
+tmlr filter '$method == "smartseq2"' /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv > smartseq2.metadata.tsv
+
+# get list of all BAMs we have along with id
+
+##### tawk '$3 == "smartseq2"{print $1}' /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv |head
+
+
 # build jobs
+
 (cd /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022 && find Pilot*/alignment-gencode/{smartseq2,SS2} -name '*.bam') | awk '{print "./runIntronProspector", $0}' >smartseq2.jobs
 
 para create smartseq2.jobs -batch=b1
 para push -maxPush=1000000 -batch=b1
 
 ##
 # metadata check
 ##
 find tmp -name "*.tsv" | sed -E -e 's#^.+/##' -e 's#.homo.gencode.+$##' > calls-cells.tab
 
 tmlr filter '$method == "smartseq2"' /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv > smartseq2.metadata.tsv
 
 have: selectById 1 smartseq2.metadata.tsv 1 calls-cells.tab |twc
 missing: selectById -not 1 smartseq2.metadata.tsv 1 calls-cells.tab |twc