f48cccce86f5cc987b4532e7267f036a1ee4f3c5
markd
  Tue Oct 10 13:01:18 2023 -0700
work on tabula sapiens but stuck on missing metadata

diff --git src/hg/makeDb/doc/hg38/tabulaSapiens.txt src/hg/makeDb/doc/hg38/tabulaSapiens.txt
index 8ee393b..2094a2b 100644
--- src/hg/makeDb/doc/hg38/tabulaSapiens.txt
+++ src/hg/makeDb/doc/hg38/tabulaSapiens.txt
@@ -1,42 +1,94 @@
 This describes building tabula-sapiens related tracks
 
 
 #############################################################################
 # bam-dec2022 download (Max)
 #############################################################################
 
 
 Download to:
   /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022/
 
 some doc in https://github.com/czbiohub/tabula-sapiens
 
+metadata:
+  /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv
+  /hive/data/inside/cells/datasets/tabula-sapiens/orig/metaMerged/cellxgene.tsv
 
-#############################################################################
-# setup for intronProspector runs (markd)
-#############################################################################
-
-Install htslib-1.16 and intronProspector-1.0.3 in /cluster/software/. Obtain
-from:
-
-   https://github.com/samtools/htslib/releases/download/1.16/htslib-1.16.tar.bz2
-   https://github.com/diekhans/intronProspector/archive/refs/tags/v1.0.3.tar.gz
-
-Need to get genome sequence matching the BAMs.
-
-STAR/homo.gencode.v30.annotation.ERCC92 which is not in bucket
-
-
-something is weird with samrtseq2 directory BAMs
+# note that smartseq2 BAM can't be erad by samtools or picard due to duplicate
+# chromosomes:
     cd /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022/Pilot1/alignment-gencode/
 
     % samtools view -H  ./smartseq2/B107813_G5_S31.homo.covid19.Aligned.out.sorted.bam| head
     [E::sam_hrecs_refs_from_targets_array] Duplicate entry "NC_040671" in target list
     samtools view: failed to add PG line to the header
 
     % picard ValidateSamFile I=./smartseq2/B107813_G5_S31.homo.covid19.Aligned.out.sorted.bam
     ERROR	2023-02-08 21:44:44	ValidateSamFile	Cannot add sequence that already exists in SAMSequenceDictionary: NC_040671
 
+however, intronProspector can read them
+
+#############################################################################
+# setup for intronProspector runs (markd)
+#############################################################################
+
+Install htslib-1.18 and intronProspector-1.0.3 in /cluster/software/. Obtain
+from:
+
+   https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2
+   https://github.com/diekhans/intronProspector/archive/refs/tags/v1.0.3.tar.gz
+
 
-CZI contacted about problems
+##
+# run intronProspector
+## 
+cd /hive/data/genomes/hg38/bed/tabula-sapiens-dec2022
+
+# script runIntronProspector is run on each BAM like
+   intronProspector --genome-fasta=${ref} --skip-missing-targets --min-confidence-score=1.0 \p
+                   --intron-calls=${outtsvtmp} ${inbam}
+
+# build jobs
+(cd /hive/data/inside/cells/datasets/tabula-sapiens/bam-dec2022 && find Pilot*/alignment-gencode/{smartseq2,SS2} -name '*.bam') | awk '{print "./runIntronProspector", $0}' >smartseq2.jobs
+
+para create smartseq2.jobs -batch=b1
+para push -maxPush=1000000 -batch=b1
+
+##
+# metadata check
+##
+find tmp -name "*.tsv" | sed -E -e 's#^.+/##' -e 's#.homo.gencode.+$##' > calls-cells.tab
+
+tmlr filter '$method == "smartseq2"' /hive/data/inside/cells/datasets/tabula-sapiens/all/meta.fixed.tsv > smartseq2.metadata.tsv
+
+have: selectById 1 smartseq2.metadata.tsv 1 calls-cells.tab |twc
+missing: selectById -not 1 smartseq2.metadata.tsv 1 calls-cells.tab |twc
+
+    calls-cells.tab:         84877
+    smartseq2.metadata.tsv:  27052
+    diff                     57825
+
+    have:                    23213
+    missing:                 61664
+
+# list of missing
+selectById -not 1 smartseq2.metadata.tsv  1 calls-cells.tab >missing.tab
+
+# count per Pilot
+for p in $(cd tmp && echo *) ; do find tmp/$p -name "*.tsv" | sed -E -e 's#^.+/##' -e 's#.homo.gencode.+$##' > calls-cells.$p.tab ; done&
+for f in calls-cells.*.tab; do  selectById -not 1 smartseq2.metadata.tsv  1 $f | (echo -n $f ' ' ; twc -l) ; done >&pilot.metadata.cnts
+    calls-cells.Pilot1.tab          15338
+    calls-cells.Pilot2.tab          12302
+    calls-cells.Pilot3.tab          2276
+    calls-cells.Pilot4.tab          4096
+    calls-cells.Pilot5.tab          629
+    calls-cells.Pilot6.tab          1881
+    calls-cells.Pilot7.tab          1436
+    calls-cells.Pilot8.tab          518
+    calls-cells.Pilot9.tab          1058
+    calls-cells.Pilot10.tab         1556
+    calls-cells.Pilot11.tab         428
+    calls-cells.Pilot12.tab         490
+    calls-cells.Pilot13.tab         978
+    calls-cells.Pilot14.tab         18678