aa6447741ae61479b43cbd2b02182aa405487dab
hiram
  Fri Sep 30 15:19:00 2022 -0700
can now read qSizes from file or URL refs #29819

diff --git src/hg/mouseStuff/netClass/netClass.c src/hg/mouseStuff/netClass/netClass.c
index 07d216b..9d53d85 100644
--- src/hg/mouseStuff/netClass/netClass.c
+++ src/hg/mouseStuff/netClass/netClass.c
@@ -1,78 +1,83 @@
 /* netClass - Add classification info to net. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
+#include "chromInfo.h"
 #include "hash.h"
 #include "options.h"
 #include "rbTree.h"
 #include "jksql.h"
 #include "hdb.h"
 #include "localmem.h"
 #include "agpGap.h"
 #include "simpleRepeat.h"
 #include "liftUp.h"
 #include "chainNet.h"
 
 
 /* Command line switches. */
 char *tNewR = NULL;
 char *qNewR = NULL;
 boolean noAr = FALSE;
 char *qRepeatTable = NULL;
 char *tRepeatTable = NULL;
+char *qSizes = NULL;
 struct hash *liftHashT = NULL;
 struct hash *liftHashQ = NULL;
 
 /* Localmem obj shared by cached query rbTrees. */
 struct lm *qLm = NULL;
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"tNewR", OPTION_STRING},
     {"qNewR", OPTION_STRING},
     {"noAr", OPTION_BOOLEAN},
     {"qRepeats", OPTION_STRING},
     {"tRepeats", OPTION_STRING},
     {"liftT", OPTION_STRING},
     {"liftQ", OPTION_STRING},
+    {"qSizes", OPTION_STRING},
     {NULL, 0}
 };
 
 static void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "netClass - Add classification info to net\n"
   "usage:\n"
   "   netClass [options] in.net tDb qDb out.net\n"
   "       tDb - database to fetch target repeat masker table information\n"
   "       qDb - database to fetch query repeat masker table information\n"
   "options:\n"
   "   -tNewR=dir - Dir of chrN.out.spec files, with RepeatMasker .out format\n"
   "                lines describing lineage specific repeats in target\n"
   "   -qNewR=dir - Dir of chrN.out.spec files for query\n"
   "   -noAr - Don't look for ancient repeats\n"
   "   -qRepeats=table - table name for query repeats in place of rmsk\n"
   "   -tRepeats=table - table name for target repeats in place of rmsk\n"
   "                   - for example: -tRepeats=windowmaskerSdust\n"
   "   -liftQ=file.lft - Lift in.net's query coords to chrom-level using\n"
   "                     file.lft (for accessing chrom-level coords in qDb)\n"
   "   -liftT=file.lft - Lift in.net's target coords to chrom-level using\n"
   "                     file.lft (for accessing chrom-level coords in tDb)\n"
+  "   -qSizes=chrom.sizes - file with query chrom.sizes instead of reading\n"
+  "                   - the chromInfo table from the database\n"
   );
 }
 
 struct chrom
 /* Basic information on a chromosome. */
     {
     struct chrom *next;	  /* Next in list */
     char *name;		  /* Chromosome name, allocated in hash. */
     int size;		  /* Chromosome size. */
     struct rbTree *nGaps; /* Gaps in sequence (Ns) */
     struct rbTree *repeats; /* Repeats in sequence */
     struct rbTree *newRepeats; /* New (lineage specific) repeats. */
     struct rbTree *oldRepeats; /* Old (pre-split) repeats. */
     struct rbTree *trf;	       /* Simple repeats. */
     };
@@ -665,50 +670,66 @@
 else
     errAbort("Can't find ancientRepeat table in %s or %s",
 	     sqlGetDatabase(tConn), sqlGetDatabase(qConn));
 char query[1024];
 sqlSafef(query, sizeof query, "select name,family,class from ancientRepeat");
 sr = sqlGetResult(conn, query);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     sprintf(key, "%s.%s.%s", row[0], row[1], row[2]);
     hashAdd(hash, key, NULL);
     }
 sqlFreeResult(&sr);
 return hash;
 }
 
-void getChroms(struct sqlConnection *conn, struct hash **retHash,
-	       struct chrom **retList)
-/* Get hash of chromosomes from database. */
+static void getChroms(struct sqlConnection *conn, struct hash **retHash,
+	       struct chrom **retList, char *sizes)
+/* Get hash of chromosomes from database or from sizes file if given. */
 {
-struct sqlResult *sr;
-char **row;
 struct chrom *chromList = NULL, *chrom;
 struct hash *hash = hashNew(8);
 
+if (sizes)
+    {
+    struct chromInfo *list = chromInfoListFromFile(sizes);
+    struct chromInfo *ci = NULL;
+    for (ci = list; ci; ci=ci->next)
+	{
+	AllocVar(chrom);
+	hashAddSaveName(hash, ci->chrom, chrom, &chrom->name);
+	chrom->size = ci->size;
+	slAddHead(&chromList, chrom);
+	}
+    }
+else
+    {
+    struct sqlResult *sr;
+    char **row;
     char query[1024];
     sqlSafef(query, sizeof query, "select chrom,size from chromInfo");
     sr = sqlGetResult(conn, query);
     while ((row = sqlNextRow(sr)) != NULL)
 	{
 	AllocVar(chrom);
 	hashAddSaveName(hash, row[0], chrom, &chrom->name);
 	chrom->size = atoi(row[1]);
 	slAddHead(&chromList, chrom);
 	}
     sqlFreeResult(&sr);
+    }
+
 slReverse(&chromList);
 *retHash = hash;
 *retList = chromList;
 }
 
 int liftStart(char *name, int start, struct hash *liftHash)
 /* Lift start if necessary. */
 {
 int s = start;
 if (liftHash != NULL)
     {
     struct liftSpec *lft = hashMustFindVal(liftHash, name);
     s += lft->offset;
     }
 return s;
@@ -871,31 +892,31 @@
 {
 struct chainNet *net;
 struct lineFile *lf = lineFileOpen(inName, TRUE);
 FILE *f = mustOpen(outName, "w");
 struct chrom *qChromList, *chrom;
 struct hash *qChromHash;
 struct hash *arHash = NULL;
 struct sqlConnection *tConn = sqlConnect(tDb);
 struct sqlConnection *qConn = sqlConnect(qDb);
 
 qLm = lmInit(0);
 
 if (!noAr)
     arHash = getAncientRepeats(tConn, qConn);
 
-getChroms(qConn, &qChromHash, &qChromList);
+getChroms(qConn, &qChromHash, &qChromList, qSizes);
 
 verbose(1, "Reading gaps in %s\n", qDb);
 if (sqlTableExists(qConn, "gap"))
     {
     getSeqGapsUnsplit(qConn, qChromHash);
     }
 else
     {
     for (chrom = qChromList; chrom != NULL; chrom = chrom->next)
 	chrom->nGaps = getSeqGaps(qConn, chrom->name);
     }
 
 if (qNewR)
     {
     verbose(1, "Reading new repeats from %s\n", qNewR);
@@ -976,28 +997,29 @@
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *liftFileQ = NULL, *liftFileT = NULL;
 optionInit(&argc, argv, optionSpecs);
 if (argc != 5)
     usage();
 tNewR = optionVal("tNewR", tNewR);
 qNewR = optionVal("qNewR", qNewR);
 noAr = optionExists("noAr");
 qRepeatTable = optionVal("qRepeats", qRepeatTable);
 tRepeatTable = optionVal("tRepeats", tRepeatTable);
 liftFileQ = optionVal("liftQ", liftFileQ);
 liftFileT = optionVal("liftT", liftFileT);
+qSizes = optionVal("qSizes", qSizes);
 if (liftFileQ != NULL)
     {
     struct liftSpec *lifts = readLifts(liftFileQ);
     liftHashQ = hashLift(lifts, TRUE);
     }
 if (liftFileT != NULL)
     {
     struct liftSpec *lifts = readLifts(liftFileT);
     liftHashT = hashLift(lifts, TRUE);
     }
 netClass(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }