4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/txPaper/txPaperPfam/txPaperPfam.c src/hg/txPaper/txPaperPfam/txPaperPfam.c
index a7151eb..30091c7 100644
--- src/hg/txPaper/txPaperPfam/txPaperPfam.c
+++ src/hg/txPaper/txPaperPfam/txPaperPfam.c
@@ -1,199 +1,199 @@
 /* txPaperPfam - Do stuff for pfam calcs.. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "portable.h"
 #include "txInfo.h"
 #include "protFeat.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txPaperPfam - Do stuff for pfam calcs.\n"
   "usage:\n"
   "   txPaperPfam genes.info pfam.tab pfamDesc.tab canonical.lst outDir\n"
   "options:\n"
   "   -xxx=XXX\n"
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 
 FILE *mustCreateInDir(char *dir, char *file)
 /* Create a file to write in dir. */
 {
 char path[PATH_LEN];
 safef(path, sizeof(path), "%s/%s", dir, file);
 return mustOpen(path, "w");
 }
 
 struct featCount
 /* Count up number of features. */
     {
     struct featCount *next;
     char *feature;	/* Name of feature. */
     int count;		/* Number of instances. */
     };
 
 int featCountCmp(const void *va, const void *vb)
 /* Compare to sort based on count, most numerous first. */
 {
 const struct featCount *a = *((struct featCount **)va);
 const struct featCount *b = *((struct featCount **)vb);
 return b->count - a->count;
 }
 
 void countFeature(char *protName, struct hash *featHash, struct hash *countHash, struct featCount **pCountList)
 /* Look up protName in featHash, and update countHash with features seen for this
  * protein.  Do not count a feature more than once per protein. */
 {
 struct hash *uniqHash = hashNew(8);
 struct hashEl *el;
 for (el = hashLookup(featHash, protName); el != NULL; el = hashLookupNext(el))
     {
     struct protFeat *feat = el->val;
     char *featName = feat->feature;
     if (!hashLookup(uniqHash, featName))
         {
 	hashAdd(uniqHash, featName, NULL);
 	struct featCount *count = hashFindVal(countHash, featName);
 	if (count == NULL)
 	    {
 	    AllocVar(count);
 	    hashAddSaveName(countHash, featName, count, &count->feature);
 	    slAddHead(pCountList, count);
 	    }
 	count->count += 1;
 	}
     }
 }
 
 void featCountStats(struct featCount **pList, char *type, struct hash *descHash, FILE *f)
 /* Sort list of feature and print out top ten, and percentages. */
 {
 int total = 0;
 int i;
 struct featCount *count;
 slSort(pList, featCountCmp);
 for (count = *pList; count != NULL; count = count->next)
     total += count->count;
 fprintf(f, "%s total unique domains in proteins: %d\n", type, total);
 for (count = *pList, i=1; i<=25 && count != NULL; ++i, count = count->next)
     {
     fprintf(f, "   %4.2f%% (%d) %s \"%s\"\n", 100.0 * count->count / total, count->count,
     	count->feature, naForNull(hashFindVal(descHash, count->feature)));
     }
 }
 
 void txPaperPfam(char *infoFile, char *pfamFile, char *pfamDescFile, char *canonicalFile, char *outDir)
 /* txPaperPfam - Do stuff for pfam calcs.. */
 {
 /* Load in txInfo. */
 struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
 struct hash *infoHash = hashNew(18);
 for (info = infoList; info != NULL; info = info->next)
     hashAdd(infoHash, info->name, info);
 
 /* Load in pfam features. */
 struct protFeat *pfam, *pfamList = protFeatLoadAll(pfamFile);
 struct hash *pfamHash = hashNew(18);
 for (pfam = pfamList; pfam != NULL; pfam = pfam->next)
     hashAdd(pfamHash, pfam->protein, pfam);
 
 /* Load in pfam descriptions into hash */
 struct hash *pfamDescHash = hashNew(0);
 struct lineFile *lf = lineFileOpen(pfamDescFile, TRUE);
 char *row[3];
 while (lineFileRowTab(lf, row))
     hashAdd(pfamDescHash, row[1], cloneString(row[2]));
 lineFileClose(&lf);
 
 struct hash *canonicalHash = hashWordsInFile(canonicalFile, 16);
 
 struct hash *refCountHash = hashNew(16);
 struct hash *nonrefCountHash = hashNew(16);
 struct featCount *refCountList = NULL, *nonrefCountList = NULL;
 
 /* Figure out what percentage of genes of various types are
  * covered by pfam feature. */
 makeDir(outDir);
 FILE *f = mustCreateInDir(outDir, "summary");
 int codingTxCount = 0, codingGeneCount = 0;
 int refTxCount = 0, nonrefTxCount = 0, refGeneCount = 0, nonrefGeneCount = 0;
 int refTxPfam = 0, nonrefTxPfam = 0, refGenePfam = 0, nonrefGenePfam = 0;
 for (info = infoList; info != NULL; info = info->next)
     {
     if (sameString(info->category, "coding"))
         {
 	boolean gotPfam = (hashLookup(pfamHash, info->name) != NULL);
 	boolean isRef = info->isRefSeq;
 	boolean isCanonical = (hashLookup(canonicalHash, info->name) != NULL);
 	++codingTxCount;
 	if (isCanonical)
 	    ++codingGeneCount;
 	if (isRef)
 	    {
 	    ++refTxCount;
 	    if (isCanonical)
 		{
 		++refGeneCount;
 		if (gotPfam)
 		    {
 		    ++refGenePfam;
 		    countFeature(info->name, pfamHash, refCountHash, &refCountList);
 		    }
 		}
 	    if (gotPfam)
 		++refTxPfam;
 	    }
 	else
 	    {
 	    ++nonrefTxCount;
 	    if (isCanonical)
 	        {
 		++nonrefGeneCount;
 		if (gotPfam)
 		    {
 		    ++nonrefGenePfam;
 		    countFeature(info->name, pfamHash, nonrefCountHash, &nonrefCountList);
 		    }
 		}
 	    if (gotPfam)
 	        ++nonrefTxPfam;
 	    }
 	}
     }
 fprintf(f, "Coding transcripts: %d\n", codingTxCount);
 fprintf(f, "Coding genes: %d\n", codingGeneCount);
 fprintf(f, "RefSeq transcripts: %d (%4.2f%%)\n", refTxCount, 100.0 * refTxCount / codingTxCount);
 fprintf(f, "RefSeq genes: %d (%4.2f%%)\n", refGeneCount, 100.0 * refGeneCount / codingGeneCount);
 fprintf(f, "non-RefSeq transcripts: %d (%4.2f%%)\n", nonrefTxCount, 100.0 * nonrefTxCount / codingTxCount);
 fprintf(f, "non-RefSeq genes: %d (%4.2f%%)\n", nonrefGeneCount, 100.0 * nonrefGeneCount / codingGeneCount);
 fprintf(f, "RefSeq transcripts with pfam: %d (%4.2f%%)\n", refTxPfam, 100.0 * refTxPfam / refTxCount);
 fprintf(f, "RefSeq genes with pfam: %d (%4.2f%%)\n", refGenePfam, 100.0 * refGenePfam / refGeneCount);
 fprintf(f, "non-RefSeq transcripts with pfam: %d (%4.2f%%)\n", nonrefTxPfam, 100.0 * nonrefTxPfam / nonrefTxCount);
 fprintf(f, "non-RefSeq genes with pfam: %d (%4.2f%%)\n", nonrefGenePfam, 100.0 * nonrefGenePfam / nonrefGeneCount);
 
 featCountStats(&refCountList, "RefSeq", pfamDescHash, f);
 featCountStats(&nonrefCountList, "non-RefSeq", pfamDescHash, f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 6)
     usage();
 txPaperPfam(argv[1], argv[2], argv[3], argv[4], argv[5]);
 return 0;
 }