4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/protein/dv2/dv2.c src/hg/protein/dv2/dv2.c
index 69106f6..23e7eb0 100644
--- src/hg/protein/dv2/dv2.c
+++ src/hg/protein/dv2/dv2.c
@@ -1,317 +1,317 @@
 /* dv2 - build dvBed table */
 
 /* Copyright (C) 2013 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "hCommon.h"
 #include "hdb.h"
 
 /* global counter used to collect stats */
 int  np0, np1, np2, nn0, nn1, nn2, nn, np = 0;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "dv - build dvBed table\n"
   "usage:\n"
   "   dv protDb hgDb outTabfile\n"
   "      protDb is the UniProt DB, e.g. unitProt\n"
   "      hgDb is the genome DB, e.g. hg18\n"
   "      outTabfile is the output, e.g. dvBed.tab\n"
   "example: dv uniProt hg18 dvBed.tab\n");
 }
 
 void processVariant(char *database, char *proteinID, int aaSeqLen, int varStart, int varLen, 
 		    char *origSeq,  char *varSeq, char *varId, FILE *outf)
 {
 char query[256];
 struct sqlResult *sr;
 char **row;
 struct sqlConnection  *conn;
 
 char *qNameStr;
 char *qSizeStr;
 char *qStartStr;
 char *qEndStr;
 char *tNameStr=NULL;
 char *tSizeStr;
 char *tStartStr;
 char *tEndStr;
 char *blockCountStr;
 char *blockSizesStr;
 char *qStartsStr;
 char *tStartsStr;
 
 char *strand       = NULL;
 int blockCount=0;
 char *exonStartStr = NULL;
 int exonGenomeStartPos, exonGenomeEndPos;
 char *exonGenomeStartStr = NULL;
 char *chp, *chp2, *chp3;
 char *exonSizeStr  = NULL;
 int j;
 
 int exonStart, exonEnd;
 int lastStart;
 int lastLen;
 int accumLen;
 int exonLen;
 int dvStart;
 int varEnd;
 struct dnaSeq *dnaseq;
 
 conn= hAllocConn();
 /* NOTE: the query below may not always return single answer, */
 /* and kgProtMap and knownGene alignments may not be identical, so pick the closest one. */
 
 sqlSafef(query,sizeof(query), "select qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts, strand from %s.%s where qName='%s';",
         database, "kgProtMap", proteinID);
 sr  = sqlMustGetResult(conn, query);
 row = sqlNextRow(sr);
 
 if (row == NULL)
     {
     sqlFreeResult(&sr);
     hFreeConn(&conn);
     return;
     }
 
 while (row != NULL)
     {
     qNameStr        = cloneString(row[0]);
     qSizeStr        = cloneString(row[1]);
     qStartStr       = cloneString(row[2]);
     qEndStr         = cloneString(row[3]);
     tNameStr        = cloneString(row[4]);
     tSizeStr        = cloneString(row[5]);
     tStartStr       = cloneString(row[6]);
     tEndStr         = cloneString(row[7]);
     blockCountStr   = cloneString(row[8]);
     blockSizesStr   = cloneString(row[9]);
     qStartsStr      = cloneString(row[10]);
     tStartsStr      = cloneString(row[11]);
     strand          = cloneString(row[12]);
 
     blockCount 	   = atoi(blockCountStr);
 
 if (sameWord(strand, "+"))
     {
     exonStartStr   = qStartsStr;
     exonGenomeStartStr = tStartsStr;
     exonSizeStr    = blockSizesStr;
     
     accumLen = 0;
     lastStart = 0;
     chp2 = exonGenomeStartStr;
     chp3 = blockSizesStr;
     for (j=0; j< blockCount; j++)
     	{
 	chp = strstr(chp2, ",");
 	*chp = '\0';
 	exonStart = atoi(chp2);
 	chp2 = chp+1;
 	
 	chp = strstr(chp3, ",");
 	*chp = '\0';
 	exonLen = atoi(chp3);
 	chp3 = chp + 1;
 	
 	if (((accumLen + exonLen)/3) >= varStart)
 	    {
 	    dvStart = exonStart + (varStart - accumLen/3)*3;
 	    np++;
 	    dnaseq = hDnaFromSeq(tNameStr, dvStart, dvStart+varLen*3, dnaUpper); 
 	    
 	    /* check if first AA of variant agrees with genomic codon */
 	    if (lookupCodon(dnaseq->dna) == origSeq[0]) 
 		{
 		np0++;
 	        fprintf(outf, "%s\t%d\t%d\t%s\n", tNameStr, dvStart, dvStart+varLen*3, varId);
 		}
 	    else
 		{
 	        /* try next position */
 		dvStart = exonStart + (varStart - accumLen/3)*3 - 1;
 		
 		dnaseq = hDnaFromSeq(tNameStr, dvStart, dvStart+varLen*3, dnaUpper); 
 		if (lookupCodon(dnaseq->dna) == origSeq[0]) 
 			{
 			np1++;
 	        	fprintf(outf, "%s\t%d\t%d\t%s\n", 
 				tNameStr, dvStart, dvStart+varLen*3, varId);
 			}
 		else
 			{
 			/* could further scan wider range for valid AA mapping */
 			}
      		}
 	    break;
 	    }
 	    
 	lastStart = exonStart;
 	accumLen = accumLen + exonLen;
 	}
     }
 else
     {
     exonStartStr   = qStartsStr;
     exonGenomeStartStr = tStartsStr;
     exonSizeStr    = blockSizesStr;
 
     varEnd = aaSeqLen - varStart - varLen;
     
     accumLen = 0;
     lastStart = 0;
     chp2 = exonGenomeStartStr;
     chp3 = blockSizesStr;
     for (j=0; j< blockCount; j++)
     	{
 	chp = strstr(chp2, ",");
 	*chp = '\0';
 	exonStart = atoi(chp2);
 	chp2 = chp+1;
 	
 	chp = strstr(chp3, ",");
 	*chp = '\0';
 	exonLen = atoi(chp3);
 	chp3 = chp + 1;
 
 	if (((accumLen + exonLen)/3) >= varEnd)
 	    {
 	    nn++;
 	    dvStart =  exonStart + (varEnd - accumLen/3)*3;
 	
 	    dnaseq = hDnaFromSeq(tNameStr, dvStart, dvStart+varLen*3, dnaUpper); 
 	    reverseComplement(dnaseq->dna, (long)3);
   	    if (lookupCodon(dnaseq->dna) == origSeq[0])
             	{
 		nn0++;
 	    	fprintf(outf, "%s\t%d\t%d\t%s\n", tNameStr, dvStart, dvStart+varLen*3, varId);
 	    	}
 	    else
 	    	{
 	    	dvStart =  exonStart + (varEnd - accumLen/3)*3 - 1;
 	
 	    	dnaseq = hDnaFromSeq(tNameStr, dvStart, dvStart+varLen*3, dnaUpper); 
 	    	reverseComplement(dnaseq->dna, (long)3);
   	    	if (lookupCodon(dnaseq->dna) == origSeq[0])
             	    {
 		    nn1++;
 
 	    	    fprintf(outf, "%s\t%d\t%d\t%s\n", tNameStr, dvStart, dvStart+varLen*3, varId);
 		    }
 		else
 		    {
 		    /* deal with this later */
 		    }
 		}
 	    
 	    fflush(outf);
 	    break;
 	    }
 	    
 	lastStart = exonStart;
 	accumLen = accumLen + exonLen;
 	}
     }
 
     row = sqlNextRow(sr);
     }
 
 sqlFreeResult(&sr);
 hFreeConn(&conn);
 }
 
 int main(int argc, char *argv[])
 {
 struct sqlConnection *conn2, *conn3;
   
 char query2[256], query3[256];
 struct sqlResult *sr2, *sr3;
 char **row2, **row3;
 char *proteinDb;
 char *database;
 char *acc, *start, *end;
 
 char *accession;
 char *displayID;
 char *division;
 char *extDB;
 char *extAC;
 
 char *proteinDataDate;
 char *genomeRelease;
 int  i;
 
 char *swissID, *pdb;
 char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd,
      *exonCount, *exonStarts, *exonEnds;
 int iStart;
 char *varId;
 
 char *aaSeq;
 char *lenStr;
 int  varLen;
 char *orig;
 int aaSeqLen;
 char *outFileName;
 char baseAa, variantAa;
 char *origSeq;
 char *varSeq;
 FILE   *outf;
 
 if (argc != 4) usage();
 proteinDb = argv[1];
 database  = argv[2];
 outFileName   = argv[3];
 
 hSetDb(database);
 outf = mustOpen(outFileName, "w");
 conn2 = sqlConnect(proteinDb);
 conn3 = sqlConnect(proteinDb);
 	
 sqlSafef(query2, sizeof query2, "select * from %s.dv", database);
 
 sr2 = sqlMustGetResult(conn2, query2);
 row2 = sqlNextRow(sr2);
 while (row2 != NULL)
     {
     acc 	= row2[0];
     start 	= row2[1];
     lenStr 	= row2[2];
     origSeq 	= row2[3];
     varSeq      = row2[4];
     varId       = row2[5];
     
     iStart = atoi(start);
     varLen = atoi(lenStr);
    
     /* get protein sequence */
     sqlSafef(query3, sizeof query3, "select val from %s.protein where acc='%s'", proteinDb, acc);
     sr3      = sqlMustGetResult(conn3, query3);
     row3     = sqlNextRow(sr3);
     aaSeq    = row3[0];
     aaSeqLen = strlen(aaSeq);
     
     if (strlen(varSeq) >= 255) 
 	{
         fprintf(stderr,"varSeq longer than 255 AA: \t%s\t%s\n", varSeq, varId);
 	*(varSeq+255) = '\0';
 	}
 
     processVariant(database, acc, aaSeqLen, iStart, varLen, origSeq, varSeq, varId, outf); 
     sqlFreeResult(&sr3);
     
     row2 = sqlNextRow(sr2);
     }
     
 sqlFreeResult(&sr2);
 fclose(outf);
 fflush(stdout);
 
 printf("np=%d np0=%d np1=%d not covered = %d\n", np, np0, np1, np - np0 - np1);fflush(stdout);
 printf("nn=%d nn0=%d nn1=%d not covered = %d\n", nn, nn0, nn1, nn - nn0 - nn1);fflush(stdout);
 return(0);
 }