src/lib/wormdna.c 1.11
1.11 2009/09/23 18:42:29 angie
Fixed compiler warnings from gcc 4.3.3, mostly about system calls whose return values weren't checked and non-literal format strings with no args.
Index: src/lib/wormdna.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/wormdna.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 1000000 -r1.10 -r1.11
--- src/lib/wormdna.c 10 Apr 2005 14:41:26 -0000 1.10
+++ src/lib/wormdna.c 23 Sep 2009 18:42:29 -0000 1.11
@@ -1,1213 +1,1213 @@
/* wormDna - Stuff for finding worm DNA and annotation features.
* This is pretty much the heart of the cobbled-together 'database'
* behind the intronerator.
*
* This file is copyright 2002 Jim Kent, but license is hereby
* granted for all use - public, private or commercial. */
#include "common.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "fa.h"
#include "gdf.h"
#include "nt4.h"
#include "snof.h"
#include "wormdna.h"
#include "cda.h"
#include "sig.h"
#include "dystring.h"
static char const rcsid[] = "$Id$";
static char *jkwebDir = NULL;
static char *cdnaDir = NULL;
static char *featDir = NULL;
static char *nt4Dir = NULL;
static char *sangerDir = NULL;
static char *genieDir = NULL;
static char *xenoDir = NULL;
static void getDirs()
/* Look up the directories where our data is stored. */
{
if (jkwebDir == NULL)
{
char buf[512];
/* Look up directory where directory pointer files are stored
* in environment string if it's there. */
if ((jkwebDir = getenv("JKWEB")) == NULL)
jkwebDir = "";
sprintf(buf, "%scdna.dir", jkwebDir);
firstWordInFile(buf, buf, sizeof(buf));
cdnaDir = cloneString(buf);
sprintf(buf, "%sfeat.dir", jkwebDir);
firstWordInFile(buf, buf, sizeof(buf));
featDir = cloneString(buf);
sprintf(buf, "%snt4.dir", jkwebDir);
firstWordInFile(buf, buf, sizeof(buf));
nt4Dir = cloneString(buf);
sprintf(buf, "%ssanger/", featDir);
sangerDir = cloneString(buf);
sprintf(buf, "%sgenie/", featDir);
genieDir = cloneString(buf);
sprintf(buf, "%sxeno.dir", jkwebDir);
firstWordInFile(buf, buf, sizeof(buf));
xenoDir = cloneString(buf);
}
}
char *wormFeaturesDir()
/* Return the features directory. (Includes trailing slash.) */
{
getDirs();
return featDir;
}
char *wormChromDir()
/* Return the directory with the chromosomes. */
{
getDirs();
return nt4Dir;
}
char *wormCdnaDir()
/* Return directory with cDNA data. */
{
getDirs();
return cdnaDir;
}
char *wormSangerDir()
/* Return directory with Sanger specific gene predictions. */
{
getDirs();
return sangerDir;
}
char *wormGenieDir()
/* Return directory with Genie specific gene predictions. */
{
getDirs();
return genieDir;
}
char *wormXenoDir()
/* Return directory with cross-species alignments. */
{
getDirs();
return xenoDir;
}
static char *chromIds[] = {"i", "ii", "iii", "iv", "v", "x", "m", };
void wormChromNames(char ***retNames, int *retNameCount)
/* Get list of worm chromosome names. */
{
*retNames = chromIds;
*retNameCount = ArraySize(chromIds);
}
int wormChromIx(char *name)
/* Return index of worm chromosome. */
{
return stringIx(name, chromIds);
}
char *wormChromForIx(int ix)
/* Given ix, return worm chromosome official name. */
{
assert(ix >= 0 && ix <= ArraySize(chromIds));
return chromIds[ix];
}
char *wormOfficialChromName(char *name)
/* This returns a pointer to our official string for the chromosome name.
* (This allows some routines to do direct pointer comparisons rather
* than string comparisons.) */
{
int ix = wormChromIx(name);
if (ix < 0) return NULL;
return chromIds[ix];
}
static struct snof *cdnaSnof = NULL;
static FILE *cdnaFa = NULL;
static void wormCdnaCache()
/* Set up to read cDNAs */
{
getDirs();
if (cdnaSnof == NULL)
{
char buf[512];
sprintf(buf, "%s%s", cdnaDir, "allcdna");
cdnaSnof = snofMustOpen(buf);
sprintf(buf, "%s%s", cdnaDir, "allcdna.fa");
cdnaFa = mustOpen(buf, "rb");
}
}
void wormCdnaUncache()
/* Tear down structure for reading cDNAs. */
{
snofClose(&cdnaSnof);
carefulClose(&cdnaFa);
freez(&cdnaDir);
}
void wormFreeCdnaInfo(struct wormCdnaInfo *ci)
/* Free the mother string in the cdnaInfo. (The info structure itself normally isn't
* dynamically allocated. */
{
freeMem(ci->motherString);
zeroBytes(ci, sizeof(*ci));
}
static char *realInfoString(char *s)
/* Returns NULL if s is just "?", the NULL placeholder. */
{
if (s[0] == '?' && s[1] == 0) return NULL;
return s;
}
static void parseRestOfCdnaInfo(char *textInfo, struct wormCdnaInfo *retInfo)
/* Parse text info string into a binary structure retInfo. */
{
int wordCount;
char *words[32];
char *s;
wordCount = chopString(textInfo, "|", words, ArraySize(words));
if (wordCount < 8)
errAbort("Expecting at least 8 fields in cDNA database, got %d", wordCount);
if ((s = realInfoString(words[0])) != NULL)
retInfo->orientation = s[0];
retInfo->gene = realInfoString(words[1]);
retInfo->product = realInfoString(words[2]);
if ((s = realInfoString(words[3])) != NULL)
{
char *parts[2];
int partCount;
partCount = chopString(s, ".", parts, ArraySize(parts));
if (partCount == 2)
{
retInfo->knowStart = retInfo->knowEnd = TRUE;
if (parts[0][0] == '<')
{
retInfo->knowStart = FALSE;
parts[0] += 1;
}
if (parts[1][0] == '>')
{
retInfo->knowEnd = FALSE;
parts[1] += 1;
}
retInfo->cdsStart = atoi(parts[0]);
retInfo->cdsEnd = atoi(parts[1]);
}
}
if ((s = realInfoString(words[4])) != NULL)
{
if (sameString("embryo", s))
retInfo->isEmbryonic = TRUE;
else if (sameString("adult", s))
retInfo->isAdult = TRUE;
}
if ((s = realInfoString(words[5])) != NULL)
{
if (sameString("herm", s))
retInfo->isHermaphrodite = TRUE;
else if (sameString("male", s))
retInfo->isMale = TRUE;
}
if ((s = realInfoString(words[6])) != NULL)
{
/* Reserved. Unused currently */
}
retInfo->description = realInfoString(words[7]);
}
void wormFaCommentIntoInfo(char *faComment, struct wormCdnaInfo *retInfo)
/* Process line from .fa file containing information about cDNA into binary
* structure. */
{
if (retInfo)
{
char *s;
zeroBytes(retInfo, sizeof(*retInfo));
/* Separate out first word and use it as name. */
s = strchr(faComment, ' ');
if (s == NULL)
errAbort("Expecting lots of info, just got %s", faComment);
*s++ = 0;
retInfo->name = faComment+1;
retInfo->motherString = faComment;
parseRestOfCdnaInfo(s, retInfo);
}
}
boolean wormCdnaInfo(char *name, struct wormCdnaInfo *retInfo)
/* Get info about cDNA sequence. */
{
char commentBuf[512];
char *comment;
long offset;
wormCdnaCache();
if (!snofFindOffset(cdnaSnof, name, &offset))
return FALSE;
fseek(cdnaFa, offset, SEEK_SET);
-fgets(commentBuf, sizeof(commentBuf), cdnaFa);
+mustGetLine(cdnaFa, commentBuf, sizeof(commentBuf));
if (commentBuf[0] != '>')
errAbort("Expecting line starting with > in cDNA fa file.\nGot %s", commentBuf);
comment = cloneString(commentBuf);
wormFaCommentIntoInfo(comment, retInfo);
return TRUE;
}
boolean wormCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo)
/* Get a single worm cDNA sequence. Optionally (if retInfo is non-null) get additional
* info about the sequence. */
{
long offset;
char *faComment;
char **pFaComment = (retInfo == NULL ? NULL : &faComment);
wormCdnaCache();
if (!snofFindOffset(cdnaSnof, name, &offset))
return FALSE;
fseek(cdnaFa, offset, SEEK_SET);
if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna))
return FALSE;
wormFaCommentIntoInfo(faComment, retInfo);
return TRUE;
}
struct wormFeature *newWormFeature(char *name, char *chrom, int start, int end, char typeByte)
/* Allocate a new feature. */
{
int size = sizeof(struct wormFeature) + strlen(name);
struct wormFeature *feat = needMem(size);
feat->chrom = chrom;
feat->start = start;
feat->end = end;
feat->typeByte = typeByte;
strcpy(feat->name, name);
return feat;
}
static struct wormFeature *scanChromOffsetFile(char *dir, char *suffix,
bits32 signature, int nameOffset, char *chromId, int start, int end,
int addEnd)
/* Scan a chrom.pgo or chrom.cdo file for names of things that are within
* range. */
{
FILE *f;
char fileName[512];
bits32 sig, nameSize, entryCount;
int entrySize;
int *entry;
char *name;
bits32 i;
struct wormFeature *list = NULL, *el;
char *typePt;
char typeByte;
sprintf(fileName, "%s%s%s", dir, chromId, suffix);
f = mustOpen(fileName, "rb");
mustReadOne(f, sig);
if (sig != signature)
errAbort("Bad signature on %s", fileName);
mustReadOne(f, entryCount);
mustReadOne(f, nameSize);
entrySize = nameSize + nameOffset;
entry = needMem(entrySize + 1);
name = (char *)entry;
name += nameOffset;
typePt = name-1;
for (i=0; i<entryCount; ++i)
{
mustRead(f, entry, entrySize);
if (entry[0] > end)
break;
if (entry[1] < start)
continue;
typeByte = *typePt;
el = newWormFeature(name, chromId, entry[0], entry[1]+addEnd, typeByte);
slAddHead(&list, el);
}
slReverse(&list);
fclose(f);
freeMem(entry);
return list;
}
struct wormFeature *wormCdnasInRange(char *chromId, int start, int end)
/* Get all cDNAs that overlap the range. freeDnaSeqList the returned
* list when you are through with it. */
{
/* This routine looks through the .CDO files made by cdnaOff
*/
getDirs();
return scanChromOffsetFile(cdnaDir, ".cdo", cdoSig, 2*sizeof(int)+1,
chromId, start, end, 0);
}
struct wormFeature *wormSomeGenesInRange(char *chromId, int start, int end, char *gdfDir)
/* Get info on genes that overlap range in a particular set of gene predictions. */
{
return scanChromOffsetFile(gdfDir, ".pgo", pgoSig, 2*sizeof(int)+1,
chromId, start, end, 0);
}
struct wormFeature *wormGenesInRange(char *chromId, int start, int end)
/* Get names of all genes that overlap the range. */
{
/* This routine looks through the .PGO files made by makePgo
*/
getDirs();
return wormSomeGenesInRange(chromId, start, end, sangerDir);
}
struct wormFeature *wormCosmidsInRange(char *chromId, int start, int end)
/* Get names of all genes that overlap the range. */
{
/* This routine looks through the .COO files made by makePgo
*/
getDirs();
return scanChromOffsetFile(featDir, ".coo", pgoSig, 2*sizeof(int)+1,
chromId, start, end, 1);
}
FILE *wormOpenGoodAli()
/* Opens good alignment file and reads signature.
* (You can then cdaLoadOne() it.) */
{
char fileName[512];
getDirs();
sprintf(fileName, "%sgood.ali", cdnaDir);
return cdaOpenVerify(fileName);
}
struct cdaAli *wormCdaAlisInRange(char *chromId, int start, int end)
/* Return list of cdna alignments that overlap range. */
{
struct cdaAli *list = NULL, *el;
char fileName[512];
FILE *ixFile, *aliFile;
bits32 sig;
int s, e;
long fpos;
aliFile = wormOpenGoodAli();
sprintf(fileName, "%s%s.alx", cdnaDir, chromId);
ixFile = mustOpen(fileName, "rb");
mustReadOne(ixFile, sig);
if (sig != alxSig)
errAbort("Bad signature on %s", fileName);
for (;;)
{
if (!readOne(ixFile, s))
break;
mustReadOne(ixFile, e);
mustReadOne(ixFile, fpos);
if (e <= start)
continue;
if (s >= end)
break;
AllocVar(el);
fseek(aliFile, fpos, SEEK_SET);
el = cdaLoadOne(aliFile);
if (el == NULL)
errAbort("Truncated cdnaAli file");
slAddHead(&list, el);
}
slReverse(&list);
fclose(aliFile);
fclose(ixFile);
return list;
}
boolean nextWormCdnaAndInfo(struct wormCdnaIterator *it, struct dnaSeq **retSeq,
struct wormCdnaInfo *retInfo)
/* Return next sequence and associated info from database. */
{
char *faComment;
if (!faReadNext(it->faFile, "unknown", TRUE, &faComment, retSeq))
return FALSE;
wormFaCommentIntoInfo(faComment, retInfo);
return TRUE;
}
struct dnaSeq *nextWormCdna(struct wormCdnaIterator *it)
/* Return next sequence in database */
{
return faReadOneDnaSeq(it->faFile, "unknown", TRUE);
}
boolean wormSearchAllCdna(struct wormCdnaIterator **retSi)
/* Set up to search entire database or worm cDNA */
{
char buf[512];
struct wormCdnaIterator *it;
it = needMem(sizeof(*it));
getDirs();
sprintf(buf, "%s%s", cdnaDir, "allcdna.fa");
it->faFile = mustOpen(buf, "rb");
*retSi = it;
return TRUE;
}
void freeWormCdnaIterator(struct wormCdnaIterator **pIt)
/* Free up iterator returned by wormSearchAllCdna() */
{
struct wormCdnaIterator *it = *pIt;
if (it != NULL)
{
carefulClose(&it->faFile);
freez(pIt);
}
}
static boolean isAllAlpha(char *s)
/* Returns TRUE if every character in string is a letter. */
{
char c;
while ((c = *s++) != 0)
{
if (!isalpha(c)) return FALSE;
}
return TRUE;
}
static boolean isAllDigit(char *s)
/* Returns TRUE if every character in string is a digit. */
{
char c;
while ((c = *s++) != 0)
{
if (!isdigit(c)) return FALSE;
}
return TRUE;
}
boolean wormIsOrfName(char *in)
/* Check to see if the input is formatted correctly to be
* an ORF. */
{
return strchr(in, '.') != NULL;
}
boolean wormIsGeneName(char *name)
/* See if it looks like a worm gene name - that is
* abc-12
* letters followed by a dash followed by a number. */
{
char buf[128];
int partCount;
strncpy(buf, name, sizeof(buf));
partCount = chopString(buf, "-", NULL, 0);
if (partCount == 2)
{
char *parts[2];
chopString(buf, "-", parts, 2);
return isAllAlpha(parts[0]) && isAllDigit(parts[1]);
}
else
{
return FALSE;
}
}
struct slName *wormGeneToOrfNames(char *name)
/* Returns list of cosmid.N type ORF names that are known by abc-12 type name. */
{
struct slName *synList = NULL;
char synFileName[512];
FILE *synFile;
char lineBuf[128];
int nameLen = strlen(name);
/* genes are supposed to be lower case. */
tolowers(name);
/* Open synonym file and loop through each line of it */
sprintf(synFileName, "%ssyn", wormFeaturesDir());
if ((synFile = fopen(synFileName, "r")) == NULL)
errAbort("Can't find synonym file '%s'. (errno: %d)\n", synFileName, errno);
while (fgets(lineBuf, ArraySize(lineBuf), synFile))
{
/* If first part of line matches chop up line. */
if (strncmp(name, lineBuf, nameLen) == 0)
{
char *syns[32];
int count;
count = chopString(lineBuf, whiteSpaceChopper, syns, ArraySize(syns));
/* Looks like we got a synonym. Add all the aliases. */
if (strcmp(name, syns[0]) == 0)
{
int i;
for (i=1; i<count; ++i)
slAddTail(&synList, newSlName(syns[i]));
break;
}
}
}
fclose(synFile);
return synList;
}
char *wormGeneFirstOrfName(char *geneName)
/* Return first ORF synonym to gene. */
{
struct slName *synList = wormGeneToOrfNames(geneName);
char *name;
if (synList == NULL)
return NULL;
name = cloneString(synList->name);
slFreeList(&synList);
return name;
}
boolean wormGeneForOrf(char *orfName, char *geneNameBuf, int bufSize)
/* Look for gene type (unc-12 or something) synonym for cosmid.N name. */
{
FILE *f;
char fileName[512];
char lineBuf[512];
int nameLen = strlen(orfName);
boolean ok = FALSE;
sprintf(fileName, "%sorf2gene", wormFeaturesDir());
f = mustOpen(fileName, "r");
while (fgets(lineBuf, sizeof(lineBuf), f))
{
if (strncmp(lineBuf, orfName, nameLen) == 0 && lineBuf[nameLen] == ' ')
{
char *words[2];
int wordCount;
wordCount = chopLine(lineBuf, words);
assert((int)strlen(words[1]) < bufSize);
strncpy(geneNameBuf, words[1], bufSize);
ok = TRUE;
break;
}
}
fclose(f);
return ok;
}
boolean wormInfoForGene(char *orfName, struct wormCdnaInfo *retInfo)
/* Return info if any on ORF, or NULL if none exists. freeMem() return value. */
{
FILE *f;
char fileName[512];
char lineBuf[512];
int nameLen;
char *info = NULL;
char *synName = NULL;
int lineCount = 0;
/* Make this one work for orfs as well as gene names */
if (wormIsGeneName(orfName))
{
synName = wormGeneFirstOrfName(orfName);
if (synName != NULL)
orfName = synName;
}
sprintf(fileName, "%sorfInfo", wormFeaturesDir());
nameLen = strlen(orfName);
f = mustOpen(fileName, "r");
while (fgets(lineBuf, sizeof(lineBuf), f))
{
++lineCount;
if (strncmp(lineBuf, orfName, nameLen) == 0 && lineBuf[nameLen] == ' ')
{
info = cloneString(lineBuf);
break;
}
}
freeMem(synName);
fclose(f);
if (info == NULL)
return FALSE;
wormFaCommentIntoInfo(info, retInfo);
return TRUE;;
}
boolean getWormGeneDna(char *name, DNA **retDna, boolean upcExons)
/* Get the DNA associated with a gene. Optionally upper case exons. */
{
struct gdfGene *g;
struct slName *syn = NULL;
long lstart, lend;
int start, end;
int dnaSize;
DNA *dna;
struct wormGdfCache *gdfCache;
/* Translate biologist type name to cosmid.N name */
if (wormIsGeneName(name))
{
syn = wormGeneToOrfNames(name);
if (syn != NULL)
name = syn->name;
}
if (strncmp(name, "g-", 2) == 0)
gdfCache = &wormGenieGdfCache;
else
gdfCache = &wormSangerGdfCache;
if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL)
return FALSE;
gdfGeneExtents(g, &lstart, &lend);
start = lstart;
end = lend;
/* wormClipRangeToChrom(chromIds[g->chromIx], &start, &end); */
dnaSize = end-start;
*retDna = dna = wormChromPart(chromIds[g->chromIx], start, dnaSize);
gdfOffsetGene(g, -start);
if (g->strand == '-')
{
reverseComplement(dna, dnaSize);
gdfRcGene(g, dnaSize);
}
if (upcExons)
{
int i;
struct gdfDataPoint *pt = g->dataPoints;
for (i=0; i<g->dataCount; i += 2)
{
toUpperN(dna + pt[i].start, pt[i+1].start - pt[i].start);
}
}
gdfFreeGene(g);
return TRUE;
}
boolean getWormGeneExonDna(char *name, DNA **retDna)
/* Get the DNA associated with a gene, without introns. */
{
struct gdfGene *g;
struct slName *syn = NULL;
long lstart, lend;
int start, end;
int dnaSize;
DNA *dna;
int i;
struct gdfDataPoint *pt = NULL;
struct wormGdfCache *gdfCache;
struct dyString *dy = newDyString(1000);
/* Translate biologist type name to cosmid.N name */
if (wormIsGeneName(name))
{
syn = wormGeneToOrfNames(name);
if (syn != NULL)
name = syn->name;
}
if (strncmp(name, "g-", 2) == 0)
gdfCache = &wormGenieGdfCache;
else
gdfCache = &wormSangerGdfCache;
if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL)
return FALSE;
gdfGeneExtents(g, &lstart, &lend);
start = lstart;
end = lend;
/*wormClipRangeToChrom(chromIds[g->chromIx], &start, &end);*/
dnaSize = end-start;
dna = wormChromPart(chromIds[g->chromIx], start, dnaSize);
gdfOffsetGene(g, -start);
if (g->strand == '-')
{
reverseComplement(dna, dnaSize);
gdfRcGene(g, dnaSize);
}
pt = g->dataPoints;
for (i=0; i<g->dataCount; i += 2)
{
dyStringAppendN(dy, (dna+pt[i].start), (pt[i+1].start - pt[i].start));
}
*retDna = cloneString(dy->string);
dyStringFree(&dy);
gdfFreeGene(g);
return TRUE;
}
static void makeChromFileName(char *chromId, char *buf)
{
getDirs();
sprintf(buf, "%s%s.nt4", nt4Dir, chromId);
}
void wormLoadNt4Genome(struct nt4Seq ***retNt4Seq, int *retNt4Count)
/* Load up entire packed worm genome into memory. */
{
int count = ArraySize(chromIds);
struct nt4Seq **nt4s = needMem(count*sizeof(*nt4s));
int i;
char fileName[512];
for (i=0; i<count; ++i)
{
makeChromFileName(chromIds[i], fileName);
nt4s[i] = loadNt4(fileName, chromIds[i]);
}
*retNt4Seq = nt4s;
*retNt4Count = count;
}
void wormFreeNt4Genome(struct nt4Seq ***pNt4Seq)
/* Free up packed worm genome. */
{
struct nt4Seq **seqs;
int i;
if ((seqs = *pNt4Seq) == NULL)
return;
for (i=0; i<ArraySize(chromIds); ++i)
freeNt4(&seqs[i]);
freez(pNt4Seq);
}
int wormChromSize(char *chrom)
/* Return size of worm chromosome. */
{
static int sizes[ArraySize(chromIds)];
int ix;
int size;
if ((ix = wormChromIx(chrom)) < 0)
errAbort("%s isn't a chromosome", chrom);
size = sizes[ix];
/* If we don't know it already have to get it from file. */
if (size == 0)
{
char fileName[512];
makeChromFileName(chromIds[ix], fileName);
size = sizes[ix] = nt4BaseCount(fileName);
}
return size;
}
DNA *wormChromPart(char *chromId, int start, int size)
/* Return part of a worm chromosome. */
{
char fileName[512];
makeChromFileName(chromId, fileName);
return nt4LoadPart(fileName, start, size);
}
DNA *wormChromPartExonsUpper(char *chromId, int start, int size)
/* Return part of a worm chromosome with exons in upper case. */
{
DNA *dna = wormChromPart(chromId, start, size);
struct wormFeature *geneFeat = wormGenesInRange(chromId, start, start+size);
struct wormFeature *feat;
for (feat = geneFeat; feat != NULL; feat = feat->next)
{
char *name = feat->name;
if (!wormIsNamelessCluster(name))
{
struct gdfGene *gene = wormGetGdfGene(name);
gdfUpcExons(gene, feat->start, dna, size, start);
gdfFreeGene(gene);
}
}
slFreeList(&geneFeat);
return dna;
}
void wormClipRangeToChrom(char *chrom, int *pStart, int *pEnd)
/* Make sure that we stay inside chromosome. */
{
int chromEnd = wormChromSize(chrom);
int temp;
/* Swap ends if reversed. */
if (*pStart > *pEnd)
{
temp = *pEnd;
*pEnd = *pStart;
*pStart = temp;
}
/* Generally speaking try to slide the range covered by
* start-end inside the chromosome rather than just
* truncating an end. */
if (*pStart < 0)
{
*pEnd -= *pStart;
*pStart = 0;
}
if (*pEnd > chromEnd)
{
*pStart -= *pEnd - chromEnd;
*pEnd = chromEnd;
}
/* This handles case where the range is larger than the chromosome. */
if (*pStart < 0)
*pStart = 0;
}
boolean wormParseChromRange(char *in, char **retChromId, int *retStart, int *retEnd)
/* Chop up a string representation of a range within a chromosome and put the
* pieces into the return variables. Return FALSE if it isn't formatted right. */
{
char *words[5];
int wordCount;
char *chromId;
char buf[128];
strncpy(buf, in, sizeof(buf));
wordCount = chopString(buf, "- \t\r\n:", words, ArraySize(words));
if (wordCount != 3)
return FALSE;
chromId = wormOfficialChromName(words[0]);
if (chromId == NULL)
return FALSE;
if (!isdigit(words[1][0]) || !isdigit(words[2][0]))
return FALSE;
*retChromId = chromId;
*retStart = atoi(words[1]);
*retEnd = atoi(words[2]);
wormClipRangeToChrom(chromId, retStart, retEnd);
return TRUE;
}
boolean wormIsChromRange(char *in)
/* Check to see if the input is formatted correctly to be
* a range of a chromosome. */
{
char *chromId;
int start, end;
boolean ok;
ok = wormParseChromRange(in, &chromId, &start, &end);
return ok;
}
boolean wormFixupOrfName(char *name)
/* Turn something into a proper cosmid.# style name. Return FALSE if it can't be done. */
{
char *dot = strrchr(name, '.');
if (dot == NULL)
return FALSE;
toUpperN(name, dot-name); /* First part always upper case. */
if (!isdigit(dot[1])) /* Nameless cluster - just leave following digits be. */
return TRUE;
else
tolowers(dot+1); /* Suffix is lower case. */
return TRUE;
}
boolean wormIsAltSplicedName(char *name)
/* Is name in right form to be an isoform? */
{
char *dot = strrchr(name, '.');
if (dot == NULL)
return FALSE;
if (!isdigit(dot[1]))
return FALSE;
return isalpha(dot[strlen(dot)-1]);
}
static void makeIsoformBaseName(char *name)
{
if (wormIsAltSplicedName(name))
name[strlen(name)-1] = 0;
}
static boolean findAltSpliceRange(char *name, struct snof *snof, FILE *f,
char **retChrom, int *retStart, int *retEnd, char *retStrand)
/* Return range of chromosome covered by a gene and all of it's isoforms. */
{
char baseName[64];
char bName[64];
int snIx, maxIx;
int start = 0x7fffffff;
int end = -start;
char lineBuf[128];
char *words[3];
int wordCount;
int baseNameSize;
strcpy(baseName, name);
makeIsoformBaseName(baseName);
baseNameSize = strlen(baseName);
if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx))
return FALSE;
maxIx = snofElementCount(snof);
for (;snIx < maxIx; ++snIx)
{
long offset;
char *geneName;
snofNameOffsetAtIx(snof, snIx, &geneName, &offset);
if (strncmp(geneName, baseName, baseNameSize) != 0)
break;
strcpy(bName, geneName);
makeIsoformBaseName(bName);
if (sameString(baseName, bName))
{
int s, e;
fseek(f, offset, SEEK_SET);
- fgets(lineBuf, sizeof(lineBuf), f);
+ mustGetLine(f, lineBuf, sizeof(lineBuf));
wordCount = chopLine(lineBuf, words);
assert(wordCount == 3);
wormParseChromRange(words[0], retChrom, &s, &e);
*retStrand = words[1][0];
if (start > s)
start = s;
if (end < e)
end = e;
}
}
*retStart = start;
*retEnd = end;
return TRUE;
}
boolean wormGeneRange(char *name, char **retChrom, char *retStrand, int *retStart, int *retEnd)
/* Return chromosome position of a chrom range, gene, ORF, cosmid, or nameless cluster. */
{
static struct snof *c2gSnof = NULL, *c2cSnof = NULL;
static FILE *c2gFile = NULL, *c2cFile = NULL;
long offset;
char fileName[512];
struct slName *syn = NULL;
boolean ok;
if (wormIsChromRange(name))
{
*retStrand = '.';
return wormParseChromRange(name, retChrom, retStart, retEnd);
}
getDirs();
/* Translate biologist type name to cosmid.N name */
if (wormIsGeneName(name))
{
syn = wormGeneToOrfNames(name);
if (syn != NULL)
{
name = syn->name;
}
}
if (wormFixupOrfName(name)) /* See if ORF, and if so make nice. */
{
if (c2gSnof == NULL)
{
sprintf(fileName, "%sc2g", featDir);
c2gSnof = snofMustOpen(fileName);
sprintf(fileName, "%sc2g", featDir);
c2gFile = mustOpen(fileName, "rb");
}
ok = findAltSpliceRange(name, c2gSnof, c2gFile, retChrom, retStart, retEnd, retStrand);
}
else /* Lets say it's a cosmid. */
{
char lineBuf[128];
char *words[3];
int wordCount;
touppers(name);
if (c2cSnof == NULL)
{
sprintf(fileName, "%sc2c", featDir);
c2cSnof = snofMustOpen(fileName);
sprintf(fileName, "%sc2c", featDir);
c2cFile = mustOpen(fileName, "rb");
}
if (!snofFindOffset(c2cSnof, name, &offset) )
return FALSE;
fseek(c2cFile, offset, SEEK_SET);
- fgets(lineBuf, sizeof(lineBuf), c2cFile);
+ mustGetLine(c2cFile, lineBuf, sizeof(lineBuf));
wordCount = chopLine(lineBuf, words);
assert(wordCount == 3);
assert(strcmp(words[2], name) == 0);
assert(wormIsChromRange(words[0]));
*retStrand = words[1][0];
ok = wormParseChromRange(words[0], retChrom, retStart, retEnd);
}
slFreeList(&syn);
return ok;
}
boolean wormIsNamelessCluster(char *name)
/* Returns true if name is of correct format to be a nameless cluster. */
{
char *e = strrchr(name, '.');
if (e == NULL)
return FALSE;
if (e[1] != 'N')
return FALSE;
if (!isdigit(e[2]))
return FALSE;
return TRUE;
}
DNA *wormGetNamelessClusterDna(char *name)
/* Get DNA associated with nameless cluster */
{
char *chrom;
int start, end;
char strand;
if (!wormGeneRange(name, &chrom, &strand, &start, &end))
errAbort("Can't find %s in database", name);
return wormChromPart(chrom, start, end-start);
}
struct wormGdfCache wormSangerGdfCache = {&sangerDir,NULL,NULL};
struct wormGdfCache wormGenieGdfCache = {&genieDir,NULL,NULL};
struct wormGdfCache *defaultGdfCache = &wormSangerGdfCache;
static void wormCacheSomeGdf(struct wormGdfCache *cache)
/* Cache one gene prediction set. */
{
if (cache->snof == NULL)
{
char fileName[512];
char *dir;
bits32 sig;
getDirs();
dir = *(cache->pDir);
sprintf(fileName, "%sgenes", dir);
cache->snof = snofMustOpen(fileName);
sprintf(fileName, "%sgenes.gdf", dir);
cache->file = mustOpen(fileName, "rb");
mustReadOne(cache->file, sig);
if (sig != glSig)
errAbort("%s is not a good file", fileName);
}
}
#if 0 /* unused */
static void wormCacheGdf()
/* Set up for fast access to GDF file entries. */
{
wormCacheSomeGdf(defaultGdfCache);
}
#endif
void wormUncacheSomeGdf(struct wormGdfCache *cache)
/* Uncache some gene prediction set. */
{
snofClose(&cache->snof);
carefulClose(&cache->file);
}
void wormUncacheGdf()
/* Free up resources associated with fast GDF access. */
{
wormUncacheSomeGdf(defaultGdfCache);
}
struct gdfGene *wormGetSomeGdfGene(char *name, struct wormGdfCache *cache)
/* Get a single gdfGene of given name. */
{
long offset;
wormCacheSomeGdf(cache);
if (!snofFindOffset(cache->snof, name, &offset) )
return NULL;
fseek(cache->file, offset, SEEK_SET);
return gdfReadOneGene(cache->file);
}
struct gdfGene *wormGetGdfGene(char *name)
/* Get a single gdfGene of given name. */
{
return wormGetSomeGdfGene(name, defaultGdfCache);
}
struct gdfGene *wormGetSomeGdfGeneList(char *baseName, int baseNameSize, struct wormGdfCache *cache)
/* Get all gdfGenes that start with a given name. */
{
int snIx;
int maxIx;
struct snof *snof;
FILE *f;
struct gdfGene *list = NULL, *el;
wormCacheSomeGdf(cache);
snof = cache->snof;
f = cache->file;
if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx))
return NULL;
maxIx = snofElementCount(snof);
for (;snIx < maxIx; ++snIx)
{
long offset;
char *geneName;
snofNameOffsetAtIx(snof, snIx, &geneName, &offset);
if (strncmp(geneName, baseName, baseNameSize) != 0)
break;
fseek(f, offset, SEEK_SET);
el = gdfReadOneGene(f);
slAddTail(&list, el);
}
slReverse(&list);
return list;
}
struct gdfGene *wormGetGdfGeneList(char *baseName, int baseNameSize)
/* Get all gdfGenes that start with a given name. */
{
return wormGetSomeGdfGeneList(baseName, baseNameSize, defaultGdfCache);
}
struct gdfGene *wormGdfGenesInRange(char *chrom, int start, int end,
struct wormGdfCache *geneFinder)
/* Get list of genes in range according to given gene finder. */
{
char *dir = NULL;
struct gdfGene *gdfList = NULL, *gdf;
struct wormFeature *nameList, *name;
if (geneFinder == &wormSangerGdfCache)
dir = wormSangerDir();
else if (geneFinder == &wormGenieGdfCache)
dir = wormGenieDir();
else
errAbort("Unknown geneFinder line %d of %s", __LINE__, __FILE__);
nameList = wormSomeGenesInRange(chrom, start, end, dir);
for (name = nameList; name != NULL; name = name->next)
{
char *n = name->name;
if (!wormIsNamelessCluster(n))
{
gdf = wormGetSomeGdfGene(n, geneFinder);
slAddHead(&gdfList, gdf);
}
}
slFreeList(&nameList);
slReverse(&gdfList);
return gdfList;
}