src/lib/fa.c 1.38
1.38 2009/09/23 18:42:28 angie
Fixed compiler warnings from gcc 4.3.3, mostly about system calls whose return values weren't checked and non-literal format strings with no args.
Index: src/lib/fa.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/fa.c,v
retrieving revision 1.37
retrieving revision 1.38
diff -b -B -U 1000000 -r1.37 -r1.38
--- src/lib/fa.c 20 Feb 2007 18:50:21 -0000 1.37
+++ src/lib/fa.c 23 Sep 2009 18:42:28 -0000 1.38
@@ -1,648 +1,649 @@
/* Routines for reading and writing fasta format sequence files.
*
* This file is copyright 2002 Jim Kent, but license is hereby
* granted for all use - public, private or commercial. */
#include "common.h"
#include "errabort.h"
#include "hash.h"
#include "portable.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "fa.h"
#include "linefile.h"
static char const rcsid[] = "$Id$";
boolean faReadNext(FILE *f, char *defaultName, boolean mustStartWithComment,
char **retCommentLine, struct dnaSeq **retSeq)
/* Read next sequence from .fa file. Return sequence in retSeq.
* If retCommentLine is non-null
* return the '>' line in retCommentLine.
* The whole thing returns FALSE at end of file.
* DNA chars are mapped to lower case.*/
{
return faReadMixedNext(f, 0, defaultName, mustStartWithComment,
retCommentLine, retSeq);
}
boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName,
boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq)
/* Read next sequence from .fa file. Return sequence in retSeq.
* If retCommentLine is non-null return the '>' line in retCommentLine.
* The whole thing returns FALSE at end of file.
* Contains parameter to preserve mixed case. */
{
char lineBuf[1024];
int lineSize;
char *words[1];
int c;
off_t offset = ftello(f);
size_t dnaSize = 0;
DNA *dna, *sequence;
char *name = defaultName;
if (name == NULL)
name = "";
dnaUtilOpen();
if (retCommentLine != NULL)
*retCommentLine = NULL;
*retSeq = NULL;
/* Skip first lines until it starts with '>' */
for (;;)
{
if(fgets(lineBuf, sizeof(lineBuf), f) == NULL)
{
if (ferror(f))
errnoAbort("read of fasta file failed");
return FALSE;
}
lineSize = strlen(lineBuf);
if (lineBuf[0] == '>')
{
if (retCommentLine != NULL)
*retCommentLine = cloneString(lineBuf);
offset = ftello(f);
chopByWhite(lineBuf, words, ArraySize(words));
name = words[0]+1;
break;
}
else if (!mustStartWithComment)
{
if (fseeko(f, offset, SEEK_SET) < 0)
errnoAbort("fseek on fasta file failed");
break;
}
else
offset += lineSize;
}
/* Count up DNA. */
for (;;)
{
c = fgetc(f);
if (c == EOF || c == '>')
break;
if (isalpha(c))
{
++dnaSize;
}
}
if (dnaSize == 0)
{
warn("Invalid fasta format: sequence size == 0 for element %s",name);
}
/* Allocate DNA and fill it up from file. */
dna = sequence = needHugeMem(dnaSize+1);
if (fseeko(f, offset, SEEK_SET) < 0)
errnoAbort("fseek on fasta file failed");
for (;;)
{
c = fgetc(f);
if (c == EOF || c == '>')
break;
if (isalpha(c))
{
/* check for non-DNA char */
if (ntChars[c] == 0)
{
*dna++ = preserveCase ? 'N' : 'n';
}
else
{
*dna++ = preserveCase ? c : ntChars[c];
}
}
}
if (c == '>')
ungetc(c, f);
*dna = 0;
*retSeq = newDnaSeq(sequence, dnaSize, name);
if (ferror(f))
errnoAbort("read of fasta file failed");
return TRUE;
}
struct dnaSeq *faReadOneDnaSeq(FILE *f, char *defaultName, boolean mustStartWithComment)
/* Read sequence from FA file. Assumes positioned at or before
* the '>' at start of sequence. */
{
struct dnaSeq *seq;
if (!faReadNext(f, defaultName, mustStartWithComment, NULL, &seq))
return NULL;
else
return seq;
}
static bioSeq *nextSeqFromMem(char **pText, boolean isDna, boolean doFilter)
/* Convert fa in memory to bioSeq. Update *pText to point to next
* record. Returns NULL when no more sequences left. */
{
char *name = "";
char *s, *d;
struct dnaSeq *seq;
int size = 0;
char c;
char *filter = (isDna ? ntChars : aaChars);
char *text = *pText;
char *p = skipLeadingSpaces(text);
if (p == NULL)
return NULL;
dnaUtilOpen();
if (*p == '>')
{
char *end;
s = strchr(p, '\n');
if (s != NULL) ++s;
name = skipLeadingSpaces(p+1);
end = skipToSpaces(name);
if (end >= s || name >= s)
errAbort("No name in line starting with '>'");
if (end != NULL)
*end = 0;
}
else
{
s = p;
if (s == NULL || s[0] == 0)
return NULL;
}
name = cloneString(name);
d = text;
if (s != NULL)
{
for (;;)
{
c = *s;
if (c == 0 || c == '>')
break;
++s;
if (!isalpha(c))
continue;
if (doFilter)
{
if ((c = filter[(int)c]) == 0)
{
if (isDna)
c = 'n';
else
c = 'X';
}
}
d[size++] = c;
}
}
d[size] = 0;
/* Put sequence into our little sequence structure. */
AllocVar(seq);
seq->name = name;
seq->dna = text;
seq->size = size;
*pText = s;
return seq;
}
bioSeq *faNextSeqFromMemText(char **pText, boolean isDna)
/* Convert fa in memory to bioSeq. Update *pText to point to next
* record. Returns NULL when no more sequences left. */
{
return nextSeqFromMem(pText, isDna, TRUE);
}
bioSeq *faNextSeqFromMemTextRaw(char **pText)
/* Same as faNextSeqFromMemText, but will leave in
* letters even if they aren't in DNA or protein alphabed. */
{
return nextSeqFromMem(pText, TRUE, FALSE);
}
bioSeq *faSeqListFromMemText(char *text, boolean isDna)
/* Convert fa's in memory into list of dnaSeqs. */
{
bioSeq *seqList = NULL, *seq;
while ((seq = faNextSeqFromMemText(&text, isDna)) != NULL)
{
slAddHead(&seqList, seq);
}
slReverse(&seqList);
return seqList;
}
bioSeq *faSeqListFromMemTextRaw(char *text)
/* Convert fa's in memory into list of dnaSeqs without
* converting chars to N's. */
{
bioSeq *seqList = NULL, *seq;
while ((seq = faNextSeqFromMemTextRaw(&text)) != NULL)
{
slAddHead(&seqList, seq);
}
slReverse(&seqList);
return seqList;
}
bioSeq *faSeqFromMemText(char *text, boolean isDna)
/* Convert fa in memory to bioSeq. */
{
return faNextSeqFromMemText(&text, isDna);
}
struct dnaSeq *faFromMemText(char *text)
/* Return a sequence from a .fa file that's been read into
* a string in memory. This cannabalizes text, which should
* be allocated with needMem. This buffer becomes part of
* the returned dnaSeq, which may be freed normally with
* freeDnaSeq. */
{
return faNextSeqFromMemText(&text, TRUE);
}
struct dnaSeq *faReadSeq(char *fileName, boolean isDna)
/* Open fa file and read a single sequence from it. */
{
int maxSize = fileSize(fileName);
int fd;
DNA *s;
if (maxSize < 0)
errAbort("can't open %s", fileName);
s = needHugeMem(maxSize+1);
fd = open(fileName, O_RDONLY);
-read(fd, s, maxSize);
+if (read(fd, s, maxSize) < 0)
+ errAbort("faReadSeq: read failed: %s", strerror(errno));
close(fd);
s[maxSize] = 0;
return faSeqFromMemText(s, isDna);
}
struct dnaSeq *faReadDna(char *fileName)
/* Open fa file and read a single nucleotide sequence from it. */
{
return faReadSeq(fileName, TRUE);
}
struct dnaSeq *faReadAa(char *fileName)
/* Open fa file and read a peptide single sequence from it. */
{
return faReadSeq(fileName, FALSE);
}
static unsigned faFastBufSize = 0;
static DNA *faFastBuf;
static void expandFaFastBuf(int bufPos, int minExp)
/* Make faFastBuf bigger. */
{
if (faFastBufSize == 0)
{
faFastBufSize = 64 * 1024;
while (minExp > faFastBufSize)
faFastBufSize <<= 1;
faFastBuf = needHugeMem(faFastBufSize);
}
else
{
DNA *newBuf;
unsigned newBufSize = faFastBufSize + faFastBufSize;
while (newBufSize < minExp)
{
newBufSize <<= 1;
if (newBufSize <= 0)
errAbort("expandFaFastBuf: integer overflow when trying to "
"increase buffer size from %u to a min of %u.",
faFastBufSize, minExp);
}
newBuf = needHugeMem(newBufSize);
memcpy(newBuf, faFastBuf, bufPos);
freeMem(faFastBuf);
faFastBuf = newBuf;
faFastBufSize = newBufSize;
}
}
void faFreeFastBuf()
/* Free up buffers used in fa fast and speedreading. */
{
freez(&faFastBuf);
faFastBufSize = 0;
}
boolean faFastReadNext(FILE *f, DNA **retDna, int *retSize, char **retName)
/* Read in next FA entry as fast as we can. Return FALSE at EOF.
* The returned DNA and name will be overwritten by the next call
* to this function. */
{
int c;
int bufIx = 0;
static char name[256];
int nameIx = 0;
boolean gotSpace = FALSE;
/* Seek to next '\n' and save first word as name. */
dnaUtilOpen();
name[0] = 0;
for (;;)
{
if ((c = fgetc(f)) == EOF)
{
*retDna = NULL;
*retSize = 0;
*retName = NULL;
return FALSE;
}
if (!gotSpace && nameIx < ArraySize(name)-1)
{
if (isspace(c))
gotSpace = TRUE;
else if (c != '>')
{
name[nameIx++] = c;
}
}
if (c == '\n')
break;
}
name[nameIx] = 0;
/* Read until next '>' */
for (;;)
{
c = fgetc(f);
if (c == EOF || c == '>')
c = 0;
else if (!isalpha(c))
continue;
else
{
c = ntChars[c];
if (c == 0) c = 'n';
}
if (bufIx >= faFastBufSize)
expandFaFastBuf(bufIx, 0);
faFastBuf[bufIx++] = c;
if (c == 0)
{
*retDna = faFastBuf;
*retSize = bufIx-1;
*retName = name;
return TRUE;
}
}
}
void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize)
/* Write next sequence to fa file. */
{
if (dnaSize == 0)
return;
if (startLine != NULL)
fprintf(f, ">%s\n", startLine);
writeSeqWithBreaks(f, dna, dnaSize, 50);
}
void faWrite(char *fileName, char *startLine, DNA *dna, int dnaSize)
/* Write out FA file or die trying. */
{
FILE *f = mustOpen(fileName, "w");
faWriteNext(f, startLine, dna, dnaSize);
if (fclose(f) != 0)
errnoAbort("fclose failed");
}
void faWriteAll(char *fileName, bioSeq *seqList)
/* Write out all sequences in list to file. */
{
FILE *f = mustOpen(fileName, "w");
bioSeq *seq;
for (seq=seqList; seq != NULL; seq = seq->next)
faWriteNext(f, seq->name, seq->dna, seq->size);
if (fclose(f) != 0)
errnoAbort("fclose failed");
}
boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
/* Read in DNA or Peptide FA record in mixed case. Allow any upper or lower case
* letter, or the dash character in. */
{
char c;
int bufIx = 0;
static char name[512];
int lineSize, i;
char *line;
dnaUtilOpen();
/* Read first line, make sure it starts with '>', and read first word
* as name of sequence. */
name[0] = 0;
if (!lineFileNext(lf, &line, &lineSize))
{
*retDna = NULL;
*retSize = 0;
return FALSE;
}
if (line[0] == '>')
{
line = firstWordInLine(skipLeadingSpaces(line+1));
if (line == NULL)
errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName);
strncpy(name, line, sizeof(name));
name[sizeof(name)-1] = '\0'; /* Just to make sure name is NULL terminated. */
}
else
{
errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName);
}
/* Read until next '>' */
for (;;)
{
if (!lineFileNext(lf, &line, &lineSize))
break;
if (line[0] == '>')
{
lineFileReuse(lf);
break;
}
if (bufIx + lineSize >= faFastBufSize)
expandFaFastBuf(bufIx, lineSize);
for (i=0; i<lineSize; ++i)
{
c = line[i];
if (isalpha(c) || c == '-')
faFastBuf[bufIx++] = c;
}
}
if (bufIx >= faFastBufSize)
expandFaFastBuf(bufIx, 0);
faFastBuf[bufIx] = 0;
*retDna = faFastBuf;
*retSize = bufIx;
*retName = name;
if (bufIx == 0)
{
warn("Invalid fasta format: sequence size == 0 for element %s",name);
}
return TRUE;
}
void faToProtein(char *poly, int size)
/* Convert possibly mixed-case protein to upper case. Also
* convert any strange characters to 'X'. Does not change size.
* of sequence. */
{
int i;
char c;
dnaUtilOpen();
for (i=0; i<size; ++i)
{
if ((c = aaChars[(int)poly[i]]) == 0)
c = 'X';
poly[i] = c;
}
}
void faToDna(char *poly, int size)
/* Convert possibly mixed-case DNA to lower case. Also turn
* any strange characters to 'n'. Does not change size.
* of sequence. */
{
int i;
char c;
dnaUtilOpen();
for (i=0; i<size; ++i)
{
if ((c = ntChars[(int)poly[i]]) == 0)
c = 'n';
poly[i] = c;
}
}
boolean faSomeSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName, boolean isDna)
/* Read in DNA or Peptide FA record. */
{
char *poly;
int size;
if (!faMixedSpeedReadNext(lf, retDna, retSize, retName))
return FALSE;
size = *retSize;
poly = *retDna;
if (isDna)
faToDna(poly, size);
else
faToProtein(poly, size);
return TRUE;
}
boolean faPepSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
/* Read in next peptide FA entry as fast as we can. */
{
return faSomeSpeedReadNext(lf, retDna, retSize, retName, FALSE);
}
boolean faSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
/* Read in next FA entry as fast as we can. Faster than that old,
* pokey faFastReadNext. Return FALSE at EOF.
* The returned DNA and name will be overwritten by the next call
* to this function. */
{
return faSomeSpeedReadNext(lf, retDna, retSize, retName, TRUE);
}
static struct dnaSeq *faReadAllMixableInLf(struct lineFile *lf,
boolean isDna, boolean mixed)
/* Return list of all sequences from open fa file.
* Mixed case parameter overrides isDna. If mixed is false then
* will return DNA in lower case and non-DNA in upper case. */
{
struct dnaSeq *seqList = NULL, *seq;
DNA *dna;
char *name;
int size;
boolean ok;
for (;;)
{
if (mixed)
ok = faMixedSpeedReadNext(lf, &dna, &size, &name);
else
ok = faSomeSpeedReadNext(lf, &dna, &size, &name, isDna);
if (!ok)
break;
AllocVar(seq);
seq->name = cloneString(name);
seq->size = size;
seq->dna = cloneMem(dna, size+1);
slAddHead(&seqList, seq);
}
slReverse(&seqList);
faFreeFastBuf();
return seqList;
}
static struct dnaSeq *faReadAllSeqMixable(char *fileName, boolean isDna, boolean mixed)
/* Return list of all sequences in FA file.
* Mixed case parameter overrides isDna. If mixed is false then
* will return DNA in lower case and non-DNA in upper case. */
{
struct lineFile *lf = lineFileOpen(fileName, FALSE);
struct dnaSeq *seqList = faReadAllMixableInLf(lf, isDna, mixed);
lineFileClose(&lf);
return seqList;
}
struct hash *faReadAllIntoHash(char *fileName, enum dnaCase dnaCase)
/* Return hash of all sequences in FA file. */
{
boolean isDna = (dnaCase == dnaLower);
boolean isMixed = (dnaCase == dnaMixed);
struct dnaSeq *seqList = faReadAllSeqMixable(fileName, isDna, isMixed);
struct hash *hash = hashNew(18);
struct dnaSeq *seq;
for (seq = seqList; seq != NULL; seq = seq->next)
{
if (hashLookup(hash, seq->name))
errAbort("%s duplicated in %s", seq->name, fileName);
hashAdd(hash, seq->name, seq);
}
return hash;
}
struct dnaSeq *faReadAllSeq(char *fileName, boolean isDna)
/* Return list of all sequences in FA file. */
{
return faReadAllSeqMixable(fileName, isDna, FALSE);
}
struct dnaSeq *faReadAllDna(char *fileName)
/* Return list of all DNA sequences in FA file. */
{
return faReadAllSeq(fileName, TRUE);
}
struct dnaSeq *faReadAllPep(char *fileName)
/* Return list of all peptide sequences in FA file. */
{
return faReadAllSeq(fileName, FALSE);
}
struct dnaSeq *faReadAllMixed(char *fileName)
/* Read in mixed case fasta file, preserving case. */
{
return faReadAllSeqMixable(fileName, FALSE, TRUE);
}
struct dnaSeq *faReadAllMixedInLf(struct lineFile *lf)
/* Read in mixed case sequence from open fasta file. */
{
return faReadAllMixableInLf(lf, FALSE, TRUE);
}