8d44c7dbaf65ec740081b21da51a0d7739c35855 braney Wed Nov 4 14:07:09 2015 -0800 use ';' to separate MAF lines in bigMaf instead of '\001' #15935 diff --git src/hg/lib/hgMaf.c src/hg/lib/hgMaf.c index 69ce12b..87ddce7 100644 --- src/hg/lib/hgMaf.c +++ src/hg/lib/hgMaf.c @@ -1,466 +1,466 @@ /* hgMaf.c - Stuff to load up mafs from the browser database. */ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "dystring.h" #include "options.h" #include "jksql.h" #include "dnautil.h" #include "dnaseq.h" #include "maf.h" #include "hdb.h" #include "scoredRef.h" #include "hgMaf.h" int mafCmp(const void *va, const void *vb) /* Compare to sort based on start of first component. */ { const struct mafAli *a = *((struct mafAli **)va); const struct mafAli *b = *((struct mafAli **)vb); return a->components->start - b->components->start; } struct mafAli *mafLoadInRegion2(struct sqlConnection *conn, struct sqlConnection *conn2, char *table, char *chrom, int start, int end, char *file) /* Return list of alignments in region. */ { char **row; unsigned int extFileId = 0; struct mafAli *maf, *mafList = NULL; struct mafFile *mf = NULL; int rowOffset; if (file != NULL) mf = mafOpen(file); struct sqlResult *sr = hRangeQuery(conn, table, chrom, start, end, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct scoredRef ref; scoredRefStaticLoad(row + rowOffset, &ref); if ((file != NULL) && (ref.extFile != 0)) errAbort("expect extFile to be zero if file specified\n"); if ((file == NULL) && (ref.extFile == 0)) errAbort("expect extFile to be not zero or file specified\n"); if (ref.extFile != extFileId) { char *path = hExtFileNameC(conn2, "extFile", ref.extFile); mafFileFree(&mf); mf = mafOpen(path); extFileId = ref.extFile; } lineFileSeek(mf->lf, ref.offset, SEEK_SET); maf = mafNext(mf); if (maf == NULL) internalErr(); slAddHead(&mafList, maf); } sqlFreeResult(&sr); mafFileFree(&mf); slReverse(&mafList); /* hRangeQuery may return items out-of-order when bin is used in the query, * so sort here in order to avoid trouble at base-level view: */ slSort(&mafList, mafCmp); return mafList; } struct mafAli *mafLoadInRegion(struct sqlConnection *conn, char *table, char *chrom, int start, int end) { struct sqlConnection *conn2 = hAllocConn(sqlGetDatabase(conn)); struct mafAli *ret = mafLoadInRegion2(conn, conn2, table, chrom, start, end,NULL); hFreeConn(&conn2); return ret; } struct mafAli *axtLoadAsMafInRegion(struct sqlConnection *conn, char *table, char *chrom, int start, int end, char *tPrefix, char *qPrefix, int tSize, struct hash *qSizeHash) /* Return list of alignments in region from axt external file as a maf. */ { char **row; unsigned int extFileId = 0; struct lineFile *lf = NULL; struct mafAli *maf, *mafList = NULL; struct axt *axt; int rowOffset; struct sqlResult *sr = hRangeQuery(conn, table, chrom, start, end, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct scoredRef ref; scoredRefStaticLoad(row + rowOffset, &ref); if (ref.extFile != extFileId) { char *path = hExtFileName(sqlGetDatabase(conn),"extFile", ref.extFile); lf = lineFileOpen(path, TRUE); extFileId = ref.extFile; } lineFileSeek(lf, ref.offset, SEEK_SET); axt = axtRead(lf); if (axt == NULL) internalErr(); maf = mafFromAxt(axt, tSize, tPrefix, hashIntVal(qSizeHash, axt->qName), qPrefix); axtFree(&axt); slAddHead(&mafList, maf); } sqlFreeResult(&sr); lineFileClose(&lf); slReverse(&mafList); return mafList; } static void mafCheckFirstComponentSrc(struct mafAli *mafList, char *src) /* Check that the first component of each maf has given src. */ { struct mafAli *maf; for (maf = mafList; maf != NULL; maf = maf->next) { if (!sameString(maf->components->src, src)) errAbort("maf first component isn't %s", src); } } static void mafCheckFirstComponentStrand(struct mafAli *mafList, char strand) /* Check that the first component of each maf has given strand. */ { struct mafAli *maf; for (maf = mafList; maf != NULL; maf = maf->next) { if (maf->components->strand != strand) errAbort("maf first component isn't %c", strand); } } struct oneOrg /* Info on one organism. */ { struct oneOrg *next; char *name; /* Name - allocated in hash */ int order; /* Help sort organisms. */ struct dyString *dy; /* Associated alignment for this organism. */ boolean hit; /* Flag to see if hit this time around. */ }; static int oneOrgCmp(const void *va, const void *vb) /* Compare to sort based on order. */ { const struct oneOrg *a = *((struct oneOrg **)va); const struct oneOrg *b = *((struct oneOrg **)vb); return a->order - b->order; } static void fillInMissing(struct oneOrg *nativeOrg, struct oneOrg *orgList, struct dnaSeq *native, int seqStart, int curPos, int aliStart) /* Fill in alignment strings in orgList with native sequence * for first organism, and dots for rest. */ { int fillSize = aliStart - curPos; int offset = curPos - seqStart; struct oneOrg *org; if (nativeOrg == NULL) return; dyStringAppendN(nativeOrg->dy, native->dna + offset, fillSize); for (org = orgList; org != NULL; org = org->next) { if (org != nativeOrg) dyStringAppendMultiC(org->dy, '.', fillSize); } } static int countAlpha(char *s) /* Count up number of alphabetic characters in string */ { int count = 0; char c; while ((c = *s++) != 0) if (isalpha(c)) ++count; return count; } struct mafAli *hgMafFrag( char *database, /* Database, must already have hSetDb to this */ char *track, /* Name of MAF track */ char *chrom, /* Chromosome (in database genome) */ int start, int end, /* start/end in chromosome */ char strand, /* Chromosome strand. */ char *outName, /* Optional name to use in first component */ struct slName *orderList /* Optional order of organisms. */ ) /* mafFrag- Extract maf sequences for a region from database. * This creates a somewhat unusual MAF that extends from start * to end whether or not there are actually alignments. Where * there are no alignments (or alignments missing a species) * a . character fills in. The score is always zero, and * the sources just indicate the species. You can mafFree this * as normal. */ { int chromSize = hChromSize(database, chrom); struct sqlConnection *conn = hAllocConn(database); struct dnaSeq *native = hChromSeq(database, chrom, start, end); struct mafAli *maf, *mafList = mafLoadInRegion(conn, track, chrom, start, end); char masterSrc[128]; struct hash *orgHash = newHash(10); struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL; int curPos = start, symCount = 0; struct slName *name; int order = 0; /* Check that the mafs are really copacetic, the particular * subtype we think is in the database that this (relatively) * simple code can handle. */ safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom); mafCheckFirstComponentSrc(mafList, masterSrc); mafCheckFirstComponentStrand(mafList, '+'); slSort(&mafList, mafCmp); /* Prebuild organisms if possible from input orderList. */ for (name = orderList; name != NULL; name = name->next) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, name->name, org, &org->name); org->dy = dyStringNew(native->size*1.5); org->order = order++; if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL) { AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, database, org, &org->name); org->dy = dyStringNew(native->size*1.5); if (nativeOrg == NULL) nativeOrg = org; } /* Go through all mafs in window, mostly building up * org->dy strings. */ for (maf = mafList; maf != NULL; maf = maf->next) { struct mafComp *mc, *mcMaster = maf->components; struct mafAli *subMaf = NULL; order = 0; if (curPos < mcMaster->start) { fillInMissing(nativeOrg, orgList, native, start, curPos, mcMaster->start); symCount += mcMaster->start - curPos; } if (curPos < mcMaster->start + mcMaster->size) /* Prevent worst * backtracking */ { if (mafNeedSubset(maf, masterSrc, curPos, end)) { subMaf = mafSubset(maf, masterSrc, curPos, end); if (subMaf == NULL) continue; } else subMaf = maf; for (mc = subMaf->components; mc != NULL; mc = mc->next, ++order) { /* Extract name up to dot into 'orgName' */ char buf[128], *e, *orgName; if ((mc->size == 0) || (mc->srcSize == 0)) /* skip over components without sequence */ continue; mc->leftStatus = mc->rightStatus = 0; /* squash annotation */ e = strchr(mc->src, '.'); if (e == NULL) orgName = mc->src; else { int len = e - mc->src; if (len >= sizeof(buf)) errAbort("organism/database name %s too long", mc->src); memcpy(buf, mc->src, len); buf[len] = 0; orgName = buf; } /* Look up dyString corresponding to org, and create a * new one if necessary. */ org = hashFindVal(orgHash, orgName); if (org == NULL) { if (orderList != NULL) errAbort("%s is not in orderList", orgName); AllocVar(org); slAddHead(&orgList, org); hashAddSaveName(orgHash, orgName, org, &org->name); org->dy = dyStringNew(native->size*1.5); dyStringAppendMultiC(org->dy, '.', symCount); if (nativeOrg == NULL) nativeOrg = org; } if (orderList == NULL && order > org->order) org->order = order; org->hit = TRUE; /* Fill it up with alignment. */ dyStringAppendN(org->dy, mc->text, subMaf->textSize); } for (org = orgList; org != NULL; org = org->next) { if (!org->hit) dyStringAppendMultiC(org->dy, '.', subMaf->textSize); org->hit = FALSE; } symCount += subMaf->textSize; curPos = mcMaster->start + mcMaster->size; if (subMaf != maf) mafAliFree(&subMaf); } } if (curPos < end) { fillInMissing(nativeOrg, orgList, native, start, curPos, end); symCount += end - curPos; } mafAliFreeList(&mafList); slSort(&orgList, oneOrgCmp); if (strand == '-') { for (org = orgList; org != NULL; org = org->next) reverseComplement(org->dy->string, org->dy->stringSize); } /* Construct our maf */ AllocVar(maf); maf->textSize = symCount; for (org = orgList; org != NULL; org = org->next) { struct mafComp *mc; AllocVar(mc); if (org == orgList) { if (outName != NULL) { mc->src = cloneString(outName); mc->srcSize = native->size; mc->strand = '+'; mc->start = 0; mc->size = native->size; } else { mc->src = cloneString(masterSrc); mc->srcSize = chromSize; mc->strand = strand; if (strand == '-') reverseIntRange(&start, &end, chromSize); mc->start = start; mc->size = end-start; } } else { int size = countAlpha(org->dy->string); mc->src = cloneString(org->name); mc->srcSize = size; mc->strand = '+'; mc->start = 0; mc->size = size; } mc->text = cloneString(org->dy->string); dyStringFree(&org->dy); slAddHead(&maf->components, mc); } slReverse(&maf->components); slFreeList(&orgList); freeHash(&orgHash); hFreeConn(&conn); return maf; } struct consWiggle *wigMafWiggles(char *db, struct trackDb *tdb) /* get conservation wiggle table names and labels from trackDb setting, ignoring those where table doesn't exist */ { char *fields[20]; int fieldCt; int i; char *wigTable, *wigLabel; struct consWiggle *wig, *wigList = NULL; char *setting = trackDbSetting(tdb, CONS_WIGGLE); if (!setting) return NULL; fieldCt = chopLine(cloneString(setting), fields); for (i = 0; i < fieldCt; i += 3) { wigTable = fields[i]; if (hTableExists(db, wigTable)) { AllocVar(wig); wig->table = cloneString(wigTable); wigLabel = (i+1 == fieldCt ? DEFAULT_CONS_LABEL : fields[i+1]); wig->leftLabel = cloneString(wigLabel); wigLabel = (i+2 >= fieldCt ? wig->leftLabel : fields[i+2]); wig->uiLabel = cloneString(wigLabel); slAddTail(&wigList, wig); } } return wigList; } char *wigMafWiggleVar(char *prefix, struct consWiggle *wig,char **suffix) // Return name of cart variable (and optionally the suffix) for this cons wiggle { char option[128]; safef(option, sizeof option, "%s.cons.%s", prefix, wig->leftLabel); if (suffix != NULL) *suffix = option + strlen(prefix) + 1; return (cloneString(option)); } struct consWiggle *consWiggleFind(char *db,struct trackDb *parent,char *table) /* Return conservation wig if it is found in the parent. */ { if (parent == NULL || !startsWith("wigMaf", parent->type)) return NULL; struct consWiggle *wig, *wiggles = wigMafWiggles(db, parent); for (wig = wiggles; wig != NULL && differentString(wig->table,table); wig = wig->next) {} return wig; } struct mafAli *bigMafLoadInRegion( struct bbiFile *bbi, char *chrom, int start, int end) /* Read in MAF blocks from bigBed. */ { struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); struct mafAli *mafList = NULL; for (bb = bbList; bb != NULL; bb = bb->next) { // the MAF block in the record as \001 instead of newlines - char *mafText = replaceChars(bb->rest, "\001","\n"); + char *mafText = replaceChars(bb->rest, ";","\n"); struct mafFile mf; mf.lf = lineFileOnString(NULL, TRUE, mafText); struct mafAli *maf = mafNext(&mf); slAddHead(&mafList, maf); } return mafList; }