51c07793ec99fbec04ed2c16a5bd464704b9ad50 hiram Fri Nov 16 11:54:52 2012 -0800 adding the lift of .align files from Robert Hubley diff --git src/hg/liftUp/liftUp.c src/hg/liftUp/liftUp.c index 35d8cd2..2583b40 100644 --- src/hg/liftUp/liftUp.c +++ src/hg/liftUp/liftUp.c @@ -20,33 +20,33 @@ #include "sqlNum.h" boolean isPtoG = TRUE; /* is protein to genome lift */ boolean nohead = FALSE; /* No header for psl files? */ boolean nosort = FALSE; /* Don't sort files */ boolean ignoreVersions = FALSE; /* drop NCBI versions */ boolean extGenePred = FALSE; /* load extended genePred */ int dots=0; /* Put out I'm alive dot now and then? */ int gapsize = 0; /* optional non-default gap size */ void usage() /* Explain usage and exit. */ { errAbort( - "liftUp - change coordinates of .psl, .agp, .gap, .gl, .out, .gff, .gtf .bscore \n" - ".tab .gdup .axt .chain .net, genePred, .wab, .bed, or .bed8 files to parent\n" - "coordinate system.\n" + "liftUp - change coordinates of .psl, .agp, .gap, .gl, .out, .align, .gff, .gtf\n" + ".bscore .tab .gdup .axt .chain .net, genePred, .wab, .bed, or .bed8 files to\n" + "parent coordinate system.\n" "\n" "usage:\n" " liftUp [-type=.xxx] destFile liftSpec how sourceFile(s)\n" "The optional -type parameter tells what type of files to lift\n" "If omitted the type is inferred from the suffix of destFile\n" "Type is one of the suffixes described above.\n" "DestFile will contain the merged and lifted source files,\n" "with the coordinates translated as per liftSpec. LiftSpec\n" "is tab-delimited with each line of the form:\n" " offset oldName oldSize newName newSize\n" "LiftSpec may optionally have a sixth column specifying + or - strand,\n" "but strand is not supported for all input types.\n" "The 'how' parameter controls what the program will do with\n" "items which are not in the liftSpec. It must be one of:\n" " carry - Items not in liftSpec are carried to dest without translation\n" @@ -96,30 +96,37 @@ {"ignoreVersions", OPTION_BOOLEAN}, {"extGenePred", OPTION_BOOLEAN}, {NULL, 0} }; enum how /* how to handle items missing from liftSpec */ { warnMissing, /* warn if items are missing */ silentDrop, /* silently drop items. */ carryMissing, /* carry missing items untranslated. */ errorMissing /* error if items are missing */ }; enum how how = warnMissing; +enum RMAlignFormats +{ + preRM330, /* outline format, no id */ + RM330, /* outline format, with id */ + RM400, /* catline format, with id */ + unknown +}; char *rmChromPrefix(char *s) /* Remove chromosome prefix if any. */ { char *e = strchr(s, '/'); if (e != NULL) return e+1; else return s; } void rmChromPart(struct liftSpec *list) /* Turn chrom/ctg to just ctg . */ { struct liftSpec *el; @@ -163,30 +170,42 @@ } return hel->val; } void skipLines(struct lineFile *lf, int count) /* Skip a couple of lines. */ { int i, lineSize; char *line; for (i=0; ilineIx, lf->fileName); val = atoi(s); @@ -196,31 +215,218 @@ } char flipStrand(char strand) /* Turn + to - and vice versa. */ { return (strand == '-' ? '+' : '-'); } void cantHandleSpecRevStrand(struct liftSpec *spec) /* abort if spec has a minus strand and we don't (yet) support that. */ { if (spec && spec->strand == '-') errAbort("Can't handle lifts with - strands for this input type"); } -void liftOut(char *destFile, struct hash *liftHash, int sourceCount, char *sources[]) +void liftAlign(char *destFile, struct hash *liftHash, int sourceCount, char *sources[]) +/* Lift up coordinates in .align file. Add offset to id column to + * maintain non-overlapping id ranges for different input files. + * -Robert Hubley Oct-2012 */ +{ +FILE *dest = mustOpen(destFile, "w"); +char *source = NULL; +int i = 0; +struct lineFile *lf; +int lineSize, wordCount; +char *line, *words[32]; +char origLine[256]; +char seqId[32]; +char *s; +int begin, end, left; +char leftString[18]; +int highestIdSoFar = 0, idOffset = 0; +char idStr[32]; +struct liftSpec *spec = NULL; +char *newName = NULL; +enum RMAlignFormats alignFmt = unknown; +int inAlignHdr = 0; + +seqId[0] = 0; +for (i=0; ilineIx, lf->fileName); + left = numField(words, 7, 1, lf); + int numId = 0; + + if ( alignFmt == RM400 ) + { + if ( wordCount == 14 ) + numId = sqlUnsigned(words[13]) + idOffset; + else + numId = sqlUnsigned(words[14]) + idOffset; + } + else + { + numId = sqlUnsigned(words[14]) + idOffset; + } + + if (numId > highestIdSoFar) + highestIdSoFar = numId; + + safef(idStr, sizeof(idStr), "%d", numId); + spec = findLift(liftHash, words[4], lf); + if (spec == NULL) + { + if (how == carryMissing) + { + newName = words[4]; + } + else + continue; + } + else + { + cantHandleSpecRevStrand(spec); + begin += spec->offset; + end += spec->offset; + left = spec->newSize - end; + newName = spec->newName; + } + sprintf(leftString, "(%d)", left); + + if ( alignFmt == RM400 ) + { + if ( wordCount == 14 ) + fprintf(dest, + "%s %s %s %s %s %d %d %s %s %s %s %s %s %s\n", + words[0], words[1], words[2], words[3], newName, + begin, end, leftString, + words[8], words[9], words[10], words[11], words[12], idStr); + else + fprintf(dest, + "%s %s %s %s %s %d %d %s %s %s %s %s %s %s %s\n", + words[0], words[1], words[2], words[3], newName, + begin, end, leftString, + words[8], words[9], words[10], words[11], words[12], words[13], idStr); + } + else + { + fprintf(dest, + "%s %s %s %s %s %d %d %s %s %s %s %s %s %s %s\n", + words[0], words[1], words[2], words[3], newName, + begin, end, leftString, + words[8], words[9], words[10], words[11], words[12], words[13], idStr); + } + + } + else if ( spec && wordCount == 4 && isNum( words[1] ) + && isNum( words[3] ) && seqId[0] != '\0' + && strstr( seqId, words[0] ) == seqId ) + { + // Alignment data -- specifically a line with query coords + begin = numField(words, 1, 0, lf); + end = numField(words, 3, 0, lf); + begin += spec->offset; + end += spec->offset; + if ( alignFmt == RM400 ) + { + fprintf(dest, " %-13.13s %10d %s %d\n", + newName, begin, words[2], end ); + } + else + { + fprintf(dest, " %-12.12s %9d %s %d\n", + newName, begin, words[2], end ); + } + } + else // All remaining lines just get printed. + fprintf(dest, "%s\n", origLine ); + } + lineFileClose(&lf); + idOffset = highestIdSoFar; + } +if (ferror(dest)) + errAbort("error writing %s", destFile); +fclose(dest); +} + + +void liftOut(char *destFile, struct hash *liftHash, int sourceCount, + char *sources[]) /* Lift up coordinates in .out file. Add offset to id (15th) column to * maintain non-overlapping id ranges for different input files. */ { FILE *dest = mustOpen(destFile, "w"); char *source; int i; struct lineFile *lf; int lineSize, wordCount; char *line, *words[32]; char *s; int begin, end, left; char leftString[18]; int highestIdSoFar = 0, idOffset = 0; char idStr[32]; struct liftSpec *spec; @@ -1523,30 +1729,36 @@ usage(); if (how == carryMissing && sameString("/dev/null", liftFile)) verbose(1, "Carrying input -- ignoring /dev/null liftFile\n"); else { lifts = readLifts(liftFile); verbose(1, "Got %d lifts in %s\n", slCount(lifts), liftFile); } if (endsWith(destType, ".out")) { rmChromPart(lifts); liftHash = hashLift(lifts, FALSE); liftOut(destFile, liftHash, sourceCount, sources); } +else if (endsWith(destType, ".align")) + { + rmChromPart(lifts); + liftHash = hashLift(lifts, FALSE); + liftAlign(destFile, liftHash, sourceCount, sources); + } else if (endsWith(destType, ".pslx") || endsWith(destType, ".xa") || endsWith(destType, ".psl")) { rmChromPart(lifts); liftHash = hashLift(lifts, TRUE); liftPsl(destFile, liftHash, sourceCount, sources, optionExists("pslQ") || optionExists("pslq"), !endsWith(destType, ".psl") ); } else if (endsWith(destType, ".agp")) { liftHash = hashLift(lifts, FALSE); liftAgp(destFile, liftHash, sourceCount, sources); } else if (endsWith(destType, ".gap")) { liftHash = hashLift(lifts, TRUE);