533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/lib/annoStreamDbPslPlus.c src/hg/lib/annoStreamDbPslPlus.c deleted file mode 100644 index fb9e6ac..0000000 --- src/hg/lib/annoStreamDbPslPlus.c +++ /dev/null @@ -1,256 +0,0 @@ -/* annoStreamDbPslPlus -- subclass of annoStreamer for joining PSL+CDS+seq database tables */ - -/* Copyright (C) 2017 The Regents of the University of California - * See README in this or parent directory for licensing information. */ - -#include "annoStreamDbPslPlus.h" -#include "annoStreamDb.h" -#include "hdb.h" - -static char *pslPlusAutoSqlString = -"table pslPlus" -"\"transcript PSL, CDS and seq info\"" -" (" -" uint match; \"Number of bases that match that aren't repeats\"" -" uint misMatch; \"Number of bases that don't match\"" -" uint repMatch; \"Number of bases that match but are part of repeats\"" -" uint nCount; \"Number of 'N' bases\"" -" uint qNumInsert; \"Number of inserts in query (transcript)\"" -" int qBaseInsert; \"Number of bases inserted in query (transcript)\"" -" uint tNumInsert; \"Number of inserts in target (chromosome/scaffold)\"" -" int tBaseInsert; \"Number of bases inserted in target (chromosome/scaffold)\"" -" char[2] strand; \"+ or - for query strand (transcript to genome orientation)\"" -" string qName; \"Transcript accession\"" -" uint qSize; \"Transcript sequence size\"" -" uint qStart; \"Alignment start position in query (transcript)\"" -" uint qEnd; \"Alignment end position in query (transcript)\"" -" string tName; \"Target (chromosome/scaffold) name\"" -" uint tSize; \"Target (chromosome/scaffold) size\"" -" uint tStart; \"Alignment start position in target\"" -" uint tEnd; \"Alignment end position in target\"" -" uint blockCount; \"Number of blocks in alignment\"" -" uint[blockCount] blockSizes; \"Size of each block\"" -" uint[blockCount] qStarts; \"Start of each block in query.\"" -" uint[blockCount] tStarts; \"Start of each block in target.\"" -" string cds; \"CDS start and end in transcript (if applicable)\"" -" string protAcc; \"Protein accession (if applicable)\"" -" string name2; \"Gene symbolic name\"" -" string path; \"Path to FASTA file containing transcript sequence\"" -" uint fileOffset; \"Offset of transcript record in FASTA file\"" -" uint fileSize; \"Number of bytes of transcript record in FASTA file\"" -" )"; - -struct annoStreamDbPslPlus - { - struct annoStreamer streamer; // Parent class members & methods (external interface) - // Private members - char *gpTable; // Associated genePred (refGene, ncbiRefSeqCurated etc) - struct annoStreamer *mySource; // Internal source of PSL+CDS+seq info - struct hash *idHash; // Used to restrict PSL query result to curated/predicted - }; - -// select p.*, c.cds, l.protAcc, l.name, e.path, s.file_offset, s.file_size -// from (((ncbiRefSeqPsl p // NOT ANYMORE (#21770): join ncbiRefSeqCurated n on p.qName = n.name) -// left join ncbiRefSeqCds c on p.qName = c.id) -// join ncbiRefSeqLink l on p.qName = l.mrnaAcc) -// left join (seqNcbiRefSeq s left join extNcbiRefSeq e on s.extFile = e.id) on p.qName = s.acc -// where p.tName = "chr1" -// order by p.tName, p.tStart -// limit 5; - -static char *ncbiRefSeqConfigJsonFormat = - "{ \"naForMissing\": false," - " \"relatedTables\": [ { \"table\": \"ncbiRefSeqCds\"," - " \"fields\": [\"cds\"] }," - " { \"table\": \"ncbiRefSeqLink\"," - " \"fields\": [\"protAcc\", \"name\"] }," - " { \"table\": \"extNcbiRefSeq\"," - " \"fields\": [\"path\"] }," - " { \"table\": \"seqNcbiRefSeq\"," - " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; - -//select p.*,c.name,l.protAcc,l.name,e.path,s.file_offset,s.file_size, i.version -//from refSeqAli p -// join (hgFixed.gbCdnaInfo i -// left join hgFixed.cds c on i.cds = c.id) on i.acc = p.qName -// left join (hgFixed.gbSeq s -// join hgFixed.gbExtFile e on e.id = s.gbExtFile) on s.acc = p.qName -// join hgFixed.refLink l on p.qName = l.mrnaAcc -// where p.tName = "chr1" -// order by p.tName, p.tStart -// limit 5; - -static char *refSeqAliConfigJson = - "{ \"naForMissing\": false," - " \"relatedTables\": [ { \"table\": \"hgFixed.cds\"," - " \"fields\": [\"name\"] }," - " { \"table\": \"hgFixed.refLink\"," - " \"fields\": [\"protAcc\", \"name\"] }," - " { \"table\": \"hgFixed.gbExtFile\"," - " \"fields\": [\"path\"] }," - " { \"table\": \"hgFixed.gbSeq\"," - " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; - -struct asObject *annoStreamDbPslPlusAsObj() -/* Return an autoSql object with PSL, gene name, protein acc, CDS and sequence file info fields. - * An annoStreamDbPslPlus instance may return additional additional columns if configured, but - * these columns will always be present. */ -{ -return asParseText(pslPlusAutoSqlString); -} - -static void asdppSetRegion(struct annoStreamer *sSelf, char *chrom, uint rStart, uint rEnd) -/* Pass setRegion down to internal source. */ -{ -annoStreamerSetRegion(sSelf, chrom, rStart, rEnd); -struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; -self->mySource->setRegion(self->mySource, chrom, rStart, rEnd); -} - -static struct annoRow *asdppNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, - struct lm *lm) -/* Return next psl+ row. */ -{ -struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; -char **ppWords; -lmAllocArray(lm, ppWords, sSelf->numCols); -struct annoRow *ppRow; -boolean rightJoinFail = FALSE; -while ((ppRow = self->mySource->nextRow(self->mySource, minChrom, minEnd, lm)) != NULL) - { - ppWords = ppRow->data; - // If self->idHash is non-NULL, check PSL qName; skip this row if qName not found. - char *qName = ppWords[9]; - if (self->idHash && ! hashLookup(self->idHash, qName)) - continue; - // If there are filters, apply them, otherwise just return aRow. - if (sSelf->filters) - { - boolean fails = annoFilterRowFails(sSelf->filters, ppWords, sSelf->numCols, - &rightJoinFail); - // If this row passes the filter, or fails but is rightJoin, then we're done looking. - if (!fails || rightJoinFail) - break; - } - else - // no filtering to do, just use this row - break; - } -if (ppRow != NULL) - return annoRowFromStringArray(ppRow->chrom, ppRow->start, ppRow->end, rightJoinFail, - ppWords, sSelf->numCols, lm); -else - return NULL; -} - -static void asdppClose(struct annoStreamer **pSSelf) -/* Free up state. */ -{ -if (pSSelf == NULL) - return; -struct annoStreamDbPslPlus *self = *(struct annoStreamDbPslPlus **)pSSelf; -freez(&self->gpTable); -self->mySource->close(&(self->mySource)); -annoStreamerFree(pSSelf); -} - -static struct asColumn *asColumnClone(struct asColumn *colIn) -/* Return a full clone of colIn, or NULL if colIn is NULL. */ -{ -if (colIn == NULL) - return NULL; -if (colIn->obType != NULL || colIn->index != NULL) - errAbort("asColumnClone: support for obType and index not implemented"); -struct asColumn *colOut; -AllocVar(colOut); -colOut->name = cloneString(colIn->name); -colOut->comment = cloneString(colIn->comment); -colOut->lowType = colIn->lowType; // static struct in asParse.c -colOut->obName = cloneString(colIn->obName); -colOut->fixedSize = colIn->fixedSize; -colOut->linkedSizeName = cloneString(colIn->linkedSizeName); -colOut->linkedSize = asColumnClone(colIn->linkedSize); -colOut->isSizeLink = colIn->isSizeLink; -colOut->isList = colIn->isList; -colOut->isArray = colIn->isArray; -colOut->autoIncrement = colIn->autoIncrement; -colOut->values = slNameCloneList(colIn->values); -return colOut; -} - -static void asObjAppendExtraColumns(struct asObject *asObjTarget, struct asObject *asObjSource) -/* If asObjSource has more columns than asObjTarget then clone and append those additional columns - * to asObjTarget. */ -{ -int tColCount = slCount(asObjTarget->columnList); -int sColCount = slCount(asObjSource->columnList); -if (tColCount < 1) - errAbort("asObjAppendExtraColumns: support for empty target columnList not implemented"); -if (sColCount > tColCount) - { - struct asColumn *tCol = asObjTarget->columnList, *sCol = asObjSource->columnList; - int i; - for (i = 0; i < tColCount-1; i++) - { - tCol = tCol->next; - sCol = sCol->next; - } - while (sCol->next != NULL) - { - tCol->next = asColumnClone(sCol->next); - tCol = tCol->next; - sCol = sCol->next; - } - } -} - -struct annoStreamer *annoStreamDbPslPlusNew(struct annoAssembly *aa, char *gpTable, int maxOutRows, - struct jsonElement *extraConfig) -/* Create an annoStreamer (subclass) object that streams PSL, CDS and seqFile info. - * gpTable is a genePred table that has associated PSL, CDS and sequence info - * (i.e. refGene, ncbiRefSeq, ncbiRefSeqCurated or ncbiRefSeqPredicted). */ -{ -char *pslTable = NULL, *configJson = NULL; -if (sameString("refGene", gpTable)) - { - pslTable = "refSeqAli"; - configJson = refSeqAliConfigJson; - } -else if (startsWith("ncbiRefSeq", gpTable)) - { - pslTable = "ncbiRefSeqPsl"; - struct dyString *dy = dyStringCreate(ncbiRefSeqConfigJsonFormat, gpTable); - configJson = dyStringCannibalize(&dy); - } -else - errAbort("annoStreamDbPslPlusNew: unrecognized table \"%s\"", gpTable); -struct annoStreamDbPslPlus *self; -AllocVar(self); -// Get internal streamer for joining PSL with other tables. -struct jsonElement *config = jsonParse(configJson); -if (extraConfig) - jsonObjectMerge(config, extraConfig); -self->mySource = annoStreamDbNew(aa->name, pslTable, aa, maxOutRows, config); -struct asObject *asObj = annoStreamDbPslPlusAsObj(); -if (extraConfig) - asObjAppendExtraColumns(asObj, self->mySource->asObj); -if (startsWith("ncbiRefSeq", gpTable) && differentString("ncbiRefSeq", gpTable)) - { - // Load up an ID hash to restrict PSL query results to the subset in gpTable: - struct sqlConnection *conn = hAllocConn(aa->name); - char query[1024]; - sqlSafef(query, sizeof(query), "select name, 1 from %s", gpTable); - self->idHash = sqlQuickHash(conn, query); - hFreeConn(&conn); - } -// Set up external streamer interface -struct annoStreamer *streamer = &(self->streamer); -annoStreamerInit(streamer, aa, asObj, pslTable); -streamer->rowType = arWords; -self->gpTable = cloneString(gpTable); -// Override methods that need to pass through to internal source: -streamer->setRegion = asdppSetRegion; -streamer->nextRow = asdppNextRow; -streamer->close = asdppClose; -return (struct annoStreamer *)self; -}