4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/makeDb/hgLoadSeq/hgLoadSeq.c src/hg/makeDb/hgLoadSeq/hgLoadSeq.c index 059125b..a1820bb 100644 --- src/hg/makeDb/hgLoadSeq/hgLoadSeq.c +++ src/hg/makeDb/hgLoadSeq/hgLoadSeq.c @@ -1,251 +1,251 @@ /* hgLoadSeq - load sequences into the seq/extFile tables. */ /* Copyright (C) 2013 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "options.h" #include "portable.h" #include "linefile.h" #include "hash.h" #include "fa.h" #include "hgRelate.h" /* command line option specifications */ static struct optionSpec optionSpecs[] = { {"abbr", OPTION_STRING}, {"prefix", OPTION_STRING}, {"replace", OPTION_BOOLEAN}, {"drop", OPTION_BOOLEAN}, {"test", OPTION_BOOLEAN}, {"seqTbl", OPTION_STRING}, {"extFileTbl", OPTION_STRING}, {NULL, 0} }; /* Command line options and defaults. */ char *seqTbl = "seq"; char *extFileTbl = "extFile"; char *abbr = NULL; char *prefix = NULL; boolean test = FALSE; boolean replace = FALSE; boolean drop = FALSE; char seqTableCreate[] = /* This keeps track of a sequence. */ "create table %s (" "id int unsigned not null primary key," /* Unique ID across all tables. */ "acc varchar(128) not null ," /* seq ID. */ "size int unsigned not null," /* Size of sequence in bases. */ "gb_date date not null," /* GenBank last modified date, * not used, for compatbility with older databases. */ "extFile int unsigned not null," /* File it is in. */ "file_offset bigint not null," /* Offset in file. */ "file_size int unsigned not null," /* Size in file. */ /* Extra indices. */ "unique (acc))"; boolean faSeekNextRecord(struct lineFile *faLf) /* Seeks to the next FA record. Returns FALSE if seeks to EOF. */ { char *faLine; int faLineSize; while (lineFileNext(faLf, &faLine, &faLineSize)) { if (faLine[0] == '>') return TRUE; } return FALSE; } void abbreviate(char *s, char *fluff) /* Cut out fluff from s. */ { int len; if (s != NULL && fluff != NULL) { s = strstr(s, fluff); if (s != NULL) { len = strlen(fluff); strcpy(s, s+len); } } } boolean readFaSeq(struct lineFile *faLf, char **retFaName, int *retDnaSize, off_t *retFaOffset) /* Read the next record, returning it's start location in the file */ { // to get offset, must read first line, save offset, then read the record to // get the size char *faLine; if (!lineFileNext(faLf, &faLine, NULL)) return FALSE; if (faLine[0] != '>') errAbort("fasta record doesn't start with '>' line %d of %s", faLf->lineIx, faLf->fileName); *retFaOffset = faLf->bufOffsetInFile + faLf->lineStart; lineFileReuse(faLf); DNA *dna; boolean gotIt = faMixedSpeedReadNext(faLf, &dna, retDnaSize, retFaName); if (!gotIt) internalErr(); return TRUE; } boolean loadFaSeq(struct lineFile *faLf, HGID extFileId, HGID seqId, FILE *seqTab) /* Add next sequence in fasta file to tab file */ { off_t faOffset, faEndOffset; int faSize, dnaSize; char *faName, faAcc[256], faAccBuf[513]; int prefixLen = 0; /* Get next FA record. */ if (!readFaSeq(faLf, &faName, &dnaSize, &faOffset)) return FALSE; char *s = firstWordInLine(faName); abbreviate(s, abbr); if (strlen(s) == 0) errAbort("Missing accession line %d of %s", faLf->lineIx, faLf->fileName); if (prefix != NULL) prefixLen = strlen(prefix) + 1; if (strlen(faName+1) + prefixLen >= sizeof(faAcc)) errAbort("Fasta name too long line %d of %s", faLf->lineIx, faLf->fileName); faAcc[0] = 0; if (prefix != NULL) { safecat(faAcc, sizeof(faAcc), prefix); safecat(faAcc, sizeof(faAcc), "-"); } strcat(faAcc, s); faEndOffset = faLf->bufOffsetInFile + faLf->lineStart; faSize = (int)(faEndOffset - faOffset); /* note: sqlDate column is empty */ fprintf(seqTab, "%u\t%s\t%d\t0000-00-00\t%u\t%lld\t%d\n", seqId, sqlEscapeTabFileString2(faAccBuf, faAcc), dnaSize, extFileId, (unsigned long long)faOffset, faSize); return TRUE; } void loadFa(char *faFile, struct sqlConnection *conn, FILE *seqTab, HGID *nextSeqId) /* Add sequences in a fasta file to a seq table tab file */ { /* Check if the faFile is already in the extFileTbl and inform the user.*/ char query[1024]; sqlSafef(query, sizeof(query), "(select * from %s where name='%s'",extFileTbl,faFile); if ((!test) && (!replace) && (sqlGetResultExt(conn, query, NULL, NULL) != NULL)) errAbort("The file %s already has an entry in %s. To replace the existing entry rerun with the " "-replace option.", faFile, extFileTbl); HGID extFileId = test ? 0 : hgAddToExtFileTbl(faFile, conn, extFileTbl); struct lineFile *faLf = lineFileOpen(faFile, TRUE); unsigned count = 0; verbose(1, "Adding %s\n", faFile); /* Seek to first line starting with '>' in line file. */ if (!faSeekNextRecord(faLf)) errAbort("%s doesn't appear to be an .fa file\n", faLf->fileName); lineFileReuse(faLf); /* Loop around for each record of FA */ while (loadFaSeq(faLf, extFileId, *nextSeqId, seqTab)) { (*nextSeqId)++; count++; } verbose(1, "%u sequences\n", count); lineFileClose(&faLf); } void hgLoadSeq(char *database, int fileCount, char *fileNames[]) /* Add a bunch of FA files to sequence and extFile tables of * database. */ { struct sqlConnection *conn; int i; FILE *seqTab; HGID firstSeqId = 0, nextSeqId = 0; if (!test) { conn = hgStartUpdate(database); char query[1024]; if (drop) { sqlSafef(query, sizeof(query), "drop table if exists %s", seqTbl); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "drop table if exists %s", extFileTbl); sqlUpdate(conn, query); } sqlSafef(query, sizeof(query), seqTableCreate, seqTbl); // This aught to catch the duplicate table entry sqlMaybeMakeTable(conn, seqTbl, query); firstSeqId = nextSeqId = hgGetMaxId(conn, seqTbl) + 1; } verbose(1, "Creating %s.tab file\n", seqTbl); seqTab = hgCreateTabFile(".", seqTbl); for (i=0; i<fileCount; ++i) { loadFa(fileNames[i], conn, seqTab, &nextSeqId); } if (!test) { unsigned opts = 0; if (replace) opts |= SQL_TAB_REPLACE; verbose(1, "Updating %s table\n", seqTbl); hgLoadTabFileOpts(conn, ".", seqTbl, opts, &seqTab); hgEndUpdate(&conn, firstSeqId, nextSeqId-1, "Add sequences to %s from %d files starting with %s", seqTbl, fileCount, fileNames[0]); verbose(1, "All done\n"); } } void usage() /* Explain usage and exit. */ { errAbort( "hgLoadSeq - load browser database with sequence file info.\n" "usage:\n" " hgLoadSeq [-abbr=junk] database file(s).fa\n" "This loads sequence file info only, it is not used for genbank data.\n" "\n" "Options:\n" " -abbr=junk - remove junk from the start of each seq accession\n" " -prefix=xxx - prepend \"xxx-\" to each seq accession\n" " -replace - replace existing sequences with the same id\n" " -seqTbl=tbl - use this table instead of seq\n" " -extFileTbl=tbl - use this table instead of extFile\n" " -test - do not load database table\n" " -drop - drop tables before loading, can only use if -seqTbl and -extFileTbl\n" " are specified. \n" ); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc < 2) usage(); if ((optionExists("seqTbl") && !optionExists("extFileTbl")) || (!optionExists("seqTbl") && optionExists("extFileTbl"))) errAbort("must specified both or neither of -seqTbl and -extFileTbl"); seqTbl = optionVal("seqTbl", seqTbl); extFileTbl = optionVal("extFileTbl", extFileTbl); abbr = optionVal("abbr", NULL); prefix = optionVal("prefix", NULL); replace = optionExists("replace"); test = optionExists("test"); drop = optionExists("drop"); if (drop && !optionExists("seqTbl")) errAbort("can only specify -drop with -seqTbl and -extFileTbl"); hgLoadSeq(argv[1], argc-2, argv+2); return 0; }