fcfb73cd2dc944a0a10ef34699dcb0c185ea8e6a
max
  Wed May 26 05:12:50 2021 -0700
Trying to give users a clearer idea how many licenses this repo
contains, what is under which license and who to contact to get the
license. refs #27614

diff --git src/hg/utils/scaffoldFaToAgp/scaffoldFaToAgp.c src/hg/utils/scaffoldFaToAgp/scaffoldFaToAgp.c
new file mode 100644
index 0000000..e2ca8e4
--- /dev/null
+++ src/hg/utils/scaffoldFaToAgp/scaffoldFaToAgp.c
@@ -0,0 +1,213 @@
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
+/* scaffoldFaToAgp - read a file containing FA scaffold records and generate */
+/*                   an AGP file, gap file and a lift file */
+/* This utility is used when working with genomes that consist
+ * of scaffolds with no chromosome mapping (e.g. Fugu).
+ * The AGP file contains scaffolds and inter-scaffold gaps.
+ * This file is used to generate the assembly ("gold") track.
+ * The "gap" file contains gaps within scaffolds as well
+ * as inter-scaffold gaps.  This file is used to generate the "gap" track.
+ * It is formatted similarly to an AGP, but only contains gap entries.
+ * The lift file is associated with the AGP file.  It is used to
+ * lift scaffold coordinates to chromosome coordinates.
+ */
+
+#include "common.h"
+#include "fa.h"
+#include "options.h"
+#include "../../hg/inc/agpFrag.h"
+#include "../../hg/inc/agpGap.h"
+
+
+#define SCAFFOLD_GAP_TYPE "contig"   
+#define FRAGMENT_GAP_TYPE "frag"        /* within scaffolds (bridged) */
+#define CHROM_NAME "chrUn"
+
+int minGapSize = 5;
+int scaffoldGapSize = 1000;
+
+void usage()
+/* Print usage instructions and exit. */
+{
+errAbort("scaffoldFaToAgp - generate an AGP file, gap file, and lift file from a scaffold FA file.\n"
+     "usage:\n"
+     "    scaffoldFaToAgp source.fa\n"
+     "options:\n"
+     "      -minGapSize   Minimum threshold for calling a block of Ns a gap.\n"
+     "      -scaffoldGapSize Size of gaps of Ns to be inserted between scaffolds.\n"
+     "The resulting files will be source.{agp,gap,lft}\n"
+     "Note: unless otherwise specified, gaps of 1000 bases are inserted \n"
+     "between scaffold records by default as contig gaps in the .agp file.\n"
+     "  N's within scaffolds are represented as\n"
+     "   frag gaps in the .gap file only\n");
+}
+
+boolean seqGetGap(DNA *seq, int *retSize, int *retGapSize)
+/* determine size of ungapped sequence, and size of gap following it */
+{
+int seqSize = 0, gapSize = 0;
+
+for (; *seq && (*seq != 'n' && *seq != 'N'); seq++, seqSize++);
+if (retSize)
+    *retSize = seqSize;
+for (; *seq && (*seq == 'n' || *seq == 'N'); seq++, gapSize++);
+if (retGapSize)
+    *retGapSize = gapSize;
+return (seqSize ? TRUE : FALSE);
+}
+
+void scaffoldFaToAgp(char *scaffoldFile)
+/* scaffoldFaToAgp - create AGP file, gap file and lift file 
+* from scaffold FA file */
+{
+DNA *scaffoldSeq;
+char *name;
+int size;
+struct agpFrag frag;
+struct agpGap scaffoldGap, fragGap;
+
+struct lineFile *lf = lineFileOpen(scaffoldFile, TRUE);
+char outDir[256], outFile[128], ext[64], outPath[512];
+FILE *agpFile = NULL, *gapFile = NULL, *liftFile = NULL;
+
+int fileNumber = 1;      /* sequence number in AGP file */
+int start = 0, end = 0;
+int chromSize = 0;
+int scaffoldCount = 0;
+
+int fragSize = 0, gapSize = 0;
+char *seq;
+int seqStart = 0;
+
+/* determine size of "unordered chromosome" that will be constructed.
+ * This is needed for the lift file. */
+while (faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name))
+    {
+    chromSize += size;
+    chromSize += scaffoldGapSize;
+    scaffoldCount++;
+    }
+/* do not need the final useless gap */
+chromSize -= scaffoldGapSize;
+printf("scaffold gap size is %d, total scaffolds: %d\n",
+         scaffoldGapSize, scaffoldCount);
+printf("chrom size is %d\n", chromSize);
+
+/* initialize fixed fields in AGP frag */
+ZeroVar(&frag);
+frag.chrom = CHROM_NAME;
+frag.type[0] = 'D';   /* draft */
+frag.fragStart = 0;   /* always start at beginning of scaffold */
+frag.strand[0] = '+';
+
+/* initialize fixed fields in scaffold gap */
+ZeroVar(&scaffoldGap);
+scaffoldGap.chrom = CHROM_NAME;
+scaffoldGap.n[0] = 'N';
+scaffoldGap.size = scaffoldGapSize;
+scaffoldGap.type = SCAFFOLD_GAP_TYPE;
+scaffoldGap.bridge = "no";
+
+/* initialize fixed fields in frag gap */
+ZeroVar(&fragGap);
+fragGap.chrom = CHROM_NAME;
+fragGap.n[0] = 'N';
+fragGap.type = FRAGMENT_GAP_TYPE;
+fragGap.bridge = "yes";
+
+/* munge file paths */
+splitPath(scaffoldFile, outDir, outFile, ext);
+
+sprintf(outPath, "%s%s.agp", outDir, outFile);
+agpFile = mustOpen(outPath, "w");
+printf("writing %s\n", outPath);
+
+sprintf(outPath, "%s%s.gap", outDir, outFile);
+gapFile = mustOpen(outPath, "w");
+printf("writing %s\n", outPath);
+
+sprintf(outPath, "%s%s.lft", outDir, outFile);
+liftFile = mustOpen(outPath, "w");
+printf("writing %s\n", outPath);
+
+/* read in scaffolds from fasta file, and generate
+ * the three files */
+lineFileSeek(lf, 0, SEEK_SET);
+boolean allDone = FALSE;
+allDone = ! faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name);
+while (! allDone)
+    {
+    end = start + size;
+
+    /* setup AGP frag for the scaffold and write to AGP file */
+    frag.frag = name;
+    frag.ix = fileNumber++;
+    frag.chromStart = start;
+    frag.chromEnd = end;
+    frag.fragEnd = size;
+    agpFragOutput(&frag, agpFile, '\t', '\n');
+
+    /* write lift file entry for this scaffold */
+    fprintf(liftFile, "%d\t%s\t%d\t%s\t%d\n",
+            start, name, size, CHROM_NAME, chromSize);
+
+    /* write gap file entries for this scaffold */
+    seq = scaffoldSeq;
+    seqStart = start;
+    while (seqGetGap(seq, &fragSize, &gapSize))
+        {
+        if (gapSize > minGapSize)
+            {
+            fragGap.size = gapSize;
+            fragGap.chromStart = seqStart + fragSize + 1;
+            fragGap.chromEnd = fragGap.chromStart + gapSize - 1;
+            agpGapOutput(&fragGap, gapFile, '\t', '\n');
+            }
+        seqStart = seqStart + fragSize + gapSize;
+        seq = seq + fragSize + gapSize;
+        }
+
+    /* setup AGP gap to separate scaffolds and write to AGP and gap files */
+    /* Note: may want to suppress final gap -- not needed as separator */
+    start = end + 1;
+    end = start + scaffoldGapSize - 1;
+
+    /*	Avoid an extra gap on the end - not needed */
+    allDone = ! faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name);
+    if (allDone)
+	break;
+
+    scaffoldGap.ix = fileNumber++;
+    scaffoldGap.chromStart = start;
+    scaffoldGap.chromEnd = end;
+    agpGapOutput(&scaffoldGap, agpFile, '\t', '\n');
+    agpGapOutput(&scaffoldGap, gapFile, '\t', '\n');
+
+    /* write lift file entry for this gap */
+    fprintf(liftFile, "%d\t%s\t%d\t%s\t%d\n",
+            start-1, "gap", scaffoldGapSize, CHROM_NAME, chromSize);
+
+    start = end;
+
+    //freeMem(seq);
+    }
+carefulClose(&agpFile);
+carefulClose(&liftFile);
+carefulClose(&gapFile);
+lineFileClose(&lf);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionHash(&argc, argv);
+minGapSize = optionInt("minGapSize", minGapSize);
+scaffoldGapSize = optionInt("scaffoldGapSize", scaffoldGapSize);
+
+if (argc != 2)
+    usage();
+scaffoldFaToAgp(argv[1]);
+return 0;
+}