Example #1
0
void writeAgpFile(char *chromName, struct agpData *startAgpData, char *filename)
/*
Writes the agp file out for a single chromsome

param chromName - The name of the chromsome.
param startGap - Pointer to the dna gap or fragment at which we are starting to
 write data. The data will include the contents of this gap/frag.
param filename - The file to write.
 */
{
struct agpData *curData = NULL;
FILE *fp = NULL;

fp = fopen(filename, "w");
/*printf("Writing agp file %s for chromo %s\n", filename, startAgpData->data.pGap->chrom);*/

curData = startAgpData;
while (NULL != curData) 
    {
    if (curData->isGap)
	{
	 /* Undo the decrement we did earlier.
	    This was done in nextAgpEntryToSplitOn() in order
	    to be compatible with the 0-based frag addressing scheme.
	  */
	curData->data.pGap->chromStart++;
	agpGapOutput(curData->data.pGap, fp, '\t', '\n');
	/* Redo the above decrement - don't want to create side effects */
	curData->data.pGap->chromStart--;
	}
    else
	{
	agpFragOutput(curData->data.pFrag, fp, '\t', '\n');
	}

    curData = curData->next;
    }

fclose(fp);
}
void scaffoldFaToAgp(char *scaffoldFile)
/* scaffoldFaToAgp - create AGP file, gap file and lift file 
* from scaffold FA file */
{
DNA *scaffoldSeq;
char *name;
int size;
struct agpFrag frag;
struct agpGap scaffoldGap, fragGap;

struct lineFile *lf = lineFileOpen(scaffoldFile, TRUE);
char outDir[256], outFile[128], ext[64], outPath[512];
FILE *agpFile = NULL, *gapFile = NULL, *liftFile = NULL;

int fileNumber = 1;      /* sequence number in AGP file */
int start = 0, end = 0;
int chromSize = 0;
int scaffoldCount = 0;

int fragSize = 0, gapSize = 0;
char *seq;
int seqStart = 0;

/* determine size of "unordered chromosome" that will be constructed.
 * This is needed for the lift file. */
while (faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name))
    {
    chromSize += size;
    chromSize += scaffoldGapSize;
    scaffoldCount++;
    }
/* do not need the final useless gap */
chromSize -= scaffoldGapSize;
printf("scaffold gap size is %d, total scaffolds: %d\n",
         scaffoldGapSize, scaffoldCount);
printf("chrom size is %d\n", chromSize);

/* initialize fixed fields in AGP frag */
ZeroVar(&frag);
frag.chrom = CHROM_NAME;
frag.type[0] = 'D';   /* draft */
frag.fragStart = 0;   /* always start at beginning of scaffold */
frag.strand[0] = '+';

/* initialize fixed fields in scaffold gap */
ZeroVar(&scaffoldGap);
scaffoldGap.chrom = CHROM_NAME;
scaffoldGap.n[0] = 'N';
scaffoldGap.size = scaffoldGapSize;
scaffoldGap.type = SCAFFOLD_GAP_TYPE;
scaffoldGap.bridge = "no";

/* initialize fixed fields in frag gap */
ZeroVar(&fragGap);
fragGap.chrom = CHROM_NAME;
fragGap.n[0] = 'N';
fragGap.type = FRAGMENT_GAP_TYPE;
fragGap.bridge = "yes";

/* munge file paths */
splitPath(scaffoldFile, outDir, outFile, ext);

sprintf(outPath, "%s%s.agp", outDir, outFile);
agpFile = mustOpen(outPath, "w");
printf("writing %s\n", outPath);

sprintf(outPath, "%s%s.gap", outDir, outFile);
gapFile = mustOpen(outPath, "w");
printf("writing %s\n", outPath);

sprintf(outPath, "%s%s.lft", outDir, outFile);
liftFile = mustOpen(outPath, "w");
printf("writing %s\n", outPath);

/* read in scaffolds from fasta file, and generate
 * the three files */
lineFileSeek(lf, 0, SEEK_SET);
boolean allDone = FALSE;
allDone = ! faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name);
while (! allDone)
    {
    end = start + size;

    /* setup AGP frag for the scaffold and write to AGP file */
    frag.frag = name;
    frag.ix = fileNumber++;
    frag.chromStart = start;
    frag.chromEnd = end;
    frag.fragEnd = size;
    agpFragOutput(&frag, agpFile, '\t', '\n');

    /* write lift file entry for this scaffold */
    fprintf(liftFile, "%d\t%s\t%d\t%s\t%d\n",
            start, name, size, CHROM_NAME, chromSize);

    /* write gap file entries for this scaffold */
    seq = scaffoldSeq;
    seqStart = start;
    while (seqGetGap(seq, &fragSize, &gapSize))
        {
        if (gapSize > minGapSize)
            {
            fragGap.size = gapSize;
            fragGap.chromStart = seqStart + fragSize + 1;
            fragGap.chromEnd = fragGap.chromStart + gapSize - 1;
            agpGapOutput(&fragGap, gapFile, '\t', '\n');
            }
        seqStart = seqStart + fragSize + gapSize;
        seq = seq + fragSize + gapSize;
        }

    /* setup AGP gap to separate scaffolds and write to AGP and gap files */
    /* Note: may want to suppress final gap -- not needed as separator */
    start = end + 1;
    end = start + scaffoldGapSize - 1;

    /*	Avoid an extra gap on the end - not needed */
    allDone = ! faMixedSpeedReadNext(lf, &scaffoldSeq, &size, &name);
    if (allDone)
	break;

    scaffoldGap.ix = fileNumber++;
    scaffoldGap.chromStart = start;
    scaffoldGap.chromEnd = end;
    agpGapOutput(&scaffoldGap, agpFile, '\t', '\n');
    agpGapOutput(&scaffoldGap, gapFile, '\t', '\n');

    /* write lift file entry for this gap */
    fprintf(liftFile, "%d\t%s\t%d\t%s\t%d\n",
            start-1, "gap", scaffoldGapSize, CHROM_NAME, chromSize);

    start = end;

    //freeMem(seq);
    }
carefulClose(&agpFile);
carefulClose(&liftFile);
carefulClose(&gapFile);
lineFileClose(&lf);
}
Example #3
0
static void agpSangerUnfinished(char *agpFile, char *contigFasta, char *agpOut)
/* Fix agp to match unfinished contigs in fasta */
{
struct lineFile *lf = lineFileOpen(agpFile, TRUE);
char *line, *words[16];
int lineSize, wordCount;
unsigned lastPos = 0;
struct agpFrag *agp;
struct agpGap *gap;
FILE *f;
char *lastObj = NULL;
f = mustOpen(agpOut, "w");
char *newChrom = NULL;
struct hash *hash = hashFasta(contigFasta);

verbose(2,"#\tprocessing AGP file: %s\n", agpFile);
while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == 0 || line[0] == '#' || line[0] == '\n')
        continue;
    //verbose(2,"#\tline: %d\n", lf->lineIx);
    wordCount = chopLine(line, words);
    if (wordCount < 5)
        errAbort("Bad line %d of %s: need at least 5 words, got %d\n",
		 lf->lineIx, lf->fileName, wordCount);

    if (!lastObj || !sameString(words[0],lastObj))
	{
	freez(&newChrom);
	newChrom = cloneString(words[0]);
	lastPos = 0;
	}

    	
		 
    if (words[4][0] != 'N')
	{
	lineFileExpectAtLeast(lf, 9, wordCount);
	agp = agpFragLoad(words);
	/* agp is 1-based but agp loaders do not adjust for 0-based: */
    	agp->chromStart -= 1;
	agp->fragStart  -= 1;
	if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart)
	    errAbort("Sizes don't match in %s and %s line %d of %s\n",
		agp->chrom, agp->frag, lf->lineIx, lf->fileName);

	char *root = cloneString(agp->frag);
	chopSuffixAt(root, '.');

	struct hashEl *e, *elist = hashLookup(hash, root);
	for (e = elist; e; e = hashLookupNext(e))
	    {
	    struct unfinishedContig *u = e->val;
            if ((u->fragStart <= agp->fragStart) && (u->fragEnd >= agp->fragEnd))
		{
		agp->frag = cloneString(u->frag);
		agp->fragEnd -= u->fragStart;
		agp->fragStart -= u->fragStart;
		}
	    }
	freeMem(root);
	}
    else
        {
	lineFileExpectAtLeast(lf, 8, wordCount);
	gap = agpGapLoad(words);
	/* to be consistent with agpFrag */
	gap->chromStart -= 1;
	agp = (struct agpFrag*)gap;
	}

    if (agp->chromStart != lastPos)
	errAbort("Start doesn't match previous end line %d of %s\n"
	    "agp->chromStart: %u\n" 
	    "agp->chromEnd: %u\n" 
	    "lastPos: %u\n" 
	    ,lf->lineIx, lf->fileName
	    ,agp->chromStart
	    ,agp->chromEnd
	    ,lastPos
	    );

    lastPos = agp->chromEnd;
    freez(&lastObj);
    lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */
	
    if (words[4][0] != 'N')
	{
	/* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */
	agpFragOutput(agp, f, '\t', '\n');
	agpFragFree(&agp);
	}
    else
        {
	/* restore back to 1-based for agp 
	 * because agpGapOutput doesn't compensate */
	gap->chromStart += 1;
	agpGapOutput(gap, f, '\t', '\n');
	agpGapFree(&gap);
	}
	
    }

carefulClose(&f);
}
static void agpMergeChromScaf(char *agpFile, char *agpOut, boolean filtering)
/* Create a combined agp file from the chrom.agp and scaffold.agp, 
 *  merging in only scaffolds from scaffold.agp
 *  that are not already in chroms. */
{
struct lineFile *lf = lineFileOpen(agpFile, TRUE);
char *line, *words[16];
int lineSize, wordCount;
unsigned lastPos = 0;
struct agpFrag *agp;
struct agpGap *gap;
FILE *f;
char *lastObj = NULL;
f = mustOpen(agpOut, filtering ? "a" : "w");
char *newChrom = NULL;
static struct hash *hash = NULL;
boolean skipping = FALSE;

if (!hash)
    hash = hashNew(0);

verbose(2,"#\tprocessing AGP file: %s\n", agpFile);
while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == 0 || line[0] == '#' || line[0] == '\n')
        continue;
    //verbose(2,"#\tline: %d\n", lf->lineIx);
    wordCount = chopLine(line, words);
    if (wordCount < 5)
        errAbort("Bad line %d of %s: need at least 5 words, got %d\n",
		 lf->lineIx, lf->fileName, wordCount);

    if (!lastObj || !sameString(words[0],lastObj))
	{
	freez(&newChrom);
	newChrom = cloneString(words[0]);
	lastPos = 0;
	}

    	
    skipping = FALSE;
    if (filtering)
	{
	if (hashLookup(hash, words[0]))
	    skipping = TRUE;
	}
		 
    if (words[4][0] != 'N')
	{
	lineFileExpectAtLeast(lf, 9, wordCount);
	agp = agpFragLoad(words);
	/* agp is 1-based but agp loaders do not adjust for 0-based: */
    	agp->chromStart -= 1;
	agp->fragStart  -= 1;
	if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart)
	    errAbort("Sizes don't match in %s and %s line %d of %s\n",
		agp->chrom, agp->frag, lf->lineIx, lf->fileName);
        if (!filtering)
	    {
	    char *root = cloneString(agp->frag);
	    chopSuffixAt(root, '.');
	    hashStore(hash, root);
	    freeMem(root);
	    }
	}
    else
        {
	lineFileExpectAtLeast(lf, 8, wordCount);
	gap = agpGapLoad(words);
	/* to be consistent with agpFrag */
	gap->chromStart -= 1;
	agp = (struct agpFrag*)gap;
	}

    if (agp->chromStart != lastPos)
	errAbort("Start doesn't match previous end line %d of %s\n"
	    "agp->chromStart: %u\n" 
	    "agp->chromEnd: %u\n" 
	    "lastPos: %u\n" 
	    ,lf->lineIx, lf->fileName
	    ,agp->chromStart
	    ,agp->chromEnd
	    ,lastPos
	    );

    lastPos = agp->chromEnd;
    freez(&lastObj);
    lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */
	
    if (words[4][0] != 'N')
	{
	/* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */
	if (!skipping)
    	    agpFragOutput(agp, f, '\t', '\n');
	agpFragFree(&agp);
	}
    else
        {
	/* restore back to 1-based for agp 
	 * because agpGapOutput doesn't compensate */
	gap->chromStart += 1;
	if (!skipping)
	    agpGapOutput(gap, f, '\t', '\n');
	agpGapFree(&gap);
	}
	
    }

carefulClose(&f);
}