void splitNcbiFa(char *ncbiIn, char *outDir)
/* splitNcbiFa - Split up NCBI format fa file into UCSC formatted ones.. */
{
struct lineFile *lf = lineFileOpen(ncbiIn, TRUE);
static struct dnaSeq seq;
ZeroVar(&seq);

makeDir(outDir);
while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    FILE *f;
    char fileName[512];
    char *row[5];
    int wordCount;
    char ourName[129];
    char cloneName[128];

    wordCount = chopByChar(seq.name, '|', row, ArraySize(row));
    if (wordCount != 5)
        errAbort("Expecting 5 | separated fields line %d of %s", lf->lineIx, lf->fileName);
    strcpy(cloneName, row[3]);
    chopSuffix(cloneName);
    sprintf(fileName, "%s/%s.fa", outDir, cloneName);
    sprintf(ourName, "%s_1", row[3]);
    faWrite(fileName, ourName, seq.dna, seq.size);
    }
}
Exemple #2
0
int main(int argc, char *argv[])
{
char *sourceName, *destRootName;
int maxSize;
char destName[512];
char faName[512];
int destIx;
int size, start;
struct dnaSeq *seq;


if (argc != 4)
    usage();
sourceName = argv[1];
maxSize = atoi(argv[2]);
if (maxSize < 1)
    usage();
destRootName = argv[3];
printf("reading %s\n", sourceName);
seq = faReadDna(sourceName);
for (start = 0, destIx = 1; start < seq->size; start += size, ++destIx)
    {
    size = seq->size - start;
    if (size > maxSize)
	size = maxSize;
    sprintf(destName, "%s%02d.fa", destRootName, destIx);
    sprintf(faName, "%s.%d", seq->name, destIx);
    printf("writing %s\n", destName);
    faWrite(destName, faName, seq->dna+start, size);
    }
return 0;
}
Exemple #3
0
void snpMaskChrom(char *tableName, char *nibFile, char *outFile)
/* snpMaskChrom - Print a nib file as a fasta file, using IUPAC codes for single base substitutions. */
{
struct dnaSeq *seq;
char *ptr;
struct snpSimple *snps = NULL;
struct snpSimple *snp = NULL;
boolean inRep = FALSE;

seq = nibLoadAllMasked(NIB_MASK_MIXED, nibFile);
ptr = seq->dna;
snps = readSnpsFromChrom(tableName, chromName);

/* do all substitutions */

for (snp = snps; snp != NULL; snp = snp->next)
    {
    if (islower(ptr[snp->chromStart])) inRep = TRUE;
    else inRep = FALSE;
    ptr[snp->chromStart] = iupac(snp->name, snp->observed, ptr[snp->chromStart]);
    if (inRep)
        ptr[snp->chromStart] = tolower(ptr[snp->chromStart]);
    }

faWrite(outFile, chromName, seq->dna, seq->size);
snpSimpleFreeList(&snps);
dnaSeqFree(&seq);  

}
void writeChromFaFile(char *chromName, char *dna, int dnaSize, char *destDir)
/*
Writes the contents of a single chromsome out to a file in FASTA format.

param chromName - The name of the chromosome for which we are writing
 the fa file.
param dna - Pointer to the dna array.
param dnaSize - The size of the dna array.
 */
{
char filename [DEFAULT_PATH_SIZE];

sprintf(filename, "%s/%s.fa", destDir, chromName);
printf("Writing fa file %s for chromosome %s\n", filename, chromName);
faWrite(filename, chromName, dna, dnaSize);
}
Exemple #5
0
void snpMask(char *nibFile, char *outFile)
/* snpMask - Print a nib file, using IUPAC codes for single base substitutions. */
{
struct dnaSeq *seq;
char *ptr;
struct snp *snps = NULL;
struct snp *snp = NULL;

seq = nibLoadAllMasked(NIB_MASK_MIXED, nibFile);
ptr = seq->dna;
snps = readSnpsFromChrom(chromName);
printf("got all snps in %s\n", chromName);

/* do all substitutions */
for (snp = snps; snp != NULL; snp = snp->next)
    {
    ptr[snp->chromStart] = iupac(snp->name, snp->observed, ptr[snp->chromStart]);
    }

if (printSnps)
    {
    for (snp = snps; snp != NULL; snp = snp->next)
        {
        printSnpSeq(snp, seq);
        }
    }

if (printChrom)
    faWrite(outFile, chromName, seq->dna, seq->size);

snpFreeList(&snps);

if (printGenes) doPrintGenes(chromName, seq);

dnaSeqFree(&seq);  

}
void writeSuperContigFaFile(DNA *dna, struct agpData *startData, struct agpData *endData, char *filename, int sequenceNum)
/*
Creates a fasta file containing the contents of a supercontig in FASTA format.

param dna - Pointer to the dna array.
param startData - Pointer to the dna gap or fragment at which we are starting to
 write data. The data will include the contents of this gap/frag.
param end - Pointer to the dna gap or fragment at which we are stopping to
 write data. The data will include the contents of this gap/frag.
param filename - The file name to which to write.
param sequenceNum - The 1-based number of this clone supercontig in the chromsome.
 */
{
int startOffset = startData->data.pGap->chromStart;
int endOffset = endData->data.pGap->chromEnd;
int dnaSize = 0;
char sequenceName[BUF_SIZE];

printf("Writing supercontig fa file %s\n", filename);
// starting at dna[%d] up to but not including dna[%d]\n", filename, startOffset, endOffset);
sprintf(sequenceName, "%s_%d %d-%d", startData->data.pGap->chrom, sequenceNum, startOffset, endOffset);
dnaSize = endOffset - startOffset;
faWrite(filename, sequenceName, &dna[startOffset], dnaSize);
}
void gsBig(char *faName, char *gtfName, 
	   char *suboptName, 
	   char *transName,
	   char *exeName, 
	   char *parName,
	   char *tmpDirName)
/* gsBig - Run Genscan on big input and produce GTF files. */
{
struct dnaSeq seq;
struct lineFile *lf = lineFileOpen(faName, TRUE);
FILE *gtfFile = mustOpen(gtfName, "w");
FILE *subFile = NULL;
FILE *transFile = NULL;
ZeroVar(&seq);

if (suboptName != NULL)
    subFile = mustOpen(suboptName, "w");
if (transName != NULL)
    transFile = mustOpen(transName, "w");
if (exeName != NULL)
    exePath = cloneString(exeName);
if (parName != NULL)
        parPath = cloneString(parName);	
if (tmpDirName != NULL)
        tmpDir = cloneString(tmpDirName);
	
if (optionExists("prerun"))
    {
    char *preFileName = optionVal("prerun", NULL);
    char seqName[128];
    struct segment *seg = parseSegment(preFileName, 0, 100000000, seqName);
    writeSeg(seqName, seg, gtfFile, subFile, transFile);
    }
else
    {
    struct dyString *dy = newDyString(1024);
    char tempFa[512], tempGs[512];
    char dir1[256], root1[128], ext1[64];
    int myPid = (int)getpid();

    splitPath(faName, dir1, root1, ext1);
    while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
	{
	int offset, sizeOne;
	struct segment *segList = NULL, *seg;
	char *seqName = cloneString(seq.name);
	int chunkNum = 0;

	for (offset = 0; offset < seq.size; offset += stepSize)
	    {
	    boolean allN = TRUE;
	    int i;
	    safef(tempFa, sizeof(tempFa), "%s/temp_gsBig_%d_%s_%d.fa",
		  tmpDir, myPid, seqName, chunkNum);
	    safef(tempGs, sizeof(tempGs), "%s/temp_gsBig_%d_%s_%d.genscan",
		  tmpDir, myPid, seqName, chunkNum);
	    sizeOne = seq.size - offset;
	    if (sizeOne > winSize) sizeOne = winSize;
	    /* Genscan hangs forever if a chunk is all-N's... if so, 
	     * then skip this chunk. */
	    for (i=offset;  i < (offset+sizeOne);  i++)
		{
		if (seq.dna[i] != 'N' && seq.dna[i] != 'n')
		    {
		    allN = FALSE;
		    break;
		    }
		}
	    if (allN)
		{
		printf("\ngsBig: skipping %s[%d:%d] -- it's all N's.\n\n",
		       seqName, offset, (offset+sizeOne-1));
		}
	    else
		{
		faWrite(tempFa, "split", seq.dna + offset, sizeOne); 
		dyStringClear(dy);
		dyStringPrintf(dy, "%s %s %s", exePath, parPath, tempFa);
		if (suboptName != NULL)
		    dyStringPrintf(dy, " -subopt");
		dyStringPrintf(dy, " > %s", tempGs);
		verbose(3, "%s\n", dy->string);
		mustSystem(dy->string);
		seg = parseSegment(tempGs, offset, offset+sizeOne, NULL);
		slAddHead(&segList, seg);
		}
	    chunkNum++;
	    }
	slReverse(&segList);
	seg = mergeSegs(segList);
	writeSeg(seqName, seg, gtfFile, subFile, transFile);
	freez(&seqName);
	}
    if (! optionExists("noRemove"))
	{
	remove(tempFa);
	remove(tempGs);
	}
    }
}
Exemple #8
0
void fakeFinContigs(char *agpName, char *faName, char *finDir, char *rootName, char *finFaDir, char *ooVer)
/* fakeFinContigs - Fake up contigs for a finished chromosome. */
{
struct contig *contigList = NULL, *contig = NULL;
struct agpFrag *agp;
struct lineFile *lf = lineFileOpen(agpName, TRUE);
char *line, *words[16];
int lineSize, wordCount;
int contigIx = 0;
char liftDir[512], contigDir[512], path[512];
char chrom[128];
FILE *f;
struct dnaSeq *seq;
int fragIx;

/* Build up contig list by scanning agp file. */
printf("Reading %s\n", lf->fileName);
while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == '#' || line[0] == 0)
        continue;
    wordCount = chopLine(line, words);
    if (wordCount < 5)
        errAbort("Expecting at least 5 words line %d of %s", lf->lineIx, lf->fileName);
    if (words[4][0] == 'N' || words[4][0] == 'U')
	{
        contig = NULL;
        continue;
	}
    lineFileExpectWords(lf, 9, wordCount);
    agp = agpFragLoad(words);
    // file is 1-based but agpFragLoad() now assumes 0-based:
    agp->chromStart -= 1;
    agp->fragStart  -= 1;
    if (contig == NULL)
	{
        AllocVar(contig);
	sprintf(contig->name, "%s%d", rootName, ++contigIx);
	contig->startOffset = agp->chromStart;
	slAddHead(&contigList, contig);
	}
    else 
        {
	if (contig->agpList != NULL && contig->agpList->chromEnd != agp->chromStart)
	    errAbort("Start doesn't match previous end line %d of %s", 
	    	lf->lineIx, lf->fileName);
	}
    if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart)
        errAbort("Chrom and frag size mismatch line %d of %s", lf->lineIx, lf->fileName);
    slAddHead(&contig->agpList, agp);
    contig->endOffset = agp->chromEnd;
    }
slReverse(&contigList);
for (contig = contigList; contig != NULL; contig = contig->next)
    slReverse(&contig->agpList);
lineFileClose(&lf);

/* Load up chromosome sequence and make sure it is in one piece. */
printf("Reading %s\n", faName);
seq = faReadAllDna(faName);
if (slCount(seq) != 1)
    errAbort("Got %d sequences in %s, can only handle one.", slCount(seq), faName);

/* Fix up agp coordinates. Make a directory for each contig.  Fill it with 
 * .fa .agp barge.NN files for that contig. */
printf("Writing contig dirs\n");
for (contig = contigList; contig != NULL; contig = contig->next)
    {
    /* Make Contig dir. */
    sprintf(contigDir, "%s/%s", finDir, contig->name);
    makeDir(contigDir);

    /* Make contig.agp file. */
    sprintf(path, "%s/%s.agp", contigDir, contig->name);
    f = mustOpen(path, "w");
    fragIx = 0;
    for (agp = contig->agpList; agp != NULL; agp = agp->next)
	{
	char buf[128];
	sprintf(buf, "%s/%s", skipChr(agp->chrom), contig->name);
	freez(&agp->chrom);
	agp->chrom = cloneString(buf);
	agp->chromStart -= contig->startOffset;
	agp->chromEnd -= contig->startOffset;
	agp->ix = ++fragIx;
	agpFragTabOut(agp, f);
	}
    carefulClose(&f);

    /* Make ooGreedy.NN.gl file */
    sprintf(path, "%s/%s.%s.gl", contigDir, "ooGreedy", ooVer);
    f = mustOpen(path, "w");
    for (agp = contig->agpList; agp != NULL; agp = agp->next)
        {
	if (agp->type[0] != 'N' && agp->type[0] != 'U')
	    {
	    fprintf(f, "%s_1\t%d\t%d\t%s\n",  agp->frag, 
	    	agp->chromStart, 
		agp->chromEnd,
	        agp->strand);
	    }
	}
    carefulClose(&f);

    /* Make contig.fa file. */
    sprintf(path, "%s/%s.fa", contigDir, contig->name);
    faWrite(path, contig->name, seq->dna + contig->startOffset, 
    	contig->endOffset - contig->startOffset);

    /* Make contig/barge file. */
    sprintf(path, "%s/barge.%s", contigDir, ooVer);
    f = mustOpen(path, "w");
    fprintf(f, "Barge (Connected Clone) File ooGreedy Version %s\n", ooVer);
    fprintf(f, "\n");
    fprintf(f, "start  accession  size overlap maxClone maxOverlap\n");
    fprintf(f, "------------------------------------------------------------\n");
    for (agp = contig->agpList; agp != NULL; agp = agp->next)
        {
	char clone[128];
	strcpy(clone, agp->frag);
	chopSuffix(clone);
	
	fprintf(f, "%d\t%s\t%d\t100\tn/a\t0\n", agp->chromStart, 
		clone, agp->chromEnd);
	}
    carefulClose(&f);

    /* Make contig/gold file. */
    sprintf(path, "%s/gold.%s", contigDir, ooVer);
    f = mustOpen(path, "w");
    fragIx = 0;
    for (agp = contig->agpList; agp != NULL; agp = agp->next)
        {
	char fragName[128];
	struct agpFrag frag = *agp;
	sprintf(fragName, "%s_1", agp->frag);
	frag.frag = fragName;
	frag.type[0] = '0';
	agpFragTabOut(&frag, f);
	}
    carefulClose(&f);
    }

/* Create lift subdirectory. */
printf("Creating lift files\n");
sprintf(liftDir, "%s/lift", finDir);
makeDir(liftDir);

/* Create lift/oOut.lst file (just a list of contigs). */
sprintf(path, "%s/oOut.lst", liftDir);
f = mustOpen(path, "w");
for (contig = contigList; contig != NULL; contig = contig->next)
    fprintf(f, "%s/%s.fa.out\n", contig->name, contig->name);
carefulClose(&f);

/* Create lift/ordered.lst file (just a list of contigs). */
sprintf(path, "%s/ordered.lst", liftDir);
f = mustOpen(path, "w");
for (contig = contigList; contig != NULL; contig = contig->next)
    fprintf(f, "%s\n", contig->name);
carefulClose(&f);

/* Create lift/ordered.lft file. */
sprintf(path, "%s/ordered.lft", liftDir);
f = mustOpen(path, "w");
splitPath(faName, NULL, chrom, NULL);
for (contig = contigList; contig != NULL; contig = contig->next)
    fprintf(f, "%d\t%s/%s\t%d\t%s\t%d\n", 
	contig->startOffset, skipChr(chrom), contig->name,  
	contig->endOffset - contig->startOffset,
	chrom, seq->size);
carefulClose(&f);
}