void splitAbout(char *inName, off_t approxSize, char *outRoot)
/* Split into chunks of about approxSize.  Don't break up
 * sequence though. */
{
struct dnaSeq seq;
struct lineFile *lf = lineFileOpen(inName, TRUE);
int digits = 2;
off_t curPos = approxSize;
int fileCount = 0;
FILE *f = NULL;
char outPath[PATH_LEN];
ZeroVar(&seq);

while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    if (curPos >= approxSize)
        {
	carefulClose(&f);
	curPos = 0;
        mkOutPath(outPath, outRoot, digits, fileCount++);
	verbose(2, "writing %s\n", outPath);
	f = mustOpen(outPath, "w");
	}
    curPos += seq.size;
    faWriteNext(f, seq.name, seq.dna, seq.size);
    }
carefulClose(&f);
lineFileClose(&lf);
}
Example #2
0
void musAliAt(char *database, char *chrom, char *humanFa, char *mouseFa)
/* musAliAt - Produce .fa files where mouse alignments hit on chr22. */
{
char query[256], **row;
struct sqlResult *sr;
struct sqlConnection *conn;
struct dnaSeq *musSeq, *homoSeq;
struct psl *psl;
struct hash *musHash = newHash(10);
FILE *musOut = mustOpen(mouseFa, "w");

hSetDb(database);
conn = hAllocConn();
sqlSafef(query, sizeof query, "select * from blatMouse where tName = '%s'", chrom);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    psl = pslLoad(row);
    if ((musSeq = hashFindVal(musHash, psl->qName)) == NULL)
        {
	musSeq = hExtSeq(psl->qName);
	hashAdd(musHash, psl->qName, NULL);
	faWriteNext(musOut, musSeq->name, musSeq->dna, musSeq->size);
	freeDnaSeq(&musSeq);
	}
    pslFree(&psl);
    }
}
static void writeOneByGap(boolean oneFile, char *outRoot,
    int digits, int *pieceIx, FILE *f, char noPath[256], int pos, int thisSize,
	struct dnaSeq *seq, FILE *lift, int *writeCount, char *outPath)
{
char numOut[128];
if (!oneFile)
    {
    char fileName[512];
    mkOutPath(fileName, outRoot, digits, *pieceIx);
    f = mustOpen(fileName, "w");
    verbose(2, "writing %s\n", fileName);
    }
else
    verbose(2, "writing %s\n", outPath);
sprintf(numOut, "%s%0*d", noPath, digits, *pieceIx);
verbose(3,"#\twriting piece %s, at pos %d for size %d\n", numOut,pos,thisSize);
faWriteNext(f, numOut, seq->dna + pos, thisSize);
if (lift)
    fprintf(lift, "%d\t%s\t%d\t%s\t%d\n",
	pos, numOut, thisSize, seq->name, seq->size);
*writeCount += 1;
*pieceIx += 1;
if (!oneFile)
    carefulClose(&f);
}
Example #4
0
int filterByQual(char *faFileName, FILE *f, int minQual, int minQualRun, struct hash *uniqHash)
/* Write out parts of sequence that meet quality standards to fa file in out. 
 * Returns untrimmed size. */
{
char qaFileName[512], dir[256], name[128], ext[64];
struct qaSeq *qa;
int start, size;
int initialSize;

splitPath(faFileName, dir, name, ext);
sprintf(qaFileName, "%s%s.qual", dir, name);
qa = qaMustReadBoth(qaFileName, faFileName);
if (hashLookup(uniqHash, qa->name))
   warn("%s duplicated, ignoring all but first occurence", qa->name);
else
    {
    hashAdd(uniqHash, qa->name, NULL);
    if (trimQa(qa, minQual, minQualRun, &start, &size))
	{
	faWriteNext(f, qa->name, qa->dna + start, size);
	}
    }
initialSize = qa->size;
qaSeqFree(&qa);
return initialSize;
}
Example #5
0
void agpToFaOne(struct agpFrag **pAgpList, char *agpFile, char *agpSeq,
		char *seqDir, int lastPos, FILE *f)
/* Given one sequence's worth of AGP in pAgpList, process it into FASTA
 * and write to f. */
{
DNA *dna = NULL;

slReverse(pAgpList);
if (lastPos == 0)
    errAbort("%s not found in %s\n", agpSeq, agpFile);
dna = needHugeMem(lastPos+1);
memset(dna, 'n', lastPos);
dna[lastPos] = 0;
if (optionExists("simpleMulti"))
    {
    simpleMultiFillInSequence(0, seqDir, *pAgpList, dna, lastPos);
    }
else if (optionExists("simpleMultiMixed"))
    {
    simpleMultiFillInSequence(1, seqDir, *pAgpList, dna, lastPos);
    }
else if (optionExists("simple"))
    {
    simpleFillInSequence(seqDir, *pAgpList, dna, lastPos);
    }
else
    {
    gsFillInSequence(seqDir, *pAgpList, dna, lastPos);
    }
verbose(2,"Writing %s (%d bases)\n", agpSeq, lastPos);
faWriteNext(f, agpSeq, dna, lastPos);
agpFragFreeList(pAgpList);
}
void splitByNamePrefix(char *inName, char *outRoot, int preFixCount)
/* Split into chunks using prefix of sequence names.  */
{
struct dnaSeq seq;
struct lineFile *lf = lineFileOpen(inName, TRUE);
FILE *f = NULL;
char outDir[256], outFile[128], ext[64], outPath[512], preFix[512];
ZeroVar(&seq);

splitPath(outRoot, outDir, outFile, ext);
assert(preFixCount < sizeof(preFix));

while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    carefulClose(&f);
    strncpy(preFix, seq.name, preFixCount);
    preFix[preFixCount] = '\0';
    sprintf(outPath, "%s%s.fa", outDir, preFix);
    verbose(2, "writing %s\n", outPath);
    f = mustOpen(outPath, "a");
    faWriteNext(f, seq.name, seq.dna, seq.size);
    }
carefulClose(&f);
lineFileClose(&lf);
}
void splitByRecord(char *inName, int splitCount, char *outRoot, off_t estSize)
/* Split into a file base by base. */
{
struct dnaSeq seq;
struct lineFile *lf = lineFileOpen(inName, TRUE);
int digits = digitsBaseTen(splitCount);
off_t nextEnd = 0;
off_t curPos = 0;
int fileCount = 0;
FILE *f = NULL;
char outPath[PATH_LEN];
ZeroVar(&seq);

while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    curPos += seq.size;
    if (curPos > nextEnd)
        {
	carefulClose(&f);
        mkOutPath(outPath, outRoot, digits, fileCount++);
	verbose(2, "writing %s\n", outPath);
	f = mustOpen(outPath, "w");
	nextEnd = calcNextEnd(fileCount, splitCount, estSize);
	}
    faWriteNext(f, seq.name, seq.dna, seq.size);
    }
carefulClose(&f);
lineFileClose(&lf);
}
Example #8
0
void faWrite(char *fileName, char *startLine, DNA *dna, int dnaSize)
/* Write out FA file or die trying. */
{
FILE *f = mustOpen(fileName, "w");
faWriteNext(f, startLine, dna, dnaSize);
if (fclose(f) != 0)
    errnoAbort("fclose failed");
}
Example #9
0
void correctOne(struct dnaSeq *est, struct psl *psl, char *nibDir, 
   struct hash *nibHash, FILE *f)
/* Write one corrected EST to file. */
{
struct dnaSeq *geno = readCachedNib(nibHash, nibDir, psl->tName, 
	psl->tStart, psl->tEnd - psl->tStart);
struct dyString *t = newDyString(est->size+20);
int qSize = psl->qSize;
int tSize = psl->tSize;
int qLastEnd = 0;
int blockIx;
struct mrnaBlock *mbList, *mb;
int genoOffset = psl->tStart;
boolean isRc = FALSE;

/* Load sequence and alignment blocks, coping with reverse
 * strand as necessary. */
toUpperN(geno->dna, geno->size);	/* This helps debug... */
mbList = mrnaBlockFromPsl(psl);
if (psl->strand[0] == '-')
    {
    reverseComplement(geno->dna, geno->size);
    genoOffset = tSize - psl->tEnd;
    for (mb = mbList; mb != NULL; mb = mb->next)
         {
	 reverseIntRange(&mb->tStart, &mb->tEnd, tSize);
	 reverseIntRange(&mb->qStart, &mb->qEnd, qSize);
	 }
    slReverse(&mbList);
    isRc = TRUE;
    }

/* Make t have corrected sequence. */
for (mb = mbList; mb != NULL; mb = mb->next)
    {
    int qStart = mb->qStart;
    int qEnd = mb->qEnd;
    int uncovSize = qStart - qLastEnd;
    if (uncovSize > 0)
	dyStringAppendN(t, est->dna + qLastEnd, uncovSize);
    dyStringAppendN(t, geno->dna + mb->tStart - genoOffset, 
    	mb->tEnd - mb->tStart);
    qLastEnd = qEnd;
    }
if (qLastEnd != qSize)
    {
    int uncovSize = qSize - qLastEnd;
    dyStringAppendN(t, est->dna + qLastEnd, uncovSize);
    }

/* Output */
faWriteNext(f, est->name, t->string, t->stringSize);

/* Clean up time. */
slFreeList(&mbList);
freeDyString(&t);
freeDnaSeq(&geno);
}
void webOutFasta(struct dnaSeq *seq, char *db)
{
/** output a blat link and the fasta in cut and past form */
printf("<pre>\n");
faWriteNext(stdout, seq->name, seq->dna, seq->size);
printf("</pre>\n");
outputBlatLink("Blat Sequence on new Draft", db, seq);
printf("<br><br>");
}
Example #11
0
static void processRnaSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi)
/* get an RNA sequence, which already includes version in name */
{
struct dnaSeq *seq = hGenBankGetMrnaC(conn, rsvi->acc, NULL);
if (seq == NULL)
    errAbort("failed to get %s from database", rsvi->acc);
faWriteNext(fh, seq->name, seq->dna, seq->size);
dnaSeqFree(&seq);
}
Example #12
0
void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, 
	int start, int end)
/* Output sequence. */
{
struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqSpec, start, end);
if (noMask)
    toUpperN(seq->dna, seq->size);
faWriteNext(f, seq->name, seq->dna, seq->size);
dnaSeqFree(&seq);
}
void outputProtein(struct cdsEvidence *cds, struct dnaSeq *txSeq, FILE *f)
/* Translate txSeq to protein guided by cds, and output to file.
 * The implementation is a little complicated by checking for internal
 * stop codons and other error conditions. */
{
boolean selenocysteine = FALSE;
if (selenocysteineHash != NULL)
    {
    if (hashLookup(selenocysteineHash, txSeq->name))
	selenocysteine = TRUE;
    }
struct dyString *dy = dyStringNew(4*1024);
int blockIx;
for (blockIx=0; blockIx<cds->cdsCount; ++blockIx)
    {
    DNA *dna = txSeq->dna + cds->cdsStarts[blockIx];
    int rnaSize = cds->cdsSizes[blockIx];
    if (rnaSize%3 != 0)
        {
	errAbort("size of block (%d) not multiple of 3 in %s",
	    rnaSize, cds->name);
	}
    int aaSize = rnaSize/3;
    int i;
    for (i=0; i<aaSize; ++i)
        {
	AA aa = lookupCodon(dna);
	if (aa == 0) 
	    {
	    aa = '*';
	    if (selenocysteine)
	        {
		if (!isReallyStopCodon(dna, TRUE))
		    aa = 'U';
		}
	    }
	dyStringAppendC(dy, aa);
	dna += 3;
	}
    }
int lastCharIx = dy->stringSize-1;
if (dy->string[lastCharIx] == '*')
    {
    dy->string[lastCharIx] = 0;
    dy->stringSize = lastCharIx;
    }
char *prematureStop = strchr(dy->string, '*');
if (prematureStop != NULL)
    {
    errAbort("Stop codons in CDS at position %d for %s", 
    	(int)(prematureStop - dy->string), cds->name);
    }
faWriteNext(f, cds->name, dy->string, dy->stringSize);
dyStringFree(&dy);
}
void writePeptide(FILE *outFa, char *acc, struct dnaSeq *dna, struct genbankCds *cds)
/* translate the sequence to a peptide and output */
{
char *pep = needMem(dna->size); /* more than needed */
char hold = dna->dna[cds->end];
dna->dna[cds->end] = '\0';
dnaTranslateSome(dna->dna+cds->start, pep, dna->size);
dna->dna[cds->end] = hold;
faWriteNext(outFa, acc, pep, strlen(pep));
freeMem(pep);
}
Example #15
0
static void writeFastas(struct gff3File *g3f, FILE *fh)
/* write fasta records fo the file */
{
if (g3f->seqs != NULL)
    {
    fputs("##FASTA\n", fh);
    struct dnaSeq *seq;
    for (seq = g3f->seqs; seq != NULL; seq = seq->next)
        faWriteNext(fh, seq->name, seq->dna, seq->size);
    }
}
Example #16
0
void faWriteAll(char *fileName, bioSeq *seqList)
/* Write out all sequences in list to file. */
{
FILE *f = mustOpen(fileName, "w");
bioSeq *seq;

for (seq=seqList; seq != NULL; seq = seq->next)
    faWriteNext(f, seq->name, seq->dna, seq->size);
if (fclose(f) != 0)
    errnoAbort("fclose failed");
}
void writeSeg(char *seqName, struct segment *seg, FILE *gtf, FILE *sub, FILE *trans)
/* Write out gtf and bed files. */
{
struct genScanGene *gene;
struct genScanFeature *gsf;

for (gene = seg->geneList; gene != NULL; gene = gene->next)
    {
    char geneName[128];
    boolean someCds = FALSE;
    sprintf(geneName, "%s.%d", seqName, gene->id);
    for (gsf = gene->featureList; gsf != NULL; gsf = gsf->next)
        {
	if (sameString("Init", gsf->type))
	    {
	    cdsOut(gtf, gsf, geneName, seqName);
	    someCds = TRUE;
	    }
	else if (sameString("Intr", gsf->type))
	    {
	    cdsOut(gtf, gsf, geneName, seqName);
	    someCds = TRUE;
	    }
	else if (sameString("Term", gsf->type))
	    {
	    cdsOut(gtf, gsf, geneName, seqName);
	    someCds = TRUE;
	    }
	else if (sameString("Sngl", gsf->type))
	    {
	    cdsOut(gtf, gsf, geneName, seqName);
	    someCds = TRUE;
	    }
	}
    if ((trans != NULL) && (gene->featureList != NULL))
        {
	if (someCds)
	    faWriteNext(trans, geneName, gene->translation, strlen(gene->translation));
	}
    }

if (sub != NULL)
    {
    for (gsf = seg->suboptList; gsf != NULL; gsf = gsf->next)
        {
	fprintf(sub, "%s\t%d\t%d\t%s.%d\t%d\t%c\n",
	    seqName, gsf->start, gsf->end, seqName, gsf->featId,
	    round(1000*gsf->p), gsf->strand);
	}
    }
}
void createFastaFilesForBits(char *root, struct genomeBit *gbList, boolean addDummy)
/* load all of the fasta records for the bits in the genome list into one
   fasta file. Uses .nib files as they are much more compact and allow random access. */
{
struct dnaSeq *seq = NULL;
struct genomeBit *gb = NULL;
FILE *faOut = NULL;
char *faFile = NULL;
char *nibFile = NULL;
int totalBp = 0;
assert(gbList);
faFile = fileNameFromGenomeBit(outputRoot, ".fa", gbList);
faOut = mustOpen(faFile, "w");
for(gb = gbList; gb != NULL; gb = gb->next)
    {
    char buff[256];
    snprintf(buff, sizeof(buff), "%s:%u-%u", gb->chrom, gb->chromStart, gb->chromEnd);
    nibFile = nibFileFromChrom(root, gb->chrom);
    seq = nibLoadPartMasked(NIB_MASK_MIXED, nibFile, gb->chromStart, gb->chromEnd-gb->chromStart);
    totalBp += strlen(seq->dna);

    faWriteNext(faOut, buff, seq->dna, seq->size);
    dnaSeqFree(&seq);
    freez(&nibFile);
    }
/* Add a dummy fasta record so that avid will order and orient things for us.. */
if(addDummy)
    faWriteNext(faOut, "garbage", "nnnnnnnnnn", 10);
carefulClose(&faOut);
/** This bit is commented out as we are now using nnnn's as repeat masking */
/*  if(slCount(gbList) > 1) */
/*      repeatMaskFile(outputRoot, gbList); */
/*  else */
/*    fakeRepeatMaskFile(outputRoot, gbList); */
freez(&faFile);
}
void randomEst(char *database, int count, char *output)
/* randomEst - Select random ESTs from database. */
{
struct sqlConnection *conn = sqlConnect(database);
struct sqlResult *sr;
char **row;
int i, elIx, okCount = 0;
struct slName *list = NULL, *el;
FILE *f = NULL;
char **array = NULL;
struct dnaSeq *seq;
struct hash *uniqHash = newHash(0);

hSetDb(database);
printf("Scanning database\n");
sr = sqlGetResult(conn, "select acc,type,direction from mrna");
while ((row = sqlNextRow(sr)) != NULL)
    {
    if (sameString(row[1], "EST") && sameString(row[2], "3"))
        {
	el = newSlName(row[0]);
	slAddHead(&list, el);
	++okCount;
	}
    }
sqlFreeResult(&sr);
printf("Got %d 3' ESTs\n", okCount);
AllocArray(array, okCount);
for (i=0, el = list; el != NULL; el = el->next, ++i)
    array[i] = el->name;

printf("Selecting %d to put into %s\n", count, output);
f = mustOpen(output, "w");
for (i=0; i<count; ++i)
    {
    char *name;
    elIx = rand()%okCount;
    name = array[elIx];
    if (!hashLookup(uniqHash, name))
	{
	hashAdd(uniqHash, name, NULL);
	seq = hRnaSeq(name);
	faWriteNext(f, seq->name, seq->dna, seq->size);
	freeDnaSeq(&seq);
	}
    }
}
Example #20
0
void chromFeatureSeq(struct sqlConnection *conn, 
	char *database, char *chrom, char *trackSpec,
	FILE *bedFile, FILE *faFile,
	int *retItemCount, int *retBaseCount)
/* Write out sequence file for features from one chromosome.
 * This separate routine handles the non-merged case.  It's
 * reason for being is so that the feature names get preserved. */
{
boolean hasBin;
char t[512], *s = NULL;
char table[HDB_MAX_TABLE_STRING];
struct featureBits *fbList = NULL, *fb;

if (trackSpec[0] == '!')
   errAbort("Sorry, '!' not available with fa output unless you use faMerge");
isolateTrackPartOfSpec(trackSpec, t);
s = strchr(t, '.');
if (s != NULL)
    errAbort("Sorry, only database (not file) tracks allowed with "
             "fa output unless you use faMerge");
// ignore isSplit return from hFindSplitTable()
(void) hFindSplitTable(database, chrom, t, table, &hasBin);
fbList = fbGetRangeQuery(database, trackSpec, chrom, 0, hChromSize(database, chrom),
			 where, TRUE, TRUE);
for (fb = fbList; fb != NULL; fb = fb->next)
    {
    int s = fb->start, e = fb->end;
    if (bedFile != NULL)
	{
	fprintf(bedFile, "%s\t%d\t%d\t%s", 
	    fb->chrom, fb->start, fb->end, fb->name);
	if (fb->strand != '?')
	    fprintf(bedFile, "\t0\t%c", fb->strand);
	fprintf(bedFile, "\n");
	}
    if (faFile != NULL)
        {
	struct dnaSeq *seq = hDnaFromSeq(database, chrom, s, e, dnaLower);
	if (fb->strand == '-')
	    reverseComplement(seq->dna, seq->size);
	faWriteNext(faFile, fb->name, seq->dna, seq->size);
	freeDnaSeq(&seq);
	}
    }
featureBitsFreeList(&fbList);
}
Example #21
0
static void processProtSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi, struct hash *doneProts)
/* get an protein sequence, which already includes version in name. Don't duplicate NPs */
{
char query[128];
sqlSafef(query, sizeof(query), "SELECT protAcc FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
char *protAcc = sqlNeedQuickString(conn, query);
if (isNotEmpty(protAcc) && hashLookup(doneProts, protAcc) == NULL)
    {
    struct dnaSeq *seq = hGenBankGetPepC(conn, protAcc, NULL);
    if (seq == NULL)
        errAbort("failed to get %s from database", protAcc);
    faWriteNext(fh, seq->name, seq->dna, seq->size);
    dnaSeqFree(&seq);
    hashAdd(doneProts, protAcc, NULL);
    }
freeMem(protAcc);
}
Example #22
0
void printExons(struct genePred *gene, struct dnaSeq *seq, FILE *f)
/* print the sequence from the exons */
{
int exonPos = 0;
int exonStart = 0;
int exonEnd = 0;
int size = 0;
int total = 0;
struct dnaSeq *exonOnlySeq;
int offset = 0;

verbose(3, "exonCount = %d\n", gene->exonCount);

// get length of exons
for (exonPos = 0; exonPos < gene->exonCount; exonPos++)
    {
    exonStart = gene->exonStarts[exonPos] - gene->txStart;
    exonEnd   = gene->exonEnds[exonPos] - gene->txStart;
    size = exonEnd - exonStart;
    assert (size > 0);
    total += size;
    }

// modeled after hgSeq.c
AllocVar(exonOnlySeq);
exonOnlySeq->dna = needLargeMem(total+1);
exonOnlySeq->size = total;

offset = 0;
for (exonPos = 0; exonPos < gene->exonCount; exonPos++)
    {
    exonStart = gene->exonStarts[exonPos] - gene->txStart;
    exonEnd   = gene->exonEnds[exonPos] - gene->txStart;
    size = exonEnd - exonStart;
    verbose(4, "size = %d\n", size);
    memcpy(exonOnlySeq->dna+offset, seq->dna+exonStart, size);
    offset += size;
    }

assert(offset == exonOnlySeq->size);
exonOnlySeq->dna[offset] = 0;
faWriteNext(f, gene->name, exonOnlySeq->dna, exonOnlySeq->size);
freeDnaSeq(&exonOnlySeq);

}
Example #23
0
void chopFaLines(char *inName, char *outName)
/* chopFaLines - Read in FA file with long lines and rewrite it with shorter lines. */
{
FILE *in = mustOpen(inName, "r");
FILE *out = mustOpen(outName, "w");
char *commentLine;
struct dnaSeq *seq;

while (faReadNext(in, NULL, TRUE, &commentLine, &seq))
    {
    commentLine = trimSpaces(commentLine+1);
    uglyf(">%s\n", commentLine);
    mustWrite(uglyOut, seq->dna, 100);
    uglyf("\n");
    uglyAbort("All for now");
    faWriteNext(out, commentLine, seq->dna, seq->size);
    }
}
Example #24
0
void polyTrimSeq(struct dnaSeq *seq, FILE *fh)
/*  trim a sequence */
{
if (trimPolyA)
    {
    int sz = maskTailPolyA(seq->dna, seq->size);
    seq->size -= sz;
    seq->dna[seq->size] = '\0';
    }
if (trimPolyT)
    {
    int sz = maskHeadPolyT(seq->dna, seq->size);
    seq->size -= sz;
    seq->dna += sz;
    }

faWriteNext(fh, seq->name, seq->dna, seq->size);
}
Example #25
0
int main(int argc, char *argv[])
/* Process command line. */
{
    char *inName, *outName, **inNames;
    FILE *in, *out;
    int i, inCount;
    DNA *dna;
    int inSize, outSize;
    int dnaOff;
    char *seqName;
    struct dyString *subSeqName = newDyString(512);
    int maxSize = 100000;

    if (argc < 3)
        usage();
    outName = argv[1];
    inNames = &argv[2];
    inCount = argc-2;
    out = mustOpen(outName, "w");
    for (i=0; i<inCount; ++i)
    {
        inName = inNames[i];
        printf("processing %s", inName);
        in = mustOpen(inName, "r");
        while (faFastReadNext(in, &dna, &inSize, &seqName))
        {
            for (dnaOff = 0; dnaOff < inSize; dnaOff += outSize)
            {
                printf(".");
                fflush(stdout);
                outSize = inSize - dnaOff;
                if (outSize > maxSize) outSize = maxSize;
                dyStringClear(subSeqName);
                dyStringPrintf(subSeqName, "%s.%d", seqName, dnaOff);
                faWriteNext(out, subSeqName->string, dna+dnaOff, outSize);
            }
        }
        fclose(in);
        printf("\n");
    }
}
Example #26
0
void scrambleFa(char *inName, char *outName)
/* scrambleFa - scramble the order of records in an fa file. */
{
struct dnaSeq *seqList, *seq;
int seqCount;
int seqIx;
FILE *out;

seqList = faReadAllDna(inName);
out = mustOpen(outName, "w");
seqCount = slCount(seqList);
while (seqCount > 0)
    {
    seqIx = rand()%seqCount;
    seq = slElementFromIx(seqList, seqIx);
    faWriteNext(out, seq->name, seq->dna, seq->size);
    slRemoveEl(&seqList, seq);
    --seqCount;
    }
fclose(out);
}
void getAccMrna(char *acc, struct sqlConnection *conn, FILE *outFa)
/* get mrna for an accession */
{
HGID seqId;
char *faSeq;
struct dnaSeq *dna;
boolean cdsOk = TRUE;
char accBuf[512];
struct genbankCds cds;

faSeq = hGetSeqAndId(conn, acc, &seqId);
if (faSeq == NULL)
    {
    fprintf(stderr, "%s\tsequence not in database\n", acc);
    return;
    }
dna = faFromMemText(faSeq);

if (cdsUpper || peptides)
    cdsOk = getCds(conn, acc, dna->size, !cdsUpperAll, &cds);

if (cdsOk && cdsUpper)
    upperCaseCds(dna, &cds);
if ((cdsOk || cdsUpperAll) && inclVer)
    {
    int ver = getVersion(conn, acc);
    safef(accBuf, sizeof(accBuf), "%s.%d", acc, ver);
    acc = accBuf;
    }

if ((cdsOk || cdsUpperAll))
    {
    if (peptides)
        writePeptide(outFa, acc, dna, &cds);
    else
        faWriteNext(outFa, acc, dna->dna, dna->size);
    }

dnaSeqFree(&dna);
}
Example #28
0
static void processSeqsFromBed(struct twoBitFile *tbf, char *bedFileName, FILE *outFile)
/* Get sequences defined by beds.  Exclude introns. */
{
struct bed *bed, *bedList = bedLoadAll(bedFileName);
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    struct dnaSeq *seq = twoBitAndBedToSeq(tbf, bed);
    char* seqName = NULL;
    if (clBedPos) 
        {
        char buf[1024];
        safef(buf, 1024, "%s:%d-%d", bed->chrom, bed->chromStart, bed->chromEnd);
        seqName = buf;
        }
    else
        seqName = seq->name;
    if (noMask)
        toUpperN(seq->dna, seq->size);
    faWriteNext(outFile, seqName, seq->dna, seq->size);
    dnaSeqFree(&seq);
    }
}
Example #29
0
void correctEst(char *oldFa, char *pslFile, char *nibDir, char *outFa)
/* correctEst - Correct ESTs by passing them through genome. */
{
struct hash *pslHash = hashPsls(pslFile);
struct lineFile *lf = lineFileOpen(oldFa, FALSE);
FILE *f = mustOpen(outFa, "w");
static struct dnaSeq est;
struct hashEl *hel;
struct psl *psl;
struct hash *nibHash = newHash(8);

while (faSpeedReadNext(lf, &est.dna, &est.size, &est.name))
    {
    if ((psl = hashFindVal(pslHash, est.name)) != NULL)
        {
	correctOne(&est, psl, nibDir, nibHash, f);
	}
    else
        {
	faWriteNext(f, est.name, est.dna, est.size);
	}
    }
}
Example #30
0
void seqFromPsl(char *inPsl, char *inTwoBit, char *outFa)
/* seqFromPsl - Extract masked sequence from database corresponding to psl file. */
{
struct twoBitFile *tbf = twoBitOpen(inTwoBit);
struct lineFile *lf = pslFileOpen(inPsl);
FILE *f = mustOpen(outFa, "w");
struct psl *psl;

while ((psl = pslNext(lf)) != NULL)
    {
    char faHead[512];
    struct dnaSeq *seq = twoBitReadSeqFrag(tbf, psl->tName,
    	psl->tStart, psl->tEnd);
    if (psl->strand[0] == '-')
        reverseComplement(seq->dna, seq->size);
    safef(faHead, sizeof(faHead), "%s (%s:%d-%d)", 
    	psl->qName, psl->tName, psl->tStart+1, psl->tEnd);
    if (hardMask)
        lowerToN(seq->dna, seq->size);
    faWriteNext(f, faHead, seq->dna, seq->size);
    }
carefulClose(&f);
}