void faCat(char *inFile, char *outFile, char *liftFile)
/* faCat - Filter out fa records that don't match expression. */
{
    char *tempFile = rTempName("/tmp", "lift", ".lft");
    struct lineFile *inLf = lineFileOpen(inFile, TRUE);
    FILE *outFh = NULL;
    FILE *tempFh = mustOpen(tempFile, "w");
    DNA *seq;
    int seqSize;
    char *seqHeader;
    long int offset = 0;
    char *gap = NULL;
    int i, fileIndex = 1;
    char nameNew[512];
    char outFileName[512];
    char liftFileName[512];
    char fastaHeader[512];
    safef(nameNew,sizeof(nameNew), "%s.%d",name, fileIndex);
    safef(fastaHeader,sizeof(fastaHeader),">%s\n",nameNew);
    safef(outFileName, sizeof(outFileName), "%s.%d.fa",outFile, fileIndex);
    safef(liftFileName, sizeof(liftFileName), "%s.%d.lft",liftFile, fileIndex++);
    outFh = mustOpen(outFileName, "w");
    gap = needMem(gapSize+1);
    for (i = 0 ; i < gapSize ; i++)
    {
        gap[i] = 'N';
    }
    gap[i] = '\0';
    mustWrite(outFh, fastaHeader, strlen(fastaHeader));
    while (faMixedSpeedReadNext(inLf, &seq, &seqSize, &seqHeader))
    {
//    if (vOption ^ recMatches(seq, seqSize, seqHeader))
        //    faWriteNext(outFh, seqHeader, seq, seqSize);

        /* output lift record:       offset oldName oldSize newName newSize */
        fprintf(tempFh, "%ld\t%s\t%d\t%s\t%d\n",offset, nameNew, 0, seqHeader,  seqSize);
        offset += (seqSize + gapSize);
        writeSeqWithBreaks(outFh, seq, seqSize, 50);
        writeSeqWithBreaks(outFh, gap, gapSize, 50);
        if (offset > maxOutputSize)
        {
            carefulClose(&tempFh);
            carefulClose(&outFh);
            fixNewLength(tempFile, liftFileName, offset);
            tempFh = mustOpen(tempFile, "w");
            safef(nameNew,sizeof(nameNew), "%s.%d",name, fileIndex);
            safef(fastaHeader,sizeof(fastaHeader),">%s\n",nameNew);
            safef(liftFileName, sizeof(liftFileName), "%s.%d.lft",liftFile, fileIndex);
            safef(outFileName, sizeof(outFileName), "%s.%d.fa",outFile, fileIndex++);
            outFh = mustOpen(outFileName, "w");
            mustWrite(outFh, fastaHeader, strlen(fastaHeader));
            offset = 0;
        }
    }
    carefulClose(&tempFh);
    fixNewLength(tempFile, liftFileName, offset);
    lineFileClose(&inLf);
    carefulClose(&outFh);
    unlink(tempFile);
}
static void makeDirFasta(char *regionsFile, char *hg18FastaFile, char *dir, int num) {
	FILE *fp, *sq;
	char buf[500], dirName[500], seqName[500], chr1[500], chr2[500];
	int b1, e1, b2, e2, i, len;
	char ori1, ori2;
	struct hash *seqHash = NULL;
	struct dnaSeq *seq1, *seq2;
	struct stat st;
	DNA *s1, *s2;

	seqHash = faReadAllIntoHash(hg18FastaFile, dnaUpper);
	if (stat(dir, &st) != 0)
		do_cmd("mkdir %s", dir);

	fp = mustOpen(regionsFile, "r");
	i = 0;
	while (fgets(buf, 500, fp)) {
		if (sscanf(buf, "%[^:]:%d-%d %[^:]:%d-%d [%c %c]", chr1, &b1, &e1, chr2, &b2, &e2, &ori1, &ori2) != 8)
			errAbort("error: %s", buf);
		++i;
		if (i != num) 
			continue;
		sprintf(dirName, "%s/R%d", dir, i);
		if (stat(dirName, &st) != 0)
			do_cmd("mkdir %s", dir);
		sprintf(seqName, "%s/ref.fa", dirName);
		sq = mustOpen(seqName, "w");
		fprintf(sq, ">%s:%d-%d+%s:%d-%d[%c%c]\n", chr1, b1, e1, chr2, b2, e2, ori1, ori2);
		seq1 = (struct dnaSeq *)hashFindVal(seqHash, chr1);
		assert(e1 <= seq1->size);
		len = e1 - b1 + 1;
		if (ori1 == '-') {
			s1 = cloneStringZExt(seq1->dna + b1 - 1, len, len+1);
			reverseComplement(s1, len);
			writeSeqWithBreaks(sq, s1, len, 80);
			freeMem(s1);
		}
		else
			writeSeqWithBreaks(sq, seq1->dna + b1 - 1, e1 - b1 + 1, 80);
		seq2 = (struct dnaSeq *)hashFindVal(seqHash, chr2);
		assert(e2 <= seq2->size);
		len = e2 - b2 + 1;
		if (ori2 == '-') {
			s2 = cloneStringZExt(seq2->dna + b2 - 1, len, len+1);
			reverseComplement(s2, len);
			writeSeqWithBreaks(sq, s2, len, 80);
			freeMem(s2);
		}
		else
			writeSeqWithBreaks(sq, seq2->dna + b2 - 1, e2 - b2 + 1, 80);
		fclose(sq);
	}
	fclose(fp);
	//FIXME: free space
} 
Esempio n. 3
0
static void getSeqFromBlob(struct sqlConnection *conn,
        struct subjInfo *siList, char *tableName, char *xrefField)
/* Get sequence from blob field in table and print it as fasta. */
{
struct sqlResult *sr;
char **row;
char query[256];
struct subjInfo *si;
int seqCnt = 0;
hPrintf("<TT><PRE>");
for (si = siList; si != NULL; si = si->next)
    {
    char *subjId = si->fields[1];
    /* currently just 3 Thailand or 4 US */
    sqlSafef(query, sizeof(query),
        "select id, seq from %s s, gisaidXref g where g.subjId='%s' and g.%s=s.id", 
	tableName, subjId, xrefField);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
        {
        char *id = row[0];
        char *seq = row[1];
        hPrintf(">%s", id);
        hPrintf(":%s\n", subjId);
        writeSeqWithBreaks(stdout, seq, strlen(seq), 60);
        hPrintf("\n");
	seqCnt++;
        }
    sqlFreeResult(&sr);
    }
if (seqCnt == 0) hPrintf("No sequence data available for subject(s) selected.");
hPrintf("</TT></PRE>");
}
int main(int argc, char *argv[])
/* read snpTable, generate skinny sequence for chrom */
{
    char fileName[64];
    FILE *f;
    struct dnaSeq *skinnySeq = NULL;

    if (argc != 5)
        usage();

    database = argv[1];
    hSetDb(database);
    chromName = argv[2];

    snpTable = argv[4];
    if (!hTableExists(snpTable))
        errAbort("no %s table\n", snpTable);

    skinnySeq = getSkinnySeq(argv[3], chromName);
    stripChar(skinnySeq->dna, '-');
    safef(fileName, ArraySize(fileName), "%s.skinny", chromName);
    f = mustOpen(fileName, "w");
// faWriteNext(f, chromName, skinnySeq->dna, strlen(skinnySeq->dna));
    fprintf(f, ">%s\n", chromName);
    writeSeqWithBreaks(f, skinnySeq->dna, strlen(skinnySeq->dna), 50);
    carefulClose(&f);

    return 0;
}
static void showMrnaFromGenePred(struct sqlConnection *conn, 
	char *geneId, char *geneName)
/* Get mRNA sequence for gene from gene prediction. */
{
char *table = genomeSetting("knownGene");
struct sqlResult *sr;
char **row;
char query[256];
boolean hasBin = hIsBinned(sqlGetDatabase(conn), table);

hPrintf("<TT><PRE>");
safef(query, sizeof(query), 
    "select * from %s where name='%s'"
    " and chrom='%s' and txStart=%d and txEnd=%d", 
    table, geneId, curGeneChrom, curGeneStart, curGeneEnd);
sr = sqlGetResult(conn, query);
if ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *gene = genePredLoad(row+hasBin);
    struct bed *bed = bedFromGenePred(gene);
    struct dnaSeq *seq = hSeqForBed(sqlGetDatabase(conn), bed);
    hPrintf(">%s (%s predicted mRNA)\n", geneId, geneName);
    writeSeqWithBreaks(stdout, seq->dna, seq->size, 50);
    dnaSeqFree(&seq);
    bedFree(&bed);
    genePredFree(&gene);
    }
else
    errAbort("Couldn't find %s at %s:%d-%d", geneId, 
    	curGeneChrom, curGeneStart, curGeneEnd);
sqlFreeResult(&sr);
hPrintf("</TT></PRE>");
}
Esempio n. 6
0
void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize)
/* Write next sequence to fa file. */
{
if (dnaSize == 0)
    return;
if (startLine != NULL)
    fprintf(f, ">%s\n", startLine);
writeSeqWithBreaks(f, dna, dnaSize, 50);
}
void showSeqFromTable(struct sqlConnection *conn, char *geneId,
	char *geneName, char *table)
/* Show some sequence from given table. */
{
char query[512];
struct sqlResult *sr;
char **row;
hPrintf("<TT><PRE>");

safef(query, sizeof(query), 
    "select seq from %s where name = '%s'", table, geneId);
sr = sqlGetResult(conn, query);
if ((row = sqlNextRow(sr)) != NULL)
    {
    char *seq = row[0];
    hPrintf(">%s (%s) length=%d\n", geneId, geneName, (seq!=NULL) ? (int)strlen(seq): 0);
    writeSeqWithBreaks(stdout, seq, strlen(seq), 60);
    }
sqlFreeResult(&sr);
hPrintf("</PRE></TT>");
}
Esempio n. 8
0
File: seqOut.c Progetto: bowhan/kent
void doGenePredNongenomic(struct sqlConnection *conn, int typeIx)
/* Get mrna or protein associated with selected genes. */
{
/* Note this does do the whole genome at once rather than one
 * chromosome at a time, but that's ok because the gene prediction
 * tracks this serves are on the small side. */
char *typeWords[3];
char *table;
struct lm *lm = lmInit(64*1024);
int fieldCount;
struct bed *bed, *bedList = cookedBedsOnRegions(conn, curTable, getRegions(),
	lm, &fieldCount);
int typeWordCount;

textOpen();

/* Figure out which table to use. */
if (isRefGeneTrack(curTable))
    {
    if (typeIx == 1) /* Protein */
        doRefGeneProteinSequence(conn, bedList);
    else
        doRefGeneMrnaSequence(conn, bedList);
    }
else
    {
    char *dupType = cloneString(findTypeForTable(database, curTrack, curTable, ctLookupName));
    typeWordCount = chopLine(dupType, typeWords);
    if (typeIx >= typeWordCount)
	internalErr();
    table = typeWords[typeIx];
    if (sqlTableExists(conn, table))
	{
	struct sqlResult *sr;
	char **row;
	char query[256];
	struct hash *hash = newHash(18);
	boolean gotResults = FALSE;

	/* Make hash of all id's passing filters. */
	for (bed = bedList; bed != NULL; bed = bed->next)
	    hashAdd(hash, bed->name, NULL);

	/* Scan through table, outputting ones that match. */
	sqlSafef(query, sizeof(query), "select name, seq from %s", table);
	sr = sqlGetResult(conn, query);
	while ((row = sqlNextRow(sr)) != NULL)
	    {
	    if (hashLookup(hash, row[0]))
		{
		hPrintf(">%s\n", row[0]);
		writeSeqWithBreaks(stdout, row[1], strlen(row[1]), 60);
		gotResults = TRUE;
		}
	    }
	sqlFreeResult(&sr);
	hashFree(&hash);
	if (!gotResults)
	    hPrintf(NO_RESULTS);
	}
    else
	{
	internalErr();
	}
    freez(&dupType);
    }
lmCleanup(&lm);
}
Esempio n. 9
0
void faTrimRead(char *inFile, char *qualFile, char *outFile, char *liftFile)
/* faTrimRead - trim reads based on qual scores */
{
struct lineFile *lf = lineFileOpen(inFile, TRUE);
struct dnaSeq seq;
FILE *qf = mustOpen(qualFile, "r");
FILE *f = mustOpen(outFile, "w");
FILE *lift = mustOpen(liftFile, "w");
int seqCount = 0;
ZeroVar(&seq);

fprintf(lift,"## name \tclipStart\tclipEnd\tSize\n");
while (faSomeSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name, FALSE))
    {
    int i, j = 0;
    int mode = START;
    struct qual qual;
    int clipStart = 0, clipEnd = seq.size;
    seqCount += 1;
    qual.size = seq.size;
    qual.end = 0;
    assert(seq.size < MAXREADSIZE);
    qual.array = needMem((qual.size+1)*sizeof(int));
    if (qualReadAll(qf, TRUE, "name", FALSE, NULL, &qual))
        {
        for (i = 0 ; i<seq.size ; i++)
            seq.dna[i] = toupper(seq.dna[i]);
        for (i = 0 ; i<seq.size ; i++)
            {
            if (mode == START && ((clipStart = checkWindow(&qual, i,window)) >= 0))
                {
                /* set beginning of read to N's */
                for (j = 0 ; j < clipStart ; j++)
                    {
                    if (lower)
                        seq.dna[j] = tolower(seq.dna[j]);
                    else
                        seq.dna[j] = 'N';
                    }
                i = clipStart;
                mode = MIDDLE;
                }
            if (mode == MIDDLE)
                {
                assert(i < qual.size);
                assert(i < seq.size);
                if (qual.array[i] < minScore )
                    {
                    if (lower)
                        seq.dna[i] = tolower(seq.dna[i]);
                    else
                        seq.dna[i] = 'N';
                    if (i == clipStart)
                        clipStart++;
                    }
                }
            }
        mode = END;
        for (i = seq.size-window-1 ; i>=0 ; i--)
            {
            //seq.dna[i] = toupper(seq.dna[i]) ;
            if (mode == END && ((clipEnd = checkWindow(&qual, i,window)) >= 0))
                {
                clipEnd += window+1 ; 
                assert(clipEnd <= seq.size);
                for (j = clipEnd ; j < seq.size ; j++)
                    if (lower)
                        seq.dna[j] = tolower(seq.dna[j]) ;
                    else
                        seq.dna[j] = 'N';
                mode = MIDDLE;
                i = clipEnd+1;
                }
            else if (mode == MIDDLE)
                {
                if (qual.array[i] < minScore )
                    {
                    if (lower)
                        seq.dna[i] = tolower(seq.dna[i]) ;
                    else
                        seq.dna[i] = 'N';
                    if (i == clipEnd)
                        clipEnd--;
                    }
                }
            }
        }
    /*
    for (i=seq.size-1; i>=0; --i)
        {
        DNA b = seq.dna[i];
        if (b == 'a' || b == 'A')
            ++aSize;
        else
            break;
        }
    if (aSize >= 4)
        {
        memset(seq.dna + seq.size - aSize, 'n', aSize);
        seq.size -= aSize;
        seq.dna[seq.size-aSize] = 0;
        }
        */
    if (showQual)
        faWriteWithQualNext(f, seq.name, seq.dna, seq.size, &qual);
    else
        {
        //faWriteNext(f, seq.name, seq.dna+clipStart, clipEnd-clipStart+1);
        if (seq.name != NULL)
            fprintf(f, ">%s\n", seq.name);
        if (clip)
            writeSeqWithBreaks(f, seq.dna+clipStart, clipEnd-clipStart+1, lineSize);
        else
            writeSeqWithBreaks(f, seq.dna, seq.size, lineSize);
        }
    fprintf(lift,"%s\t%d\t%d\t%d\n",seq.name, clipStart, clipEnd, seq.size);
    freez(&qual.array);
    assert(qual.array == NULL);
    ZeroVar(&seq);
    }
fclose(lift);
fclose(f);
}