void faCat(char *inFile, char *outFile, char *liftFile) /* faCat - Filter out fa records that don't match expression. */ { char *tempFile = rTempName("/tmp", "lift", ".lft"); struct lineFile *inLf = lineFileOpen(inFile, TRUE); FILE *outFh = NULL; FILE *tempFh = mustOpen(tempFile, "w"); DNA *seq; int seqSize; char *seqHeader; long int offset = 0; char *gap = NULL; int i, fileIndex = 1; char nameNew[512]; char outFileName[512]; char liftFileName[512]; char fastaHeader[512]; safef(nameNew,sizeof(nameNew), "%s.%d",name, fileIndex); safef(fastaHeader,sizeof(fastaHeader),">%s\n",nameNew); safef(outFileName, sizeof(outFileName), "%s.%d.fa",outFile, fileIndex); safef(liftFileName, sizeof(liftFileName), "%s.%d.lft",liftFile, fileIndex++); outFh = mustOpen(outFileName, "w"); gap = needMem(gapSize+1); for (i = 0 ; i < gapSize ; i++) { gap[i] = 'N'; } gap[i] = '\0'; mustWrite(outFh, fastaHeader, strlen(fastaHeader)); while (faMixedSpeedReadNext(inLf, &seq, &seqSize, &seqHeader)) { // if (vOption ^ recMatches(seq, seqSize, seqHeader)) // faWriteNext(outFh, seqHeader, seq, seqSize); /* output lift record: offset oldName oldSize newName newSize */ fprintf(tempFh, "%ld\t%s\t%d\t%s\t%d\n",offset, nameNew, 0, seqHeader, seqSize); offset += (seqSize + gapSize); writeSeqWithBreaks(outFh, seq, seqSize, 50); writeSeqWithBreaks(outFh, gap, gapSize, 50); if (offset > maxOutputSize) { carefulClose(&tempFh); carefulClose(&outFh); fixNewLength(tempFile, liftFileName, offset); tempFh = mustOpen(tempFile, "w"); safef(nameNew,sizeof(nameNew), "%s.%d",name, fileIndex); safef(fastaHeader,sizeof(fastaHeader),">%s\n",nameNew); safef(liftFileName, sizeof(liftFileName), "%s.%d.lft",liftFile, fileIndex); safef(outFileName, sizeof(outFileName), "%s.%d.fa",outFile, fileIndex++); outFh = mustOpen(outFileName, "w"); mustWrite(outFh, fastaHeader, strlen(fastaHeader)); offset = 0; } } carefulClose(&tempFh); fixNewLength(tempFile, liftFileName, offset); lineFileClose(&inLf); carefulClose(&outFh); unlink(tempFile); }
static void makeDirFasta(char *regionsFile, char *hg18FastaFile, char *dir, int num) { FILE *fp, *sq; char buf[500], dirName[500], seqName[500], chr1[500], chr2[500]; int b1, e1, b2, e2, i, len; char ori1, ori2; struct hash *seqHash = NULL; struct dnaSeq *seq1, *seq2; struct stat st; DNA *s1, *s2; seqHash = faReadAllIntoHash(hg18FastaFile, dnaUpper); if (stat(dir, &st) != 0) do_cmd("mkdir %s", dir); fp = mustOpen(regionsFile, "r"); i = 0; while (fgets(buf, 500, fp)) { if (sscanf(buf, "%[^:]:%d-%d %[^:]:%d-%d [%c %c]", chr1, &b1, &e1, chr2, &b2, &e2, &ori1, &ori2) != 8) errAbort("error: %s", buf); ++i; if (i != num) continue; sprintf(dirName, "%s/R%d", dir, i); if (stat(dirName, &st) != 0) do_cmd("mkdir %s", dir); sprintf(seqName, "%s/ref.fa", dirName); sq = mustOpen(seqName, "w"); fprintf(sq, ">%s:%d-%d+%s:%d-%d[%c%c]\n", chr1, b1, e1, chr2, b2, e2, ori1, ori2); seq1 = (struct dnaSeq *)hashFindVal(seqHash, chr1); assert(e1 <= seq1->size); len = e1 - b1 + 1; if (ori1 == '-') { s1 = cloneStringZExt(seq1->dna + b1 - 1, len, len+1); reverseComplement(s1, len); writeSeqWithBreaks(sq, s1, len, 80); freeMem(s1); } else writeSeqWithBreaks(sq, seq1->dna + b1 - 1, e1 - b1 + 1, 80); seq2 = (struct dnaSeq *)hashFindVal(seqHash, chr2); assert(e2 <= seq2->size); len = e2 - b2 + 1; if (ori2 == '-') { s2 = cloneStringZExt(seq2->dna + b2 - 1, len, len+1); reverseComplement(s2, len); writeSeqWithBreaks(sq, s2, len, 80); freeMem(s2); } else writeSeqWithBreaks(sq, seq2->dna + b2 - 1, e2 - b2 + 1, 80); fclose(sq); } fclose(fp); //FIXME: free space }
static void getSeqFromBlob(struct sqlConnection *conn, struct subjInfo *siList, char *tableName, char *xrefField) /* Get sequence from blob field in table and print it as fasta. */ { struct sqlResult *sr; char **row; char query[256]; struct subjInfo *si; int seqCnt = 0; hPrintf("<TT><PRE>"); for (si = siList; si != NULL; si = si->next) { char *subjId = si->fields[1]; /* currently just 3 Thailand or 4 US */ sqlSafef(query, sizeof(query), "select id, seq from %s s, gisaidXref g where g.subjId='%s' and g.%s=s.id", tableName, subjId, xrefField); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *id = row[0]; char *seq = row[1]; hPrintf(">%s", id); hPrintf(":%s\n", subjId); writeSeqWithBreaks(stdout, seq, strlen(seq), 60); hPrintf("\n"); seqCnt++; } sqlFreeResult(&sr); } if (seqCnt == 0) hPrintf("No sequence data available for subject(s) selected."); hPrintf("</TT></PRE>"); }
int main(int argc, char *argv[]) /* read snpTable, generate skinny sequence for chrom */ { char fileName[64]; FILE *f; struct dnaSeq *skinnySeq = NULL; if (argc != 5) usage(); database = argv[1]; hSetDb(database); chromName = argv[2]; snpTable = argv[4]; if (!hTableExists(snpTable)) errAbort("no %s table\n", snpTable); skinnySeq = getSkinnySeq(argv[3], chromName); stripChar(skinnySeq->dna, '-'); safef(fileName, ArraySize(fileName), "%s.skinny", chromName); f = mustOpen(fileName, "w"); // faWriteNext(f, chromName, skinnySeq->dna, strlen(skinnySeq->dna)); fprintf(f, ">%s\n", chromName); writeSeqWithBreaks(f, skinnySeq->dna, strlen(skinnySeq->dna), 50); carefulClose(&f); return 0; }
static void showMrnaFromGenePred(struct sqlConnection *conn, char *geneId, char *geneName) /* Get mRNA sequence for gene from gene prediction. */ { char *table = genomeSetting("knownGene"); struct sqlResult *sr; char **row; char query[256]; boolean hasBin = hIsBinned(sqlGetDatabase(conn), table); hPrintf("<TT><PRE>"); safef(query, sizeof(query), "select * from %s where name='%s'" " and chrom='%s' and txStart=%d and txEnd=%d", table, geneId, curGeneChrom, curGeneStart, curGeneEnd); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { struct genePred *gene = genePredLoad(row+hasBin); struct bed *bed = bedFromGenePred(gene); struct dnaSeq *seq = hSeqForBed(sqlGetDatabase(conn), bed); hPrintf(">%s (%s predicted mRNA)\n", geneId, geneName); writeSeqWithBreaks(stdout, seq->dna, seq->size, 50); dnaSeqFree(&seq); bedFree(&bed); genePredFree(&gene); } else errAbort("Couldn't find %s at %s:%d-%d", geneId, curGeneChrom, curGeneStart, curGeneEnd); sqlFreeResult(&sr); hPrintf("</TT></PRE>"); }
void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize) /* Write next sequence to fa file. */ { if (dnaSize == 0) return; if (startLine != NULL) fprintf(f, ">%s\n", startLine); writeSeqWithBreaks(f, dna, dnaSize, 50); }
void showSeqFromTable(struct sqlConnection *conn, char *geneId, char *geneName, char *table) /* Show some sequence from given table. */ { char query[512]; struct sqlResult *sr; char **row; hPrintf("<TT><PRE>"); safef(query, sizeof(query), "select seq from %s where name = '%s'", table, geneId); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { char *seq = row[0]; hPrintf(">%s (%s) length=%d\n", geneId, geneName, (seq!=NULL) ? (int)strlen(seq): 0); writeSeqWithBreaks(stdout, seq, strlen(seq), 60); } sqlFreeResult(&sr); hPrintf("</PRE></TT>"); }
void doGenePredNongenomic(struct sqlConnection *conn, int typeIx) /* Get mrna or protein associated with selected genes. */ { /* Note this does do the whole genome at once rather than one * chromosome at a time, but that's ok because the gene prediction * tracks this serves are on the small side. */ char *typeWords[3]; char *table; struct lm *lm = lmInit(64*1024); int fieldCount; struct bed *bed, *bedList = cookedBedsOnRegions(conn, curTable, getRegions(), lm, &fieldCount); int typeWordCount; textOpen(); /* Figure out which table to use. */ if (isRefGeneTrack(curTable)) { if (typeIx == 1) /* Protein */ doRefGeneProteinSequence(conn, bedList); else doRefGeneMrnaSequence(conn, bedList); } else { char *dupType = cloneString(findTypeForTable(database, curTrack, curTable, ctLookupName)); typeWordCount = chopLine(dupType, typeWords); if (typeIx >= typeWordCount) internalErr(); table = typeWords[typeIx]; if (sqlTableExists(conn, table)) { struct sqlResult *sr; char **row; char query[256]; struct hash *hash = newHash(18); boolean gotResults = FALSE; /* Make hash of all id's passing filters. */ for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(hash, bed->name, NULL); /* Scan through table, outputting ones that match. */ sqlSafef(query, sizeof(query), "select name, seq from %s", table); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { if (hashLookup(hash, row[0])) { hPrintf(">%s\n", row[0]); writeSeqWithBreaks(stdout, row[1], strlen(row[1]), 60); gotResults = TRUE; } } sqlFreeResult(&sr); hashFree(&hash); if (!gotResults) hPrintf(NO_RESULTS); } else { internalErr(); } freez(&dupType); } lmCleanup(&lm); }
void faTrimRead(char *inFile, char *qualFile, char *outFile, char *liftFile) /* faTrimRead - trim reads based on qual scores */ { struct lineFile *lf = lineFileOpen(inFile, TRUE); struct dnaSeq seq; FILE *qf = mustOpen(qualFile, "r"); FILE *f = mustOpen(outFile, "w"); FILE *lift = mustOpen(liftFile, "w"); int seqCount = 0; ZeroVar(&seq); fprintf(lift,"## name \tclipStart\tclipEnd\tSize\n"); while (faSomeSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name, FALSE)) { int i, j = 0; int mode = START; struct qual qual; int clipStart = 0, clipEnd = seq.size; seqCount += 1; qual.size = seq.size; qual.end = 0; assert(seq.size < MAXREADSIZE); qual.array = needMem((qual.size+1)*sizeof(int)); if (qualReadAll(qf, TRUE, "name", FALSE, NULL, &qual)) { for (i = 0 ; i<seq.size ; i++) seq.dna[i] = toupper(seq.dna[i]); for (i = 0 ; i<seq.size ; i++) { if (mode == START && ((clipStart = checkWindow(&qual, i,window)) >= 0)) { /* set beginning of read to N's */ for (j = 0 ; j < clipStart ; j++) { if (lower) seq.dna[j] = tolower(seq.dna[j]); else seq.dna[j] = 'N'; } i = clipStart; mode = MIDDLE; } if (mode == MIDDLE) { assert(i < qual.size); assert(i < seq.size); if (qual.array[i] < minScore ) { if (lower) seq.dna[i] = tolower(seq.dna[i]); else seq.dna[i] = 'N'; if (i == clipStart) clipStart++; } } } mode = END; for (i = seq.size-window-1 ; i>=0 ; i--) { //seq.dna[i] = toupper(seq.dna[i]) ; if (mode == END && ((clipEnd = checkWindow(&qual, i,window)) >= 0)) { clipEnd += window+1 ; assert(clipEnd <= seq.size); for (j = clipEnd ; j < seq.size ; j++) if (lower) seq.dna[j] = tolower(seq.dna[j]) ; else seq.dna[j] = 'N'; mode = MIDDLE; i = clipEnd+1; } else if (mode == MIDDLE) { if (qual.array[i] < minScore ) { if (lower) seq.dna[i] = tolower(seq.dna[i]) ; else seq.dna[i] = 'N'; if (i == clipEnd) clipEnd--; } } } } /* for (i=seq.size-1; i>=0; --i) { DNA b = seq.dna[i]; if (b == 'a' || b == 'A') ++aSize; else break; } if (aSize >= 4) { memset(seq.dna + seq.size - aSize, 'n', aSize); seq.size -= aSize; seq.dna[seq.size-aSize] = 0; } */ if (showQual) faWriteWithQualNext(f, seq.name, seq.dna, seq.size, &qual); else { //faWriteNext(f, seq.name, seq.dna+clipStart, clipEnd-clipStart+1); if (seq.name != NULL) fprintf(f, ">%s\n", seq.name); if (clip) writeSeqWithBreaks(f, seq.dna+clipStart, clipEnd-clipStart+1, lineSize); else writeSeqWithBreaks(f, seq.dna, seq.size, lineSize); } fprintf(lift,"%s\t%d\t%d\t%d\n",seq.name, clipStart, clipEnd, seq.size); freez(&qual.array); assert(qual.array == NULL); ZeroVar(&seq); } fclose(lift); fclose(f); }