boolean anyCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo) /* Get a single cDNA sequence. Optionally (if retInfo is non-null) get additional * info about the sequence. */ { static FILE *cdnaFa; static struct snof *cdnaSnof = NULL; long offset; char *faComment; char **pFaComment = (retInfo == NULL ? NULL : &faComment); if (cdnaSnof == NULL) { char buf[512]; cdnaSnof = snofMustOpen(cdnaName); sprintf(buf, "%s%s", cdnaName, ".fa"); cdnaFa = mustOpen(buf, "rb"); } if (!snofFindOffset(cdnaSnof, name, &offset)) return FALSE; fseek(cdnaFa, offset, SEEK_SET); if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna)) return FALSE; if (retInfo != NULL) { /* Kludge - only look up info if format is more or less right. */ int fieldCount = countChars(faComment, '|'); if (fieldCount >= 8) wormFaCommentIntoInfo(faComment, retInfo); else zeroBytes(retInfo, sizeof(*retInfo)); } return TRUE; }
struct dnaSeq *readSeqFromFaPos(struct seqFilePos *sfp, FILE *f) /* Read part of FA file. */ { struct dnaSeq *seq; fseek(f, sfp->pos, SEEK_SET); if (!faReadNext(f, "", TRUE, NULL, &seq)) errAbort("Couldn't faReadNext on %s in %s\n", sfp->name, sfp->file); return seq; }
struct dnaSeq *faReadOneDnaSeq(FILE *f, char *defaultName, boolean mustStartWithComment) /* Read sequence from FA file. Assumes positioned at or before * the '>' at start of sequence. */ { struct dnaSeq *seq; if (!faReadNext(f, defaultName, mustStartWithComment, NULL, &seq)) return NULL; else return seq; }
boolean nextWormCdnaAndInfo(struct wormCdnaIterator *it, struct dnaSeq **retSeq, struct wormCdnaInfo *retInfo) /* Return next sequence and associated info from database. */ { char *faComment; if (!faReadNext(it->faFile, "unknown", TRUE, &faComment, retSeq)) return FALSE; wormFaCommentIntoInfo(faComment, retInfo); return TRUE; }
boolean wormCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo) /* Get a single worm cDNA sequence. Optionally (if retInfo is non-null) get additional * info about the sequence. */ { long offset; char *faComment; char **pFaComment = (retInfo == NULL ? NULL : &faComment); wormCdnaCache(); if (!snofFindOffset(cdnaSnof, name, &offset)) return FALSE; fseek(cdnaFa, offset, SEEK_SET); if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna)) return FALSE; wormFaCommentIntoInfo(faComment, retInfo); return TRUE; }
void chopFaLines(char *inName, char *outName) /* chopFaLines - Read in FA file with long lines and rewrite it with shorter lines. */ { FILE *in = mustOpen(inName, "r"); FILE *out = mustOpen(outName, "w"); char *commentLine; struct dnaSeq *seq; while (faReadNext(in, NULL, TRUE, &commentLine, &seq)) { commentLine = trimSpaces(commentLine+1); uglyf(">%s\n", commentLine); mustWrite(uglyOut, seq->dna, 100); uglyf("\n"); uglyAbort("All for now"); faWriteNext(out, commentLine, seq->dna, seq->size); } }
static struct traceInfo* parseFastaRecord(FILE* fh, char* fastaName) /* read the next fasta record akd create a traceInfo object. This * parses the sequence id and comment for the read and clone name. */ { struct dnaSeq* dna; char* comment; struct traceInfo* traceInfo; if (!faReadNext(fh, NULL, 0, &comment, &dna)) return NULL; /* EOF */ AllocVar(traceInfo); traceInfo->ti = parseTraceId(dna->name, fastaName); traceInfo->size = dna->size; traceInfo->templateId = parseTemplateId(comment, fastaName); freeMem(comment); freeDnaSeq(&dna); return traceInfo; }
boolean flyCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo) /* Get a single fly cDNA sequence. Optionally (if retInfo is non-null) get additional * info about the sequence. */ { long offset; char *faComment; char **pFaComment = (retInfo == NULL ? NULL : &faComment); static struct snof *cdnaSnof = NULL; static FILE *cdnaFa; if (cdnaSnof == NULL) cdnaSnof = snofMustOpen("c:/biodata/fly/cDna/allcdna"); if (cdnaFa == NULL) cdnaFa = mustOpen("c:/biodata/fly/cDna/allcdna.fa", "rb"); if (!snofFindOffset(cdnaSnof, name, &offset)) return FALSE; fseek(cdnaFa, offset, SEEK_SET); if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna)) return FALSE; flyFaCommentIntoInfo(faComment, retInfo); return TRUE; }
int main(int argc, char *argv[]) { char *estName, *targetName, *oocName; FILE *estFile; struct dnaSeq *target; struct dnaSeq *est; struct patSpace *ps; struct patClump *clumpList, *clump; int estIx = 0; /* Check command line arguments and assign to local variables. */ if (argc != 4) usage(); estName = argv[1]; estFile = mustOpen(estName, "rb"); targetName = argv[2]; oocName = argv[3]; /* Read in target DNA from fasta files and check not too big. */ fprintf(stderr, "Reading %s\n", targetName); target = faReadAllDna(targetName); if (totalSequenceSize(target) > 8000000) { errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.", targetName, totalSequenceSize(target)); } /* Make a pattern space index structure. */ fprintf(stderr, "Making Pattern Space index\n"); ps = makePatSpace(&target, 1, oocName, 4, 32000); /* Loop through each EST in query list. */ printf("Searching for hits\n\n"); while (faReadNext(estFile, NULL, TRUE, NULL, &est)) { boolean isRc; /* Reverse complemented? */ if (++estIx % 5000 == 0) fprintf(stderr, "Processing EST %d\n", estIx); if (est->size > 20000) { warn("Very large EST sequence %s.\n" "Maybe you mixed up the EST and genomic parameters?", est->name); usage(); } for (isRc = 0; isRc <= 1; ++isRc) /* Search both strands. */ { if (isRc) reverseComplement(est->dna, est->size); clumpList = patSpaceFindOne(ps, est->dna, est->size); /* For each homology clump patSpace finds, do a fuzzyFinder * alignment of it and print the results. */ for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ali, *a; boolean isRc; int score; struct dnaSeq *t = clump->seq; DNA *tStart = t->dna + clump->start; ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna); if (ali != NULL) { score = ffScoreCdna(ali); printf("%s hits %s strand %c score %d\n", est->name, t->name, (isRc ? '+' : '-'), score); for (a = ali; a != NULL; a = a->right) { printf(" Q %4d - %4d\t T %4d -%4d\n", a->nStart - est->dna, a->nEnd - est->dna, a->hStart - t->dna, a->hEnd - t->dna); } printf("\n"); ffFreeAli(&ali); } else { printf("Couldn't align clump at %s %d-%d\n", t->name, clump->start, clump->start + clump->size); } } slFreeList(&clumpList); } freeDnaSeq(&est); } /* Clean up time. */ freePatSpace(&ps); freeSeqList(&target); return 0; }