int main(int argc, char *argv[]) { char *estName, *targetName, *oocName; FILE *estFile; struct dnaSeq *target; struct dnaSeq *est; struct patSpace *ps; struct patClump *clumpList, *clump; int estIx = 0; /* Check command line arguments and assign to local variables. */ if (argc != 4) usage(); estName = argv[1]; estFile = mustOpen(estName, "rb"); targetName = argv[2]; oocName = argv[3]; /* Read in target DNA from fasta files and check not too big. */ fprintf(stderr, "Reading %s\n", targetName); target = faReadAllDna(targetName); if (totalSequenceSize(target) > 8000000) { errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.", targetName, totalSequenceSize(target)); } /* Make a pattern space index structure. */ fprintf(stderr, "Making Pattern Space index\n"); ps = makePatSpace(&target, 1, oocName, 4, 32000); /* Loop through each EST in query list. */ printf("Searching for hits\n\n"); while (faReadNext(estFile, NULL, TRUE, NULL, &est)) { boolean isRc; /* Reverse complemented? */ if (++estIx % 5000 == 0) fprintf(stderr, "Processing EST %d\n", estIx); if (est->size > 20000) { warn("Very large EST sequence %s.\n" "Maybe you mixed up the EST and genomic parameters?", est->name); usage(); } for (isRc = 0; isRc <= 1; ++isRc) /* Search both strands. */ { if (isRc) reverseComplement(est->dna, est->size); clumpList = patSpaceFindOne(ps, est->dna, est->size); /* For each homology clump patSpace finds, do a fuzzyFinder * alignment of it and print the results. */ for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ali, *a; boolean isRc; int score; struct dnaSeq *t = clump->seq; DNA *tStart = t->dna + clump->start; ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna); if (ali != NULL) { score = ffScoreCdna(ali); printf("%s hits %s strand %c score %d\n", est->name, t->name, (isRc ? '+' : '-'), score); for (a = ali; a != NULL; a = a->right) { printf(" Q %4d - %4d\t T %4d -%4d\n", a->nStart - est->dna, a->nEnd - est->dna, a->hStart - t->dna, a->hEnd - t->dna); } printf("\n"); ffFreeAli(&ali); } else { printf("Couldn't align clump at %s %d-%d\n", t->name, clump->start, clump->start + clump->size); } } slFreeList(&clumpList); } freeDnaSeq(&est); } /* Clean up time. */ freePatSpace(&ps); freeSeqList(&target); return 0; }
int main(int argc, char *argv[]) { char *genoListName; char *cdnaListName; char *oocFileName; char *pairFileName; struct patSpace *patSpace; long startTime, endTime; char **genoList; int genoListSize; char *genoListBuf; char **cdnaList; int cdnaListSize; char *cdnaListBuf; char *genoName; int i; int estIx = 0; struct dnaSeq **seqListList = NULL, *seq; static char hitFileName[512], mergerFileName[512], okFileName[512]; char *outRoot; struct hash *pairHash; if (dumpMe) { bigHtmlFile = mustOpen("C:\\inetpub\\wwwroot\\test\\patAli.html", "w"); littleHtmlFile = mustOpen("C:\\inetpub\\wwwroot\\test\\patSpace.html", "w"); htmStart(bigHtmlFile, "PatSpace Alignments"); htmStart(littleHtmlFile, "PatSpace Index"); } if ((hostName = getenv("HOST")) == NULL) hostName = ""; if (argc != 6) usage(); pushWarnHandler(patSpaceWarnHandler); startTime = clock1000(); dnaUtilOpen(); makePolys(); genoListName = argv[1]; cdnaListName = argv[2]; oocFileName = argv[3]; pairFileName = argv[4]; outRoot = argv[5]; sprintf(hitFileName, "%s.hit", outRoot); sprintf(mergerFileName, "%s.glu", outRoot); sprintf(okFileName, "%s.ok", outRoot); readAllWords(genoListName, &genoList, &genoListSize, &genoListBuf); readAllWords(cdnaListName, &cdnaList, &cdnaListSize, &cdnaListBuf); pairHash = makePairHash(pairFileName); hitOut = mustOpen(hitFileName, "w"); mergerOut = mustOpen(mergerFileName, "w"); dumpOut = mustOpen("dump.out", "w"); seqListList = needMem(genoListSize*sizeof(seqListList[0]) ); fprintf(hitOut, "Pattern space 0.2 cDNA matcher\n"); fprintf(hitOut, "cDNA files: ", cdnaListSize); for (i=0; i<cdnaListSize; ++i) fprintf(hitOut, " %s", cdnaList[i]); fprintf(hitOut, "\n"); fprintf(hitOut, "%d genomic files\n", genoListSize); for (i=0; i<genoListSize; ++i) { genoName = genoList[i]; if (!startsWith("//", genoName) ) { seqListList[i] = seq = faReadAllDna(genoName); fprintf(hitOut, "%d els in %s ", slCount(seq), genoList[i]); for (; seq != NULL; seq = seq->next) fprintf(hitOut, "%d ", seq->size); fprintf(hitOut, "\n"); } } patSpace = makePatSpace(seqListList, genoListSize, oocFileName); for (i=0; i<cdnaListSize; ++i) { FILE *f; char *estFileName; DNA *dna; char *estName; int size; int c; int maxSizeForFuzzyFind = 20000; int dotCount = 0; estFileName = cdnaList[i]; if (startsWith("//", estFileName) ) continue; f = mustOpen(estFileName, "rb"); while ((c = fgetc(f)) != EOF) if (c == '>') break; printf("%s", cdnaList[i]); fflush(stdout); while (fastFaReadNext(f, &dna, &size, &estName)) { aliSeqName = estName; if (size < maxSizeForFuzzyFind) /* Some day need to fix this somehow... */ { struct hashEl *hel; struct cdnaAliList *calList = NULL; hel = hashLookup(pairHash, estName); if (hel != NULL) /* Do pair processing. */ { struct estPair *ep; struct seq *thisSeq, *otherSeq; ep = hel->val; if (hel->name == ep->name3) { thisSeq = &ep->seq3; otherSeq = &ep->seq5; } else { thisSeq = &ep->seq5; otherSeq = &ep->seq3; } if (otherSeq->dna == NULL) /* First in pair - need to save sequence. */ { thisSeq->size = size; thisSeq->dna = needMem(size); memcpy(thisSeq->dna, dna, size); } else /* Second in pair - do gluing and free partner. */ { char mergedName[64]; thisSeq->dna = dna; thisSeq->size = size; sprintf(mergedName, "%s_AND_%s", ep->name5, ep->name3); patSpaceFindOne(patSpace, ep->seq5.dna, ep->seq5.size, '+', '5', ep->name5, &calList); reverseComplement(ep->seq5.dna, ep->seq5.size); patSpaceFindOne(patSpace, ep->seq5.dna, ep->seq5.size, '-', '5', ep->name5, &calList); patSpaceFindOne(patSpace, ep->seq3.dna, ep->seq3.size, '+', '3', ep->name3, &calList); reverseComplement(ep->seq3.dna, ep->seq3.size); patSpaceFindOne(patSpace, ep->seq3.dna, ep->seq3.size, '-', '3', ep->name3, &calList); slReverse(&calList); writeMergers(calList, mergedName, genoList); freez(&otherSeq->dna); thisSeq->dna = NULL; thisSeq->size =otherSeq->size = 0; } } else { patSpaceFindOne(patSpace, dna, size, '+', '5', estName, &calList); reverseComplement(dna, size); patSpaceFindOne(patSpace, dna, size, '-', '5', estName, &calList); slReverse(&calList); writeMergers(calList, estName, genoList); } ++estIx; if ((estIx & 0xfff) == 0) { printf("."); ++dotCount; fflush(stdout); } } } printf("\n"); } aliSeqName = ""; printf("raw %4d ffSubmitted %3d ffAccepted %3d ffOkScore %3d ffSolidMatch %2d\n", grandTotalHits, ffSubmitted, ffAccepted, ffOkScore, ffSolidMatch); endTime = clock1000(); printf("Total time is %4.2f\n", 0.001*(endTime-startTime)); /* Write out file who's presense say's we succeeded */ { FILE *f = mustOpen(okFileName, "w"); fputs("ok", f); fclose(f); } if (dumpMe) { htmEnd(bigHtmlFile); htmEnd(littleHtmlFile); } return 0; }
void fakeFinContigs(char *agpName, char *faName, char *finDir, char *rootName, char *finFaDir, char *ooVer) /* fakeFinContigs - Fake up contigs for a finished chromosome. */ { struct contig *contigList = NULL, *contig = NULL; struct agpFrag *agp; struct lineFile *lf = lineFileOpen(agpName, TRUE); char *line, *words[16]; int lineSize, wordCount; int contigIx = 0; char liftDir[512], contigDir[512], path[512]; char chrom[128]; FILE *f; struct dnaSeq *seq; int fragIx; /* Build up contig list by scanning agp file. */ printf("Reading %s\n", lf->fileName); while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '#' || line[0] == 0) continue; wordCount = chopLine(line, words); if (wordCount < 5) errAbort("Expecting at least 5 words line %d of %s", lf->lineIx, lf->fileName); if (words[4][0] == 'N' || words[4][0] == 'U') { contig = NULL; continue; } lineFileExpectWords(lf, 9, wordCount); agp = agpFragLoad(words); // file is 1-based but agpFragLoad() now assumes 0-based: agp->chromStart -= 1; agp->fragStart -= 1; if (contig == NULL) { AllocVar(contig); sprintf(contig->name, "%s%d", rootName, ++contigIx); contig->startOffset = agp->chromStart; slAddHead(&contigList, contig); } else { if (contig->agpList != NULL && contig->agpList->chromEnd != agp->chromStart) errAbort("Start doesn't match previous end line %d of %s", lf->lineIx, lf->fileName); } if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart) errAbort("Chrom and frag size mismatch line %d of %s", lf->lineIx, lf->fileName); slAddHead(&contig->agpList, agp); contig->endOffset = agp->chromEnd; } slReverse(&contigList); for (contig = contigList; contig != NULL; contig = contig->next) slReverse(&contig->agpList); lineFileClose(&lf); /* Load up chromosome sequence and make sure it is in one piece. */ printf("Reading %s\n", faName); seq = faReadAllDna(faName); if (slCount(seq) != 1) errAbort("Got %d sequences in %s, can only handle one.", slCount(seq), faName); /* Fix up agp coordinates. Make a directory for each contig. Fill it with * .fa .agp barge.NN files for that contig. */ printf("Writing contig dirs\n"); for (contig = contigList; contig != NULL; contig = contig->next) { /* Make Contig dir. */ sprintf(contigDir, "%s/%s", finDir, contig->name); makeDir(contigDir); /* Make contig.agp file. */ sprintf(path, "%s/%s.agp", contigDir, contig->name); f = mustOpen(path, "w"); fragIx = 0; for (agp = contig->agpList; agp != NULL; agp = agp->next) { char buf[128]; sprintf(buf, "%s/%s", skipChr(agp->chrom), contig->name); freez(&agp->chrom); agp->chrom = cloneString(buf); agp->chromStart -= contig->startOffset; agp->chromEnd -= contig->startOffset; agp->ix = ++fragIx; agpFragTabOut(agp, f); } carefulClose(&f); /* Make ooGreedy.NN.gl file */ sprintf(path, "%s/%s.%s.gl", contigDir, "ooGreedy", ooVer); f = mustOpen(path, "w"); for (agp = contig->agpList; agp != NULL; agp = agp->next) { if (agp->type[0] != 'N' && agp->type[0] != 'U') { fprintf(f, "%s_1\t%d\t%d\t%s\n", agp->frag, agp->chromStart, agp->chromEnd, agp->strand); } } carefulClose(&f); /* Make contig.fa file. */ sprintf(path, "%s/%s.fa", contigDir, contig->name); faWrite(path, contig->name, seq->dna + contig->startOffset, contig->endOffset - contig->startOffset); /* Make contig/barge file. */ sprintf(path, "%s/barge.%s", contigDir, ooVer); f = mustOpen(path, "w"); fprintf(f, "Barge (Connected Clone) File ooGreedy Version %s\n", ooVer); fprintf(f, "\n"); fprintf(f, "start accession size overlap maxClone maxOverlap\n"); fprintf(f, "------------------------------------------------------------\n"); for (agp = contig->agpList; agp != NULL; agp = agp->next) { char clone[128]; strcpy(clone, agp->frag); chopSuffix(clone); fprintf(f, "%d\t%s\t%d\t100\tn/a\t0\n", agp->chromStart, clone, agp->chromEnd); } carefulClose(&f); /* Make contig/gold file. */ sprintf(path, "%s/gold.%s", contigDir, ooVer); f = mustOpen(path, "w"); fragIx = 0; for (agp = contig->agpList; agp != NULL; agp = agp->next) { char fragName[128]; struct agpFrag frag = *agp; sprintf(fragName, "%s_1", agp->frag); frag.frag = fragName; frag.type[0] = '0'; agpFragTabOut(&frag, f); } carefulClose(&f); } /* Create lift subdirectory. */ printf("Creating lift files\n"); sprintf(liftDir, "%s/lift", finDir); makeDir(liftDir); /* Create lift/oOut.lst file (just a list of contigs). */ sprintf(path, "%s/oOut.lst", liftDir); f = mustOpen(path, "w"); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%s/%s.fa.out\n", contig->name, contig->name); carefulClose(&f); /* Create lift/ordered.lst file (just a list of contigs). */ sprintf(path, "%s/ordered.lst", liftDir); f = mustOpen(path, "w"); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%s\n", contig->name); carefulClose(&f); /* Create lift/ordered.lft file. */ sprintf(path, "%s/ordered.lft", liftDir); f = mustOpen(path, "w"); splitPath(faName, NULL, chrom, NULL); for (contig = contigList; contig != NULL; contig = contig->next) fprintf(f, "%d\t%s/%s\t%d\t%s\t%d\n", contig->startOffset, skipChr(chrom), contig->name, contig->endOffset - contig->startOffset, chrom, seq->size); carefulClose(&f); }