void aliTrack(char *bacAcc, char *wholeName, char *partsName, struct memGfx *mg, int x, int y, FILE *mapFile, int trim, char *repeatMask) /* Write out one alignment track. */ { struct dnaSeq *whole, *partList, *part; bits16 contig; int maxBlockSize = 5000; int wholeSize; struct patSpace *ps; DNA *wholeDna; whole = faReadAllDna(wholeName); if (slCount(whole) > 1) warn("%d sequences in %s, only using first", slCount(whole), wholeName); wholeDna = whole->dna; wholeSize = whole->size; ps = makePatSpace(&whole, 1, oocFile, 5, 500); partList = faReadAllDna(partsName); printf("%d contigs in %s\n\n", slCount(partList), partsName); for (part = partList, contig = 0; part != NULL; part = part->next, ++contig) { DNA *dna = part->dna; int dnaSize = part->size; int start, size; int subIx = 0; char numText[12]; Color color = blockColors[contig%ArraySize(blockColors)]; sprintf(numText, "%d", contig+1); for (start = trim; start < dnaSize-trim; start += size) { struct ffAli *left, *right; boolean rc; int score; size = dnaSize - start-trim; if (size > maxBlockSize) size = maxBlockSize; if (!fastFind(dna+start, size, ps, &left, &rc, &score) ) { printf("Contig %d.%d:%d-%d of %d UNALIGNED\n", contig+1, subIx, start, start+size, dnaSize); } else { int x1, x2; int xo, w; double quality; int qStart, qSize, tStart,tSize; char qualityString[40]; right = left; while (right->right != NULL) right = right->right; qStart = left->nStart - dna; qSize = right->nEnd - left->nStart; if (rc) { int rcEnd = right->nEnd - (dna+start) - 1; qStart = reverseOffset(rcEnd, size) + start; } tStart = left->hStart - wholeDna; tSize = right->hEnd - left->hStart; quality = 100.0 * score / qSize; if (quality >= 25.0) sprintf(qualityString, "%4.1f%%", quality); else sprintf(qualityString, "<50%%"); printf("<A HREF=\"../cgi-bin/chkGlue.exe?bacAcc=%s&contig=%d&qStart=%d&qSize=%d&tStart=%d&tSize=%d&repeatMask=%s\">", bacAcc, contig, qStart, qSize, tStart, tSize, repeatMask); printf("Contig %d.%d:%d-%d %c of %d aligned %d-%d of %d aliSize %d quality %s</A>\n", contig+1, subIx, qStart, qStart+qSize, (rc ? '-' : '+'), dnaSize, tStart, tStart + tSize, wholeSize, qSize, qualityString); x1 = roundingScale(trackWidth, left->hStart - wholeDna, wholeSize); x2 = roundingScale(trackWidth, right->hEnd - wholeDna, wholeSize); xo = x1+x; w = x2-x1; mapWriteBox(mapFile, mtBlock, xo, y, w, trackHeight, bacAcc, contig, qStart, qSize, tStart, tSize); mgDrawBox(mg, xo, y, w, trackHeight, color); mgTextCentered(mg, xo, y, w, trackHeight, MG_WHITE, font, numText); ffFreeAli(&left); } ++subIx; } } freePatSpace(&ps); freeAllSeq(&whole); freeAllSeq(&partList); }
int main(int argc, char *argv[]) { char *genoListName; char *otherListName; char *oocFileName; char *typeName; char *outName; struct patSpace *patSpace; long startTime, endTime; char **genoList; int genoListSize; char *genoListBuf; char **otherList; int otherListSize; char *otherListBuf; char *genoName; int i; int blockCount = 0; struct dnaSeq **seqListList = NULL, *seq = NULL; char *outRoot; struct sqlConnection *conn; enum ffStringency stringency = ffCdna; int seedSize = 10; FILE *out; boolean noHead = FALSE; struct repeatTracker *rt; struct hash *repeatHash = newHash(10); hostName = getenv("HOST"); pushWarnHandler(warnHandler); startTime = clock1(); cgiSpoof(&argc, argv); minMatch = cgiOptionalInt("minMatch", minMatch); maxBad = cgiOptionalInt("maxBad", maxBad); minBases = cgiOptionalInt("minBases", minBases); dnaUtilOpen(); #ifdef DEBUG /* Hard wire command line input so don't have to type it in each * time run the stupid Gnu debugger. */ genoListName = "pFoo/geno.lst"; otherListName = "pFoo/bacend.lst"; typeName = "genomic"; oocFileName = "/d/biodata/human/10.ooc"; outName = "pFoo/pFoo.psl"; #else if (argc != 6 && argc != 7) usage(); genoListName = argv[1]; otherListName = argv[2]; typeName = argv[3]; oocFileName = argv[4]; if (sameWord(oocFileName, "none")) oocFileName = NULL; outName = argv[5]; if (argc == 7) { if (sameWord("noHead", argv[6])) noHead = TRUE; else usage(); } #endif if (sameWord(typeName, "mRNA") || sameWord(typeName, "cDNA")) { stringency = ffCdna; } else if (sameWord(typeName, "genomic")) { stringency = ffTight; } else if (sameWord(typeName, "g2g")) { stringency = ffTight; veryTight = TRUE; seedSize = 11; } else if (sameString(typeName, "asm")) { stringency = ffTight; avoidSelfSelf = TRUE; } else { warn("Unrecognized otherType %s\n", typeName); usage(); } readAllWordsOrFa(genoListName, &genoList, &genoListSize, &genoListBuf); filterMissingFiles(genoList, &genoListSize); if (genoListSize <= 0) errAbort("There are no files that exist in %s\n", genoListName); readAllWordsOrFa(otherListName, &otherList, &otherListSize, &otherListBuf); if (otherListSize <= 0) errAbort("There are no files that exist in %s\n", otherListName); filterMissingFiles(otherList, &otherListSize); out = mustOpen(outName, "w"); if (!noHead) pslWriteHead(out); AllocArray(seqListList, genoListSize); for (i=0; i<genoListSize; ++i) { genoName = genoList[i]; if (!startsWith("#", genoName) ) seqListList[i] = seq = faReadAllDna(genoName); for (;seq != NULL; seq = seq->next) { int size = seq->size; char *name = seq->name; struct hashEl *hel; AllocVar(rt); AllocArray(rt->repBytes, size); rt->seq = seq; if ((hel = hashLookup(repeatHash, name)) != NULL) errAbort("Duplicate %s in %s\n", name, genoName); hashAdd(repeatHash, name, rt); } storeMasked(repeatHash, genoName); } patSpace = makePatSpace(seqListList, genoListSize, seedSize, oocFileName, minMatch, 2000); endTime = clock1(); printf("Made index in %ld seconds\n", (endTime-startTime)); startTime = endTime; for (i=0; i<otherListSize; ++i) { FILE *f; char *otherName; int c; int dotCount = 0; struct dnaSeq otherSeq; ZeroVar(&otherSeq); otherName = otherList[i]; if (startsWith("#", otherName) ) continue; f = mustOpen(otherName, "r"); while ((c = fgetc(f)) != EOF) if (c == '>') break; printf("%s\n", otherName); fflush(stdout); while (faFastReadNext(f, &otherSeq.dna, &otherSeq.size, &otherSeq.name)) { aliSeqName = otherSeq.name; oneStrand(patSpace, repeatHash, &otherSeq, FALSE, stringency, out); reverseComplement(otherSeq.dna, otherSeq.size); oneStrand(patSpace, repeatHash, &otherSeq, TRUE, stringency, out); aliSeqName = NULL; } fclose(f); } freePatSpace(&patSpace); endTime = clock1(); printf("Alignment time is %ld sec\n", (endTime-startTime)); startTime = endTime; fclose(out); return 0; }
int main(int argc, char *argv[]) { char *estName, *targetName, *oocName; FILE *estFile; struct dnaSeq *target; struct dnaSeq *est; struct patSpace *ps; struct patClump *clumpList, *clump; int estIx = 0; /* Check command line arguments and assign to local variables. */ if (argc != 4) usage(); estName = argv[1]; estFile = mustOpen(estName, "rb"); targetName = argv[2]; oocName = argv[3]; /* Read in target DNA from fasta files and check not too big. */ fprintf(stderr, "Reading %s\n", targetName); target = faReadAllDna(targetName); if (totalSequenceSize(target) > 8000000) { errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.", targetName, totalSequenceSize(target)); } /* Make a pattern space index structure. */ fprintf(stderr, "Making Pattern Space index\n"); ps = makePatSpace(&target, 1, oocName, 4, 32000); /* Loop through each EST in query list. */ printf("Searching for hits\n\n"); while (faReadNext(estFile, NULL, TRUE, NULL, &est)) { boolean isRc; /* Reverse complemented? */ if (++estIx % 5000 == 0) fprintf(stderr, "Processing EST %d\n", estIx); if (est->size > 20000) { warn("Very large EST sequence %s.\n" "Maybe you mixed up the EST and genomic parameters?", est->name); usage(); } for (isRc = 0; isRc <= 1; ++isRc) /* Search both strands. */ { if (isRc) reverseComplement(est->dna, est->size); clumpList = patSpaceFindOne(ps, est->dna, est->size); /* For each homology clump patSpace finds, do a fuzzyFinder * alignment of it and print the results. */ for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ali, *a; boolean isRc; int score; struct dnaSeq *t = clump->seq; DNA *tStart = t->dna + clump->start; ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna); if (ali != NULL) { score = ffScoreCdna(ali); printf("%s hits %s strand %c score %d\n", est->name, t->name, (isRc ? '+' : '-'), score); for (a = ali; a != NULL; a = a->right) { printf(" Q %4d - %4d\t T %4d -%4d\n", a->nStart - est->dna, a->nEnd - est->dna, a->hStart - t->dna, a->hEnd - t->dna); } printf("\n"); ffFreeAli(&ali); } else { printf("Couldn't align clump at %s %d-%d\n", t->name, clump->start, clump->start + clump->size); } } slFreeList(&clumpList); } freeDnaSeq(&est); } /* Clean up time. */ freePatSpace(&ps); freeSeqList(&target); return 0; }