int main(int argc, char *argv[]) /* The program */ { struct psl *pslList = NULL, *psl; struct hash *queryHash, *targetHash; struct lineFile *vulg; aaSeq *querySeqs; struct dnaSeq *targetSeqs; if (argc != 5) usage(); /* Load up everything at beginning */ vulg = lineFileOpen(argv[1], TRUE); querySeqs = dnaLoadAll(argv[2]); targetSeqs = dnaLoadAll(argv[3]); queryHash = seqHash(querySeqs); targetHash = seqHash(targetSeqs); /* Main business */ pslList = vulgarToPsl(vulg, queryHash, targetHash); pslWriteAll(pslList, argv[4], FALSE); /* Free up everything */ freeDnaSeqList(&querySeqs); freeDnaSeqList(&targetSeqs); freeHash(&targetHash); freeHash(&queryHash); pslFreeList(&pslList); lineFileClose(&vulg); return 0; }
void alignNt(char *nt) /* Do alignments of draft bacs against one NT. */ { char indexFileName[512]; char ntFaName[512]; struct lineFile *indexLf; int lineSize; char *line; char *words[3]; int wordCount; struct patSpace *ps; struct dnaSeq *ntSeq; printf("<H1>Check Layout of %s</H1>\n", nt); printf("<PRE>"); sprintf(ntFaName, "%s/p%s.fa", faDir, nt); ntSeq = faReadAllDna(ntFaName); ps = makePatSpace(&ntSeq, 1, oocFile, 10, 500); sprintf(indexFileName, "%s/%s.index", indexDir, nt); uglyf("Checking out %s and %s\n", indexFileName, ntFaName); indexLf = lineFileOpen(indexFileName, TRUE); while (lineFileNext(indexLf, &line, &lineSize)) { wordCount = chopLine(line, words); if (wordCount > 0) { char bacFaName[512]; struct dnaSeq *contigList, *contig; char *bacAcc = words[0]; char *s = strrchr(bacAcc, '.'); if (s != NULL) *s = 0; uglyf("%s\n", bacAcc); sprintf(bacFaName, "%s/%s.fa", faDir, bacAcc); contigList = faReadAllDna(bacFaName); for (contig = contigList; contig != NULL; contig = contig->next) { boolean isRc; uglyf(" %s\n", contig->name); for (isRc = FALSE; isRc <= TRUE; isRc += 1) { struct ssBundle *bunList, *bun; bunList = ssFindBundles(ps, contig, contig->name, ffTight); for (bun = bunList; bun != NULL; bun = bun->next) { showBundle(bun, isRc); } ssBundleFreeList(&bunList); reverseComplement(contig->dna, contig->size); } } freeDnaSeqList(&contigList); } } lineFileClose(&indexLf); freeDnaSeqList(&ntSeq); }
void coordConvRepFree(struct coordConvRep **pEl) /* free an individual coordinate conversion report */ { struct coordConvRep *el; if((el = *pEl) == NULL) return; freeMem(el->msg); coordConvFree(&el->to); coordConvFree(&el->from); freeDnaSeqList(&el->upSeq); freeDnaSeqList(&el->midSeq); freeDnaSeqList(&el->downSeq); pslFreeList(&el->upPsl); pslFreeList(&el->midPsl); pslFreeList(&el->downPsl); }
static void getCloneDna(struct clone *clone, struct hash *fragHash) /* Read in clone DNA from file in format with one record per * clone contig. Make clone->dna so that it is same as * non-fragmented clone file. */ { struct dnaSeq *seqList = faReadAllDna(clone->faFile), *seq; int fragSize; clone->dna = needLargeMem(clone->size+1); clone->dna[clone->size] = 0; uglyf("GetCloneDna %s\n", clone->faFile); for (seq = seqList; seq != NULL; seq = seq->next) { struct frag *frag = hashFindVal(fragHash, seq->name); if (frag == NULL) errAbort("Couldn't find %s from %s in trans files", seq->name, clone->faFile); assert(frag->end <= clone->size); fragSize = frag->end - frag->start; assert(fragSize >= 0); if (fragSize != seq->size) errAbort("Size mismatch (%d vs %d) between trans and .ffa files on %s", fragSize, seq->size, frag->name); memcpy(clone->dna + frag->start, seq->dna, fragSize); } freeDnaSeqList(&seqList); }
int main(int argc, char *argv[]) /* Process command line. */ { optionHash(&argc, argv); if (argc != 9) usage(); fileCache = newDlList(); maxGap = optionInt("maxGap", maxGap); verboseSetLogFile("stdout"); ss = axtScoreSchemeDefault(); verbose(1,"Reading alignments from %s\n",argv[3]); mrnaHash = readPslToBinKeeper(argv[2], argv[3]); twoBitFile = twoBitOpen(argv[5]); //verbose(1,"Reading alignments from %s\n",argv[]); //pseudoHash = readPslToBinKeeper(argv[3], argv[]); //verbose(1,"Reading mRNA sequences from %s\n",argv[5]); //mrnaList = faReadAllMixed(argv[5]); //if (mrnaList == NULL) //errAbort("could not open %s\n",argv[5]); //faHash = newHash(0); //for (el = mrnaList; el != NULL ; el = el->next) //hashAdd(faHash, el->name, el); verbose(1,"Reading chains from %s\n",argv[6]); chainHash = readChainToBinKeeper(argv[2], argv[6]); outFile = fopen(argv[8],"w"); verbose(1,"Scoring %s\n",argv[1]); checkExp(argv[1], argv[7], argv[4]); fclose(outFile); freeDnaSeqList(&mrnaList); return(0); }
void findCutters(char *gcgFile, char *genome, char *outputFile) /* findCutters - Find REBASE restriction enzymes using their GCG file. */ { struct cutter *cutters = readGcg(gcgFile); struct dnaSeq *seqs = dnaLoadAll(genome); struct slName *whiteList = NULL; if (justThis) whiteList = newSlName(justThis); if (justThese) { struct slName *listFromJustThese = getWhiteListFromFile(); whiteList = slCat(whiteList, listFromJustThese); } if (justThese || justThis) cullCutters(&cutters, TRUE, whiteList, 0); if (countsOnly) findCounts(cutters, seqs, outputFile); else findBeds(cutters, seqs, outputFile); cutterFreeList(&cutters); freeDnaSeqList(&seqs); slNameFreeList(&whiteList); }
int checkOurContig(char *contigDir, struct contig *contig) /* Check files in contigDir. */ { char fileName[512]; struct dnaSeq *seq; int problemCount = 0; /* Check FA file for size. */ sprintf(fileName, "%s/%s.fa", contigDir, contig->name); uglyf("Checking %s %d\n", fileName, contig->size); if (!fileExists(fileName)) { printf("%s doesn't exist\n", fileName); return 1; } seq = faReadAllDna(fileName); if (seq == NULL) { printf("%s has no sequence\n", fileName); return 1; } if (slCount(seq) != 1) { ++problemCount; printf("%s has more than one sequence\n", fileName); } if (seq->size != contig->size) { ++problemCount; printf("%s is %d bases according to NCBI, but %d bases in %s", contig->name, contig->size, seq->size, fileName); } freeDnaSeqList(&seq); return problemCount; }
void writeOverlaps(FILE *f, struct seqOver *so) /* Write out info on overlapping part. */ { struct hash *fragHash = newHash(0); struct dnaSeq *qSeqList, *qSeq; char *queryClone = so->name; struct psl *psl; char qName[256]; char tName[256]; char faHead[512]; qSeqList = readHashDna(queryClone, fragHash); slSort(&so->pslList, pslCmpQuery); for (psl = so->pslList; psl != NULL; psl = psl->next) { qSeq = hashMustFindVal(fragHash, psl->qName); sprintf(qName, "%s.%d.%d.%d", psl->qName, psl->qStart, psl->qEnd, psl->qSize); sprintf(tName, "%s.%d.%d.%d", psl->tName, psl->tStart, psl->tEnd, psl->tSize); sprintf(faHead, "%s %s", qName, tName); faWriteNext(f, faHead, qSeq->dna + psl->qStart, psl->qEnd - psl->qStart); } freeHash(&fragHash); freeDnaSeqList(&qSeqList); }
void secondPass(char *inName, char *outName) /* Do second pass - pair HMM between homologous regions specified in * input. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); char *line; int lineSize; char *words[16]; int wordCount; struct wabaCrude *wcList = NULL, *wc; char qFileName[512]; struct dnaSeq *qSeqList = NULL, *seq; struct hash *tFileHash = newHash(8); struct hash *qSeqHash = NULL; FILE *out = mustOpen(outName, "w"); FILE *dynFile; printf("Second pass (HMM) input %s output %s\n", inName, outName); /* Load up alignments from file and sort. */ while (lineFileNext(lf, &line, &lineSize)) { wordCount = chopLine(line, words); if (wordCount != 10) errAbort("line %d of %s doesn't look like a waba first pass file", lf->lineIx, lf->fileName); wc = wabaCrudeLoad(words); slAddHead(&wcList, wc); } lineFileClose(&lf); slSort(&wcList, wcCmpQposScore); /* Go through alignments one by one, loading DNA as need be. */ qFileName[0] = 0; for (wc = wcList; wc != NULL; wc = wc->next) { struct hashEl *hel; struct dnaSeq *tSeqList, *tSeq, *qSeq; int qSize; DNA *qStart; int tMaxSize = 5000; int tMin, tMax, tMid, tSize; int score; /* Get target sequence. */ hel = hashLookup(tFileHash, wc->tFile); if (hel == NULL) { printf("Loading %s\n", wc->tFile); tSeqList = faReadAllDna(wc->tFile); hel = hashAdd(tFileHash, wc->tFile, tSeqList); } else { tSeqList = hel->val; } tSeq = findSeq(tSeqList, wc->tSeq); /* Get query sequence. */ if (!sameString(qFileName, wc->qFile)) { strcpy(qFileName, wc->qFile); printf("Loading %s\n", wc->qFile); freeDnaSeqList(&qSeqList); qSeqList = faReadAllDna(wc->qFile); freeHash(&qSeqHash); qSeqHash = newHash(0); for (qSeq = qSeqList; qSeq != NULL; qSeq = qSeq->next) hashAddUnique(qSeqHash, qSeq->name, qSeq); } qSeq = hashMustFindVal(qSeqHash, wc->qSeq); /* Do fine alignment. */ qSize = wc->qEnd - wc->qStart; qStart = qSeq->dna + wc->qStart; if (wc->strand < 0) reverseComplement(qStart, qSize); tMid = (wc->tStart + wc->tEnd)/2; tMin = tMid-tMaxSize/2; tMax = tMin + tMaxSize; if (tMin < 0) tMin = 0; if (tMax > tSeq->size) tMax = tSeq->size; printf("Aligning %s %s:%d-%d %c to %s.%s:%d-%d +\n", wc->qFile, qSeq->name, wc->qStart, wc->qEnd, (wc->strand < 0 ? '-' : '+'), wc->tFile, tSeq->name, tMin, tMax); fprintf(out, "Aligning %s %s:%d-%d %c to %s.%s:%d-%d +\n", wc->qFile, qSeq->name, wc->qStart, wc->qEnd, (wc->strand < 0 ? '-' : '+'), wc->tFile, tSeq->name, tMin, tMax); score = xenAlignSmall(qStart, qSize, tSeq->dna + tMin, tMax-tMin, out, FALSE); fprintf(out, "best score %d\n", score); if (wc->strand < 0) reverseComplement(qStart, qSize); } freeDnaSeqList(&qSeqList); hashTraverseVals(tFileHash, htvFreeSeq); wabaCrudeFreeList(&wcList); freeHash(&tFileHash); fclose(out); }
void htvFreeSeq(void *val) /* Free dnaSeq list in hash table */ { struct dnaSeq *seqList = val; freeDnaSeqList(&seqList); }
void firstPass(char *aList, char *bList, char *outName) /* Do first pass - find areas of homology between a and b, * save to outName. */ { char *aNameBuf, **aNames; char *bNameBuf, **bNames; int aCount, bCount; struct nt4Seq **bNts, *bNt, *bNtList = NULL; int bNtCount; int i; FILE *out = mustOpen(outName, "w"); /* Read in fa file lists . */ readAllWordsOrFa(aList, &aNames, &aCount, &aNameBuf); readAllWordsOrFa(bList, &bNames, &bCount, &bNameBuf); /* Convert second list to nt4 (packed) format in memory. */ printf("Loading and packing dna in %s\n", bList); for (i=0; i<bCount; ++i) { char *bName = bNames[i]; struct dnaSeq *seqList, *seq; seqList = faReadAllDna(bName); for (seq = seqList; seq != NULL; seq = seq->next) { char uniqName[512]; sprintf(uniqName, "%s@%s", seq->name, bName); bNt = newNt4(seq->dna, seq->size, uniqName); slAddHead(&bNtList, bNt); } freeDnaSeqList(&seqList); } slReverse(&bNtList); bNtCount = slCount(bNtList); AllocArray(bNts, bNtCount); for (i=0, bNt=bNtList; i<bNtCount; ++i, bNt=bNt->next) bNts[i] = bNt; printf("Loaded %d contigs from %d files\n", bNtCount, bCount); /* Align elements of A list one at a time against B list. */ for (i=0; i<aCount; ++i) { char *aName = aNames[i]; struct dnaSeq *seqList, *seq; printf("Aligning %s against %s\n", aName, bList); seqList = faReadAllDna(aName); for (seq = seqList; seq != NULL; seq = seq->next) { doCrude(aName, seq, bNts, bNtCount, out); } printf("\n"); freeDnaSeqList(&seqList); } /* Cleanup time. */ for (i=0; i<bNtCount; ++i) freeNt4(&bNts[i]); freeMem(bNts); freeMem(aNames); freeMem(bNames); freeMem(aNameBuf); freeMem(bNameBuf); fclose(out); }
void blat(char *dbFile, char *queryFile, char *outName) /* blat - Standalone BLAT fast sequence search command line tool. */ { char **dbFiles, **queryFiles; int dbCount, queryCount; struct dnaSeq *dbSeqList, *seq; struct genoFind *gf; boolean tIsProt = (tType == gftProt); boolean qIsProt = (qType == gftProt); boolean bothSimpleNuc = (tType == gftDna && (qType == gftDna || qType == gftRna)); boolean bothSimpleProt = (tIsProt && qIsProt); FILE *f = mustOpen(outName, "w"); boolean showStatus = (f != stdout); databaseName = dbFile; gfClientFileArray(dbFile, &dbFiles, &dbCount); if (makeOoc != NULL) { gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType); if (showStatus) printf("Done making %s\n", makeOoc); exit(0); } gfClientFileArray(queryFile, &queryFiles, &queryCount); dbSeqList = gfClientSeqList(dbCount, dbFiles, tIsProt, tType == gftDnaX, repeats, minRepDivergence, showStatus); databaseSeqCount = slCount(dbSeqList); for (seq = dbSeqList; seq != NULL; seq = seq->next) databaseLetters += seq->size; gvo = gfOutputAny(outputFormat, minIdentity*10, qIsProt, tIsProt, noHead, databaseName, databaseSeqCount, databaseLetters, minIdentity, f); if (bothSimpleNuc || bothSimpleProt) { struct hash *maskHash = NULL; /* Save away masking info for output. */ if (repeats != NULL) { maskHash = newHash(0); for (seq = dbSeqList; seq != NULL; seq = seq->next) { Bits *maskedBits = maskFromUpperCaseSeq(seq); hashAdd(maskHash, seq->name, maskedBits); } } /* Handle masking and indexing. If masking is off, we want the indexer * to see unmasked sequence, otherwise we want it to see masked. However * after indexing we always want it unmasked, because things are always * unmasked for the extension phase. */ if (mask == NULL && !bothSimpleProt) gfClientUnmask(dbSeqList); gf = gfIndexSeq(dbSeqList, minMatch, maxGap, tileSize, repMatch, ooc, tIsProt, oneOff, FALSE, stepSize); if (mask != NULL) gfClientUnmask(dbSeqList); searchOneIndex(queryCount, queryFiles, gf, outName, tIsProt, maskHash, f, showStatus); freeHash(&maskHash); } else if (tType == gftDnaX && qType == gftProt) { bigBlat(dbSeqList, queryCount, queryFiles, outName, FALSE, TRUE, f, showStatus); } else if (tType == gftDnaX && (qType == gftDnaX || qType == gftRnaX)) { bigBlat(dbSeqList, queryCount, queryFiles, outName, TRUE, qType == gftDnaX, f, showStatus); } else { errAbort("Unrecognized combination of target and query types\n"); } if (dotEvery > 0) printf("\n"); freeDnaSeqList(&dbSeqList); }
void gfAlignTransTrans(int *pConn, char *tSeqDir, struct dnaSeq *qSeq, boolean qIsRc, int minMatch, struct hash *tFileCache, struct gfOutput *out, boolean isRna) /* Search indexed translated genome on server with an dna sequence. Translate * this sequence in three frames. Load homologous bits of genome locally * and do detailed alignment. Call 'outFunction' with each alignment * that is found. */ { struct gfClump *clumps[2][3][3], *clump; char targetName[PATH_LEN]; int qFrame, tFrame, tIsRc; struct gfSeqSource *ssList = NULL, *ss; struct lm *lm = lmInit(0); int tileSize; struct gfRange *rangeList = NULL, *rl, *range; struct trans3 *qTrans = trans3New(qSeq), *t3; struct slRef *t3RefList = NULL, *t3Ref; struct hash *t3Hash = NULL; struct dnaSeq *tSeqList = NULL; enum ffStringency stringency = (isRna ? ffCdna : ffLoose); /* Query server for clumps. */ gfQuerySeqTransTrans(*pConn, qSeq, clumps, lm, &ssList, &tileSize); close(*pConn); *pConn = -1; for (tIsRc=0; tIsRc <= 1; ++tIsRc) { /* Figure out which ranges need to be loaded and load them. */ for (qFrame = 0; qFrame < 3; ++qFrame) { for (tFrame = 0; tFrame < 3; ++tFrame) { rl = seqClumpToRangeList(clumps[tIsRc][qFrame][tFrame], tFrame); rangeList = slCat(rangeList, rl); } } rangeCoorTimes3(rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax); loadHashT3Ranges(rangeList, tSeqDir, tFileCache, qSeq->size/3, tIsRc, &t3Hash, &tSeqList, &t3RefList); /* The old range list was not very precise - it was just to get * the DNA loaded. */ gfRangeFreeList(&rangeList); /* Patch up clump list and associated sequence source to refer * to bits of genome loaded into memory. Create new range list * by extending hits in clumps. */ for (qFrame = 0; qFrame < 3; ++qFrame) { for (tFrame = 0; tFrame < 3; ++tFrame) { for (clump = clumps[tIsRc][qFrame][tFrame]; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; struct gfRange *rangeSet = NULL; t3 = trans3Find(t3Hash, clumpTargetName(clump), clump->tStart*3, clump->tEnd*3); ss->seq = t3->trans[tFrame]; ss->start = t3->start/3; ss->end = t3->end/3; clumpToHspRange(clump, qTrans->trans[qFrame], tileSize, tFrame, t3, &rangeSet, TRUE, FALSE); untranslateRangeList(rangeSet, qFrame, tFrame, NULL, t3, t3->start); rangeList = slCat(rangeSet, rangeList); } } } slReverse(&rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax); for (range = rangeList; range != NULL; range = range->next) { struct dnaSeq *targetSeq = range->tSeq; struct ssBundle *bun; AllocVar(bun); bun->qSeq = qSeq; bun->genoSeq = targetSeq; bun->ffList = gfRangesToFfItem(range->components, qSeq); ssStitch(bun, stringency, minMatch, ssAliCount); getTargetName(range->tName, out->includeTargetFile, targetName); t3 = range->t3; saveAlignments(targetName, t3->nibSize, t3->start, bun, NULL, qIsRc, tIsRc, stringency, minMatch, out); ssBundleFree(&bun); } /* Cleanup for this strand of database. */ gfRangeFreeList(&rangeList); freeHash(&t3Hash); for (t3Ref = t3RefList; t3Ref != NULL; t3Ref = t3Ref->next) { struct trans3 *t3 = t3Ref->val; trans3Free(&t3); } slFreeList(&t3RefList); freeDnaSeqList(&tSeqList); } trans3Free(&qTrans); for (ss = ssList; ss != NULL; ss = ss->next) freeMem(ss->fileName); slFreeList(&ssList); lmCleanup(&lm); }
void gfAlignTrans(int *pConn, char *tSeqDir, aaSeq *seq, int minMatch, struct hash *tFileCache, struct gfOutput *out) /* Search indexed translated genome on server with an amino acid sequence. * Then load homologous bits of genome locally and do detailed alignment. * Call 'outFunction' with each alignment that is found. */ { struct ssBundle *bun; struct gfClump *clumps[2][3], *clump; struct gfRange *rangeList = NULL, *range, *rl; struct dnaSeq *targetSeq, *tSeqList = NULL; char targetName[PATH_LEN]; int tileSize; int frame, isRc = 0; struct hash *t3Hash = NULL; struct slRef *t3RefList = NULL, *ref; struct gfSeqSource *ssList = NULL, *ss; struct trans3 *t3; struct lm *lm = lmInit(0); /* Get clumps from server. */ gfQuerySeqTrans(*pConn, seq, clumps, lm, &ssList, &tileSize); close(*pConn); *pConn = -1; for (isRc = 0; isRc <= 1; ++isRc) { /* Figure out which parts of sequence we need to load. */ for (frame = 0; frame < 3; ++frame) { rl = seqClumpToRangeList(clumps[isRc][frame], frame); rangeList = slCat(rangeList, rl); } /* Convert from amino acid to nucleotide coordinates. */ rangeCoorTimes3(rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax); loadHashT3Ranges(rangeList, tSeqDir, tFileCache, seq->size, isRc, &t3Hash, &tSeqList, &t3RefList); /* The old range list was not very precise - it was just to get * the DNA loaded. */ gfRangeFreeList(&rangeList); /* Patch up clump list and associated sequence source to refer * to bits of genome loaded into memory. Create new range list * by extending hits in clumps. */ for (frame = 0; frame < 3; ++frame) { for (clump = clumps[isRc][frame]; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; t3 = trans3Find(t3Hash, clumpTargetName(clump), clump->tStart*3, clump->tEnd*3); ss->seq = t3->trans[frame]; ss->start = t3->start/3; ss->end = t3->end/3; clumpToHspRange(clump, seq, tileSize, frame, t3, &rangeList, TRUE, FALSE); } } slReverse(&rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax/3); /* Do detailed alignment of each of the clustered ranges. */ for (range = rangeList; range != NULL; range = range->next) { targetSeq = range->tSeq; AllocVar(bun); bun->qSeq = seq; bun->genoSeq = targetSeq; bun->ffList = gfRangesToFfItem(range->components, seq); bun->isProt = TRUE; t3 = hashMustFindVal(t3Hash, range->tName); bun->t3List = t3; ssStitch(bun, ffCdna, minMatch, ssAliCount); getTargetName(range->tName, out->includeTargetFile, targetName); saveAlignments(targetName, t3->nibSize, 0, bun, t3Hash, FALSE, isRc, ffCdna, minMatch, out); ssBundleFree(&bun); } /* Cleanup for this strand of database. */ gfRangeFreeList(&rangeList); freeHash(&t3Hash); for (ref = t3RefList; ref != NULL; ref = ref->next) { struct trans3 *t3 = ref->val; trans3Free(&t3); } slFreeList(&t3RefList); freeDnaSeqList(&tSeqList); } /* Final cleanup. */ for (isRc=0; isRc<=1; ++isRc) for (frame=0; frame<3; ++frame) gfClumpFreeList(&clumps[isRc][frame]); for (ss = ssList; ss != NULL; ss = ss->next) freeMem(ss->fileName); slFreeList(&ssList); lmCleanup(&lm); }