struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed) /* Get sequence defined by bed. Exclude introns. */ { struct dnaSeq *seq; if (bed->blockCount <= 1) { seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd); freeMem(seq->name); seq->name = cloneString(bed->name); } else { int totalBlockSize = bedTotalBlockSize(bed); AllocVar(seq); seq->name = cloneString(bed->name); seq->dna = needMem(totalBlockSize+1); seq->size = totalBlockSize; int i; int seqOffset = 0; for (i=0; i<bed->blockCount; ++i) { int exonSize = bed->blockSizes[i]; int exonStart = bed->chromStart + bed->chromStarts[i]; struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize); memcpy(seq->dna + seqOffset, exon->dna, exonSize); seqOffset += exonSize; dnaSeqFree(&exon); } } if (bed->strand[0] == '-') reverseComplement(seq->dna, seq->size); return seq; }
void readCachedSeqPart(char *seqName, int start, int size, boolean getMasked, struct hash *hash, struct dlList *fileCache, struct dnaSeq **retSeq, int *retOffset, boolean *retIsNib) /* Read sequence hopefully using file cashe. If sequence is in a nib * file just read part of it. */ { struct seqFilePos *sfp = hashMustFindVal(hash, seqName); FILE *f = openFromCache(fileCache, sfp); if (sfp->isTwoBit) { *retSeq = twoBitReadSeqFrag((struct twoBitFile *)f, seqName, start, start + size); *retOffset = start; *retIsNib = TRUE; } else if (sfp->isNib) { *retSeq = nibLdPartMasked((getMasked ? NIB_MASK_MIXED : 0), sfp->file, f, sfp->pos, start, size); *retOffset = start; *retIsNib = TRUE; } else { if (getMasked) errAbort("masked sequences not supported with fasta files"); *retSeq = readSeqFromFaPos(sfp, f); *retOffset = 0; *retIsNib = FALSE; } }
void readCachedSeqPart(char *seqName, int start, int size, struct hash *hash, struct dlList *fileCache, struct dnaSeq **retSeq, int *retOffset, boolean *retIsPartial) /* Read sequence hopefully using file cashe. If sequence is in a nib * file just read part of it. */ { struct seqFilePos *sfp = hashMustFindVal(hash, seqName); FILE *f = openFromCache(fileCache, sfp->file); if (sfp->isNib) { *retSeq = nibLdPartMasked(NIB_MASK_MIXED, sfp->file, f, sfp->pos, start, size); *retOffset = start; *retIsPartial = TRUE; } else if (sfp->isTwoBit) { *retSeq = twoBitReadSeqFrag(sfp->tbf, seqName, start, start+size); *retOffset = start; *retIsPartial = TRUE; } else { *retSeq = readSeqFromFaPos(sfp, f); *retOffset = 0; *retIsPartial = FALSE; } }
void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, int start, int end) /* Output sequence. */ { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqSpec, start, end); if (noMask) toUpperN(seq->dna, seq->size); faWriteNext(f, seq->name, seq->dna, seq->size); dnaSeqFree(&seq); }
struct dnaSeq *nibTwoCacheSeq(struct nibTwoCache *ntc, char *seqName) /* Return all of sequence. This will have repeats in lower case. */ { if (ntc->isTwoBit) return twoBitReadSeqFrag(ntc->tbf, seqName, 0, 0); else { struct nibInfo *nib = nibInfoFromCache(ntc->nibHash, ntc->pathName, seqName); return nibLdPart(nib->fileName, nib->f, nib->size, 0, nib->size); } }
struct dnaSeq *dnaLoadSingle(char *fileName, int *retStart, int *retEnd, int *retParentSize) /* Return sequence if it's a nib file or 2bit part, NULL otherwise. */ { struct dnaSeq *seq = NULL; unsigned start = 0, end = 0; int parentSize = 0; if (nibIsFile(fileName)) { /* Save offset out of fileName for auto-lifting */ char filePath[PATH_LEN]; char name[PATH_LEN]; nibParseName(0, fileName, filePath, name, &start, &end); if (end != 0) /* It's just a range. */ { FILE *f; int size; nibOpenVerify(filePath, &f, &size); parentSize = size; } seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); if (end == 0) parentSize = end = seq->size; freez(&seq->name); seq->name = cloneString(name); } else if (twoBitIsRange(fileName)) { /* Save offset out of fileName for auto-lifting */ char *rangeSpec = cloneString(fileName); int start, end; char *file, *seqName; twoBitParseRange(rangeSpec, &file, &seqName, &start, &end); /* Load sequence. */ { struct twoBitFile *tbf = twoBitOpen(file); parentSize = twoBitSeqSize(tbf, seqName); seq = twoBitReadSeqFrag(tbf, seqName, start, end); twoBitClose(&tbf); } if (end == 0) end = seq->size; freez(&rangeSpec); } if (retStart != NULL) *retStart = start; if (retEnd != NULL) *retEnd = end; if (retParentSize != NULL) *retParentSize = parentSize; return seq; }
void doFetch(char *inputFileName, char *sequenceFileName, char *outputFileName) /* lookup sequence for each line */ { struct lineFile *lf = NULL; char *line; char *row[6]; int elementCount; struct twoBitFile *tbf; char *fileChrom = NULL; int start = 0; int end = 0; char *name = NULL; int score = 0; char *strand = NULL; struct dnaSeq *chunk = NULL; FILE *outputFileHandle = mustOpen(outputFileName, "w"); tbf = twoBitOpen(sequenceFileName); lf = lineFileOpen(inputFileName, TRUE); while (lineFileNext(lf, &line, NULL)) { elementCount = chopString(line, "\t", row, ArraySize(row)); if (elementCount != 6) continue; fileChrom = cloneString(row[0]); start = sqlUnsigned(row[1]); end = sqlUnsigned(row[2]); name = cloneString(row[3]); score = sqlUnsigned(row[4]); strand = cloneString(row[5]); if (start == end) continue; assert (end > start); chunk = twoBitReadSeqFrag(tbf, fileChrom, start, end); touppers(chunk->dna); if (sameString(strand, "-")) reverseComplement(chunk->dna, chunk->size); fprintf(outputFileHandle, "%s\t%d\t%d\t%s\t%d\t%s\t%s\n", fileChrom, start, end, name, score, strand, chunk->dna); dnaSeqFree(&chunk); } lineFileClose(&lf); carefulClose(&outputFileHandle); }
struct dnaSeq *nibTwoLoadOne(char *pathName, char *seqName) /* Return sequence from a directory full of nibs or a .2bit file. * The sequence will have repeats in lower case. */ { struct dnaSeq *seq; if (twoBitIsFile(pathName)) { struct twoBitFile *tbf = twoBitOpen(pathName); seq = twoBitReadSeqFrag(tbf, seqName, 0, 0); twoBitClose(&tbf); } else { char path[512]; safef(path, sizeof(path), "%s/%s.nib", pathName, seqName); seq = nibLoadAllMasked(NIB_MASK_MIXED, path); } return seq; }
struct dnaSeq *readFromCache(struct dlList *cache, char *dirName, char *seqName, int start, int size, int seqSize, boolean isTwoBit) /* Return dnaSeq read from the appropriate nib file. * You need to dnaSeqFree this when done (it is the nib * file that is cached, not the sequence). */ { struct cachedSeqFile *cn = openFromCache(cache, dirName, seqName, isTwoBit); if (isTwoBit) { return twoBitReadSeqFrag(cn->tbf, seqName, start, start+size); } else { if (seqSize != cn->size) errAbort("%s/%s is %d bases in .lav file and %d in .nib file\n", dirName, seqName, seqSize, cn->size); if ((start+size) > cn->size ) printf("%s/%s is %d bases in .lav file and %d in .nib file start %d size %d end %d\n", dirName, seqName, seqSize, cn->size, start, size, start+size); return nibLdPartMasked(NIB_MASK_MIXED, cn->fileName, cn->f, cn->size, start, size); } }
void seqFromPsl(char *inPsl, char *inTwoBit, char *outFa) /* seqFromPsl - Extract masked sequence from database corresponding to psl file. */ { struct twoBitFile *tbf = twoBitOpen(inTwoBit); struct lineFile *lf = pslFileOpen(inPsl); FILE *f = mustOpen(outFa, "w"); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { char faHead[512]; struct dnaSeq *seq = twoBitReadSeqFrag(tbf, psl->tName, psl->tStart, psl->tEnd); if (psl->strand[0] == '-') reverseComplement(seq->dna, seq->size); safef(faHead, sizeof(faHead), "%s (%s:%d-%d)", psl->qName, psl->tName, psl->tStart+1, psl->tEnd); if (hardMask) lowerToN(seq->dna, seq->size); faWriteNext(f, faHead, seq->dna, seq->size); } carefulClose(&f); }
void loadIfNewSeq(char *seqPath, boolean isTwoBit, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand) /* Load sequence unless it is already loaded. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { char fileName[512]; freeDnaSeq(pSeq); if (isTwoBit) { struct twoBitFile *tbf = twoBitOpenCached(seqPath); *pSeq = seq = twoBitReadSeqFrag(tbf, newName, 0, 0); verbose(1, "Loaded %d bases of %s from %s\n", seq->size, newName, seqPath); } else { snprintf(fileName, sizeof(fileName), "%s/%s.nib", seqPath, newName); *pSeq = seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); verbose(1, "Loaded %d bases in %s\n", seq->size, fileName); } *pName = newName; *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); } }
void fillHoles(struct mafAli *mafList, struct subSpecies *speciesList, struct twoBitFile *twoBit) { int lastEnd = 100000000; struct mafAli *prevMaf = NULL, *maf, *nextMaf; struct subSpecies *species; /* for(species = speciesList; species; species = species->next) { blockStatus = &species->blockStatus; blockStatus->mc->rightStatus = MAF_NEW_STATUS; blockStatus->mc->rightLen = 0; } */ for(maf = mafList; maf ; prevMaf = maf, maf = nextMaf) { struct mafComp *mc = NULL, *masterMc, *lastMc = NULL; struct mafAli *newMaf = NULL; struct blockStatus *blockStatus; nextMaf = maf->next; masterMc=maf->components; if (masterMc->start > lastEnd) { struct subSpecies *species; for(species = speciesList; species; species = species->next) { mc = NULL; // printf("looking at %s\n",species->name); blockStatus = &species->blockStatus; if (blockStatus->mc) { // printf("should match at %s\n",blockStatus->mc->src); switch (blockStatus->mc->rightStatus) { case MAF_MISSING_STATUS: //printf("missing right\n"); case MAF_NEW_NESTED_STATUS: case MAF_MAYBE_NEW_NESTED_STATUS: case MAF_CONTIG_STATUS: case MAF_TANDEM_STATUS: case MAF_INSERT_STATUS: AllocVar(mc); mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus; mc->rightLen = mc->leftLen = blockStatus->mc->rightLen; mc->src = blockStatus->mc->src; mc->srcSize = blockStatus->mc->srcSize; mc->strand = blockStatus->mc->strand; mc->start = blockStatus->mc->start + blockStatus->mc->size; if (lastMc == NULL) { struct mafComp *miniMasterMc = NULL; char *seqName; struct dnaSeq *seq; AllocVar(miniMasterMc); miniMasterMc->next = mc; miniMasterMc->strand = '+'; miniMasterMc->srcSize = masterMc->srcSize; miniMasterMc->src = masterMc->src; miniMasterMc->start = lastEnd; miniMasterMc->size = masterMc->start - lastEnd; if ((seqName = strchr(miniMasterMc->src, '.')) != NULL) seqName++; else seqName = miniMasterMc->src; // printf("hole filled from %d to %d\n",lastEnd, masterMc->start); seq = twoBitReadSeqFrag(twoBit, seqName, lastEnd, masterMc->start); miniMasterMc->text = seq->dna; AllocVar(newMaf); newMaf->textSize = maf->textSize; newMaf->components = miniMasterMc; newMaf->next = maf; if (prevMaf) prevMaf->next = newMaf; else mafList = newMaf; //masterMc = miniMasterMc; } else { lastMc->next = mc; } lastMc = mc; if (blockStatus->mc->rightStatus == MAF_MISSING_STATUS) { if (addN) { char buffer[256]; safef(buffer, sizeof(buffer), "%s.N",species->name); mc->src = cloneString(buffer); mc->start = 0; mc->srcSize = 200000; mc->size = masterMc->start - lastEnd; mc->text = needMem(mc->size + 1); memset(mc->text, 'N', mc->size); } } else { if (addDash) { mc->size = masterMc->size; mc->srcSize = blockStatus->mc->srcSize; mc->text = needMem(mc->size + 1); memset(mc->text, '-', mc->size); mc->text[mc->size] = 0; if (mc->size == 0) errAbort("bad dash add"); mc->size = 0; } } break; } } } } lastEnd = masterMc->start + masterMc->size; for(lastMc = masterMc; lastMc->next; lastMc = lastMc->next) ; for(species = speciesList; species; species = species->next) { blockStatus = &species->blockStatus; mc = NULL; if ((blockStatus->masterStart <= masterMc->start) && (blockStatus->masterEnd > masterMc->start) && ((mc = mafMayFindCompPrefix(maf, species->name,NULL)) == NULL)) { if (blockStatus->mc != NULL) { switch (blockStatus->mc->rightStatus) { case MAF_MISSING_STATUS: case MAF_CONTIG_STATUS: case MAF_TANDEM_STATUS: case MAF_INSERT_STATUS: case MAF_NEW_NESTED_STATUS: case MAF_MAYBE_NEW_NESTED_STATUS: AllocVar(mc); mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus; if (mc->rightStatus == MAF_NEW_NESTED_STATUS) mc->rightStatus = MAF_INSERT_STATUS; if (mc->leftStatus == MAF_NEW_NESTED_STATUS) mc->leftStatus = MAF_INSERT_STATUS; mc->rightLen = mc->leftLen = blockStatus->mc->rightLen; mc->src = blockStatus->mc->src; mc->strand = blockStatus->mc->strand; mc->srcSize = blockStatus->mc->srcSize; mc->start = blockStatus->mc->start + blockStatus->mc->size ; lastMc->next = mc; lastMc = mc; if (addN && (blockStatus->mc->rightStatus == MAF_MISSING_STATUS)) { char buffer[256]; safef(buffer, sizeof(buffer), "%s.N",species->name); mc->src = cloneString(buffer); mc->start = 0; mc->srcSize = 200000; mc->size = maf->textSize; mc->text = needMem(mc->size + 1); memset(mc->text, 'N', mc->size); } else if (addDash) { mc->size = masterMc->size; mc->text = needMem(mc->size + 1); if (mc->size == 0) errAbort("bad dash add"); memset(mc->text, '-', mc->size); mc->text[mc->size] = 0; mc->size = 0; } break; default: break; } } } if (mc) { blockStatus->mc = mc; } } } }
void searchOneIndex(int fileCount, char *files[], struct genoFind *gf, char *outName, boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus) /* Search all sequences in all files against single genoFind index. */ { int i; char *fileName; int count = 0; long long totalSize = 0; gfOutputHead(gvo, outFile); for (i=0; i<fileCount; ++i) { fileName = files[i]; if (nibIsFile(fileName)) { struct dnaSeq *seq; if (isProt) errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName); seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); freez(&seq->name); seq->name = cloneString(fileName); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); freeDnaSeq(&seq); } else if (twoBitIsSpec(fileName)) { struct twoBitSpec *tbs = twoBitSpecNew(fileName); struct twoBitFile *tbf = twoBitOpen(tbs->fileName); if (isProt) errAbort("%s is a two bit file, which doesn't work for proteins.", fileName); if (tbs->seqs != NULL) { struct twoBitSeqSpec *ss = NULL; for (ss = tbs->seqs; ss != NULL; ss = ss->next) { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name, ss->start, ss->end); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); dnaSeqFree(&seq); } } else { struct twoBitIndex *index = NULL; for (index = tbf->indexList; index != NULL; index = index->next) { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0); searchOneMaskTrim(seq, isProt, gf, outFile, maskHash, &totalSize, &count); dnaSeqFree(&seq); } } twoBitClose(&tbf); } else { static struct dnaSeq seq; struct lineFile *lf = lineFileOpen(fileName, TRUE); while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { searchOneMaskTrim(&seq, isProt, gf, outFile, maskHash, &totalSize, &count); } lineFileClose(&lf); } } carefulClose(&outFile); if (showStatus) printf("Searched %lld bases in %d sequences\n", totalSize, count); }
static struct dnaSeq *dnaLoadNextFromStack(struct dnaLoad *dl) /* Load next piece of DNA from stack of files. Return NULL * when stack is empty. */ { struct dnaLoadStack *dls; struct dnaSeq *seq = NULL; while ((dls = dl->stack) != NULL) { if (dls->twoBit) { if (dls->tbi != NULL) { seq = twoBitReadSeqFrag(dls->twoBit, dls->tbi->name, 0, 0); dls->tbi = dls->tbi->next; return seq; } else { dl->stack = dls->next; dnaLoadStackFree(&dls); } } else if (dls->textIsFa) { DNA *dna; char *name; int size; if (faMixedSpeedReadNext(dls->textFile, &dna, &size, &name)) { AllocVar(seq); seq->dna = needLargeMem(size+1); memcpy((void *)seq->dna, (void *)dna, size); seq->dna[size] = 0; seq->size = size; seq->name = cloneString(name); dl->curStart = 0; dl->curEnd = size; dl->curSize = size; return seq; } else { dl->stack = dls->next; dnaLoadStackFree(&dls); } } else /* It's a file full of file names. */ { char *line; if (lineFileNextReal(dls->textFile, &line)) { line = trimSpaces(line); if ((seq = dnaLoadSingle(line, &dl->curStart, &dl->curEnd, &dl->curSize)) != NULL) return seq; else { struct dnaLoadStack *newDls; newDls = dnaLoadStackNew(line); slAddHead(&dl->stack, newDls); } } else { dl->stack = dls->next; dnaLoadStackFree(&dls); } } } dl->finished = TRUE; return NULL; }
void checkExp(char *bedFileName, char *tNibDir, char *nibList) { struct lineFile *bf = lineFileOpen(bedFileName , TRUE), *af = NULL; char *row[PSEUDOGENELINK_NUM_COLS] ; struct pseudoGeneLink *ps; char *tmpName[512], cmd[512]; struct axt *axtList = NULL, *axt, *mAxt = NULL; struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seqList = NULL; struct nibInfo *qNib = NULL, *tNib = NULL; FILE *op; int ret; if (nibHash == NULL) nibHash = hashNew(0); while (lineFileNextRow(bf, row, ArraySize(row))) { struct misMatch *misMatchList = NULL; struct binKeeper *bk = NULL; struct binElement *el, *elist = NULL; struct psl *mPsl = NULL, *rPsl = NULL, *pPsl = NULL, *psl ; struct misMatch *mf = NULL; ps = pseudoGeneLinkLoad(row); tmpName[0] = cloneString(ps->name); chopByChar(tmpName[0], '.', tmpName, sizeof(tmpName)); verbose(2,"name %s %s:%d-%d\n", ps->name, ps->chrom, ps->chromStart,ps->chromEnd); /* get expressed retro from hash */ bk = hashFindVal(mrnaHash, ps->chrom); elist = binKeeperFindSorted(bk, ps->chromStart, ps->chromEnd ) ; for (el = elist; el != NULL ; el = el->next) { rPsl = el->val; verbose(2,"retroGene %s %s:%d-%d\n",rPsl->qName, ps->chrom, ps->chromStart,ps->chromEnd); } /* find mrnas that overlap parent gene */ bk = hashFindVal(mrnaHash, ps->gChrom); elist = binKeeperFindSorted(bk, ps->gStart , ps->gEnd ) ; for (el = elist; el != NULL ; el = el->next) { pPsl = el->val; verbose(2,"parent %s %s:%d %d,%d\n", pPsl->qName, pPsl->tName,pPsl->tStart, pPsl->match, pPsl->misMatch); } /* find self chain */ bk = hashFindVal(chainHash, ps->chrom); elist = binKeeperFind(bk, ps->chromStart , ps->chromEnd ) ; slSort(&elist, chainCmpScoreDesc); for (el = elist; el != NULL ; el = el->next) { struct chain *chain = el->val, *subChain, *retChainToFree, *retChainToFree2; int qs = chain->qStart; int qe = chain->qEnd; int id = chain->id; if (chain->qStrand == '-') { qs = chain->qSize - chain->qEnd; qe = chain->qSize - chain->qStart; } if (!sameString(chain->qName , ps->gChrom) || !positiveRangeIntersection(qs, qe, ps->gStart, ps->gEnd)) { verbose(2," wrong chain %s:%d-%d %s:%d-%d parent %s:%d-%d\n", chain->qName, qs, qe, chain->tName,chain->tStart,chain->tEnd, ps->gChrom,ps->gStart,ps->gEnd); continue; } verbose(2,"chain id %d %4.0f",chain->id, chain->score); chainSubsetOnT(chain, ps->chromStart+7, ps->chromEnd-7, &subChain, &retChainToFree); if (subChain != NULL) chain = subChain; chainSubsetOnQ(chain, ps->gStart, ps->gEnd, &subChain, &retChainToFree2); if (subChain != NULL) chain = subChain; if (chain->qStrand == '-') { qs = chain->qSize - chain->qEnd; qe = chain->qSize - chain->qStart; } verbose(2," %s:%d-%d %s:%d-%d ", chain->qName, qs, qe, chain->tName,chain->tStart,chain->tEnd); if (subChain != NULL) verbose(2,"subChain %s:%d-%d %s:%d-%d\n", subChain->qName, subChain->qStart, subChain->qEnd, subChain->tName,subChain->tStart,subChain->tEnd); qNib = nibInfoFromCache(nibHash, tNibDir, chain->qName); tNib = nibInfoFromCache(nibHash, tNibDir, chain->tName); tSeq = nibInfoLoadStrand(tNib, chain->tStart, chain->tEnd, '+'); qSeq = nibInfoLoadStrand(qNib, chain->qStart, chain->qEnd, chain->qStrand); axtList = chainToAxt(chain, qSeq, chain->qStart, tSeq, chain->tStart, maxGap, BIGNUM); verbose(2,"axt count %d misMatch cnt %d\n",slCount(axtList), slCount(misMatchList)); for (axt = axtList; axt != NULL ; axt = axt->next) { addMisMatch(&misMatchList, axt, chain->qSize); } verbose(2,"%d in mismatch list %s id %d \n",slCount(misMatchList), chain->qName, id); chainFree(&retChainToFree); chainFree(&retChainToFree2); break; } /* create axt of each expressed retroGene to parent gene */ /* get alignment for each mrna overlapping retroGene */ bk = hashFindVal(mrnaHash, ps->chrom); elist = binKeeperFindSorted(bk, ps->chromStart , ps->chromEnd ) ; { char queryName[512]; char axtName[512]; char pslName[512]; safef(queryName, sizeof(queryName), "/tmp/query.%s.fa", ps->chrom); safef(axtName, sizeof(axtName), "/tmp/tmp.%s.axt", ps->chrom); safef(pslName, sizeof(pslName), "/tmp/tmp.%s.psl", ps->chrom); op = fopen(pslName,"w"); for (el = elist ; el != NULL ; el = el->next) { psl = el->val; pslOutput(psl, op, '\t','\n'); qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0); if (qSeq != NULL) slAddHead(&seqList, qSeq); else errAbort("seq %s not found \n", psl->qName); } fclose(op); faWriteAll(queryName, seqList); safef(cmd,sizeof(cmd),"pslPretty -long -axt %s %s %s %s",pslName , nibList, queryName, axtName); ret = system(cmd); if (ret != 0) errAbort("ret is %d %s\n",ret,cmd); verbose(2, "ret is %d %s\n",ret,cmd); af = lineFileOpen(axtName, TRUE); while ((axt = axtRead(af)) != NULL) slAddHead(&mAxt, axt); lineFileClose(&af); } slReverse(&mAxt); /* for each parent/retro pair, count bases matching retro and parent better */ for (el = elist; el != NULL ; el = el->next) { int i, scoreRetro=0, scoreParent=0, scoreNeither=0; struct dyString *parentMatch = newDyString(16*1024); struct dyString *retroMatch = newDyString(16*1024); mPsl = el->val; if (mAxt != NULL) { verbose(2,"mrna %s %s:%d %d,%d axt %s\n", mPsl->qName, mPsl->tName,mPsl->tStart, mPsl->match, mPsl->misMatch, mAxt->qName); assert(sameString(mPsl->qName, mAxt->qName)); for (i = 0 ; i< (mPsl->tEnd-mPsl->tStart) ; i++) { int j = mAxt->tStart - mPsl->tStart; verbose(5, "listLen = %d\n",slCount(&misMatchList)); if ((mf = matchFound(&misMatchList, (mPsl->tStart)+i)) != NULL) { if (toupper(mf->retroBase) == toupper(mAxt->qSym[j+i])) { verbose (3,"match retro[%d] %d %c == %c parent %c %d\n", i,mf->retroLoc, mf->retroBase, mAxt->qSym[j+i], mf->parentBase, mf->parentLoc); dyStringPrintf(retroMatch, "%d,", mf->retroLoc); scoreRetro++; } else if (toupper(mf->parentBase) == toupper(mAxt->qSym[j+i])) { verbose (3,"match parent[%d] %d %c == %c retro %c %d\n", i,mf->parentLoc, mf->parentBase, mAxt->qSym[j+i], mf->retroBase, mf->retroLoc); dyStringPrintf(parentMatch, "%d,", mf->parentLoc); scoreParent++; } else { verbose (3,"match neither[%d] %d %c != %c retro %c %d\n", i,mf->parentLoc, mf->parentBase, mAxt->tSym[j+i], mf->retroBase, mf->retroLoc); scoreNeither++; } } } verbose(2,"final score %s parent %d retro %d neither %d\n", mPsl->qName, scoreParent, scoreRetro, scoreNeither); fprintf(outFile,"%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n", ps->chrom, ps->chromStart, ps->chromEnd, ps->name, ps->score, mPsl->tName, mPsl->tStart, mPsl->tEnd, mPsl->qName, scoreParent, scoreRetro, scoreNeither, parentMatch->string, retroMatch->string); mAxt = mAxt->next; } dyStringFree(&parentMatch); dyStringFree(&retroMatch); } } }
struct axt *pslToAxt(struct psl *psl, struct hash *qHash, char *tNibDir, struct dlList *fileCache) { static char *tName = NULL, *qName = NULL; static struct dnaSeq *tSeq = NULL; struct dyString *q = newDyString(16*1024); struct dyString *t = newDyString(16*1024); int blockIx; int qs, ts ; int lastQ = 0, lastT = 0, size; int qOffset = 0; int tOffset = 0; struct axt *axt = NULL; boolean qIsNib = FALSE; boolean tIsNib = FALSE; int cnt = 0; //struct dnaSeq *tSeq = NULL; struct nibInfo *tNib = NULL; struct dnaSeq *qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0); // hGenBankGetMrna(psl->qName, NULL); /* freeDnaSeq(&qSeq); freez(&qName); assert(mrnaList != NULL); for (mrna = mrnaList; mrna != NULL ; mrna = mrna->next) { assert(mrna != NULL); cnt++; if (sameString(mrna->name, psl->qName)) { qSeq = cloneDnaSeq(mrna); assert(qSeq != NULL); break; } } */ if (qSeq == NULL) { warn("mrna sequence data not found %s, searched %d sequences\n",psl->qName,cnt); dyStringFree(&q); dyStringFree(&t); dnaSeqFree(&tSeq); dnaSeqFree(&qSeq); return NULL; } if (qSeq->size != psl->qSize) { warn("sequence %s aligned is different size %d from mrna.fa file %d \n",psl->qName,psl->qSize,qSeq->size); dyStringFree(&q); dyStringFree(&t); dnaSeqFree(&tSeq); dnaSeqFree(&qSeq); return NULL; } qName = cloneString(psl->qName); if (qIsNib && psl->strand[0] == '-') qOffset = psl->qSize - psl->qEnd; else qOffset = 0; verbose(5,"qString len = %d qOffset = %d\n",qSeq->size,qOffset); if (tName == NULL || !sameString(tName, psl->tName) || tIsNib) { freeDnaSeq(&tSeq); freez(&tName); tName = cloneString(psl->tName); tNib = nibInfoFromCache(nibHash, tNibDir, tName); assert(tNib !=NULL); tSeq = nibInfoLoadStrand(tNib, psl->tStart, psl->tEnd, '+'); assert(tSeq !=NULL); tOffset = psl->tStart; //readCachedSeqPart(tName, psl->tStart, psl->tEnd-psl->tStart, // tHash, fileCache, &tSeq, &tOffset, &tIsNib); } verbose(4,"strand t %s \n",psl->strand); if (tSeq != NULL) verbose(5,"tString len = %d tOffset = %d\n",tSeq->size,tOffset); else errAbort("tSeq is NULL\n"); if (psl->strand[0] == '-') reverseComplement(qSeq->dna, qSeq->size); //if (strlen(psl->strand) > 1 ) // if (psl->strand[1] == '-') // reverseComplement(tSeq->dna, tSeq->size); for (blockIx=0; blockIx < psl->blockCount; ++blockIx) { qs = psl->qStarts[blockIx] - qOffset; ts = psl->tStarts[blockIx] - tOffset; if (blockIx != 0) { int qGap, tGap, minGap; qGap = qs - lastQ; tGap = ts - lastT; minGap = min(qGap, tGap); if (minGap > 0) { writeGap(q, qGap, qSeq->dna + lastQ, t, tGap, tSeq->dna + lastT); } else if (qGap > 0) { writeInsert(q, t, qSeq->dna + lastQ, qGap); } else if (tGap > 0) { writeInsert(t, q, tSeq->dna + lastT, tGap); } } size = psl->blockSizes[blockIx]; assert(qSeq != NULL); dyStringAppendN(q, qSeq->dna + qs, size); lastQ = qs + size; dyStringAppendN(t, tSeq->dna + ts, size); lastT = ts + size; } if (strlen(q->string) != strlen(t->string)) warn("Symbol count(t) %d != %d inconsistent at t %s:%d and qName %s\n%s\n%s\n", (int)strlen(t->string), (int)strlen(q->string), psl->tName, psl->tStart, psl->qName, t->string, q->string); if (psl->strand[0] == '-') { reverseComplement(q->string, q->stringSize); reverseComplement(t->string, t->stringSize); } axt = axtCreate(q->string, t->string, min(q->stringSize,t->stringSize), psl); dyStringFree(&q); dyStringFree(&t); //dnaSeqFree(&tSeq); dnaSeqFree(&qSeq); if (qIsNib) freez(&qName); //if (tIsNib) // freez(&tName); return axt; }