void pslRecalcMatch(char *inName, char *targetName, char *queryName, char *outName) /* pslRecalcMatch - Recalculate match,mismatch,repMatch columns in psl file. * This can be useful if the psl went through pslMap, or if you've added * lower-case repeat masking after the fact. */ { struct nibTwoCache *tCache = nibTwoCacheNew(targetName); struct dnaSeq *qSeqList = dnaLoadAll(queryName); struct hash *qHash = dnaSeqHash(qSeqList); struct psl *psl; struct lineFile *lf = pslFileOpen(inName); FILE *f = mustOpen(outName, "w"); while ((psl = pslNext(lf)) != NULL) { int tSize; struct dnaSeq *tSeqPart = nibTwoCacheSeqPart(tCache, psl->tName, psl->tStart, psl->tEnd - psl->tStart, &tSize); struct dnaSeq *qSeq = hashMustFindVal(qHash, getQName(psl->qName)); recalcMatches(psl, tSeqPart, psl->tStart, qSeq); pslTabOut(psl, f); dnaSeqFree(&tSeqPart); } carefulClose(&f); lineFileClose(&lf); }
void pslCoverage(char *inLst, char *inPsl, double aliRatio, int trimSize, char *outName, char *misAsmName) /* Analyse inName and put best alignments for eacmRNA in estAliName. * Put repeat info in repName. */ { struct lineFile *in = pslFileOpen(inPsl); FILE *out = mustOpen(outName, "a"); FILE *misAsm = mustOpen(misAsmName, "w"); struct psl *pslList = NULL, *psl; char lastName[256]; int threshold = round((1.0 - (1.0 - aliRatio)*2)*1000); struct hash *probeHash; struct probe *probeList; readProbeList(inLst, &probeList, &probeHash); printf("Found %d probes in %s\n", slCount(probeList), inLst); printf("Processing %s percent ID %f%% threshold %d\n", inPsl, aliRatio*100, threshold); strcpy(lastName, ""); while ((psl = pslNext(in)) != NULL) { if (!sameString(lastName, psl->qName)) { doOneAcc(lastName, pslList, threshold, trimSize, probeHash, misAsm); finishList(&pslList); strcpy(lastName, psl->qName); } slAddHead(&pslList, psl); } doOneAcc(lastName, pslList, threshold, trimSize, probeHash, misAsm); finishList(&pslList); lineFileClose(&in); summarizeProbeList(probeList, trimSize, aliRatio, out, misAsm); fclose(out); }
void migratePsls(struct migrateAligns* migrate, unsigned pslFileType, struct gbEntryCnts* counts, FILE* outPslFh) /* Migrate selected PSL records */ { char inPsl[PATH_LEN]; struct lineFile* inPslLf; struct psl* psl; gbAlignedGetPath(migrate->prevSelect, gPslFileGzExt[pslFileType], NULL, inPsl); /* It's possible to end up here and not have a file if none of the sequences * aligned */ if (fileExists(inPsl)) { gbVerbEnter(2, "migrating %ss from %s", gPslFileExt[pslFileType], inPsl); inPslLf = gzLineFileOpen(inPsl); while ((psl = pslNext(inPslLf)) != NULL) { migratePsl(migrate, pslFileType, counts, psl, inPsl, outPslFh); pslFree(&psl); } gzLineFileClose(&inPslLf); gbVerbLeave(2, "migrating %ss from %s", gPslFileExt[pslFileType], inPsl); } }
void pslCopyInClones(char *listFile, char *partDir, char *outName) /* Copy in the .psl files corresponding to the clones named in listFile. */ { struct slName *inList, *inEl; FILE *out = mustOpen(outName, "w"); struct psl *psl; int pslCount = 0; int fileCount = 0; pslWriteHead(out); inList = getFileList(listFile, partDir); for (inEl = inList; inEl != NULL; inEl = inEl->next) { char *inName = inEl->name; struct lineFile *lf = pslFileOpen(inName); ++fileCount; while ((psl = pslNext(lf)) != NULL) { pslTabOut(psl, out); pslFree(&psl); ++pslCount; } lineFileClose(&lf); } printf("%d psls in %d files written to %s\n", pslCount, fileCount, outName); fclose(out); }
void pslIntronsOnly(char *inPslName, char *genoFile, char *outPslName) /* pslIntronsOnly - Filter psl files to only include those with introns. */ { struct lineFile *lf = NULL; FILE *outFile = NULL; struct hash *genoHash = loadGeno(genoFile); struct psl *psl; int count = 0, intronCount = 0; lf = pslFileOpen(inPslName); outFile = mustOpen(outPslName, "w"); while ((psl = pslNext(lf)) != NULL) { struct dnaSeq *geno = hashMustFindVal(genoHash, psl->tName); if (pslHasIntron(psl, geno, 0)) { ++intronCount; pslTabOut(psl, outFile); } pslFree(&psl); ++count; } carefulClose(&outFile); lineFileClose(&lf); printf("%d of %d in %s have introns\n", intronCount, count, inPslName); }
void pslToBed(char *pslFile, char *bedFile, struct hash *cdsHash, bool doPosName) /* pslToBed -- tranform a psl format file to a bed format file */ { struct lineFile *pslLf = pslFileOpen(pslFile); FILE *bedFh = mustOpen(bedFile, "w"); struct psl *psl; while ((psl = pslNext(pslLf)) != NULL) { struct bed *bed = bedFromPsl(psl); if (doPosName) { char *newName = needMem(512); safef(newName, 512, "%s:%d-%d", psl->qName, psl->qStart, psl->qEnd); freeMem(bed->name); bed->name = newName; } if (cdsHash) { struct cds *cds = hashFindVal(cdsHash, psl->qName); if (cds == NULL) bed->thickStart = bed->thickEnd = bed->chromStart; else setThick(psl, bed, cds); } bedTabOutN(bed, 12, bedFh); bedFree(&bed); pslFree(&psl); } carefulClose(&bedFh); lineFileClose(&pslLf); }
struct seqPair *readPslBlocks(char *fileName, struct hash *pairHash, FILE *f) /* Read in psl file and parse blocks into pairHash */ { struct seqPair *spList = NULL, *sp; struct lineFile *lf = pslFileOpenWithUniqueMeta(fileName, f); struct dyString *dy = newDyString(512); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { dyStringClear(dy); dyStringPrintf(dy, "%s%s%s", psl->qName, psl->strand, psl->tName); sp = hashFindVal(pairHash, dy->string); if (sp == NULL) { AllocVar(sp); slAddHead(&spList, sp); hashAddSaveName(pairHash, dy->string, sp, &sp->name); sp->qName = cloneString(psl->qName); sp->tName = cloneString(psl->tName); sp->qStrand = psl->strand[0]; } addPslBlocks(&sp->blockList, psl); sp->axtCount += 1; pslFree(&psl); } lineFileClose(&lf); dyStringFree(&dy); return spList; }
static void pslMap(char* inPslFile, char *mapFile, char *outPslFile) /* project inPsl query through mapFile query to mapFile target */ { struct chromBins *mapAlns; struct psl* inPsl; struct lineFile* inPslLf = pslFileOpen(inPslFile); FILE *outPslFh, *mapInfoFh = NULL, *mappingPslFh = NULL; if (chainMapFile) mapAlns = loadMapChains(mapFile); else mapAlns = loadMapPsls(mapFile); outPslFh = mustOpen(outPslFile, "w"); if (mapInfoFile != NULL) { mapInfoFh = mustOpen(mapInfoFile, "w"); fputs(mapInfoHdr, mapInfoFh); } if (mappingPslFile != NULL) mappingPslFh = mustOpen(mappingPslFile, "w"); while ((inPsl = pslNext(inPslLf)) != NULL) { if (swapIn) pslSwap(inPsl, FALSE); mapQueryPsl(inPsl, mapAlns, outPslFh, mapInfoFh, mappingPslFh); pslFree(&inPsl); } carefulClose(&mappingPslFh); carefulClose(&mapInfoFh); carefulClose(&outPslFh); lineFileClose(&inPslLf); }
void sgName(char *database, char *protDb, char *refPsl, char *outAssoc) /* sgName - builds association table between knownPep and gene common name. */ { struct sqlConnection *conn = sqlConnect(database); //struct sqlConnection *conn2 = sqlConnect("swissProt"); char *words[1], **row; FILE *f = mustOpen(outAssoc, "w"); struct lineFile *pslLf = pslFileOpen(refPsl); int count = 0, found = 0; char query[256]; struct psl *psl; char *swiss = NULL; while ((psl = pslNext(pslLf)) != NULL) { fprintf(f,"%s\t%s\t%s:%d-%d\t",psl->qName, lookupName(conn,psl->qName), psl->tName, psl->tStart, psl->tEnd); fprintf(f,"%s\n", swiss = getSwiss(conn, psl->qName)); } /* while (lineFileRow(lf, words)) { fprintf(f,"%s\t%s\n",words[0], lookupName(conn,words[0])); //, getSwiss(conn, words[0])); } */ hFreeConn(&conn); }
void copyPslToTab(char *pslFile, char *tabFile) /* copy a single PSL to the tab file */ { struct psl *psl; struct lineFile *lf = pslFileOpen(pslFile); struct pipeline *pl = NULL; FILE *tabFh = NULL; if (noSort) tabFh = mustOpen(tabFile, "w"); else { if (pslCreateOpts & PSL_WITH_BIN) pl = pipelineOpen(outPipeBin, pipelineWrite, tabFile, NULL); else pl = pipelineOpen(outPipeNoBin, pipelineWrite, tabFile, NULL); tabFh = pipelineFile(pl); } while ((psl = pslNext(lf)) != NULL) { if (pslCreateOpts & PSL_WITH_BIN) fprintf(tabFh, "%u\t", hFindBin(psl->tStart, psl->tEnd)); pslTabOut(psl, tabFh); pslFree(&psl); } lineFileClose(&lf); if (noSort) carefulClose(&tabFh); else { pipelineWait(pl); pipelineFree(&pl); } }
static void pslAlignStats(char *pslFile, char *statsFile, char *querySizeFile) /* collect and output per-alignment stats */ { struct hash* querySizesTbl = (querySizeFile != NULL) ? querySizeCntLoad(querySizeFile) : NULL; struct lineFile *pslLf = pslFileOpen(pslFile); FILE *fh = mustOpen(statsFile, "w"); struct psl* psl; fputs(alnStatsHdr, fh); while ((psl = pslNext(pslLf)) != NULL) { fprintf(fh, alnStatsFmt, psl->qName, psl->qSize, psl->tName, psl->tStart, psl->tEnd, calcIdent(psl), calcQCover(psl), calcRepMatch(psl), calcTCover(psl)); if (querySizesTbl != NULL) querySizeCntGet(querySizesTbl, psl->qName, psl->qSize)->alnCnt++; pslFree(&psl); } lineFileClose(&pslLf); if (querySizesTbl != NULL) alignStatsOutputUnaligned(fh, querySizesTbl); carefulClose(&fh); }
void pslCut(char *cutList, char *inPsl, char *outPsl) /* pslCut - Remove a list of clones from psl file.. */ { struct hash *cutHash = newHash(0); struct lineFile *lf = pslFileOpen(inPsl); FILE *f = mustOpen(outPsl, "w"); struct psl *psl; char cloneName[128]; int total = 0, cut = 0; buildCutHash(cutList, cutHash); pslWriteHead(f); while ((psl = pslNext(lf)) != NULL) { fragToCloneName(psl->tName, cloneName); if (!hashLookup(cutHash, cloneName)) { pslTabOut(psl, f); } else ++cut; ++total; pslFree(&psl); } printf("Cut %d of %d\n", cut, total); }
void fillInPsls(char *pslName, struct hash *pairHash) /* Read in psl file and save overlaps between indicated pairs * in hash. */ { struct lineFile *lf = pslFileOpen(pslName); struct psl *psl; char *pairName; struct seqPair *pair; struct seqOver *so; boolean firstA; char queryClone[128], targetClone[128]; struct hashEl *hel; while ((psl = pslNext(lf)) != NULL) { fragToCloneName(psl->qName, queryClone); fragToCloneName(psl->tName, targetClone); pairName = makePairName(queryClone, targetClone, &firstA); if ((pair = hashFindVal(pairHash, pairName)) != NULL) { so = (firstA ? &pair->a : &pair->b); slAddHead(&so->pslList, psl); } else { pslFree(&psl); } } }
void pslxToFa(char *pslName, char *faName, char *liftTargetName, char *liftQueryName) /* pslxToFa - convert pslx to fasta file. */ { FILE *liftTarget = NULL; FILE *liftQuery = NULL; struct lineFile *in = pslFileOpen(pslName); FILE *out = mustOpen(faName, "w"); struct psl *psl; if (liftQueryName != NULL) liftQuery = mustOpen(liftQueryName, "w"); if (liftTargetName != NULL) liftTarget = mustOpen(liftTargetName, "w"); while ((psl = pslNext(in)) != NULL) { int ii=0; //int sumQuery = 0; if (liftQuery != NULL) { fprintf(liftQuery,"%d\t%s/%s_%d_%d\t%ld\t%s\t%d\n", psl->qStarts[0], "1", psl->qName,0,psl->blockCount, (long)strlen(psl->qSequence[0]), psl->qName, psl->qSize); //sumQuery += strlen(psl->qSequence[0]); } if (liftTarget != NULL) { if (psl->strand[1] == '-') fprintf(liftTarget,"%d\t%s/%s_%d_%d\t%ld\t%s\t%d\t%c\n", psl->tSize - psl->tStarts[0], &psl->tName[3], psl->qName,0,psl->blockCount, (long)3*strlen(psl->qSequence[0]), psl->tName, psl->tSize, psl->strand[1]); else fprintf(liftTarget,"%d\t%s/%s_%d_%d\t%ld\t%s\t%d\t%c\n", psl->tStarts[0], &psl->tName[3], psl->qName,0,psl->blockCount, (long)3*strlen(psl->qSequence[0]), psl->tName, psl->tSize, psl->strand[1]); } fprintf(out,">%s_%d_%d\n%s\n",psl->qName, 0, psl->blockCount, psl->qSequence[0]); for(ii=1; ii < psl->blockCount; ii++) { if (liftTarget != NULL) { if (psl->strand[1] == '-') fprintf(liftTarget,"%d\t%s/%s_%d_%d\t%ld\t%s\t%d\t%c\n", psl->tSize - psl->tStarts[ii], &psl->tName[3], psl->qName,ii,psl->blockCount, (long)3*strlen(psl->qSequence[ii]), psl->tName, psl->tSize, psl->strand[1]); else fprintf(liftTarget,"%d\t%s/%s_%d_%d\t%ld\t%s\t%d\t%c\n", psl->tStarts[ii], &psl->tName[3], psl->qName,ii,psl->blockCount, (long)3*strlen(psl->qSequence[ii]), psl->tName, psl->tSize, psl->strand[1]); } if (liftQuery != NULL) { fprintf(liftQuery,"%d\t%s/%s_%d_%d\t%ld\t%s\t%d\n", psl->qStarts[ii], "1", psl->qName,ii,psl->blockCount, (long)strlen(psl->qSequence[ii]), psl->qName, psl->qSize); //sumQuery += strlen(psl->qSequence[ii]); } fprintf(out,">%s_%d_%d\n%s\n",psl->qName, ii, psl->blockCount, psl->qSequence[ii]); } pslFree(&psl); } }
void pslMrnaCover(char *pslFile, char *faFile) /* pslMrnaCover - Make histogram of coverage percentage of mRNA in psl. */ { static int histogram[101]; int i; int qAli; struct hash *hash; struct rnaCover *rcList = NULL, *rc; struct lineFile *lf = pslFileOpen(pslFile); struct psl *psl; /* Build up list of all sequences. */ readFa(faFile, &rcList, &hash); /* Scan psls and see maximum amount each is aligned. */ while ((psl = pslNext(lf)) != NULL) { if (psl->qSize >= minSize) { if ((rc = hashFindVal(hash, psl->qName)) == NULL) errAbort("%s is in %s but not %s", psl->qName, pslFile, faFile); if (rc->qSize != psl->qSize) errAbort("%s is %d bytes in %s but %d in %s", psl->qName, rc->qSize, faFile, psl->qSize, pslFile); qAli = psl->match + psl->repMatch + psl->misMatch; if (qAli > rc->qMaxAli) rc->qMaxAli = qAli; } pslFree(&psl); } lineFileClose(&lf); /* Open file to keep track of non-aligners */ if (listZero != NULL) { FILE *f = mustOpen(listZero, "w"); for (rc = rcList; rc != NULL; rc = rc->next) { if (rc->qMaxAli == 0) fprintf(f, "%s\t%d\n", rc->name, rc->qSize); } } /* Talley up percentage aligning in histogram. */ for (rc = rcList; rc != NULL; rc = rc->next) { int histIx = roundingScale(100, rc->qMaxAli, rc->qSize); assert(histIx <= 100); histogram[histIx] += 1; } /* Print out histogram. */ for (i=0; i<=100; ++i) { printf("%3d%% %6d\n", i, histogram[i]); } }
static struct psl *pslInputNext(struct pslInput *pi) /* read next psl */ { struct psl *psl = pi->pending; if (psl != NULL) pi->pending = NULL; else psl = pslNext(pi->lf); return psl; }
void exonMap(char *query, char *target, char *output) /* exonMap - map exons using two psls. */ { struct lineFile *qlf = pslFileOpen(query); struct lineFile *tlf = pslFileOpen(target); struct psl *psl, *pslList, *newPslList, *pslRef; struct hash *pslHash = newHash(0); FILE *outF = mustOpen(output, "w"); while ((psl = pslNext(qlf)) != NULL) { pslList = hashFindVal(pslHash, psl->qName); if (pslList == NULL) hashAdd(pslHash, psl->qName, psl); else { psl->next = pslList->next; pslList->next = psl; } } while ((psl = pslNext(tlf)) != NULL) { struct psl *newPsl = NULL; pslList = hashFindVal(pslHash,psl->qName); // if (pslList == NULL) // errAbort("can't find %s in query file",psl->qName); for(pslRef = pslList; pslRef ; pslRef = pslRef->next ) { if (optionExists("exons")) mapBlocks(pslRef, psl, outPsl, (void *)outF); else { newPsl = NULL; mapBlocks(pslRef, psl, addPsl, &newPsl); pslTabOut(newPsl, outF); } } } }
int readAlignments(char *pairsPsl, struct hash *readHash, struct hash *fragHash) /* Read in alignments and process them into the read->aliList. * Returns number of alignments altogether. */ { struct lineFile *lf = pslFileOpen(pairsPsl); struct shortAli *ali; struct psl *psl; struct readInfo *rd; int aliCount = 0; int dotEvery = 20*1024; int dotty = dotEvery; int aliSize; printf("Reading and processing %s\n", pairsPsl); for (;;) { AllocVar(ali); /* Allocate this first to reduce memory fragmentation. */ if ((psl = pslNext(lf)) == NULL) { freeMem(ali); break; } if (filter(psl)) { rd = hashMustFindVal(readHash, psl->qName); aliSize = psl->match + psl->repMatch; aliSize /= 100; if (aliSize < 0) aliSize = 0; if (aliSize >= ArraySize(aliSizes)) aliSize = ArraySize(aliSizes)-1; aliSizes[aliSize] += 1; ali->tName = hashStoreName(fragHash, psl->tName); ali->tStart = psl->tStart; ali->tEnd = psl->tEnd; ali->tSize = psl->tSize; ali->strand = psl->strand[0]; slAddHead(&rd->aliList, ali); pslFree(&psl); ++aliCount; } else { pslFree(&psl); freeMem(ali); } if (--dotty <= 0) { dotty = dotEvery; printf("."); fflush(stdout); } } printf("\n"); return aliCount; }
void readPslFile (struct lineFile *pf, struct hash **hash) /* Read in psl file and store contents in a hash keyed by qName */ { struct hash *pslHash = *hash; struct psl *psl = pslNext(pf); char *key = NULL; while (psl != NULL) { /* add to hash */ if (psl != NULL) { key = createKey(psl->qName, psl->tName, psl->tStart, psl->tEnd); /* check if this key exists already, if not then add to hash */ if (!existsInHash(pslHash, key)) hashAdd(pslHash, key, psl); } psl = pslNext(pf); } }
void pslPretty(char *pslName, char *targetList, char *queryList, char *prettyName, boolean axt, char *checkFileName) /* pslPretty - Convert PSL to human readable output. */ { struct hash *fileHash = newHash(0); /* No value. */ struct hash *tHash = newHash(20); /* seqFilePos value. */ struct hash *qHash = newHash(20); /* seqFilePos value. */ struct dlList *fileCache = newDlList(); struct lineFile *lf = pslFileOpen(pslName); FILE *f = mustOpen(prettyName, "w"); FILE *checkFile = NULL; struct psl *psl; int dotMod = dot; if (checkFileName != NULL) checkFile = mustOpen(checkFileName, "w"); /* fprintf(stderr,"Scanning %s\n", targetList); */ hashFileList(targetList, fileHash, tHash); /* fprintf(stderr,"Scanning %s\n", queryList); */ hashFileList(queryList, fileHash, qHash); /* fprintf(stderr,"Converting %s\n", pslName); */ while ((psl = pslNext(lf)) != NULL) { if (dot > 0) { if (--dotMod <= 0) { fprintf(stderr,"."); /* stderr flushes itself */ dotMod = dot; } } prettyOne(psl, qHash, tHash, fileCache, f, axt, checkFile); pslFree(&psl); } if (dot > 0) fprintf(stderr,"\n"); if (checkFile != NULL) { fprintf(checkFile,"missLargeStart: %d\n", total_missLargeStart); fprintf(checkFile,"missSmallStart: %d\n", total_missSmallStart); fprintf(checkFile,"missLargeEnd: %d\n", total_missLargeEnd); fprintf(checkFile,"missSmallEnd: %d\n", total_missSmallEnd); fprintf(checkFile,"missLargeMiddle: %d\n", total_missLargeMiddle); fprintf(checkFile,"missSmallMiddle: %d\n", total_missSmallMiddle); fprintf(checkFile,"weirdSplice: %d\n", total_weirdSplice); fprintf(checkFile,"doubleGap: %d\n", total_doubleGap); fprintf(checkFile,"jumpBack: %d\n", total_jumpBack); fprintf(checkFile,"perfect: %d\n", total_rnaPerfect); fprintf(checkFile,"total: %d\n", total_rnaCount); } lineFileClose(&lf); carefulClose(&f); carefulClose(&checkFile); }
void pslToChain(char *pslIn, char *chainOut) /* pslToChain - Extract multiple psl records. */ { struct lineFile *lf = pslFileOpen(pslIn); int chainId = 1; int ii; FILE *f = mustOpen(chainOut, "w"); struct psl *psl; struct chain chain; while ((psl = pslNext(lf) ) != NULL) { if (psl->strand[1] == '-') { if (ignoreError) continue; errAbort("PSL record on line %d has '-' for target strand which is not allowed.", lf->lineIx); } chain.score = pslScore(psl); chain.id = chainId++; chain.tName = psl->tName; chain.tSize = psl->tSize; chain.tStart = psl->tStart; chain.tEnd = psl->tEnd; chain.qName = psl->qName; chain.qSize = psl->qSize; chain.qStrand = psl->strand[0]; if (psl->strand[0] == '-') { chain.qEnd = psl->qSize - psl->qStart; chain.qStart = psl->qSize - psl->qEnd; } else { chain.qStart = psl->qStart; chain.qEnd = psl->qEnd; } chainWriteHead(&chain,f); for(ii=0; ii < psl->blockCount; ii++) { fprintf(f, "%d", psl->blockSizes[ii]); if (ii < psl->blockCount - 1) fprintf(f, "\t%d\t%d", psl->tStarts[ii+1]-(psl->tStarts[ii] + psl->blockSizes[ii]), psl->qStarts[ii+1]-(psl->qStarts[ii] + psl->blockSizes[ii])); fprintf(f,"\n"); } pslFree(&psl); } }
void pslGlue(char *inNames[], int inCount, char *outName, char *glueName) /* Reduce a psl file to only the gluing components. */ { FILE *out; FILE *glue; struct psl *pslList = NULL, *psl, *nextPsl; int i; struct psl *localList = NULL; int glueCount = 0; int pslCount = 0; printf("Reading"); for (i=0; i<inCount; ++i) { char *inName = inNames[i]; struct lineFile *lf = pslFileOpen(inName); printf(" %s", inName); fflush(stdout); while ((psl = pslNext(lf)) != NULL) { slAddHead(&pslList, psl); ++pslCount; } lineFileClose(&lf); } printf("\n"); slSort(&pslList, pslCmpQuery); out = mustOpen(outName, "w"); glue = mustOpen(glueName, "w"); pslWriteHead(out); /* Chop this up into chunks that share the same query. */ for (psl = pslList; psl != NULL; psl = nextPsl) { nextPsl = psl->next; if (localList != NULL) { if (!sameString(localList->qName, psl->qName)) { glueCount += simpleOut(out, glue, &localList); localList = NULL; } } slAddHead(&localList, psl); } glueCount += simpleOut(out, glue, &localList); printf("Got %d gluing mRNAs out of %d psls in %d bundles %d ltot %d mtot\n", glueCount, pslCount, outCount, ltot, mtot); fclose(out); fclose(glue); }
void fixBlastTrack(char *query, char *target, char *outFile) { struct lineFile *qlf = pslFileOpen(query); struct lineFile *tlf = pslFileOpen(target); struct psl *psl, *queryPsl, *newPslList; struct hash *pslHash = newHash(0); FILE *outStream = mustOpen(outFile, "w"); while ((psl = pslNext(qlf)) != NULL) { queryPsl = hashFindVal(pslHash, psl->qName); if (queryPsl != NULL) errAbort("each qName in query psl file must be unique (%s)",psl->qName); hashAdd(pslHash, psl->qName, psl); } while ((psl = pslNext(tlf)) != NULL) { queryPsl = hashFindVal(pslHash, psl->qName); if (queryPsl == NULL) errAbort("can't find %s in query file",psl->qName); if ((queryPsl->qStarts[0] != 0) && (psl->qStarts[0] < queryPsl->blockSizes[0])) { int qStart, qEnd, tBlock; assert(queryPsl->qStart == queryPsl->qStarts[0]); qStart = 0;// queryPsl->qStarts[0]; qEnd = qStart + queryPsl->blockSizes[0]; psl->qStarts[0] += queryPsl->qStart; psl->qStart = psl->qStarts[0]; tBlock = 1; while((tBlock < psl->blockCount) && (psl->qStarts[tBlock] >= qStart) && (psl->qStarts[tBlock] < qEnd)) psl->qStarts[tBlock++] += queryPsl->qStart; psl->qEnd = psl->qStarts[psl->blockCount - 1] + psl->blockSizes[psl->blockCount - 1]; } pslTabOut(psl, outStream); } }
void pslGlueRna(char *listFile, char *partDir, char *pslName, char *gluName) /* Reduce a psl files for only the gluing mRNA/EST components. */ { FILE *pslOut; FILE *gluOut; struct psl *pslList = NULL, *psl, *nextPsl; struct psl *localList = NULL; int glueCount = 0; int pslCount = 0; struct slName *inList, *inEl; inList = getFileList(listFile, partDir); for (inEl = inList; inEl != NULL; inEl = inEl->next) { char *inName = inEl->name; struct lineFile *lf = pslFileOpen(inName); while ((psl = pslNext(lf)) != NULL) { slAddHead(&pslList, psl); ++pslCount; } lineFileClose(&lf); } slSort(&pslList, pslCmpQuery); pslOut = mustOpen(pslName, "w"); gluOut = mustOpen(gluName, "w"); pslWriteHead(pslOut); /* Chop this up into chunks that share the same query. */ for (psl = pslList; psl != NULL; psl = nextPsl) { nextPsl = psl->next; if (localList != NULL) { if (!sameString(localList->qName, psl->qName)) { glueCount += output(pslOut, gluOut, &localList); localList = NULL; } } slAddHead(&localList, psl); } glueCount += output(pslOut, gluOut, &localList); printf("Got %d gluing mRNAs out of %d psls in %d bundles %d ltot %d mtot to %s\n", glueCount, pslCount, outCount, ltot, mtot, gluName); fclose(pslOut); fclose(gluOut); }
static void pslToPslx(char *inPslFile, char *qSeqSpec, char *tSeqSpec, char *outPslFile) /* pslToPslx - Convert from psl to pslx alignment format. */ { struct lineFile *pslInLf = pslFileOpen(inPslFile); struct seqReader *qSeqReader = seqReaderNew(qSeqSpec); struct seqReader *tSeqReader = seqReaderNew(tSeqSpec); FILE *pslOutFh = mustOpen(outPslFile, "w"); struct psl *psl; while ((psl = pslNext(pslInLf)) != NULL) { writePslx(pslOutFh, qSeqReader, tSeqReader, psl); pslFree(&psl); } lineFileClose(&pslInLf); carefulClose(&pslOutFh); }
void pslUnpile(char *inName, char *outName) /* pslUnpile - Removes huge piles of alignments from sorted * psl files (due to unmasked repeats presumably).. */ { FILE *f = mustOpen(outName, "w"); enum gfType qType, tType; struct lineFile *lf; struct psl *list = NULL, *psl, *el; pslxFileOpen(inName, &qType, &tType, &lf); if (!noHead) pslxWriteHead(f, qType, tType); for (;;) { psl = pslNext(lf); if (list != NULL && (psl == NULL || !pslOverlap(psl, list))) { if (list != NULL) { slReverse(&list); if (checkPile(list)) { for (el = list; el != NULL; el = el->next) { pslTabOut(el, f); } } else { for (el = list; el != NULL; el = el->next) { if (psl == NULL) pslTabOut(el, f); else if (psl->tEnd - psl->tStart > 4000) pslTabOut(el, f); } } pslFreeList(&list); } } if (psl == NULL) break; slAddHead(&list, psl); } lineFileClose(&lf); carefulClose(&f); }
void pslRcFile(char *inPslFile, char *outPslFile) /* reverse target and query in a psl file */ { struct lineFile *inLf = pslFileOpen(inPslFile); FILE *outFh = mustOpen(outPslFile, "w"); struct psl *psl; while ((psl = pslNext(inLf)) != NULL) { pslRc(psl); pslTabOut(psl, outFh); pslFree(&psl); } carefulClose(&outFh); lineFileClose(&inLf); }
void fbOrPsl(Bits *acc, char *track, char *chrom, int chromSize) /* Or in bits of psl file that correspond to chrom. */ { struct lineFile *lf; char fileName[512]; struct psl *psl; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; lf = pslFileOpen(fileName); while ((psl = pslNext(lf)) != NULL) { if (sameString(psl->tName, chrom)) setPslBits(lf, acc, psl, 0, chromSize); pslFree(&psl); } lineFileClose(&lf); }
static struct hash *collectQueryStats(char *pslFile, char *querySizeFile) /* collect per-query statistics */ { struct hash *queryStatsTbl = (querySizeFile != NULL) ? sumStatsLoad(querySizeFile) : hashNew(queryHashPowTwo); struct lineFile *pslLf = pslFileOpen(pslFile); struct psl* psl; while ((psl = pslNext(pslLf)) != NULL) { struct sumStats *ss = sumStatsGetForQuery(queryStatsTbl, psl->qName, psl->qSize); sumStatsAccumulateQuery(ss, psl); pslFree(&psl); } lineFileClose(&pslLf); return queryStatsTbl; }
void extractUsedPairs(struct hash *pairHash, char *inPslName, char *outPairName) /* Extract pairs that are used in inPsl to outPair. */ { struct hash *refHash = newHash(12); struct pairRef *refList = NULL, *ref; struct psl *psl; struct lineFile *lf; printf("Processing pairs from %s to %s\n", inPslName, outPairName); lf = pslFileOpen(inPslName); while ((psl = pslNext(lf)) != NULL) { char *name = psl->qName; struct hashEl *hel; struct pair *pair; if ((hel = hashLookup(pairHash, name)) != NULL) { pair = hel->val; if ((hel = hashLookup(refHash, name)) != NULL) { ref = hel->val; } else { AllocVar(ref); ref->pair = pair; slAddHead(&refList, ref); hashAdd(refHash, pair->a, ref); hashAdd(refHash, pair->b, ref); } if (sameString(name, pair->a)) ref->gotA = TRUE; else ref->gotB = TRUE; } pslFree(&psl); } slReverse(&refList); writePairs(outPairName, refList); slFreeList(&refList); freeHash(&refHash); }