void filterPsls() { struct psl *origPslList=NULL, *pslList=NULL, *psl=NULL; int startCount=0, stopCount=0; char buff[256]; origPslList = pslLoadAll(pslIn); /* some messages for the user */ startCount = slCount(origPslList); sprintf(buff, "Filtering %d psl using seqIdent=%g and basePct=%g\n", startCount, seqIdent, basePct); msg(buff); /* do our filtering */ pslList = filterBySeqIdentity(seqIdent, origPslList); pslFreeList(&origPslList); origPslList = filterByBasePct(basePct, pslList); /* let the user know we're done */ if(origPslList != NULL) { stopCount = slCount(origPslList); pslWriteAll(origPslList, pslOut, FALSE); pslFreeList(&origPslList); } pslFreeList(&origPslList); pslFreeList(&pslList); sprintf(buff, "After filtering %d of %d are left\n", stopCount, startCount); msg(buff); }
void protDat(char *protName, char *blatName, char *aliasFile, char *outName) { FILE *outFile = mustOpen(outName, "w"); struct hash *protHash = newHash(10); struct hash *blatHash = newHash(10); struct hash *aliasHash = newHash(10); struct psl *psls, *pslPtr, *protPsls, *blatPsl; struct lineFile *lf = lineFileOpen(aliasFile, TRUE); struct alias *alPtr; char buffer[1024]; char *words[3]; int numWords = optionExists("fb") ? 2 : 3; while (lineFileNextRow(lf, words, numWords)) { AllocVar(alPtr); alPtr->kgName = cloneString(words[1]); if (numWords == 3) alPtr->spName = cloneString(words[2]); hashAdd(aliasHash, cloneString(words[0]), alPtr); } protPsls = pslLoadAll(protName); pslPtr = psls = pslLoadAll(blatName); for(; pslPtr; pslPtr = pslPtr->next) hashAdd(blatHash, pslPtr->qName, pslPtr); for(pslPtr = protPsls; pslPtr; pslPtr = pslPtr->next) { if ((blatPsl = hashFindVal(blatHash, pslPtr->qName)) != NULL) { if ((alPtr = hashFindVal(aliasHash, pslPtr->qName)) != NULL) { if (numWords == 3) sprintf(buffer,"%s.%s:%d-%d.%s.%s",pslPtr->qName,blatPsl->tName, blatPsl->tStart, blatPsl->tEnd,alPtr->kgName, alPtr->spName); else sprintf(buffer,"%s.%s:%d-%d.%s",pslPtr->qName,blatPsl->tName, blatPsl->tStart, blatPsl->tEnd,alPtr->kgName); pslPtr->qName = buffer; pslTabOut(pslPtr, outFile); } } } }
void readInPslHash(struct hash *pslHash, char *file) { struct psl *pslList, *psl; pslList = pslLoadAll(file); for(psl = pslList; psl != NULL; psl = psl->next) { hashAdd(pslHash, psl->qName, psl); } }
struct hash *hashPsls(char *fileName) /* Return hash of all psls in file. */ { struct psl *pslList = pslLoadAll(fileName), *psl; struct hash *hash = newHash(20); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(hash, psl->qName, psl); uglyf("Loaded %d psls from %s\n", slCount(pslList), fileName); return hash; }
struct hash *hashPsls(char *pslFileName) { struct psl *pslList = NULL, *psl = NULL, *pslSubList = NULL, *pslNext = NULL; struct hash *pslHash = newHash(15); char *last = NULL; char key[128]; char *tmp = NULL; pslList = pslLoadAll(pslFileName); /* Fix psl names */ for(psl = pslList; psl != NULL; psl = psl->next) { tmp = strrchr(psl->qName, ';'); *tmp = '\0'; tmp = strstr(psl->qName,prefix); assert(tmp); /* checks if there are 2 occurrences of ":" in probe name as in full name */ /* if probe name is shortened to fit in the seq table, there is only 1 ":"*/ /* e.g. full: consensus:HG-U133A:212933_x_at; short:HG-U133A:212933_x_at;*/ if (countChars(psl->qName, *prefix) == 2) { tmp = strstr(tmp+1,prefix); assert(tmp); } tmp = tmp + strlen(prefix); safef(psl->qName, strlen(psl->qName), "%s", tmp); } /* Sort based on query name. */ slSort(&pslList, pslCmpQuery); /* For each psl, if it is has the same query name add it to the sublist. Otherwise store the sublist in the hash and start another. */ for(psl = pslList; psl != NULL; psl = pslNext) { pslNext = psl->next; if(last != NULL && differentWord(last, psl->qName)) { hashAddUnique(pslHash, last, pslSubList); pslSubList = NULL; } slAddTail(&pslSubList, psl); last = psl->qName; } /* Add the last sublist */ hashAddUnique(pslHash, last, pslSubList); return pslHash; }
struct hapRegions *hapRegionsNew(char *hapPslFile, FILE *hapRefMappedFh, FILE *hapRefCDnaFh) /* construct a new hapRegions object from PSL alignments of the haplotype * pseudo-chromosomes to the haplotype regions of the reference chromsomes. */ { struct psl *mapping, *mappings = pslLoadAll(hapPslFile); struct hapRegions *hr; AllocVar(hr); hr->refMap = hashNew(12); hr->hapMap = hashNew(12); hr->hapRefMappedFh = hapRefMappedFh; hr->hapRefCDnaFh = hapRefCDnaFh; while ((mapping = slPopHead(&mappings)) != NULL) addHapMapping(hr, mapping); return hr; }
static struct hash* loadPslByQname(char* inPslFile) /* load PSLs in to hash by qName. Make sure target strand is positive * to make process easier later. */ { struct hash* pslsByQName = hashNew(0); struct psl *psls = pslLoadAll(inPslFile); struct psl *psl; while ((psl = slPopHead(&psls)) != NULL) { if (pslTStrand(psl) != '+') pslRc(psl); struct hashEl *hel = hashStore(pslsByQName, psl->qName); struct psl** queryPsls = (struct psl**)&hel->val; slAddHead(queryPsls, psl); } return pslsByQName; }
void doPsls(struct sqlConnection *conn, char *db, char *orthoDb, char *chrom, char *netTable, char *pslFileName, char *pslTableName, char *outBedName, char *selectedFileName, int *foundCount, int *notFoundCount) /* Map over psls. */ { FILE *bedOut = NULL; FILE *selectedOut = NULL; struct bed *bed = NULL; struct psl *psl=NULL, *pslList = NULL; /* Load psls. */ warn("Loading psls."); if(pslFileName) pslList=pslLoadAll(pslFileName); else pslList=loadPslFromTable(conn, pslTableName, chrom, 0, BIGNUM); /* Convert psls. */ warn("Converting psls."); assert(outBedName); bedOut = mustOpen(outBedName, "w"); if (selectedFileName != NULL) selectedOut = mustOpen(selectedFileName, "w"); for(psl = pslList; psl != NULL; psl = psl->next) { if(differentString(psl->tName, chrom)) continue; occassionalDot(); bed = orthoBedFromPsl(conn, db, orthoDb, netTable, psl); if(bed != NULL && bed->blockCount > 0) { (*foundCount)++; bedTabOutN(bed, 12, bedOut); if (selectedOut != NULL) pslTabOut(psl, selectedOut); } else (*notFoundCount)++; bedFree(&bed); } carefulClose(&selectedOut); carefulClose(&bedOut); }
void loadPslsFromFile(char *pslFile, char *chrom, struct sqlConnection *conn) /** Load the psls from the directed file (instead of the database. */ { struct psl *psl = NULL, *pslNext = NULL, *pslList = NULL; pslList = pslLoadAll(pslFile); for(psl = pslList; psl != NULL; psl = psl->next) { minPslStart = min(psl->tStart, minPslStart); maxPslEnd = max(psl->tEnd, maxPslEnd); } chromPslBin = binKeeperNew(minPslStart, maxPslEnd); agxSeenBin = binKeeperNew(minPslStart, maxPslEnd); for(psl = pslList; psl != NULL; psl = pslNext) { pslNext = psl->next; if(sameString(psl->tName, chrom)) binKeeperAdd(chromPslBin, psl->tStart, psl->tEnd, psl); else pslFree(&psl); } }
struct bed *createBedsFromPsls(char *pslFile, int expCount) /** creates a list of beds from a pslfile, allocates memory for arrays as determined by expCount */ { struct psl *pslList = NULL, *psl = NULL; struct bed *bedList = NULL, *bed = NULL; pslList = pslLoadAll(pslFile); for(psl = pslList; psl != NULL; psl = psl->next) { bed = bedFromPsl(psl); freez(&bed->name); bed->name=parseNameFromHgc(psl->qName); bed->score = 0; bed->expCount = 0; bed->expIds = needMem(sizeof(int)*expCount); bed->expScores = needMem(sizeof(float)*expCount); slAddHead(&bedList,bed); } slReverse(&bedList); pslFreeList(&pslList); return bedList; }
struct psl* doDnaAlignment(struct dnaSeq *seq, char *db, char *blatHost, char *port, char *nibDir, struct hash *tFileCache) /* get the alignment from the blat host for this sequence */ { struct psl *pslList = NULL; int conn =0; struct tempName pslTn; FILE *f = NULL; struct gfOutput *gvo; if(seq == NULL || db == NULL) errAbort("coordConv::doDnaAlignment() - dnaSeq and/or db can't be NULL."); if(strlen(seq->dna) != seq->size) errAbort("coordConv::doDnaAlignment() - there seems to be something fishy about %s: the size doesn't equal the length", seq->name); /* if there are too many n's it can cause the blat server to hang */ if(strstr(seq->dna, "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn") ) return NULL; makeTempName(&pslTn,"ccR", ".psl"); f = mustOpen(pslTn.forCgi, "w"); gvo = gfOutputPsl(920, FALSE, FALSE, f, FALSE, FALSE); gfOutputHead(gvo, f); /* align to genome, both strands */ conn = gfConnect(blatHost, port); gfAlignStrand(&conn, nibDir, seq, FALSE, 20, tFileCache, gvo); reverseComplement(seq->dna, seq->size); conn = gfConnect(blatHost, port); gfAlignStrand(&conn, nibDir, seq, TRUE, 20 , tFileCache, gvo); gfOutputQuery(gvo, f); carefulClose(&f); pslList = pslLoadAll(pslTn.forCgi); remove(pslTn.forCgi); gfOutputFree(&gvo); return pslList; }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
void txGeneCdsMap(char *inBed, char *inInfo, char *inPicks, char *refPepToTxPsl, char *refToPepTab, char *chromSizes, char *cdsToRna, char *rnaToGenome) /* txGeneCdsMap - Create mapping between CDS region of gene and genome. */ { /* Load info into hash. */ struct hash *infoHash = hashNew(18); struct txInfo *info, *infoList = txInfoLoadAll(inInfo); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } lineFileClose(&lf); /* Load refPep/tx alignments into hash keyed by tx. */ struct hash *refPslHash = hashNew(18); struct psl *psl, *pslList = pslLoadAll(refPepToTxPsl); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(refPslHash, psl->tName, psl); struct hash *refToPepHash = hashTwoColumnFile(refToPepTab); struct hash *chromSizeHash = hashNameIntFile(chromSizes); /* Load in bed. */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Open output, and stream through bedList, writing output. */ FILE *fCdsToRna = mustOpen(cdsToRna, "w"); FILE *fRnaToGenome = mustOpen(rnaToGenome, "w"); int refTotal = 0, refFound = 0; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { char *chrom = bed->chrom; int chromSize = hashIntVal(chromSizeHash, chrom); info = hashMustFindVal(infoHash, bed->name); pick = hashMustFindVal(pickHash, bed->name); if (info->isRefSeq) { char *refAcc = txAccFromTempName(bed->name); if (!startsWith("NM_", refAcc)) errAbort("Don't think I did find that refSeq acc, got %s", refAcc); char *protAcc = hashMustFindVal(refToPepHash, refAcc); ++refTotal; if (findAndMapPsl(bed, protAcc, refPslHash, chromSize, fCdsToRna)) ++refFound; } else { fakeCdsToMrna(bed, fCdsToRna); } fakeRnaToGenome(bed, chromSize, fRnaToGenome); } } verbose(1, "Missed %d of %d refSeq protein mappings. A small number of RefSeqs just map\n" "to genome in the UTR.\n", refTotal - refFound, refTotal); carefulClose(&fCdsToRna); carefulClose(&fRnaToGenome); }
int main(int argc, char *argv[]) { /* * Arguments/options */ char outputFile[50]; char inputFile[50]; char query[100]; char target[100]; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while(1) { static struct option long_options[] = { { "query", required_argument, 0, 'q' }, { "target", required_argument, 0, 't' }, { "outputFile", required_argument, 0, 'o' }, { "inputFile", required_argument, 0, 'i' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "i:o:q:t:h", long_options, &option_index); if(key == -1) { break; } switch(key) { case 'i': strcpy(inputFile, optarg); break; case 'o': strcpy(outputFile, optarg); break; case 'q': strcpy(query, optarg); break; case 't': strcpy(target, optarg); break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(outputFile != NULL); assert(query != NULL); assert(target != NULL); FILE *fileHandle = fopen(outputFile, "w"); pslWriteHead(fileHandle); struct psl *pslList = pslLoadAll(inputFile); mapPSLs(pslList, fileHandle, query, target); fclose(fileHandle); return 0; }