int main(int argc, char** argv) { struct mafFile* mf; struct mafAli* ali; struct mafComp* mc; if ( argc != 2) { printf("remove_self maf-file\n"); return 1; } init_scores70(); mafWriteStart(stdout, 0); mf = mafOpen(argv[1], 0); while((ali = mafNext(mf)) != NULL) { mc = ali->components; if(mc->next->strand == '+' && mc->start > mc->next->start) continue; else if(mc->next->strand == '-' && mc->start > (mc->next->srcSize - mc->next->start - mc->next->size)) continue; else mafWrite(stdout, ali); } mafFileFree(&mf); mafWriteEnd(stdout); return 0; }
void mafMeFirst(char *inMaf, char *meFile, char *outMaf) /* mafMeFirst - Move component to top if it is one of the named ones. Useful * in conjunction with mafFrags when you don't want the one with the gene name * to be in the middle.. */ { struct hash *meHash = hashWordsInFile(meFile, 18); struct mafFile *mf = mafOpen(inMaf); FILE *f = mustOpen(outMaf, "w"); mafWriteStart(f, mf->scoring); struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { struct mafComp *comp = compInHash(maf, meHash); if (comp == NULL) errAbort("No components in %s in maf ending line %d of %s", meFile, mf->lf->lineIx, mf->lf->fileName); slRemoveEl(&maf->components, comp); slAddHead(&maf->components, comp); mafWrite(f, maf); mafAliFree(&maf); } mafWriteEnd(f); carefulClose(&f); }
int main (int argc, char **argv) { int nrow = DEFAULT_NROW, which_seq = 0, do_order = 0, misses_allowed = 0, output_mode = -1; float cutoff = DEFAULT_CUTOFF; char *strand = NULL, *id = NULL; struct mafFile *file = NULL; struct mafAli *ali = NULL; struct mafComp *comp = NULL; struct MOTIF *motif = NULL; struct MATCH *matches = NULL; id = ckalloc (STRSIZE); get_args (argc, argv, &output_mode, &file, &id, &motif, &do_order, &nrow, &cutoff, &misses_allowed); strand = ckalloc (sizeof (char) * (nrow + 1)); while (NULL != (ali = mafNext (file))) { for (comp = ali->components, which_seq = 0; comp; comp = comp->next, which_seq++) strand[which_seq] = comp->strand; strand[which_seq] = '\0'; /* skip blocks that don't have all seqs in them */ if (which_seq != nrow) { mafAliFree (&ali); continue; } /* forward strand */ get_matches (&matches, FORWARD, ali, nrow, motif, do_order, misses_allowed); /* reverse strand */ for (comp = ali->components; comp; comp = comp->next) do_revcomp((uchar *)comp->text, ali->textSize ); get_matches (&matches, REVERSE, ali, nrow, motif, do_order, misses_allowed); /* output matches */ if (matches) output_matches (matches, strand, id, nrow, motif); free_match_list (&matches); mafAliFree (&ali); } mafFileFree (&file); free (strand); free_motif_list (&motif); free (id); return 0; }
struct mafFile *mafReadAll(char *fileName) /* Read all elements in a maf file */ { struct mafFile *mf = mafOpen(fileName); struct mafAli *ali; while ((ali = mafNext(mf)) != NULL) { slAddHead(&mf->alignments, ali); } slReverse(&mf->alignments); return mf; }
/* Construct a malnSet from a MAF file. defaultBranchLength is used to * assign branch lengths when inferring trees from the MAF. */ struct malnSet *malnSet_constructFromMaf(struct Genomes *genomes, char *mafFileName, int maxInputBlkWidth, double defaultBranchLength, struct Genome *treelessRootGenome) { struct malnSet *malnSet = malnSet_construct(genomes, mafFileName); struct mafFile *mafFile = mafOpen(mafFileName); struct mafAli *ali; while ((ali = mafNext(mafFile)) != NULL) { checkMafAli(ali); addMafAli(malnSet, ali, maxInputBlkWidth, defaultBranchLength, treelessRootGenome); mafAliFree(&ali); } malnSet_assert(malnSet); mafFileFree(&mafFile); return malnSet; }
void mafStats(char *twoBitFile, char *mafDir, char *outFile) /* mafStats - Calculate basic stats on maf file including species-by-species * coverage and percent ID. */ { struct twoBitFile *tbf = twoBitOpen(twoBitFile); FILE *f = mustOpen(outFile, "w"); struct twoBitIndex *ix; long genomeSize = 0; struct hash *speciesHash = hashNew(0); struct speciesAcc *speciesList = NULL, *species; for (ix = tbf->indexList; ix != NULL; ix = ix->next) { unsigned chromSize = twoBitSeqSizeNoNs(tbf, ix->name); genomeSize += chromSize; char mafFileName[PATH_LEN]; safef(mafFileName, sizeof(mafFileName), "%s/%s.maf", mafDir, ix->name); struct mafFile *mf = mafMayOpen(mafFileName); verbose(1, "processing %s\n", ix->name); if (mf == NULL) { warn("%s doesn't exist", mafFileName); continue; } struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { struct mafComp *mc; for (mc = maf->components; mc != NULL; mc = mc->next) { if (mc->text != NULL) toUpperN(mc->text, maf->textSize); } addCounts(maf, speciesHash, &speciesList); mafAliFree(&maf); } mafFileFree(&mf); } slReverse(&speciesList); for (species = speciesList; species != NULL; species = species->next) { fprintf(f, "counts: %s\t%ld\t%ld\t%ld\n", species->name, species->covCount, species->aliCount, species->idCount); fprintf(f, "precents: %s\t%4.2f%%\t%4.2f%%\t%4.2f%%\n", species->name, 100.0 * species->covCount/genomeSize, 100.0 * species->aliCount/genomeSize, 100.0 * species->idCount/species->aliCount); } carefulClose(&f); }
void mafToPsl(char *querySrc, char *targetSrc, char *inName, char *outName) /* mafToPsl - Convert maf to psl format. */ { struct mafFile *mf = mafOpen(inName); FILE *pslFh = mustOpen(outName, "w"); struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { mafAliToPsl(querySrc, targetSrc, maf, pslFh); mafAliFree(&maf); } carefulClose(&pslFh); mafFileFree(&mf); }
void mafToFa(char *inName, char *outName) /* mafToFa - convert maf file to fasta. */ { struct mafFile *mf = mafOpen(inName); FILE *faFh = mustOpen(outName, "w"); struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { mafAliToFa(maf, faFh); mafAliFree(&maf); } carefulClose(&faFh); mafFileFree(&mf); }
struct mafAli *mafLoadInRegion2(struct sqlConnection *conn, struct sqlConnection *conn2, char *table, char *chrom, int start, int end, char *file) /* Return list of alignments in region. */ { char **row; unsigned int extFileId = 0; struct mafAli *maf, *mafList = NULL; struct mafFile *mf = NULL; int rowOffset; if (file != NULL) mf = mafOpen(file); struct sqlResult *sr = hRangeQuery(conn, table, chrom, start, end, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct scoredRef ref; scoredRefStaticLoad(row + rowOffset, &ref); if ((file != NULL) && (ref.extFile != 0)) errAbort("expect extFile to be zero if file specified\n"); if ((file == NULL) && (ref.extFile == 0)) errAbort("expect extFile to be not zero or file specified\n"); if (ref.extFile != extFileId) { char *path = hExtFileNameC(conn2, "extFile", ref.extFile); mafFileFree(&mf); mf = mafOpen(path); extFileId = ref.extFile; } lineFileSeek(mf->lf, ref.offset, SEEK_SET); maf = mafNext(mf); if (maf == NULL) internalErr(); slAddHead(&mafList, maf); } sqlFreeResult(&sr); mafFileFree(&mf); slReverse(&mafList); /* hRangeQuery may return items out-of-order when bin is used in the query, * so sort here in order to avoid trouble at base-level view: */ slSort(&mafList, mafCmp); return mafList; }
struct mafAli *readMafs(struct mafFile *mf) { struct mafAli *maf; char buffer[2048]; char buffer2[2048]; struct strandHead *strandHead; struct mafAli *mafList = NULL; while((maf = mafNext(mf)) != NULL) { struct mafComp *mc, *masterMc = maf->components; char *species = buffer; char *chrom; strcpy(species, masterMc->src); chrom = strchr(species,'.'); if (chrom) *chrom++ = 0; if (masterSpecies == NULL) { masterSpecies = cloneString(species); masterChrom = cloneString(chrom); //printf("master %s %s\n",masterSpecies,masterChrom); } else { if (!sameString(masterSpecies, species)) errAbort("first species (%s) not master species (%s)\n",species,masterSpecies); } for(mc= masterMc->next; mc; mc = mc->next) { struct linkBlock *linkBlock; struct subSpecies *subSpecies = NULL; strcpy(species, mc->src); chrom = strchr(species,'.'); *chrom++ = 0; if ((subSpecies = hashFindVal(speciesHash, species)) == NULL) { //printf("new species %s\n",species); AllocVar(subSpecies); subSpecies->name = cloneString(species); subSpecies->hash = newHash(6); subSpecies->blockStatus.strand = '+'; subSpecies->blockStatus.masterStart = masterMc->start; slAddHead(&speciesList, subSpecies); hashAdd(speciesHash, species, subSpecies); } subSpecies->blockStatus.masterEnd = masterMc->start + masterMc->size ; sprintf(buffer2, "%s%c%s", masterChrom,mc->strand,chrom); if ((strandHead = hashFindVal(subSpecies->hash, buffer2)) == NULL) { //printf("new strand %s for species %s\n",buffer2, species); AllocVar(strandHead); hashAdd(subSpecies->hash, buffer2, strandHead); strandHead->name = cloneString(buffer2); strandHead->species = cloneString(species); strandHead->qName = cloneString(chrom); strandHead->qSize = mc->srcSize; strandHead->strand = mc->strand; slAddHead(&strandHeads, strandHead); } AllocVar(linkBlock); linkBlock->mc = mc; linkBlock->cb.qStart = mc->start; linkBlock->cb.qEnd = mc->start + mc->size; linkBlock->cb.tStart = masterMc->start; linkBlock->cb.tEnd = masterMc->start + masterMc->size; slAddHead(&strandHead->links, linkBlock); } slAddHead(&mafList, maf); } slReverse(&mafList); return mafList; }
void scanMaf(char *database, char *fileName, struct hash *chromHash, boolean covRestrict, int spCount) /* Scan through maf file (which must be sorted by * chromosome) and fill in coverage histograms on * each chromosome. */ { struct mafFile *mf = mafOpen(fileName); struct mafAli *ali = NULL; struct mafComp *comp = NULL; struct chromSizes *lastCs = NULL, *cs = NULL; char *chrom = NULL; int start = 0, end = 0, size = 0, j, k; int idStart = 0, idEnd = 0, idSize = 0; UBYTE *cov = NULL; UBYTE *align = NULL; UBYTE *id = NULL; char *tPtr[MAXALIGN]; bool hit = FALSE; while ((ali = mafNext(mf)) != NULL) { int cCount = slCount(ali->components); int i = 1; int nextStart, idNextStart; comp = ali->components; tPtr[0] = comp->text; chrom = strchr(comp->src,'.')+1; if (chrom == NULL) chrom = comp->src; start = comp->start; idStart = comp->start; nextStart = idNextStart = start; cs = hashMustFindVal(chromHash, chrom); if (cs != lastCs) { if (lastCs != NULL) closeChromCov(fileName, lastCs, &cov, &align, &id); AllocArray(cov, cs->totalSize); AllocArray(align, cs->totalSize); AllocArray(id, cs->totalSize); if (covRestrict) { restrictCov(cov, cs->totalSize, cs->restrictList); restrictCov(align, cs->totalSize, cs->restrictList); restrictCov(id, cs->totalSize, cs->restrictList); } restrictGaps(database, cov, cs->totalSize, chrom); restrictGaps(database, align, cs->totalSize, chrom); restrictGaps(database, id, cs->totalSize, chrom); cs->unrestrictedSize = calcUnrestrictedSize(cov, cs->totalSize); lastCs = cs; } /* don't count if few alignments than spCount */ if ((ali->components->next == NULL) || (cCount < spCount)) { mafAliFree(&ali); continue; } //printf("coverage %d, size %d\n", start, comp->size); incNoOverflow(cov+start, comp->size); for (comp = ali->components->next; comp != NULL; comp = comp->next) { if (comp->size > 0) // do not process e lines { tPtr[i] = comp->text; i++; assert (i < MAXALIGN-1); } else --cCount; } size = 0; assert(cs != NULL); /* count gapless columns */ for (j = 0 ; j<ali->textSize ; j++) { hit = TRUE; /* look for aligning bases in query seqs , abort if any is a gap */ for (i = 1 ; i < cCount ; i++) { if (tPtr[i][j] == '-' || tPtr[0][j] == '-') { // printf("align %d, size %d\n", start, size); incNoOverflow(align+start, size); cs->totalDepth += size; start = nextStart; size = 0; hit = FALSE; break; } } if (hit) size++; /* if there is a gap in the target, start a new alignment block*/ if (tPtr[0][j] != '-') nextStart++; } assert(cs!=NULL); end = start+size; if (end > cs->totalSize) { if (cs->name != NULL) errAbort("End %d past end %ld of %f\n", end, (long)cs->totalSize, ali->score); else { if (ali!=NULL) errAbort("End %d past end %ld %f\n", end, (long)cs->totalSize, ali->score ); else errAbort("End %d past end %ld \n", end, (long)cs->totalSize); } } incNoOverflow(align+start, size-1); cs->totalDepth += size-1; /* count percent id */ idSize = 0; assert(cs != NULL); for (k = 0 ; k<ali->textSize ; k++) { hit = TRUE; char tc = toupper(tPtr[0][k]); for (i = 1 ; i < cCount ; i++) { if (toupper(tPtr[i][k]) != tc || tc == '-' || tc == 'N') { incNoOverflow(id+idStart, idSize); idStart = idNextStart; idSize = 0; hit = FALSE; break; } } if (hit) idSize++; /* skip over gaps */ if (tc != '-') idNextStart++; } assert(cs!=NULL); idEnd = idStart+idSize; if (idEnd > cs->totalSize) { if (cs->name != NULL) errAbort("End %d past end %ld of %f\n", idEnd, (long)cs->totalSize, ali->score); else { if (ali!=NULL) errAbort("End %d past end %ld %f\n", idEnd, (long)cs->totalSize, ali->score ); else errAbort("End %d past end %ld \n", idEnd, (long)cs->totalSize); } } incNoOverflow(id+idStart, idSize-1); mafAliFree(&ali); } closeChromCov(fileName, cs, &cov, &align, &id); }
void extractMafs(char *file, FILE *f, struct hash *regionHash) /* extract MAFs in a file from regions specified in hash */ { char *chrom = NULL; struct bed *bed = NULL; struct mafFile *mf = mafOpen(file); struct mafAli *maf = NULL; struct mafComp *mc; char path[256]; verbose(1, "extracting from %s\n", file); maf = mafNext(mf); while (maf) { mc = maf->components; if (!chrom || differentString(chrom, chromFromSrc(mc->src))) chrom = cloneString(chromFromSrc(mc->src)); /* new chrom */ bed = (struct bed *)hashFindVal(regionHash, chrom); if (!bed) { /* no regions on this chrom -- skip to next chrom */ do mafAliFree(&maf); while (((maf = mafNext(mf)) != NULL) && sameString(chromFromSrc(maf->components->src), chrom)); continue; // start over with this maf } verbose(2, "region: %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd); if (outDir) { if (f) endOutFile(f); safef(path, sizeof (path), "%s/%s.maf", dir, bed->name); f = startOutFile(path); } /* skip mafs before region, stopping if chrom changes */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (mc->start + mc->size) <= bed->chromStart) { mafAliFree(&maf); maf = mafNext(mf); } /* extract all mafs and pieces of mafs in region */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (bed->chromStart < mc->start + mc->size && bed->chromEnd > mc->start)) { int mafStart = mc->start; int mafEnd = mc->start + mc->size; struct mafAli *full = maf; if (mafStart < bed->chromStart || mafEnd > bed->chromEnd) { full = maf; maf = mafSubsetE(full, mc->src, bed->chromStart, bed->chromEnd, keepInitialGaps); mc = maf->components; } verbose(2, " %s:%d-%d\n", chrom, mc->start+1, mc->start + mc->size); mafWrite(f, maf); struct mafAli *nextMaf = (mafEnd > bed->chromEnd+1) ? mafSubset(full, mc->src, bed->chromEnd+1, mafEnd) : mafNext(mf); if (maf != full) mafAliFree(&maf); mafAliFree(&full); maf = nextMaf; } /* get next region */ hashRemove(regionHash, bed->chrom); if (bed->next) hashAdd(regionHash, bed->chrom, bed->next); } mafFileFree(&mf); }
void hgLoadMafSummary(char *db, char *table, char *fileName) /* hgLoadMafSummary - Load a summary table of pairs in a maf into a database. */ { long mafCount = 0, allMafCount = 0; struct mafComp *mcMaster = NULL; struct mafAli *maf; struct mafFile *mf = mafOpen(fileName); struct sqlConnection *conn; FILE *f = hgCreateTabFile(".", table); long componentCount = 0; struct hash *componentHash = newHash(0); if (!test) { conn = sqlConnect(database); mafSummaryTableCreate(conn, table, hGetMinIndexLength(db)); } verbose(1, "Indexing and tabulating %s\n", fileName); /* process mafs */ while ((maf = mafNext(mf)) != NULL) { mcMaster = mafMaster(maf, mf, fileName); allMafCount++; if (mcMaster->srcSize < minSeqSize) continue; while (mcMaster->size > maxSize) { /* break maf into maxSize pieces */ int end = mcMaster->start + maxSize; struct mafAli *subMaf = mafSubset(maf, mcMaster->src, mcMaster->start, end); verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src, mcMaster->start, mcMaster->size); componentCount += processMaf(subMaf, componentHash, f, mf, fileName); mafAliFree(&subMaf); subMaf = mafSubset(maf, mcMaster->src, end, end + (mcMaster->size - maxSize)); mafAliFree(&maf); maf = subMaf; mcMaster = mafMaster(maf, mf, fileName); } if (mcMaster->size != 0) { /* remainder of maf after splitting off maxSize submafs */ componentCount += processMaf(maf, componentHash, f, mf, fileName); } mafAliFree(&maf); mafCount++; } mafFileFree(&mf); flushSummaryBlocks(componentHash, f); verbose(1, "Created %ld summary blocks from %ld components and %ld mafs from %s\n", summaryCount, componentCount, allMafCount, fileName); if (test) return; verbose(1, "Loading into %s table %s...\n", database, table); hgLoadTabFile(conn, ".", table, &f); verbose(1, "Loading complete"); hgEndUpdate(&conn, "Add %ld maf summary blocks from %s\n", summaryCount, fileName); }