int main(int argc, char** argv) { struct mafFile* mf; struct mafAli* ali; struct mafComp* mc; if ( argc != 2) { printf("remove_self maf-file\n"); return 1; } init_scores70(); mafWriteStart(stdout, 0); mf = mafOpen(argv[1], 0); while((ali = mafNext(mf)) != NULL) { mc = ali->components; if(mc->next->strand == '+' && mc->start > mc->next->start) continue; else if(mc->next->strand == '-' && mc->start > (mc->next->srcSize - mc->next->start - mc->next->size)) continue; else mafWrite(stdout, ali); } mafFileFree(&mf); mafWriteEnd(stdout); return 0; }
void mafAddIRows(char *mafIn, char *twoBitIn, char *mafOut, char *nBedFile) /* mafAddIRows - Filter out maf files. */ { FILE *f = mustOpen(mafOut, "w"); struct twoBitFile *twoBit = twoBitOpen(twoBitIn); struct mafAli *mafList, *maf; struct mafFile *mf = mafOpen(mafIn); struct hash *bedHash = newHash(6); if (nBedFile != NULL) { struct lineFile *lf = lineFileOpen(nBedFile, TRUE); char *row[1]; while (lineFileRow(lf, row)) { addBed(row[0], bedHash); } lineFileClose(&lf); } speciesHash = newHash(6); mafList = readMafs(mf); mafWriteStart(f, mf->scoring); mafFileFree(&mf); chainStrands(strandHeads, bedHash); bridgeSpecies(mafList, speciesList); fillHoles(mafList, speciesList, twoBit); for(maf = mafList; maf ; maf = maf->next) mafWrite(f, maf); }
void mafsInRegion(char *regionFile, char *out, int mafCount, char *mafFiles[]) /* Extract MAFs in regions listed in regin file */ { int i = 0; struct hash *bedHash = NULL; FILE *f = NULL; struct mafFile *mf = NULL; verbose(1, "Extracting from %d files to %s\n", mafCount, out); bedHash = loadRegions(regionFile); /* get scoring scheme */ mf = mafOpen(mafFiles[0]); if (!mf) errAbort("can't open MAF file: %s\n", mafFiles[0]); scoring = cloneString(mf->scoring); mafFileFree(&mf); /* set up output dir */ if (outDir) { dir = out; makeDir(dir); } else f = startOutFile(out); for (i = 0; i < mafCount; i++) extractMafs(mafFiles[i], f, bedHash); if (!outDir) endOutFile(f); }
struct mafAli *mafLoadInRegion2(struct sqlConnection *conn, struct sqlConnection *conn2, char *table, char *chrom, int start, int end, char *file) /* Return list of alignments in region. */ { char **row; unsigned int extFileId = 0; struct mafAli *maf, *mafList = NULL; struct mafFile *mf = NULL; int rowOffset; if (file != NULL) mf = mafOpen(file); struct sqlResult *sr = hRangeQuery(conn, table, chrom, start, end, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct scoredRef ref; scoredRefStaticLoad(row + rowOffset, &ref); if ((file != NULL) && (ref.extFile != 0)) errAbort("expect extFile to be zero if file specified\n"); if ((file == NULL) && (ref.extFile == 0)) errAbort("expect extFile to be not zero or file specified\n"); if (ref.extFile != extFileId) { char *path = hExtFileNameC(conn2, "extFile", ref.extFile); mafFileFree(&mf); mf = mafOpen(path); extFileId = ref.extFile; } lineFileSeek(mf->lf, ref.offset, SEEK_SET); maf = mafNext(mf); if (maf == NULL) internalErr(); slAddHead(&mafList, maf); } sqlFreeResult(&sr); mafFileFree(&mf); slReverse(&mafList); /* hRangeQuery may return items out-of-order when bin is used in the query, * so sort here in order to avoid trouble at base-level view: */ slSort(&mafList, mafCmp); return mafList; }
int main (int argc, char **argv) { int nrow = DEFAULT_NROW, which_seq = 0, do_order = 0, misses_allowed = 0, output_mode = -1; float cutoff = DEFAULT_CUTOFF; char *strand = NULL, *id = NULL; struct mafFile *file = NULL; struct mafAli *ali = NULL; struct mafComp *comp = NULL; struct MOTIF *motif = NULL; struct MATCH *matches = NULL; id = ckalloc (STRSIZE); get_args (argc, argv, &output_mode, &file, &id, &motif, &do_order, &nrow, &cutoff, &misses_allowed); strand = ckalloc (sizeof (char) * (nrow + 1)); while (NULL != (ali = mafNext (file))) { for (comp = ali->components, which_seq = 0; comp; comp = comp->next, which_seq++) strand[which_seq] = comp->strand; strand[which_seq] = '\0'; /* skip blocks that don't have all seqs in them */ if (which_seq != nrow) { mafAliFree (&ali); continue; } /* forward strand */ get_matches (&matches, FORWARD, ali, nrow, motif, do_order, misses_allowed); /* reverse strand */ for (comp = ali->components; comp; comp = comp->next) do_revcomp((uchar *)comp->text, ali->textSize ); get_matches (&matches, REVERSE, ali, nrow, motif, do_order, misses_allowed); /* output matches */ if (matches) output_matches (matches, strand, id, nrow, motif); free_match_list (&matches); mafAliFree (&ali); } mafFileFree (&file); free (strand); free_motif_list (&motif); free (id); return 0; }
void reTraceFixMaf(char *mafFileName, char *qaFileName, char *newMafFileName) /* reTraceFixMaf - Add quality line and recompute chrom line in maf. */ { struct mafFile *mFile = mafReadAll(mafFileName); struct qaSeq *qaList = qaRead(qaFileName); struct hash *qaHash = makeQaHash(qaList); struct mafAli *ali; for (ali = mFile->alignments; ali != NULL; ali = ali->next) { if (!ali->components || !ali->components->next) errAbort("Something's up with the maf."); else { struct mafComp *secondSrc = ali->components->next; struct qaSeq *qas = hashMustFindVal(qaHash, secondSrc->src); int i, offset; int length = strlen(secondSrc->text); if (secondSrc->strand == '-') reverseBytes(qas->qa, qas->size); offset = secondSrc->start; AllocArray(secondSrc->quality, length+1); for (i = 0; i < length; i++) { if (secondSrc->text[i] == '-') secondSrc->quality[i] = '-'; else { int q = (int)qas->qa[offset++]; char c = 'F'; if ((q >= 0) && (q < 45)) { q = q / 5; c = '0' + q; } else if ((q >= 45) && (q < 98)) c = '9'; else if (q == 99) c = '0'; else c = 'F'; secondSrc->quality[i] = c; } } secondSrc->quality[length] = '\0'; if (secondSrc->strand == '-') reverseBytes(qas->qa, qas->size); } } mafWriteAll(mFile, newMafFileName); hashFree(&qaHash); qaSeqFreeList(&qaList); mafFileFree(&mFile); }
void mafFileFreeList(struct mafFile **pList) /* Free up a list of maf files. */ { struct mafFile *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; mafFileFree(&el); } *pList = NULL; }
/* Construct a malnSet from a MAF file. defaultBranchLength is used to * assign branch lengths when inferring trees from the MAF. */ struct malnSet *malnSet_constructFromMaf(struct Genomes *genomes, char *mafFileName, int maxInputBlkWidth, double defaultBranchLength, struct Genome *treelessRootGenome) { struct malnSet *malnSet = malnSet_construct(genomes, mafFileName); struct mafFile *mafFile = mafOpen(mafFileName); struct mafAli *ali; while ((ali = mafNext(mafFile)) != NULL) { checkMafAli(ali); addMafAli(malnSet, ali, maxInputBlkWidth, defaultBranchLength, treelessRootGenome); mafAliFree(&ali); } malnSet_assert(malnSet); mafFileFree(&mafFile); return malnSet; }
void mafStats(char *twoBitFile, char *mafDir, char *outFile) /* mafStats - Calculate basic stats on maf file including species-by-species * coverage and percent ID. */ { struct twoBitFile *tbf = twoBitOpen(twoBitFile); FILE *f = mustOpen(outFile, "w"); struct twoBitIndex *ix; long genomeSize = 0; struct hash *speciesHash = hashNew(0); struct speciesAcc *speciesList = NULL, *species; for (ix = tbf->indexList; ix != NULL; ix = ix->next) { unsigned chromSize = twoBitSeqSizeNoNs(tbf, ix->name); genomeSize += chromSize; char mafFileName[PATH_LEN]; safef(mafFileName, sizeof(mafFileName), "%s/%s.maf", mafDir, ix->name); struct mafFile *mf = mafMayOpen(mafFileName); verbose(1, "processing %s\n", ix->name); if (mf == NULL) { warn("%s doesn't exist", mafFileName); continue; } struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { struct mafComp *mc; for (mc = maf->components; mc != NULL; mc = mc->next) { if (mc->text != NULL) toUpperN(mc->text, maf->textSize); } addCounts(maf, speciesHash, &speciesList); mafAliFree(&maf); } mafFileFree(&mf); } slReverse(&speciesList); for (species = speciesList; species != NULL; species = species->next) { fprintf(f, "counts: %s\t%ld\t%ld\t%ld\n", species->name, species->covCount, species->aliCount, species->idCount); fprintf(f, "precents: %s\t%4.2f%%\t%4.2f%%\t%4.2f%%\n", species->name, 100.0 * species->covCount/genomeSize, 100.0 * species->aliCount/genomeSize, 100.0 * species->idCount/species->aliCount); } carefulClose(&f); }
void mafToPsl(char *querySrc, char *targetSrc, char *inName, char *outName) /* mafToPsl - Convert maf to psl format. */ { struct mafFile *mf = mafOpen(inName); FILE *pslFh = mustOpen(outName, "w"); struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { mafAliToPsl(querySrc, targetSrc, maf, pslFh); mafAliFree(&maf); } carefulClose(&pslFh); mafFileFree(&mf); }
void mafToFa(char *inName, char *outName) /* mafToFa - convert maf file to fasta. */ { struct mafFile *mf = mafOpen(inName); FILE *faFh = mustOpen(outName, "w"); struct mafAli *maf; while ((maf = mafNext(mf)) != NULL) { mafAliToFa(maf, faFh); mafAliFree(&maf); } carefulClose(&faFh); mafFileFree(&mf); }
void extractMafs(char *file, FILE *f, struct hash *regionHash) /* extract MAFs in a file from regions specified in hash */ { char *chrom = NULL; struct bed *bed = NULL; struct mafFile *mf = mafOpen(file); struct mafAli *maf = NULL; struct mafComp *mc; char path[256]; verbose(1, "extracting from %s\n", file); maf = mafNext(mf); while (maf) { mc = maf->components; if (!chrom || differentString(chrom, chromFromSrc(mc->src))) chrom = cloneString(chromFromSrc(mc->src)); /* new chrom */ bed = (struct bed *)hashFindVal(regionHash, chrom); if (!bed) { /* no regions on this chrom -- skip to next chrom */ do mafAliFree(&maf); while (((maf = mafNext(mf)) != NULL) && sameString(chromFromSrc(maf->components->src), chrom)); continue; // start over with this maf } verbose(2, "region: %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd); if (outDir) { if (f) endOutFile(f); safef(path, sizeof (path), "%s/%s.maf", dir, bed->name); f = startOutFile(path); } /* skip mafs before region, stopping if chrom changes */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (mc->start + mc->size) <= bed->chromStart) { mafAliFree(&maf); maf = mafNext(mf); } /* extract all mafs and pieces of mafs in region */ while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) && (bed->chromStart < mc->start + mc->size && bed->chromEnd > mc->start)) { int mafStart = mc->start; int mafEnd = mc->start + mc->size; struct mafAli *full = maf; if (mafStart < bed->chromStart || mafEnd > bed->chromEnd) { full = maf; maf = mafSubsetE(full, mc->src, bed->chromStart, bed->chromEnd, keepInitialGaps); mc = maf->components; } verbose(2, " %s:%d-%d\n", chrom, mc->start+1, mc->start + mc->size); mafWrite(f, maf); struct mafAli *nextMaf = (mafEnd > bed->chromEnd+1) ? mafSubset(full, mc->src, bed->chromEnd+1, mafEnd) : mafNext(mf); if (maf != full) mafAliFree(&maf); mafAliFree(&full); maf = nextMaf; } /* get next region */ hashRemove(regionHash, bed->chrom); if (bed->next) hashAdd(regionHash, bed->chrom, bed->next); } mafFileFree(&mf); }
void hgLoadMafSummary(char *db, char *table, char *fileName) /* hgLoadMafSummary - Load a summary table of pairs in a maf into a database. */ { long mafCount = 0, allMafCount = 0; struct mafComp *mcMaster = NULL; struct mafAli *maf; struct mafFile *mf = mafOpen(fileName); struct sqlConnection *conn; FILE *f = hgCreateTabFile(".", table); long componentCount = 0; struct hash *componentHash = newHash(0); if (!test) { conn = sqlConnect(database); mafSummaryTableCreate(conn, table, hGetMinIndexLength(db)); } verbose(1, "Indexing and tabulating %s\n", fileName); /* process mafs */ while ((maf = mafNext(mf)) != NULL) { mcMaster = mafMaster(maf, mf, fileName); allMafCount++; if (mcMaster->srcSize < minSeqSize) continue; while (mcMaster->size > maxSize) { /* break maf into maxSize pieces */ int end = mcMaster->start + maxSize; struct mafAli *subMaf = mafSubset(maf, mcMaster->src, mcMaster->start, end); verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src, mcMaster->start, mcMaster->size); componentCount += processMaf(subMaf, componentHash, f, mf, fileName); mafAliFree(&subMaf); subMaf = mafSubset(maf, mcMaster->src, end, end + (mcMaster->size - maxSize)); mafAliFree(&maf); maf = subMaf; mcMaster = mafMaster(maf, mf, fileName); } if (mcMaster->size != 0) { /* remainder of maf after splitting off maxSize submafs */ componentCount += processMaf(maf, componentHash, f, mf, fileName); } mafAliFree(&maf); mafCount++; } mafFileFree(&mf); flushSummaryBlocks(componentHash, f); verbose(1, "Created %ld summary blocks from %ld components and %ld mafs from %s\n", summaryCount, componentCount, allMafCount, fileName); if (test) return; verbose(1, "Loading into %s table %s...\n", database, table); hgLoadTabFile(conn, ".", table, &f); verbose(1, "Loading complete"); hgEndUpdate(&conn, "Add %ld maf summary blocks from %s\n", summaryCount, fileName); }