static CLParserPtr initParser() { CLParserPtr optionsParser = hdf5CLParserInstance(true); optionsParser->addArgument("inFile", "existing tree"); optionsParser->addOption("bottomAlignmentFile", "hal file containing an " "alignment of the genome and its children. " "Required for non-leaf genomes.", "\"\""); optionsParser->addOption("topAlignmentFile", "hal file containing an " "alignment of the genome, its parent, and " "its siblings. Required if the genome to be " "replaced is not the root.", "\"\""); optionsParser->addArgument("genomeName", "name of genome to be replaced"); optionsParser->addOptionFlag("noMarkAncestors", "don't mark ancestors for" " update", false); return optionsParser; }
static CLParserPtr initParser() { CLParserPtr optionsParser = hdf5CLParserInstance(); optionsParser->addArgument("halFile", "input hal file"); optionsParser->addArgument("srcGenome", "source genome name"); optionsParser->addArgument("srcBed", "path of input bed file. set as stdin " "to stream from standard input"); optionsParser->addArgument("tgtGenome", "target genome name"); optionsParser->addArgument("tgtBed", "path of output bed file. set as stdout" " to stream to standard output."); optionsParser->addOptionFlag("noDupes", "do not map between duplications in" " graph.", false); optionsParser->addOptionFlag("append", "append results to tgtBed", false); optionsParser->addOption("inBedVersion", "bed version of input file " "as integer between 3 and 9 or 12 reflecting " "the number of columns (see bed " "format specification for more details). Will " "be autodetected by default.", 0); optionsParser->addOption("outBedVersion", "bed version of output file " "as integer between 3 and 9 or 12 reflecting " "the number of columns (see bed " "format specification for more details). Will " "be same as input by default.", 0); optionsParser->addOption("coalescenceLimit", "coalescence limit genome:" " the genome at or above the MRCA of source" " and target at which we stop looking for" " homologies (default: MRCA)", ""); optionsParser->addOptionFlag("outPSL", "write output in PSL instead of " "bed format. overrides --outBedVersion when " "specified.", false); optionsParser->addOptionFlag("outPSLWithName", "write output as input BED name followed by PSL line instead of " "bed format. overrides --outBedVersion when " "specified.", false); optionsParser->addOptionFlag("keepExtra", "keep extra columns. these are " "columns in the input beyond the specified or " "detected bed version, and which are cut by " "default.", false); optionsParser->addOptionFlag("tab", "input is tab-separated. this allows" " column entries to contain spaces. if this" " flag is not set, both spaces and tabs are" " used to separate input columns.", false); optionsParser->setDescription("Map BED genome interval coordinates between " "two genomes."); return optionsParser; }
int main(int argc, char** argv) { CLParserPtr optionsParser = hdf5CLParserInstance(); optionsParser->setDescription("Rertrieve chain (pairwise alignment) " "information from a hal database.\n" "WARNING: THIS TOOL WAS NEVER FINISHED OR" " TESTED. USE AT OWN RISK. PLEASE " "CONSIDER halLiftover --outPSL INSTEAD."); optionsParser->addArgument("halFile", "path to hal file to analyze"); optionsParser->addArgument("genome", "(query) genome to process"); optionsParser->addOption("sequence", "sequence name in query genome (" "all sequences if not specified)", "\"\""); optionsParser->addOption("start", "start position in query genome", 0); optionsParser->addOption("length", "maximum length of chain to output.", 0); optionsParser->addOption("chainFile", "path for output file. stdout if not" " specified", "\"\""); optionsParser->addOption("maxGap", "maximum indel length to be considered a gap within" " a chain.", 20); string halPath; string chainPath; string genomeName; string sequenceName; hal_size_t start; hal_size_t length; hal_size_t maxGap; try { optionsParser->parseOptions(argc, argv); halPath = optionsParser->getArgument<string>("halFile"); genomeName = optionsParser->getArgument<string>("genome"); sequenceName = optionsParser->getOption<string>("sequence"); start = optionsParser->getOption<hal_size_t>("start"); length = optionsParser->getOption<hal_size_t>("length"); chainPath = optionsParser->getOption<string>("chainFile"); maxGap = optionsParser->getOption<hal_size_t>("maxGap"); } catch(exception& e) { cerr << e.what() << endl; optionsParser->printUsage(cerr); exit(1); } try { cerr << "WARNING: THIS TOOL WAS NEVER FINISHED OR TESTED. USE AT OWN RISK." << " PLEASE CONSIDER halLiftover --outPSL INSTEAD." <<endl; AlignmentConstPtr alignment = openHalAlignmentReadOnly(halPath, optionsParser); const Genome* genome = alignment->openGenome(genomeName); if (genome == NULL) { throw hal_exception(string("Genome not found: ") + genomeName); } hal_index_t endPosition = length > 0 ? start + length : genome->getSequenceLength(); const Sequence* sequence = NULL; if (sequenceName != "\"\"") { sequence = genome->getSequence(sequenceName); if (sequence == NULL) { throw hal_exception(string("Sequence not found: ") + sequenceName); } start += sequence->getStartPosition(); endPosition = length > 0 ? start + length : sequence->getSequenceLength(); } ofstream ofile; ostream& outStream = chainPath == "\"\"" ? cout : ofile; if (chainPath != "\"\"") { ofile.open(chainPath.c_str()); if (!ofile) { throw hal_exception(string("Error opening output file ") + chainPath); } } TopSegmentIteratorConstPtr top = genome->getTopSegmentIterator(); top->toSite(start, false); // do slicing here; GappedTopSegmentIteratorConstPtr gtop = genome->getGappedTopSegmentIterator(top->getArrayIndex(), maxGap); // need to review! Chain chain; chain._id = 0; while (gtop->getRightArrayIndex() < (hal_index_t)genome->getNumTopSegments() && gtop->getLeft()->getStartPosition() < endPosition) { if (gtop->hasParent() == true) { hal_offset_t leftOffset = 0; if ((hal_index_t)start > gtop->getStartPosition() && (hal_index_t)start < gtop->getEndPosition()) { leftOffset = start - gtop->getStartPosition() ; } hal_offset_t rightOffset = 0; if (endPosition - 1 > gtop->getStartPosition() && endPosition - 1 < gtop->getEndPosition()) { rightOffset = gtop->getEndPosition() + 1 - endPosition; } // need to do offsets for edge cases gtIteratorToChain(gtop, chain, leftOffset, rightOffset); outStream << chain; ++chain._id; } gtop->toRight(); } } catch(hal_exception& e) { cerr << "hal exception caught: " << e.what() << endl; return 1; } catch(exception& e) { cerr << "Exception caught: " << e.what() << endl; return 1; } return 0; }
int main(int argc, char** argv) { CLParserPtr optionsParser = hdf5CLParserInstance(); optionsParser->setDescription("Retrieve basic statistics from a hal database"); optionsParser->addArgument("halFile", "path to hal file to analyze"); optionsParser->addOptionFlag("genomes", "print only a list of genomes " "in alignment", false); optionsParser->addOption("sequences", "print list of sequences in given " "genome", "\"\""); optionsParser->addOption("sequenceStats", "print stats for each sequence in " "given genome", "\"\""); optionsParser->addOption("bedSequences", "print sequences of given genome " "in bed format", "\"\""); optionsParser->addOptionFlag("tree", "print only the NEWICK tree", false); optionsParser->addOptionFlag("branches", "print list of branches. " "Each branch is specified by the child genome", false); optionsParser->addOption("span", "print branches on path (or spanning tree) " "between comma " "separated list of genomes", "\"\""); optionsParser->addOption("spanRoot", "print genomes on path" "(or spanning tree) between comma " "separated list of genomes. Different from --span" "only in that the spanning tree root is also " "given", "\"\""); optionsParser->addOption("children", "print names of children of given " "genome", "\"\""); optionsParser->addOptionFlag("root", "print root genome name", false); optionsParser->addOption("parent", "print name of parent of given genome", "\"\""); optionsParser->addOption("branchLength", "print branch length between " "given genome and its parent in the tree", "\"\""); optionsParser->addOption("numSegments", "print numTopSegments " "numBottomSegments for given genome.", "\"\""); optionsParser->addOption("baseComp", "print base composition for given " "genome by sampling every step bases. Parameter " "value is of the form genome,step. Ex: " "--baseComp human,1000. The ouptut is of the form " "fraction_of_As fraction_of_Gs fraction_of_Cs " "fraction_of_Ts.", "\"\""); optionsParser->addOption("genomeMetaData", "print metadata for given genome, " "one entry per line, tab-seperated.", "\"\""); optionsParser->addOption("chromSizes", "print the name and length of each" " sequence in a given genome. This is a subset" " of the" " information returned by --sequenceStats but is" " useful because it is in the format used by" " wigToBigWig", "\"\""); optionsParser->addOption("percentID", "print % ID of a genome with all other genomes." "Only non-duplicated and unambiguous sites are" "considered", "\"\""); optionsParser->addOption("coverage", "print histogram of coverage of a genome with" " all genomes", "\"\""); optionsParser->addOption("topSegments", "print coordinates of all top segments of given" " genome in BED format.", "\"\""); optionsParser->addOption("bottomSegments", "print coordinates of all bottom segments of given" " genome in BED format.", "\"\""); optionsParser->addOptionFlag("allCoverage", "print histogram of coverage from all genomes to" " all genomes", false); string path; bool listGenomes; string sequencesFromGenome; string sequenceStatsFromGenome; string bedSequencesFromGenome; string spanGenomes; string spanRootGenomes; bool tree; bool branches; string childrenFromGenome; string parentFromGenome; bool printRoot; string nameForBL; string numSegmentsGenome; string baseCompPair; string genomeMetaData; string chromSizesFromGenome; string percentID; string coverage; string topSegments; string bottomSegments; bool allCoverage; try { optionsParser->parseOptions(argc, argv); path = optionsParser->getArgument<string>("halFile"); listGenomes = optionsParser->getFlag("genomes"); sequencesFromGenome = optionsParser->getOption<string>("sequences"); sequenceStatsFromGenome = optionsParser->getOption<string>("sequenceStats"); bedSequencesFromGenome = optionsParser->getOption<string>("bedSequences"); tree = optionsParser->getFlag("tree"); spanGenomes = optionsParser->getOption<string>("span"); spanRootGenomes = optionsParser->getOption<string>("spanRoot"); branches = optionsParser->getFlag("branches"); childrenFromGenome = optionsParser->getOption<string>("children"); parentFromGenome = optionsParser->getOption<string>("parent"); printRoot = optionsParser->getFlag("root"); nameForBL = optionsParser->getOption<string>("branchLength"); numSegmentsGenome = optionsParser->getOption<string>("numSegments"); baseCompPair = optionsParser->getOption<string>("baseComp"); genomeMetaData = optionsParser->getOption<string>("genomeMetaData"); chromSizesFromGenome = optionsParser->getOption<string>("chromSizes"); percentID = optionsParser->getOption<string>("percentID"); coverage = optionsParser->getOption<string>("coverage"); topSegments = optionsParser->getOption<string>("topSegments"); bottomSegments = optionsParser->getOption<string>("bottomSegments"); allCoverage = optionsParser->getFlag("allCoverage"); size_t optCount = listGenomes == true ? 1 : 0; if (sequencesFromGenome != "\"\"") ++optCount; if (tree == true) ++optCount; if (sequenceStatsFromGenome != "\"\"") ++optCount; if (bedSequencesFromGenome != "\"\"") ++optCount; if (spanGenomes != "\"\"") ++optCount; if (spanRootGenomes != "\"\"") ++optCount; if (branches) ++ optCount; if (childrenFromGenome != "\"\"") ++optCount; if (parentFromGenome != "\"\"") ++optCount; if (printRoot) ++optCount; if (nameForBL != "\"\"") ++optCount; if (numSegmentsGenome != "\"\"") ++optCount; if (baseCompPair != "\"\"") ++optCount; if (genomeMetaData != "\"\"") ++optCount; if (chromSizesFromGenome != "\"\"") ++optCount; if (percentID != "\"\"") ++optCount; if (coverage != "\"\"") ++optCount; if (topSegments != "\"\"") ++optCount; if (bottomSegments != "\"\"") ++optCount; if (allCoverage) ++optCount; if (optCount > 1) { throw hal_exception("--genomes, --sequences, --tree, --span, --spanRoot, " "--branches, --sequenceStats, --children, --parent, " "--bedSequences, --root, --numSegments, --baseComp, " "--genomeMetaData, --chromSizes, --percentID, " "--coverage, --topSegments, --bottomSegments, " "--allCoverage " "and --branchLength options are exclusive"); } } catch(exception& e) { cerr << e.what() << endl; optionsParser->printUsage(cerr); exit(1); } try { AlignmentConstPtr alignment = openHalAlignmentReadOnly(path, optionsParser); if (listGenomes == true && alignment->getNumGenomes() > 0) { printGenomes(cout, alignment); } else if (sequencesFromGenome != "\"\"") { printSequences(cout, alignment, sequencesFromGenome); } else if (tree == true) { cout << alignment->getNewickTree() << endl; } else if (sequenceStatsFromGenome != "\"\"") { printSequenceStats(cout, alignment, sequenceStatsFromGenome); } else if (bedSequencesFromGenome != "\"\"") { printBedSequenceStats(cout, alignment, bedSequencesFromGenome); } else if (spanGenomes != "\"\"") { printBranchPath(cout, alignment, chopString(spanGenomes, ","), false); } else if (spanRootGenomes != "\"\"") { printBranchPath(cout, alignment, chopString(spanRootGenomes, ","), true); } else if (branches == true) { printBranches(cout, alignment); } else if (childrenFromGenome != "\"\"") { printChildren(cout, alignment, childrenFromGenome); } else if (parentFromGenome != "\"\"") { printParent(cout, alignment, parentFromGenome); } else if (printRoot == true) { printRootName(cout, alignment); } else if (nameForBL != "\"\"") { printBranchLength(cout, alignment, nameForBL); } else if (numSegmentsGenome != "\"\"") { printNumSegments(cout, alignment, numSegmentsGenome); } else if (baseCompPair != "\"\"") { printBaseComp(cout, alignment, baseCompPair); } else if (genomeMetaData != "\"\"") { printGenomeMetaData(cout, alignment, genomeMetaData); } else if (chromSizesFromGenome != "\"\"") { printChromSizes(cout, alignment, chromSizesFromGenome); } else if (percentID != "\"\"") { printPercentID(cout, alignment, percentID); } else if (coverage != "\"\"") { printCoverage(cout, alignment, coverage); } else if (topSegments != "\"\"") { printSegments(cout, alignment, topSegments, true); } else if (bottomSegments != "\"\"") { printSegments(cout, alignment, bottomSegments, false); } else if (allCoverage) { printAllCoverage(cout, alignment); } else { HalStats halStats(alignment); cout << endl << "hal v" << alignment->getVersion() << "\n" << halStats; } } catch(hal_exception& e) { cerr << "hal exception caught: " << e.what() << endl; return 1; } catch(exception& e) { cerr << "Exception caught: " << e.what() << endl; return 1; } return 0; }
int main(int argc, char** argv) { CLParserPtr optionsParser = hdf5CLParserInstance(); optionsParser->setDescription("Write masked intervals of genome into bed " "file"); optionsParser->addArgument("halFile", "path to hal file to analyze"); optionsParser->addArgument("genome", "name of genome to process"); optionsParser->addOption("maskFile", "path to bed file to write to", "stdout"); optionsParser->addOption("extend", "extend masked regions by given num. " "of bases.", 0); optionsParser->addOption("extendPct", "extend masked regions by percentage" " of their lengths", 0); string halPath; string genomeName; string bedPath; hal_size_t extend; double extendPct; try { optionsParser->parseOptions(argc, argv); halPath = optionsParser->getArgument<string>("halFile"); genomeName = optionsParser->getArgument<string>("genome"); bedPath = optionsParser->getOption<string>("maskFile"); extend = optionsParser->getOption<hal_size_t>("extend"); extendPct = optionsParser->getOption<double>("extendPct"); if (extend != 0 && extendPct != 0.) { throw hal_exception("--extend and --extendPct options are exclusive."); } } catch(exception& e) { cerr << e.what() << endl; optionsParser->printUsage(cerr); exit(1); } try { AlignmentConstPtr alignment = openHalAlignmentReadOnly(halPath, optionsParser); const Genome* genome = alignment->openGenome(genomeName); if (genome == NULL) { throw hal_exception(string("Genome ") + genomeName + " not found."); } ostream* bedStream = &cout; bool newBed = false; if (bedPath != "stdout") { bedStream = new ofstream(bedPath.c_str()); newBed = true; } if (!bedStream) { throw hal_exception(string("Error opening ") + bedPath + " for writing"); } MaskExtractor mask; mask.extract(alignment, genome, bedStream, extend, extendPct); if (newBed) { delete bedStream; } } catch(hal_exception& e) { cerr << "hal exception caught: " << e.what() << endl; return 1; } catch(exception& e) { cerr << "Exception caught: " << e.what() << endl; return 1; } return 0; }
int main(int argc, char** argv) { CLParserPtr optionsParser = hdf5CLParserInstance(); optionsParser->setDescription("Rertrieve basic statics from a hal database"); optionsParser->addArgument("halFile", "path to hal file to analyze"); optionsParser->addOptionFlag("genomes", "print only a list of genomes " "in alignment", false); optionsParser->addOption("sequences", "print list of sequences in given " "genome", "\"\""); optionsParser->addOption("sequenceStats", "print stats for each sequence in " "given genome", "\"\""); optionsParser->addOptionFlag("tree", "print only the NEWICK tree", false); optionsParser->addOptionFlag("branches", "print list of branches. " "Each branch is specified by the child genome", false); optionsParser->addOption("span", "print branches on path (or spanning tree) " "between comma " "separated list of genomes", "\"\""); optionsParser->addOption("spanRoot", "print genomes on path" "(or spanning tree) between comma " "separated list of genomes. Different from --path" "only in that the spanning tree root is also " "given", "\"\""); string path; bool listGenomes; string sequencesFromGenome; string sequenceStatsFromGenome; string spanGenomes; string spanRootGenomes; bool tree; bool branches; try { optionsParser->parseOptions(argc, argv); path = optionsParser->getArgument<string>("halFile"); listGenomes = optionsParser->getFlag("genomes"); sequencesFromGenome = optionsParser->getOption<string>("sequences"); sequenceStatsFromGenome = optionsParser->getOption<string>("sequenceStats"); tree = optionsParser->getFlag("tree"); spanGenomes = optionsParser->getOption<string>("span"); spanRootGenomes = optionsParser->getOption<string>("spanRoot"); branches = optionsParser->getFlag("branches"); size_t optCount = listGenomes == true ? 1 : 0; if (sequencesFromGenome != "\"\"") ++optCount; if (tree == true) ++optCount; if (sequenceStatsFromGenome != "\"\"") ++optCount; if (spanGenomes != "\"\"") ++optCount; if (spanRootGenomes != "\"\"") ++optCount; if (branches) ++optCount; if (optCount > 1) { throw hal_exception("--genomes, --sequences, --tree, --span, " "--spanRoot, --branches " "and --sequenceStats " "options are mutually exclusive"); } } catch(exception& e) { cerr << e.what() << endl; optionsParser->printUsage(cerr); exit(1); } try { AlignmentConstPtr alignment = openHalAlignmentReadOnly(path, optionsParser); if (listGenomes == true && alignment->getNumGenomes() > 0) { printGenomes(cout, alignment); } else if (sequencesFromGenome != "\"\"") { printSequences(cout, alignment, sequencesFromGenome); } else if (tree == true) { cout << alignment->getNewickTree() << endl; } else if (sequenceStatsFromGenome != "\"\"") { printSequenceStats(cout, alignment, sequenceStatsFromGenome); } else if (spanGenomes != "\"\"") { printBranchPath(cout, alignment, chopString(spanGenomes, ","), false); } else if (spanRootGenomes != "\"\"") { printBranchPath(cout, alignment, chopString(spanRootGenomes, ","), true); } else if (branches == true) { printBranches(cout, alignment); } else { HalStats halStats(alignment); cout << endl << "hal v" << alignment->getVersion() << "\n" << halStats; } } catch(hal_exception& e) { cerr << "hal exception caught: " << e.what() << endl; return 1; } catch(exception& e) { cerr << "Exception caught: " << e.what() << endl; return 1; } return 0; }