Exemplo n.º 1
0
static CLParserPtr initParser()
{
  CLParserPtr optionsParser = hdf5CLParserInstance(true);
  optionsParser->addArgument("inFile", "existing tree");
  optionsParser->addOption("bottomAlignmentFile", "hal file containing an "
                           "alignment of the genome and its children. "
                           "Required for non-leaf genomes.", "\"\"");
  optionsParser->addOption("topAlignmentFile", "hal file containing an "
                           "alignment of the genome, its parent, and "
                           "its siblings. Required if the genome to be "
                           "replaced is not the root.", "\"\"");
  optionsParser->addArgument("genomeName", "name of genome to be replaced");
  optionsParser->addOptionFlag("noMarkAncestors", "don't mark ancestors for"
                               " update", false);
  return optionsParser;
}
Exemplo n.º 2
0
static CLParserPtr initParser()
{
  CLParserPtr optionsParser = hdf5CLParserInstance();
  optionsParser->addArgument("halFile", "input hal file");
  optionsParser->addArgument("srcGenome", "source genome name");
  optionsParser->addArgument("srcBed", "path of input bed file.  set as stdin "
                             "to stream from standard input");
  optionsParser->addArgument("tgtGenome", "target genome name");
  optionsParser->addArgument("tgtBed", "path of output bed file.  set as stdout"
                             " to stream to standard output.");
  optionsParser->addOptionFlag("noDupes", "do not map between duplications in"
                               " graph.", false);
  optionsParser->addOptionFlag("append", "append results to tgtBed", false);
  optionsParser->addOption("inBedVersion", "bed version of input file "
                           "as integer between 3 and 9 or 12 reflecting "
                           "the number of columns (see bed "
                           "format specification for more details). Will "
                           "be autodetected by default.", 0);
  optionsParser->addOption("outBedVersion", "bed version of output file "
                           "as integer between 3 and 9 or 12 reflecting "
                           "the number of columns (see bed "
                           "format specification for more details). Will "
                           "be same as input by default.", 0);
  optionsParser->addOption("coalescenceLimit", "coalescence limit genome:"
                           " the genome at or above the MRCA of source"
                           " and target at which we stop looking for"
                           " homologies (default: MRCA)",
                           "");
  optionsParser->addOptionFlag("outPSL", "write output in PSL instead of "
                               "bed format. overrides --outBedVersion when "
                               "specified.", false);
  optionsParser->addOptionFlag("outPSLWithName", "write output as input BED name followed by PSL line instead of "
                               "bed format. overrides --outBedVersion when "
                               "specified.", false);
  optionsParser->addOptionFlag("keepExtra", "keep extra columns. these are "
                               "columns in the input beyond the specified or "
                               "detected bed version, and which are cut by "
                               "default.", false);
  optionsParser->addOptionFlag("tab", "input is tab-separated. this allows"
                               " column entries to contain spaces.  if this"
                               " flag is not set, both spaces and tabs are"
                               " used to separate input columns.", false);
  optionsParser->setDescription("Map BED genome interval coordinates between "
                                "two genomes.");
  return optionsParser;
}
Exemplo n.º 3
0
int main(int argc, char** argv)
{
  CLParserPtr optionsParser = hdf5CLParserInstance();
  optionsParser->setDescription("Rertrieve chain (pairwise alignment) "
                                "information from a hal database.\n"
                                "WARNING: THIS TOOL WAS NEVER FINISHED OR"
                                " TESTED. USE AT OWN RISK. PLEASE "
                                "CONSIDER halLiftover --outPSL INSTEAD.");
  optionsParser->addArgument("halFile", "path to hal file to analyze");
  optionsParser->addArgument("genome", "(query) genome to process");
  optionsParser->addOption("sequence", "sequence name in query genome ("
                           "all sequences if not specified)", "\"\"");
  optionsParser->addOption("start", "start position in query genome", 0);
  optionsParser->addOption("length", "maximum length of chain to output.", 0);
  optionsParser->addOption("chainFile", "path for output file.  stdout if not"
                           " specified", "\"\"");
  optionsParser->addOption("maxGap", 
                           "maximum indel length to be considered a gap within"
                           " a chain.", 
                           20);
  

  string halPath;
  string chainPath;
  string genomeName;
  string sequenceName;
  hal_size_t start;
  hal_size_t length;
  hal_size_t maxGap;
  try
  {
    optionsParser->parseOptions(argc, argv);
    halPath = optionsParser->getArgument<string>("halFile");
    genomeName = optionsParser->getArgument<string>("genome");
    sequenceName = optionsParser->getOption<string>("sequence");
    start = optionsParser->getOption<hal_size_t>("start");
    length = optionsParser->getOption<hal_size_t>("length");
    chainPath = optionsParser->getOption<string>("chainFile");
    maxGap = optionsParser->getOption<hal_size_t>("maxGap");
  }
  catch(exception& e)
  {
    cerr << e.what() << endl;
    optionsParser->printUsage(cerr);
    exit(1);
  }
  try
  {
    cerr << "WARNING: THIS TOOL WAS NEVER FINISHED OR TESTED. USE AT OWN RISK."
         << " PLEASE CONSIDER halLiftover --outPSL INSTEAD." <<endl;  

    AlignmentConstPtr alignment = openHalAlignmentReadOnly(halPath,
                                                           optionsParser);
    
    
    const Genome* genome = alignment->openGenome(genomeName);
    if (genome == NULL)
    {
      throw hal_exception(string("Genome not found: ") + genomeName);
    }
    hal_index_t endPosition = 
       length > 0 ? start + length : genome->getSequenceLength();

    const Sequence* sequence = NULL;
    if (sequenceName != "\"\"")
    {
      sequence = genome->getSequence(sequenceName);
      if (sequence == NULL)
      {
        throw hal_exception(string("Sequence not found: ") + sequenceName);
      }
      start += sequence->getStartPosition();
      endPosition =  
         length > 0 ? start + length : sequence->getSequenceLength();
    }

    ofstream ofile;
    ostream& outStream = chainPath == "\"\"" ? cout : ofile;
    if (chainPath != "\"\"")
    {
      ofile.open(chainPath.c_str());
      if (!ofile)
      {
        throw hal_exception(string("Error opening output file ") + 
                            chainPath);
      }
    }

    TopSegmentIteratorConstPtr top = genome->getTopSegmentIterator();
    top->toSite(start, false);
    // do slicing here;
    
    GappedTopSegmentIteratorConstPtr gtop = 
       genome->getGappedTopSegmentIterator(top->getArrayIndex(), maxGap);

    // need to review!
    Chain chain;
    chain._id = 0;
    while (gtop->getRightArrayIndex() < 
           (hal_index_t)genome->getNumTopSegments() &&
           gtop->getLeft()->getStartPosition() < endPosition)
    {
      if (gtop->hasParent() == true)
      {
        hal_offset_t leftOffset = 0;
        if ((hal_index_t)start > gtop->getStartPosition() 
            && (hal_index_t)start < gtop->getEndPosition())
        {
          leftOffset = start - gtop->getStartPosition() ;
        }
        hal_offset_t rightOffset = 0;
        if (endPosition - 1 > gtop->getStartPosition() 
            && endPosition - 1 < gtop->getEndPosition())
        {
          rightOffset = gtop->getEndPosition() + 1 - endPosition;
        }
        // need to do offsets for edge cases
        gtIteratorToChain(gtop, chain, leftOffset, rightOffset);
        outStream << chain;
        ++chain._id;
      }
      gtop->toRight();
    }
  }
  catch(hal_exception& e)
  {
    cerr << "hal exception caught: " << e.what() << endl;
    return 1;
  }
  catch(exception& e)
  {
    cerr << "Exception caught: " << e.what() << endl;
    return 1;
  }
  
  return 0;
}
Exemplo n.º 4
0
int main(int argc, char** argv)
{
  CLParserPtr optionsParser = hdf5CLParserInstance();
  optionsParser->setDescription("Retrieve basic statistics from a hal database");
  optionsParser->addArgument("halFile", "path to hal file to analyze");
  optionsParser->addOptionFlag("genomes", "print only a list of genomes "
                               "in alignment", false);
  optionsParser->addOption("sequences", "print list of sequences in given "
                           "genome", "\"\"");
  optionsParser->addOption("sequenceStats", "print stats for each sequence in "
                           "given genome", "\"\"");
  optionsParser->addOption("bedSequences", "print sequences of given genome "
                           "in bed format",
                           "\"\"");
  optionsParser->addOptionFlag("tree", "print only the NEWICK tree", false);
  optionsParser->addOptionFlag("branches", "print list of branches. "
                               "Each branch is specified by the child genome", 
                               false);
  optionsParser->addOption("span", "print branches on path (or spanning tree) "
                           "between comma "
                           "separated list of genomes", "\"\"");
  optionsParser->addOption("spanRoot", "print genomes on path" 
                           "(or spanning tree) between comma "
                           "separated list of genomes.  Different from --span"
                           "only in that the spanning tree root is also "
                           "given", "\"\"");
  optionsParser->addOption("children", "print names of children of given "
                           "genome", "\"\"");
  optionsParser->addOptionFlag("root", "print root genome name", false);
  optionsParser->addOption("parent", "print name of parent of given genome",
                           "\"\"");
  optionsParser->addOption("branchLength", "print branch length between "
                           "given genome and its parent in the tree",
                           "\"\"");
  optionsParser->addOption("numSegments", "print numTopSegments "
                           "numBottomSegments for given genome.",
                           "\"\"");
  optionsParser->addOption("baseComp", "print base composition for given "
                           "genome by sampling every step bases. Parameter "
                           "value is of the form genome,step.  Ex: "
                           "--baseComp human,1000.  The ouptut is of the form "
                           "fraction_of_As fraction_of_Gs fraction_of_Cs "
                           "fraction_of_Ts.", 
                           "\"\"");
  optionsParser->addOption("genomeMetaData", "print metadata for given genome, "
                           "one entry per line, tab-seperated.", "\"\"");
  optionsParser->addOption("chromSizes", "print the name and length of each"
                           " sequence in a given genome.  This is a subset"
                           " of the"
                           " information returned by --sequenceStats but is"
                           " useful because it is in the format used by"
                           " wigToBigWig", 
                           "\"\"");
  optionsParser->addOption("percentID",
                           "print % ID of a genome with all other genomes."
                           "Only non-duplicated and unambiguous sites are"
                           "considered",
                           "\"\"");
  optionsParser->addOption("coverage",
                           "print histogram of coverage of a genome with"
                           " all genomes", "\"\"");
  optionsParser->addOption("topSegments",
                           "print coordinates of all top segments of given"
                           " genome in BED format.", "\"\"");
  optionsParser->addOption("bottomSegments",
                           "print coordinates of all bottom segments of given"
                           " genome in BED format.", "\"\"");
  optionsParser->addOptionFlag("allCoverage",
                               "print histogram of coverage from all genomes to"
                               " all genomes", false);


  string path;
  bool listGenomes;
  string sequencesFromGenome;
  string sequenceStatsFromGenome;
  string bedSequencesFromGenome;
  string spanGenomes;
  string spanRootGenomes;
  bool tree;
  bool branches;
  string childrenFromGenome;
  string parentFromGenome;
  bool printRoot;
  string nameForBL;
  string numSegmentsGenome;
  string baseCompPair;
  string genomeMetaData;
  string chromSizesFromGenome;
  string percentID;
  string coverage;
  string topSegments;
  string bottomSegments;
  bool allCoverage;
  try
  {
    optionsParser->parseOptions(argc, argv);
    path = optionsParser->getArgument<string>("halFile");
    listGenomes = optionsParser->getFlag("genomes");
    sequencesFromGenome = optionsParser->getOption<string>("sequences");
    sequenceStatsFromGenome = optionsParser->getOption<string>("sequenceStats");
    bedSequencesFromGenome = optionsParser->getOption<string>("bedSequences");
    tree = optionsParser->getFlag("tree");
    spanGenomes = optionsParser->getOption<string>("span");
    spanRootGenomes = optionsParser->getOption<string>("spanRoot");
    branches = optionsParser->getFlag("branches");
    childrenFromGenome = optionsParser->getOption<string>("children");
    parentFromGenome = optionsParser->getOption<string>("parent");
    printRoot = optionsParser->getFlag("root");
    nameForBL = optionsParser->getOption<string>("branchLength");
    numSegmentsGenome = optionsParser->getOption<string>("numSegments");
    baseCompPair = optionsParser->getOption<string>("baseComp");
    genomeMetaData = optionsParser->getOption<string>("genomeMetaData");
    chromSizesFromGenome = optionsParser->getOption<string>("chromSizes");
    percentID = optionsParser->getOption<string>("percentID");
    coverage = optionsParser->getOption<string>("coverage");
    topSegments = optionsParser->getOption<string>("topSegments");
    bottomSegments = optionsParser->getOption<string>("bottomSegments");
    allCoverage = optionsParser->getFlag("allCoverage");

    size_t optCount = listGenomes == true ? 1 : 0;
    if (sequencesFromGenome != "\"\"") ++optCount;
    if (tree == true) ++optCount;
    if (sequenceStatsFromGenome != "\"\"") ++optCount;
    if (bedSequencesFromGenome != "\"\"") ++optCount;
    if (spanGenomes != "\"\"") ++optCount;
    if (spanRootGenomes != "\"\"") ++optCount;
    if (branches) ++ optCount;
    if (childrenFromGenome != "\"\"") ++optCount;
    if (parentFromGenome != "\"\"") ++optCount;
    if (printRoot) ++optCount;
    if (nameForBL != "\"\"") ++optCount;
    if (numSegmentsGenome != "\"\"") ++optCount;
    if (baseCompPair != "\"\"") ++optCount;
    if (genomeMetaData != "\"\"") ++optCount;
    if (chromSizesFromGenome != "\"\"") ++optCount;
    if (percentID != "\"\"") ++optCount;
    if (coverage != "\"\"") ++optCount;
    if (topSegments != "\"\"") ++optCount;
    if (bottomSegments != "\"\"") ++optCount;
    if (allCoverage) ++optCount;
    if (optCount > 1)
    {
      throw hal_exception("--genomes, --sequences, --tree, --span, --spanRoot, "
                          "--branches, --sequenceStats, --children, --parent, "
                          "--bedSequences, --root, --numSegments, --baseComp, "
                          "--genomeMetaData, --chromSizes, --percentID, "
                          "--coverage,  --topSegments, --bottomSegments, "
                          "--allCoverage "
                          "and --branchLength options are exclusive");
    }
  }
  catch(exception& e)
  {
    cerr << e.what() << endl;
    optionsParser->printUsage(cerr);
    exit(1);
  }
  try
  {
    AlignmentConstPtr alignment = openHalAlignmentReadOnly(path, optionsParser);

    if (listGenomes == true && alignment->getNumGenomes() > 0)
    {
      printGenomes(cout, alignment);
    }
    else if (sequencesFromGenome != "\"\"")
    {
      printSequences(cout, alignment, sequencesFromGenome);
    }
    else if (tree == true)
    {
      cout << alignment->getNewickTree() << endl;
    }
    else if (sequenceStatsFromGenome != "\"\"")
    {
      printSequenceStats(cout, alignment, sequenceStatsFromGenome);
    }
    else if (bedSequencesFromGenome != "\"\"")
    {
      printBedSequenceStats(cout, alignment, bedSequencesFromGenome);
    }
    else if (spanGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanGenomes, ","), false);
    }
    else if (spanRootGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanRootGenomes, ","), true);
    }
    else if (branches == true)
    {
      printBranches(cout, alignment);
    }
    else if (childrenFromGenome != "\"\"")
    {
      printChildren(cout, alignment, childrenFromGenome);
    }
    else if (parentFromGenome != "\"\"")
    {
      printParent(cout, alignment, parentFromGenome);
    }
    else if (printRoot == true)
    {
      printRootName(cout, alignment);
    }
    else if (nameForBL != "\"\"")
    {
      printBranchLength(cout, alignment, nameForBL);
    }
    else if (numSegmentsGenome != "\"\"")
    {
      printNumSegments(cout, alignment, numSegmentsGenome);
    }
    else if (baseCompPair != "\"\"")
    {
      printBaseComp(cout, alignment, baseCompPair);
    }
    else if (genomeMetaData != "\"\"")
    {
      printGenomeMetaData(cout, alignment, genomeMetaData);
    }
    else if (chromSizesFromGenome != "\"\"")
    {
      printChromSizes(cout, alignment, chromSizesFromGenome);
    }
    else if (percentID != "\"\"")
    {
      printPercentID(cout, alignment, percentID);
    }
    else if (coverage != "\"\"") {
      printCoverage(cout, alignment, coverage);
    }
    else if (topSegments != "\"\"") {
      printSegments(cout, alignment, topSegments, true);
    }
    else if (bottomSegments != "\"\"") {
      printSegments(cout, alignment, bottomSegments, false);
    } else if (allCoverage) {
      printAllCoverage(cout, alignment);
    }
    else
    {
      HalStats halStats(alignment);
      cout << endl << "hal v" << alignment->getVersion() << "\n" << halStats;
    }
  }
  catch(hal_exception& e)
  {
    cerr << "hal exception caught: " << e.what() << endl;
    return 1;
  }
  catch(exception& e)
  {
    cerr << "Exception caught: " << e.what() << endl;
    return 1;
  }
  
  return 0;
}
Exemplo n.º 5
0
int main(int argc, char** argv)
{
  CLParserPtr optionsParser = hdf5CLParserInstance();
  optionsParser->setDescription("Write masked intervals of genome into bed "
                                "file");
  optionsParser->addArgument("halFile", "path to hal file to analyze");
  optionsParser->addArgument("genome", "name of genome to process");
  optionsParser->addOption("maskFile", "path to bed file to write to", 
                           "stdout");
  optionsParser->addOption("extend", "extend masked regions by given num. "
                           "of bases.", 0);
  optionsParser->addOption("extendPct", "extend masked regions by percentage"
                           " of their lengths", 0);

  string halPath;
  string genomeName;
  string bedPath;
  hal_size_t extend;
  double extendPct;
  try
  {
    optionsParser->parseOptions(argc, argv);
    halPath = optionsParser->getArgument<string>("halFile");
    genomeName = optionsParser->getArgument<string>("genome");
    bedPath = optionsParser->getOption<string>("maskFile");
    extend = optionsParser->getOption<hal_size_t>("extend");
    extendPct = optionsParser->getOption<double>("extendPct");

    if (extend != 0 && extendPct != 0.)
    {
      throw hal_exception("--extend and --extendPct options are exclusive.");
    }
  }
  catch(exception& e)
  {
    cerr << e.what() << endl;
    optionsParser->printUsage(cerr);
    exit(1);
  }
  try
  {
    AlignmentConstPtr alignment = openHalAlignmentReadOnly(halPath, 
                                                           optionsParser);

    const Genome* genome = alignment->openGenome(genomeName);
    if (genome == NULL)
    {
      throw hal_exception(string("Genome ") + genomeName + " not found.");
    }

    ostream* bedStream = &cout;
    bool newBed = false;
    if (bedPath != "stdout")
    {
      bedStream = new ofstream(bedPath.c_str());
      newBed = true;
    }
    if (!bedStream)
    {
      throw hal_exception(string("Error opening ") + bedPath + " for writing");
    }

    MaskExtractor mask;
    mask.extract(alignment, genome, bedStream, extend, extendPct);

    if (newBed)
    {
      delete bedStream;
    }
  }
  catch(hal_exception& e)
  {
    cerr << "hal exception caught: " << e.what() << endl;
    return 1;
  }
  catch(exception& e)
  {
    cerr << "Exception caught: " << e.what() << endl;
    return 1;
  }
  
  return 0;
}
Exemplo n.º 6
0
int main(int argc, char** argv)
{
  CLParserPtr optionsParser = hdf5CLParserInstance();
  optionsParser->setDescription("Rertrieve basic statics from a hal database");
  optionsParser->addArgument("halFile", "path to hal file to analyze");
  optionsParser->addOptionFlag("genomes", "print only a list of genomes "
                               "in alignment", false);
  optionsParser->addOption("sequences", "print list of sequences in given "
                           "genome", "\"\"");
  optionsParser->addOption("sequenceStats", "print stats for each sequence in "
                           "given genome", "\"\"");
  optionsParser->addOptionFlag("tree", "print only the NEWICK tree", false);
  optionsParser->addOptionFlag("branches", "print list of branches. "
                               "Each branch is specified by the child genome", 
                               false);
  optionsParser->addOption("span", "print branches on path (or spanning tree) "
                           "between comma "
                           "separated list of genomes", "\"\"");
  optionsParser->addOption("spanRoot", "print genomes on path" 
                           "(or spanning tree) between comma "
                           "separated list of genomes.  Different from --path"
                           "only in that the spanning tree root is also "
                           "given", "\"\"");

  string path;
  bool listGenomes;
  string sequencesFromGenome;
  string sequenceStatsFromGenome;
  string spanGenomes;
  string spanRootGenomes;
  bool tree;
  bool branches;
  try
  {
    optionsParser->parseOptions(argc, argv);
    path = optionsParser->getArgument<string>("halFile");
    listGenomes = optionsParser->getFlag("genomes");
    sequencesFromGenome = optionsParser->getOption<string>("sequences");
    sequenceStatsFromGenome = optionsParser->getOption<string>("sequenceStats");
    tree = optionsParser->getFlag("tree");
    spanGenomes = optionsParser->getOption<string>("span");
    spanRootGenomes = optionsParser->getOption<string>("spanRoot");
    branches = optionsParser->getFlag("branches");

    size_t optCount = listGenomes == true ? 1 : 0;
    if (sequencesFromGenome != "\"\"") ++optCount;
    if (tree == true) ++optCount;
    if (sequenceStatsFromGenome != "\"\"") ++optCount;
    if (spanGenomes != "\"\"") ++optCount;
    if (spanRootGenomes != "\"\"") ++optCount;
    if (branches) ++optCount;
    if (optCount > 1)
    {
      throw hal_exception("--genomes, --sequences, --tree, --span, "
                          "--spanRoot, --branches "
                          "and --sequenceStats " 
                          "options are mutually exclusive");
    }        
  }
  catch(exception& e)
  {
    cerr << e.what() << endl;
    optionsParser->printUsage(cerr);
    exit(1);
  }
  try
  {
    AlignmentConstPtr alignment = openHalAlignmentReadOnly(path, optionsParser);

    if (listGenomes == true && alignment->getNumGenomes() > 0)
    {
      printGenomes(cout, alignment);
    }
    else if (sequencesFromGenome != "\"\"")
    {
      printSequences(cout, alignment, sequencesFromGenome);
    }
    else if (tree == true)
    {
      cout << alignment->getNewickTree() << endl;
    }
    else if (sequenceStatsFromGenome != "\"\"")
    {
      printSequenceStats(cout, alignment, sequenceStatsFromGenome);
    }
    else if (spanGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanGenomes, ","), false);
    }
    else if (spanRootGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanRootGenomes, ","), true);
    }
    else if (branches == true)
    {
      printBranches(cout, alignment);
    }
    else
    {
      HalStats halStats(alignment);
      cout << endl << "hal v" << alignment->getVersion() << "\n" << halStats;
    }
  }
  catch(hal_exception& e)
  {
    cerr << "hal exception caught: " << e.what() << endl;
    return 1;
  }
  catch(exception& e)
  {
    cerr << "Exception caught: " << e.what() << endl;
    return 1;
  }
  
  return 0;
}