Ejemplo n.º 1
0
void  MappedSegmentColCompareTest::checkCallBack(AlignmentConstPtr alignment)
{
  if (alignment->getNumGenomes() == 0)
  {
    return;
  }

  validateAlignment(alignment);
  set<const Genome*> genomeSet;
  hal::getGenomesInSubTree(alignment->openGenome(alignment->getRootName()), 
                           genomeSet);
  for (set<const Genome*>::iterator i = genomeSet.begin(); i != genomeSet.end();
       ++i)
  {
    const Genome* srcGenome = *i;
    for (set<const Genome*>::iterator j = genomeSet.begin(); 
         j != genomeSet.end(); ++j)
    {
      const Genome* tgtGenome = *j;

      if (srcGenome->getSequenceLength() > 0 && 
          tgtGenome->getSequenceLength() > 0)
      {
        _ref = srcGenome;
        _tgt = tgtGenome;
        createColArray();
        createBlockArray();
        compareArrays();
      }
    }
  }
}
Ejemplo n.º 2
0
void LodManager::checkAlignment(hal_size_t minQuery,
                                const string& path,
                                AlignmentConstPtr alignment)
{
  if (alignment->getNumGenomes() == 0)
  {
    stringstream ss;
    ss << "No genomes found in base alignment specified in " << path;
    throw hal_exception(ss.str());
  }

#ifndef NDEBUG
  if (minQuery == 0)
  {
    vector<string> leafNames = alignment->getLeafNamesBelow(
      alignment->getRootName());
    string name = !leafNames.empty() ? leafNames[0] : alignment->getRootName();
    const Genome* genome = alignment->openGenome(name);
    
    bool seqFound = genome->containsDNAArray();
    alignment->closeGenome(genome);
    if (seqFound == false)
    {
      stringstream ss;
      ss << "HAL file for highest level of detail (0) in " << path 
         << "must contain DNA sequence information.";
      throw hal_exception(ss.str());
    }
  }
#endif
}
Ejemplo n.º 3
0
int main(int argc, char** argv)
{
  CLParserPtr optionsParser = initParser();

  string halPath;
  string srcGenomeName;
  string srcBedPath;
  string tgtGenomeName;
  string tgtBedPath;
  bool noDupes;
  bool append;
  int inBedVersion;
  int outBedVersion;
  bool keepExtra;
  bool outPSL;
  bool outPSLWithName;
  bool tab;
  try
  {
    optionsParser->parseOptions(argc, argv);
    halPath = optionsParser->getArgument<string>("halFile");
    srcGenomeName = optionsParser->getArgument<string>("srcGenome");
    srcBedPath =  optionsParser->getArgument<string>("srcBed");
    tgtGenomeName = optionsParser->getArgument<string>("tgtGenome");
    tgtBedPath =  optionsParser->getArgument<string>("tgtBed");
    noDupes = optionsParser->getFlag("noDupes");
    append = optionsParser->getFlag("append");
    inBedVersion = optionsParser->getOption<int>("inBedVersion");
    outBedVersion = optionsParser->getOption<int>("outBedVersion");
    keepExtra = optionsParser->getFlag("keepExtra");
    outPSL = optionsParser->getFlag("outPSL");
    outPSLWithName = optionsParser->getFlag("outPSLWithName");
    tab = optionsParser->getFlag("tab");
  }
  catch(exception& e)
  {
    cerr << e.what() << endl;
    optionsParser->printUsage(cerr);
    exit(1);
  }

  try
  {
    if (outPSLWithName == true)
    {
      outPSL = true;
    }
    if (outPSL == true)
    {
      outBedVersion = 12;
    }

    AlignmentConstPtr alignment = openHalAlignmentReadOnly(halPath, 
                                                           optionsParser);
    if (alignment->getNumGenomes() == 0)
    {
      throw hal_exception("hal alignmenet is empty");
    }

    const Genome* srcGenome = alignment->openGenome(srcGenomeName);
    if (srcGenome == NULL)
    {
      throw hal_exception(string("srcGenome, ") + srcGenomeName + 
                          ", not found in alignment");
    }
    const Genome* tgtGenome = alignment->openGenome(tgtGenomeName);
    if (tgtGenome == NULL)
    {
      throw hal_exception(string("tgtGenome, ") + tgtGenomeName + 
                          ", not found in alignment");
    }
    
    ifstream srcBed;
    istream* srcBedPtr;
    if (srcBedPath == "stdin")
    {
      srcBedPtr = &cin;
    }
    else
    {
      srcBed.open(srcBedPath.c_str());
      srcBedPtr = &srcBed;
      if (!srcBed)
      {
        throw hal_exception("Error opening srcBed, " + srcBedPath);
      }
    }
    
    ios_base::openmode mode = append ? ios::out | ios::app : ios_base::out;
    ofstream tgtBed;
    ostream* tgtBedPtr;
    if (tgtBedPath == "stdout")
    {
      tgtBedPtr = &cout;
    }
    else
    {      
      tgtBed.open(tgtBedPath.c_str(), mode);
      tgtBedPtr = &tgtBed;
      if (!tgtBed)
      {
        throw hal_exception("Error opening tgtBed, " + tgtBedPath);
      }
    }

    locale* inLocale = NULL;
    if (tab == true)
    {
      inLocale = new locale(cin.getloc(), new TabSepFacet(cin.getloc()));
      assert(std::isspace('\t', *inLocale) == true);
      assert(std::isspace(' ', *inLocale) == false);
    }
    
    BlockLiftover liftover;
    liftover.convert(alignment, srcGenome, srcBedPtr, tgtGenome, tgtBedPtr,
                     inBedVersion, outBedVersion, keepExtra, !noDupes,
                     outPSL, outPSLWithName, inLocale);
    
    delete inLocale;

  }
  catch(hal_exception& e)
  {
    cerr << "hal exception caught: " << e.what() << endl;
    return 1;
  }
  catch(exception& e)
  {
    cerr << "Exception caught: " << e.what() << endl;
    return 1;
  }

  return 0;
}
Ejemplo n.º 4
0
int main(int argc, char** argv)
{
  CLParserPtr optionsParser = hdf5CLParserInstance();
  optionsParser->setDescription("Retrieve basic statistics from a hal database");
  optionsParser->addArgument("halFile", "path to hal file to analyze");
  optionsParser->addOptionFlag("genomes", "print only a list of genomes "
                               "in alignment", false);
  optionsParser->addOption("sequences", "print list of sequences in given "
                           "genome", "\"\"");
  optionsParser->addOption("sequenceStats", "print stats for each sequence in "
                           "given genome", "\"\"");
  optionsParser->addOption("bedSequences", "print sequences of given genome "
                           "in bed format",
                           "\"\"");
  optionsParser->addOptionFlag("tree", "print only the NEWICK tree", false);
  optionsParser->addOptionFlag("branches", "print list of branches. "
                               "Each branch is specified by the child genome", 
                               false);
  optionsParser->addOption("span", "print branches on path (or spanning tree) "
                           "between comma "
                           "separated list of genomes", "\"\"");
  optionsParser->addOption("spanRoot", "print genomes on path" 
                           "(or spanning tree) between comma "
                           "separated list of genomes.  Different from --span"
                           "only in that the spanning tree root is also "
                           "given", "\"\"");
  optionsParser->addOption("children", "print names of children of given "
                           "genome", "\"\"");
  optionsParser->addOptionFlag("root", "print root genome name", false);
  optionsParser->addOption("parent", "print name of parent of given genome",
                           "\"\"");
  optionsParser->addOption("branchLength", "print branch length between "
                           "given genome and its parent in the tree",
                           "\"\"");
  optionsParser->addOption("numSegments", "print numTopSegments "
                           "numBottomSegments for given genome.",
                           "\"\"");
  optionsParser->addOption("baseComp", "print base composition for given "
                           "genome by sampling every step bases. Parameter "
                           "value is of the form genome,step.  Ex: "
                           "--baseComp human,1000.  The ouptut is of the form "
                           "fraction_of_As fraction_of_Gs fraction_of_Cs "
                           "fraction_of_Ts.", 
                           "\"\"");
  optionsParser->addOption("genomeMetaData", "print metadata for given genome, "
                           "one entry per line, tab-seperated.", "\"\"");
  optionsParser->addOption("chromSizes", "print the name and length of each"
                           " sequence in a given genome.  This is a subset"
                           " of the"
                           " information returned by --sequenceStats but is"
                           " useful because it is in the format used by"
                           " wigToBigWig", 
                           "\"\"");
  optionsParser->addOption("percentID",
                           "print % ID of a genome with all other genomes."
                           "Only non-duplicated and unambiguous sites are"
                           "considered",
                           "\"\"");
  optionsParser->addOption("coverage",
                           "print histogram of coverage of a genome with"
                           " all genomes", "\"\"");
  optionsParser->addOption("topSegments",
                           "print coordinates of all top segments of given"
                           " genome in BED format.", "\"\"");
  optionsParser->addOption("bottomSegments",
                           "print coordinates of all bottom segments of given"
                           " genome in BED format.", "\"\"");
  optionsParser->addOptionFlag("allCoverage",
                               "print histogram of coverage from all genomes to"
                               " all genomes", false);


  string path;
  bool listGenomes;
  string sequencesFromGenome;
  string sequenceStatsFromGenome;
  string bedSequencesFromGenome;
  string spanGenomes;
  string spanRootGenomes;
  bool tree;
  bool branches;
  string childrenFromGenome;
  string parentFromGenome;
  bool printRoot;
  string nameForBL;
  string numSegmentsGenome;
  string baseCompPair;
  string genomeMetaData;
  string chromSizesFromGenome;
  string percentID;
  string coverage;
  string topSegments;
  string bottomSegments;
  bool allCoverage;
  try
  {
    optionsParser->parseOptions(argc, argv);
    path = optionsParser->getArgument<string>("halFile");
    listGenomes = optionsParser->getFlag("genomes");
    sequencesFromGenome = optionsParser->getOption<string>("sequences");
    sequenceStatsFromGenome = optionsParser->getOption<string>("sequenceStats");
    bedSequencesFromGenome = optionsParser->getOption<string>("bedSequences");
    tree = optionsParser->getFlag("tree");
    spanGenomes = optionsParser->getOption<string>("span");
    spanRootGenomes = optionsParser->getOption<string>("spanRoot");
    branches = optionsParser->getFlag("branches");
    childrenFromGenome = optionsParser->getOption<string>("children");
    parentFromGenome = optionsParser->getOption<string>("parent");
    printRoot = optionsParser->getFlag("root");
    nameForBL = optionsParser->getOption<string>("branchLength");
    numSegmentsGenome = optionsParser->getOption<string>("numSegments");
    baseCompPair = optionsParser->getOption<string>("baseComp");
    genomeMetaData = optionsParser->getOption<string>("genomeMetaData");
    chromSizesFromGenome = optionsParser->getOption<string>("chromSizes");
    percentID = optionsParser->getOption<string>("percentID");
    coverage = optionsParser->getOption<string>("coverage");
    topSegments = optionsParser->getOption<string>("topSegments");
    bottomSegments = optionsParser->getOption<string>("bottomSegments");
    allCoverage = optionsParser->getFlag("allCoverage");

    size_t optCount = listGenomes == true ? 1 : 0;
    if (sequencesFromGenome != "\"\"") ++optCount;
    if (tree == true) ++optCount;
    if (sequenceStatsFromGenome != "\"\"") ++optCount;
    if (bedSequencesFromGenome != "\"\"") ++optCount;
    if (spanGenomes != "\"\"") ++optCount;
    if (spanRootGenomes != "\"\"") ++optCount;
    if (branches) ++ optCount;
    if (childrenFromGenome != "\"\"") ++optCount;
    if (parentFromGenome != "\"\"") ++optCount;
    if (printRoot) ++optCount;
    if (nameForBL != "\"\"") ++optCount;
    if (numSegmentsGenome != "\"\"") ++optCount;
    if (baseCompPair != "\"\"") ++optCount;
    if (genomeMetaData != "\"\"") ++optCount;
    if (chromSizesFromGenome != "\"\"") ++optCount;
    if (percentID != "\"\"") ++optCount;
    if (coverage != "\"\"") ++optCount;
    if (topSegments != "\"\"") ++optCount;
    if (bottomSegments != "\"\"") ++optCount;
    if (allCoverage) ++optCount;
    if (optCount > 1)
    {
      throw hal_exception("--genomes, --sequences, --tree, --span, --spanRoot, "
                          "--branches, --sequenceStats, --children, --parent, "
                          "--bedSequences, --root, --numSegments, --baseComp, "
                          "--genomeMetaData, --chromSizes, --percentID, "
                          "--coverage,  --topSegments, --bottomSegments, "
                          "--allCoverage "
                          "and --branchLength options are exclusive");
    }
  }
  catch(exception& e)
  {
    cerr << e.what() << endl;
    optionsParser->printUsage(cerr);
    exit(1);
  }
  try
  {
    AlignmentConstPtr alignment = openHalAlignmentReadOnly(path, optionsParser);

    if (listGenomes == true && alignment->getNumGenomes() > 0)
    {
      printGenomes(cout, alignment);
    }
    else if (sequencesFromGenome != "\"\"")
    {
      printSequences(cout, alignment, sequencesFromGenome);
    }
    else if (tree == true)
    {
      cout << alignment->getNewickTree() << endl;
    }
    else if (sequenceStatsFromGenome != "\"\"")
    {
      printSequenceStats(cout, alignment, sequenceStatsFromGenome);
    }
    else if (bedSequencesFromGenome != "\"\"")
    {
      printBedSequenceStats(cout, alignment, bedSequencesFromGenome);
    }
    else if (spanGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanGenomes, ","), false);
    }
    else if (spanRootGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanRootGenomes, ","), true);
    }
    else if (branches == true)
    {
      printBranches(cout, alignment);
    }
    else if (childrenFromGenome != "\"\"")
    {
      printChildren(cout, alignment, childrenFromGenome);
    }
    else if (parentFromGenome != "\"\"")
    {
      printParent(cout, alignment, parentFromGenome);
    }
    else if (printRoot == true)
    {
      printRootName(cout, alignment);
    }
    else if (nameForBL != "\"\"")
    {
      printBranchLength(cout, alignment, nameForBL);
    }
    else if (numSegmentsGenome != "\"\"")
    {
      printNumSegments(cout, alignment, numSegmentsGenome);
    }
    else if (baseCompPair != "\"\"")
    {
      printBaseComp(cout, alignment, baseCompPair);
    }
    else if (genomeMetaData != "\"\"")
    {
      printGenomeMetaData(cout, alignment, genomeMetaData);
    }
    else if (chromSizesFromGenome != "\"\"")
    {
      printChromSizes(cout, alignment, chromSizesFromGenome);
    }
    else if (percentID != "\"\"")
    {
      printPercentID(cout, alignment, percentID);
    }
    else if (coverage != "\"\"") {
      printCoverage(cout, alignment, coverage);
    }
    else if (topSegments != "\"\"") {
      printSegments(cout, alignment, topSegments, true);
    }
    else if (bottomSegments != "\"\"") {
      printSegments(cout, alignment, bottomSegments, false);
    } else if (allCoverage) {
      printAllCoverage(cout, alignment);
    }
    else
    {
      HalStats halStats(alignment);
      cout << endl << "hal v" << alignment->getVersion() << "\n" << halStats;
    }
  }
  catch(hal_exception& e)
  {
    cerr << "hal exception caught: " << e.what() << endl;
    return 1;
  }
  catch(exception& e)
  {
    cerr << "Exception caught: " << e.what() << endl;
    return 1;
  }
  
  return 0;
}
Ejemplo n.º 5
0
int main(int argc, char** argv)
{
  CLParserPtr optionsParser = hdf5CLParserInstance();
  optionsParser->setDescription("Rertrieve basic statics from a hal database");
  optionsParser->addArgument("halFile", "path to hal file to analyze");
  optionsParser->addOptionFlag("genomes", "print only a list of genomes "
                               "in alignment", false);
  optionsParser->addOption("sequences", "print list of sequences in given "
                           "genome", "\"\"");
  optionsParser->addOption("sequenceStats", "print stats for each sequence in "
                           "given genome", "\"\"");
  optionsParser->addOptionFlag("tree", "print only the NEWICK tree", false);
  optionsParser->addOptionFlag("branches", "print list of branches. "
                               "Each branch is specified by the child genome", 
                               false);
  optionsParser->addOption("span", "print branches on path (or spanning tree) "
                           "between comma "
                           "separated list of genomes", "\"\"");
  optionsParser->addOption("spanRoot", "print genomes on path" 
                           "(or spanning tree) between comma "
                           "separated list of genomes.  Different from --path"
                           "only in that the spanning tree root is also "
                           "given", "\"\"");

  string path;
  bool listGenomes;
  string sequencesFromGenome;
  string sequenceStatsFromGenome;
  string spanGenomes;
  string spanRootGenomes;
  bool tree;
  bool branches;
  try
  {
    optionsParser->parseOptions(argc, argv);
    path = optionsParser->getArgument<string>("halFile");
    listGenomes = optionsParser->getFlag("genomes");
    sequencesFromGenome = optionsParser->getOption<string>("sequences");
    sequenceStatsFromGenome = optionsParser->getOption<string>("sequenceStats");
    tree = optionsParser->getFlag("tree");
    spanGenomes = optionsParser->getOption<string>("span");
    spanRootGenomes = optionsParser->getOption<string>("spanRoot");
    branches = optionsParser->getFlag("branches");

    size_t optCount = listGenomes == true ? 1 : 0;
    if (sequencesFromGenome != "\"\"") ++optCount;
    if (tree == true) ++optCount;
    if (sequenceStatsFromGenome != "\"\"") ++optCount;
    if (spanGenomes != "\"\"") ++optCount;
    if (spanRootGenomes != "\"\"") ++optCount;
    if (branches) ++optCount;
    if (optCount > 1)
    {
      throw hal_exception("--genomes, --sequences, --tree, --span, "
                          "--spanRoot, --branches "
                          "and --sequenceStats " 
                          "options are mutually exclusive");
    }        
  }
  catch(exception& e)
  {
    cerr << e.what() << endl;
    optionsParser->printUsage(cerr);
    exit(1);
  }
  try
  {
    AlignmentConstPtr alignment = openHalAlignmentReadOnly(path, optionsParser);

    if (listGenomes == true && alignment->getNumGenomes() > 0)
    {
      printGenomes(cout, alignment);
    }
    else if (sequencesFromGenome != "\"\"")
    {
      printSequences(cout, alignment, sequencesFromGenome);
    }
    else if (tree == true)
    {
      cout << alignment->getNewickTree() << endl;
    }
    else if (sequenceStatsFromGenome != "\"\"")
    {
      printSequenceStats(cout, alignment, sequenceStatsFromGenome);
    }
    else if (spanGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanGenomes, ","), false);
    }
    else if (spanRootGenomes !=  "\"\"")
    {
      printBranchPath(cout, alignment, chopString(spanRootGenomes, ","), true);
    }
    else if (branches == true)
    {
      printBranches(cout, alignment);
    }
    else
    {
      HalStats halStats(alignment);
      cout << endl << "hal v" << alignment->getVersion() << "\n" << halStats;
    }
  }
  catch(hal_exception& e)
  {
    cerr << "hal exception caught: " << e.what() << endl;
    return 1;
  }
  catch(exception& e)
  {
    cerr << "Exception caught: " << e.what() << endl;
    return 1;
  }
  
  return 0;
}