Ejemplo n.º 1
0
void ThreadedVariantCaller(ofstream &outVCFFile, ofstream &filterVCFFile, ofstream &consensusFile, InputStructures &global_context, ExtendParameters *parameters) {

  VariantJobServer my_job_server;
  my_job_server.SetupJobServer(parameters);
  CandidateGenerationHelper candidate_generator;
  candidate_generator.SetupCandidateGeneration(global_context, parameters);

  if (parameters->output == "vcf") {
    string headerstr = getVCFHeader(parameters, candidate_generator);
    outVCFFile << headerstr << endl;
    filterVCFFile << headerstr << endl;
    candidate_generator.parser->variantCallFile.parseHeader(headerstr);
  }

  if (parameters->program_flow.skipCandidateGeneration) {
    //PURELY R&D branch to be used only for diagnostic purposes, we remove all targets and generate new targets spanning just the input variants
    justProcessInputVCFCandidates(candidate_generator, parameters);
  }

  bool checkHotSpotsSpanningHaploBases = false;
  bool isHotSpot = false;  //@TODO:  why is this here if we're just setting isHotSpot in variant???
  my_job_server.NewVariant(candidate_generator.parser->variantCallFile);

  while (candidate_generator.parser->getNextAlleles(candidate_generator.samples, candidate_generator.allowedAlleleTypes)) {

    TrackConsensus(consensusFile, candidate_generator, parameters);
    if (checkHotSpotsSpanningHaploBases && candidate_generator.parser->inputVariantsWithinHaploBases.size() > 0) {
      //check if there were any hotspot/input candidate variants within the previously haplotype length of bases
      for (size_t i = 0; i < candidate_generator.parser->inputVariantsWithinHaploBases.size(); i++) {
        //*(my_job_server.variant) = candidate_generator.parser->inputVariantsWithinHaploBases.at(i);
        fillInHotSpotVariant(candidate_generator.parser, candidate_generator.samples, my_job_server.variant, candidate_generator.parser->inputVariantsWithinHaploBases.at(i));
        my_job_server.isHotSpot = true;
        my_job_server.PushCurVariantOntoJobs(outVCFFile, filterVCFFile, candidate_generator.parser->variantCallFile, global_context, parameters);

      }


      checkHotSpotsSpanningHaploBases = false;
      candidate_generator.parser->inputVariantsWithinHaploBases.clear();
    }

    if (!generateCandidateVariant(candidate_generator.parser, candidate_generator.samples, my_job_server.variant, isHotSpot, parameters, candidate_generator.allowedAlleleTypes))
    {
      //even if this position is not a valid candidate the candidate alleles might have spanned a hotspot position
      if (candidate_generator.parser->lastHaplotypeLength > 1)
            checkHotSpotsSpanningHaploBases = true;
      continue; //skip the current position
    }

    if (candidate_generator.parser->lastHaplotypeLength > 1)
      checkHotSpotsSpanningHaploBases = true;

    my_job_server.isHotSpot = isHotSpot;
    my_job_server.PushCurVariantOntoJobs(outVCFFile, filterVCFFile, candidate_generator.parser->variantCallFile, global_context, parameters);

  }
  my_job_server.KillMeNow(global_context.DEBUG);

}
Ejemplo n.º 2
0
int main(int argc, char** argv)
{
	// graphite::AlignmentManager< HTSLibAlignmentReader > tmp;
	unsigned long milliseconds_since_epoch = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);
	graphite::Params params;
	params.parseGSSW(argc, argv);
	if (params.showHelp() || !params.validateRequired())
	{
		params.printHelp();
		exit(0);
	}
	auto bamPaths = params.getBAMPaths();
	auto fastaPath = params.getFastaPath();
	auto vcfPaths = params.getInVCFPaths();
	auto outputDirectory = params.getOutputDirectory();
	auto paramRegionPtr = params.getRegion();
	auto swPercent = params.getPercent();
	auto threadCount = params.getThreadCount();
	auto matchValue = params.getMatchValue();
	auto misMatchValue = params.getMisMatchValue();
	auto gapOpenValue = params.getGapOpenValue();
	auto gapExtensionValue = params.getGapExtensionValue();
	auto excludeDuplicates = params.getExcludeDuplicates();
	auto graphSize = params.getGraphSize();
	graphite::FileType fileType = graphite::FileType::ASCII;

	graphite::ThreadPool::Instance()->setThreadCount(threadCount);

	std::vector< graphite::Region::SharedPtr > regionPtrs;
	if (paramRegionPtr == nullptr)
	{
		regionPtrs = graphite::VCFFileReader::GetAllRegionsInVCF(vcfPaths);
	}
	else
	{
		regionPtrs.emplace_back(paramRegionPtr);
	}

	uint32_t readLength = graphite::BamAlignmentManager::GetReadLength(bamPaths);
	graphite::SampleManager::SharedPtr sampleManagerPtr = std::make_shared< graphite::SampleManager >(graphite::BamAlignmentManager::GetSamplePtrs(bamPaths));

	std::unordered_map< std::string, graphite::IFileWriter::SharedPtr > vcfoutPaths;
	for (auto vcfPath : vcfPaths)
	{
		std::string path = vcfPath.substr(vcfPath.find_last_of("/") + 1);
		std::string filePath = outputDirectory + "/" + path;
		uint32_t counter = 1;
		while (graphite::IFile::fileExists(filePath, false))
		{
			std::string extension = vcfPath.substr(vcfPath.find_last_of(".") + 1);
			std::string fileNameWithoutExtension = path.substr(0, path.find_last_of("."));
			filePath = outputDirectory + "/" + fileNameWithoutExtension + "." + std::to_string(counter) + "." + extension;
			++counter;
		}
		// filePath += ".tmp";
		graphite::IFileWriter::SharedPtr fileWriterPtr;
		if (fileType == graphite::FileType::BGZF)
		{
			fileWriterPtr = std::make_shared< graphite::BGZFFileWriter >(filePath);
		}
		else
		{
			fileWriterPtr = std::make_shared< graphite::ASCIIFileWriter >(filePath);
		}
		fileWriterPtr->open();
		vcfoutPaths[vcfPath] = fileWriterPtr;
	}

	std::unordered_set< std::string > outputPaths;
	bool firstTime = true;

	for (uint32_t regionCount = 0; regionCount < regionPtrs.size(); ++regionCount)
	{
		auto alignmentReaderManagerPtr = std::make_shared< graphite::AlignmentReaderManager< graphite::BamAlignmentReader > >(bamPaths, threadCount); // this used to go above this loop but it caused issues with loading bam regions from out-of-order VCFs
		auto regionPtr = regionPtrs[regionCount];
		auto fastaReferencePtr = std::make_shared< graphite::FastaReference >(fastaPath, regionPtr);

		// load variants from vcf
		auto variantManagerPtr = std::make_shared< graphite::VCFManager >(vcfPaths, regionPtr, fastaReferencePtr, readLength);
		variantManagerPtr->asyncLoadVCFs(); // begin the process of loading the vcfs asynchronously

		variantManagerPtr->waitForVCFsToLoadAndProcess(); // wait for vcfs to load into memory

		// load bam alignments
		auto bamAlignmentManager = std::make_shared< graphite::BamAlignmentManager >(sampleManagerPtr, regionPtr, alignmentReaderManagerPtr, excludeDuplicates);
		bamAlignmentManager->loadAlignments(variantManagerPtr);
		// bamAlignmentManager->asyncLoadAlignments(variantManagerPtr, graphSize); // begin the process of loading the alignments asynchronously
		// bamAlignmentManager->waitForAlignmentsToLoad(); // wait for alignments to load into memory

		variantManagerPtr->releaseResources(); // releases the vcf file memory, we no longer need the file resources
		bamAlignmentManager->releaseResources(); // release the bam file into memory, we no longer need the file resources

		std::deque< std::shared_ptr< std::future< void > > > variantManagerFutureFunctions;
		for (auto& iter : variantManagerPtr->getVCFReadersAndVariantListsMap())
		{
			auto futureFunct = graphite::ThreadPool::Instance()->enqueue(std::bind(&graphite::IVariantList::processOverlappingAlleles, iter.second));
			variantManagerFutureFunctions.push_back(futureFunct);
		}
		while (!variantManagerFutureFunctions.empty())
		{
			variantManagerFutureFunctions.front()->wait();
			variantManagerFutureFunctions.pop_front();
		}

		milliseconds_since_epoch = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);

		// create an adjudicator for the graph
		auto gsswAdjudicator = std::make_shared< graphite::GSSWAdjudicator >(swPercent, matchValue, misMatchValue, gapOpenValue, gapExtensionValue);

		// the gsswGraphManager adjudicates on the variantManager's variants
		auto gsswGraphManager = std::make_shared< graphite::GraphManager >(fastaReferencePtr, variantManagerPtr, bamAlignmentManager, gsswAdjudicator);
		// auto gsswGraphManager = std::make_shared< graphite::GraphManager >(fastaReferencePtr, variantManagerPtr, alignmentManager, gsswAdjudicator);
		gsswGraphManager->buildGraphs(fastaReferencePtr->getRegion(), readLength);

		graphite::MappingManager::Instance()->evaluateAlignmentMappings(gsswAdjudicator);
		graphite::MappingManager::Instance()->clearRegisteredMappings();

		std::vector< std::shared_ptr< std::thread > > fileWriters;
		auto vcfPathsAndVariantListPtrsMap = variantManagerPtr->getVCFReadersAndVariantListsMap();
		std::deque< std::shared_ptr< std::future< void > > > vcfWriterFutureFunctions;
		for (auto& iter : vcfPathsAndVariantListPtrsMap)
		{
			auto vcfReaderPtr = iter.first;
			auto vcfPath = vcfReaderPtr->getFilePath();
			graphite::IFileWriter::SharedPtr fileWriter = vcfoutPaths[vcfPath];
			std::string currentVCFOutPath = fileWriter->getFilePath();
			auto variantListPtr = iter.second;
			auto vcfHeaderPtr = vcfReaderPtr->getVCFHeader();

			vcfHeaderPtr->registerActiveSample(sampleManagerPtr);
			if (firstTime)
			{
				outputPaths.emplace(currentVCFOutPath);
			}
			auto funct = std::bind(&graphite::VariantList::writeVariantList, variantListPtr, fileWriter, vcfHeaderPtr, firstTime);
			auto functFuture = graphite::ThreadPool::Instance()->enqueue(funct);
			vcfWriterFutureFunctions.push_back(functFuture);
		}

		while (!vcfWriterFutureFunctions.empty())
		{
			vcfWriterFutureFunctions.front()->wait();
			vcfWriterFutureFunctions.pop_front();
		}

		firstTime = false;
	}

	for (auto& iter : vcfoutPaths)
	{
		graphite::IFileWriter::SharedPtr fileWriter = iter.second;
		fileWriter->close();
	}

	// graphite::GSSWAdjudicator* adj_p;
	// std::cout << "adj counts: " << (uint32_t)adj_p->s_adj_count << " [total]" << std::endl;

	return 0;
}