pair<string,scalar_type> tree_LL_nucl(string tree,string aln_filename,bool optimize_bls,scalar_type tolerance) { //const Alphabet* alphabet = new ProteicAlphabet(); const Alphabet* alphabet = new RNA(); OrderedSequenceContainer *alignment; VectorSiteContainer* sites; Fasta Reader; //NexusIOSequence Reader; //Phylip * Reader=new Phylip(true,true,100,true,"\r"); alignment = Reader.read(aln_filename, alphabet); sites = new VectorSiteContainer(*alignment); SiteContainerTools::removeGapOnlySites(*sites); SiteContainerTools::changeGapsToUnknownCharacters(*sites); TreeTemplate<Node>* ttree1=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); DiscreteRatesAcrossSitesTreeLikelihood* tl1; SubstitutionModel* model = 0; DiscreteDistribution* rDist = 0; model = new GTR(&AlphabetTools::RNA_ALPHABET); model->setFreqFromData(*sites); rDist = new GammaDiscreteDistribution(8, 1, 1); tl1 = new RHomogeneousTreeLikelihood(*ttree1, *sites, model, rDist, true, false, false); tl1->initialize(); if (optimize_bls) { //Newton.. ParameterList * parameters= new ParameterList(); parameters->addParameters( tl1->getBranchLengthsParameters()); parameters->addParameters( tl1->getRateDistributionParameters()); OptimizationTools::optimizeNumericalParameters( dynamic_cast<DiscreteRatesAcrossSitesTreeLikelihood*> (tl1), //tl1->getParameters(), *parameters, 0, 1, tolerance, 1000, 0, 0, false, 0, OptimizationTools::OPTIMIZATION_NEWTON, //OptimizationTools::OPTIMIZATION_BRENT); OptimizationTools::OPTIMIZATION_BFGS); delete parameters; } scalar_type LL=- tl1->getValue(); //Here's your log likelihood value ! //tl1->getParameters().printParameters(cout); //cout << TreeTemplateTools::treeToParenthesis( tl1->getTree() ) <<endl; pair<string,scalar_type> return_pair; return_pair.first= TreeTemplateTools::treeToParenthesis( tl1->getTree() ) ; return_pair.second=LL; delete sites; delete alphabet; delete model; delete rDist; delete tl1; return return_pair; }
int main(int argc, const char** argv) { usage(argc, argv); string read = argv[1]; string germline = argv[2]; Fasta Vgenes(germline+"V.fa", 2, "|"); Fasta Jgenes(germline+"J.fa", 2, "|"); Fasta interestingV = extractInterestingGenes(Vgenes, argv[3]); Fasta interestingJ = extractInterestingGenes(Jgenes, argv[4]); if (interestingV.size() == 0) { cerr << "No interesting V found" << endl; exit(2); } if (interestingJ.size() == 0) { cerr << "No interesting J found" << endl; exit(2); } AlignBox box_V("5", V_COLOR); AlignBox box_J("3", J_COLOR); if (read == "-") { // Read on stdin read = read_sequence(cin); } align_against_collection(read, interestingV, -1, false, false, false, &box_V, VDJ); align_against_collection(read, interestingJ, -1, false, true, false, &box_J, VDJ); // This should be handled directly into align_against_collection box_J.start = box_J.end ; box_J.del_left = box_J.del_right; box_J.end = read.size() - 1; int align_V_length = min(GENE_ALIGN, box_V.end - box_V.start + 1); int align_J_length = min(GENE_ALIGN, (int)read.size() - box_J.start + 1); int start_V = box_V.end - align_V_length + 1; int end_J = box_J.start + align_J_length - 1; cout << "read \t" << start_V << "\t" ; cout << V_COLOR << read.substr(start_V, align_V_length) << NO_COLOR << read.substr(box_V.end+1, (box_J.start - 1) - (box_V.end + 1) +1) << J_COLOR << read.substr(box_J.start, align_J_length) << NO_COLOR << "\t" << end_J << endl ; cout << box_V.refToString(start_V, end_J) << "\t" << box_V << endl ; cout << box_J.refToString(start_V, end_J) << "\t" << box_J << endl ; exit (0); }
shared_ptr<VectorSiteContainer> SiteContainerBuilder::read_fasta_protein_file( string filename) { Fasta reader; SequenceContainer* alignment = reader.readSequences(filename, &AlphabetTools::PROTEIN_ALPHABET); shared_ptr<VectorSiteContainer> sequences(new VectorSiteContainer(*alignment)); delete alignment; if (sequences->getNumberOfSequences() == 0) { sequences.reset(); throw Exception("The alignment is empty - did you specify the right file format?"); } return sequences; }
Fasta extractInterestingGenes(Fasta &repertoire, string name) { Fasta interesting; int size = repertoire.size(); for (int i = 0; i < size; i++) { if (repertoire.label(i).find(name) != string::npos) { interesting.add(repertoire.read(i)); } } return interesting; }
int read_sequences(Auto_Unzip & input, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) { if (&input == NULL) return 0; Fasta read; read.set_FASTQ_type(format_type); int n_seq = 0; { mutex::scoped_lock lock(read_mutex); istream & in = input.filtered(); while (not input.eof() and n_seq < num_seq) { in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } //Reset and take a ref Mask & r = sequences[n_seq] = Mask(); r.set_id(read.get_id()); r.set_sequence(read.get_sequence()); r.set_quality(read.get_quality()); n_seq++; } output_progress(input, gui_output); } return n_seq; }
VariantContig::VariantContig( RawVariant const& var, Fasta& ref, int flank, std::string const& seqname ) { uint64_t seqlen = ref.seqlen(seqname); uint64_t preflank_len = var.pos <= flank ? var.pos - 1 : flank; _start = std::max(1ul, var.pos - preflank_len); _stop = std::min(var.pos + var.ref.size() - 1 + flank, seqlen); uint64_t postflank_start = var.pos + var.ref.size(); uint64_t postflank_len = _stop - postflank_start + 1; // build sequence if (preflank_len) _sequence = ref.sequence(seqname, _start, preflank_len); // left flank _sequence += var.alt; if (postflank_start <= seqlen && postflank_len) _sequence += ref.sequence(seqname, postflank_start, postflank_len); // right flank // build cigar if (preflank_len) _cigar.push_back(preflank_len, MATCH); if (var.ref.size() > var.alt.size()) { _cigar.push_back(var.alt.size(), MATCH); _cigar.push_back(var.ref.size() - var.alt.size(), DEL); } else if (var.ref.size() < var.alt.size()) { _cigar.push_back(var.ref.size(), MATCH); _cigar.push_back(var.alt.size() - var.ref.size(), INS); } else { _cigar.push_back(var.alt.size(), MATCH); } if (postflank_len) _cigar.push_back(postflank_len, MATCH); }
void Alignment::_write_fasta(shared_ptr<VectorSiteContainer> seqs, string filename) { Fasta writer; writer.writeAlignment(filename, *seqs); }
int write_node ( FILE *f, parus::Tree *node, int parent_node_number, int number_output_edge, map<int,Edge> &edges_list, Fasta &fasta, const char *path_to_muscle ) { Edge edge; int flag=0; int num_input_edges=0; int left_edge_number=0; int right_edge_number=0; char str[100]; int i; FILE *file_node_body=NULL; Fasta node_fasta; Sequence_record record; if(node==NULL) return 0; fprintf(f," <NODE_BEGIN>\n"); fprintf(f," number %d\n",node->number); fprintf(f," type 0\n"); fprintf(f," weight 1000\n"); fprintf(f," layer %d\n",node->antilayer); if(node->left!=NULL) { num_input_edges++; } if(node->right!=NULL) { num_input_edges++; } fprintf(f," num_input_edges %d\n",num_input_edges); fprintf(f," edges ( "); if(node->left!=NULL) { edge_number++; left_edge_number=edge_number; fprintf(f," %d ",edge_number); } if(node->right!=NULL) { edge_number++; right_edge_number=edge_number; fprintf(f," %d ",edge_number); } fprintf(f,")\n"); if(number_output_edge==0) { fprintf(f," num_output_edges 0\n"); fprintf(f," edges ( )\n"); } else { fprintf(f," num_output_edges 1\n"); fprintf(f," edges ( %d )\n",number_output_edge); edge.to=parent_node_number; edge.from=node->number; edges_list[number_output_edge]=edge; } fprintf(f," head \"\"\n"); if((node->left!=NULL)&&(node->right!=NULL)) { fprintf(f," body \"process_pair.cpp\"\n"); } if(node->num_names!=0) { fprintf(f," body \"generate_profile.cpp\"\n"); } fprintf(f," tail \"\"\n"); fprintf(f," <NODE_END>\n\n"); flag=write_node ( f, node->left, node->number, left_edge_number, edges_list, fasta, path_to_muscle ); if(flag) return -1; flag=write_node ( f, node->right, node->number, right_edge_number, edges_list, fasta, path_to_muscle ); if(flag) return -1; if((node->left!=NULL)&&(node->right!=NULL)) { file_node_body=fopen("root_graph_program.cpp","a"); if(file_node_body==NULL) { printf("Can't open file 'root_graph_program.cpp'\n"); return -1; } fprintf ( file_node_body, "pairs[%d].set_values( %d, %d );\n", node->number-1, node->left->number, node->right->number ); fclose(file_node_body); /* fprintf ( file_node_body, "system(\"%s -profile -in1 data_node_%d.fasta -in2 data_node_%d.fasta -out data_node_%d.fasta \");\n", path_to_muscle, node->left->number, node->right->number, node->number ); */ /* * This code commented because this problem * solved in make_align.sh script */ /* fprintf ( file_node_body, "system(\"rm data_node_%d.fasta data_node_%d.fasta\");\n", node->left->number, node->right->number ); */ } if(node->num_names!=0) { /* fprintf ( file_node_body, "system(\"%s -in fasta_node_%d.fasta -out data_node_%d.fasta \");\n", path_to_muscle, node->number, node->number ); */ /* * This code commented because this problem * solved in make_align.sh script */ /* fprintf ( file_node_body, "system(\"rm fasta_node_%d.fasta \");\n", node->number ); */ for(i=0;i<node->num_names;i++) { fasta.get(node->names[i],record); node_fasta.add(record); } sprintf(str,"fasta_node_%d.fasta",node->number); flag=node_fasta.write(str); if(flag) { printf("The node with number %d can't be written to file '%s'\n",node->number,str); return -1; } } //fclose(file_node_body); return 0; }
int main( int argc, char *argv[]) { // Options bool showHelp = false; string cutSeq = "AAGCTT"; string genomeFile; string bedFile = "stdout"; string faFile = "ends.fa"; CHRPOS readLen = 20; // Show help when has no options if(argc <= 1) { Help(); return 0; } // Parsing options for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if((PARAMETER_CHECK("-h", 2, parameterLength)) || (PARAMETER_CHECK("--help", 5, parameterLength))) showHelp=true; else if((PARAMETER_CHECK("-g", 2, parameterLength)) || (PARAMETER_CHECK("--genome", 8, parameterLength))) { if ((++i) < argc) genomeFile = argv[i]; } else if((PARAMETER_CHECK("-c", 2, parameterLength)) || (PARAMETER_CHECK("--cut_seq", 9, parameterLength))) { if ((++i) < argc) cutSeq = argv[i]; } else if ((PARAMETER_CHECK("-b", 2, parameterLength)) || (PARAMETER_CHECK("--bed_output", 12, parameterLength))) { if ((++i) < argc) bedFile=argv[i]; } else if ((PARAMETER_CHECK("-f", 2, parameterLength)) || (PARAMETER_CHECK("--fa_output", 11, parameterLength))) { if ((++i) < argc) faFile=argv[i]; } else if ((PARAMETER_CHECK("-r", 2, parameterLength)) || (PARAMETER_CHECK("--read_len", 10, parameterLength))) { if ((++i) < argc) readLen = StringUtils::toValue<CHRPOS>(argv[i]); } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; } } // Show help if no proper auguments. if (showHelp) { Help(); return 0; } // Statistical variables map <string, int, less<string> > bedCount; map <string, CHRPOS, less<string> > bedSum; map <string, CHRPOS, less<string> > faSize; // Variables CHRPOS lindex,rindex; int siteLen=cutSeq.size(); bool flag; Fasta curFa; SeqReader fhfa(genomeFile); Writer bedOutput(bedFile); Writer faOutput(faFile); // open files fhfa.open(); bedOutput.open(); faOutput.open(); // Read the genome file. while (fhfa.getNext(curFa)) { // Statistics bedCount[curFa.id]=0; bedSum[curFa.id]=0; faSize[curFa.id]=curFa.length(); // Find next recognition site. lindex=rindex=0; flag=true; while (flag) { rindex=curFa.seq.find(cutSeq,lindex); if(rindex==CHRPOS(string::npos)) { rindex=curFa.seq.size(); flag=false; } (*(bedOutput.Printer())) << curFa.id << "\t" << lindex << "\t" << rindex << "\t" << (lindex+rindex)/2 << endl; if (rindex - lindex >= 2*readLen) { bedCount[curFa.id]++; bedSum[curFa.id]+=rindex-lindex; (*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_L" << endl; (*(faOutput.Printer())) << curFa.seq.substr(lindex,readLen) << endl; (*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_R" << endl; (*(faOutput.Printer())) << curFa.seq.substr(rindex-readLen,readLen) << endl; } lindex=rindex+siteLen; } } // close files fhfa.close(); bedOutput.close(); faOutput.close(); // print statistics into log file Writer log(cutSeq+".log"); log.open(); log.close(); return 0; }
scalar_type tree_LL(string tree,string aln_filename,bool optimize_bls,scalar_type tolerance) { const Alphabet* alphabet = new ProteicAlphabet(); OrderedSequenceContainer *alignment; VectorSiteContainer* sites; Fasta Reader; //Phylip * Reader=new Phylip(true,true,100,true,"\r"); alignment = Reader.read(aln_filename, alphabet); sites = new VectorSiteContainer(*alignment); SiteContainerTools::changeGapsToUnknownCharacters(*sites); TreeTemplate<Node>* ttree1=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); //Newick newick1; //ttree1 = newick1.read(tree); DiscreteRatesAcrossSitesTreeLikelihood* tl1; SubstitutionModel* model = 0; DiscreteDistribution* rDist = 0; model = new LG08(&AlphabetTools::PROTEIN_ALPHABET, new FullProteinFrequenciesSet(&AlphabetTools::PROTEIN_ALPHABET), true); model->setFreqFromData(*sites); rDist = new GammaDiscreteDistribution(4, 1, 1); tl1 = new RHomogeneousTreeLikelihood(*ttree1, *sites, model, rDist, true, false, false); tl1->initialize(); /* if (optimize_bls) { Optimizer* optimizer = new PseudoNewtonOptimizer(tl1); // Optimizer* optimizer = new PseudoNewtonOptimizer(tl1); ParameterList * parameters= new ParameterList(); parameters->addParameters( tl1->getBranchLengthsParameters()); parameters->addParameters( tl1->getRateDistributionParameters()); //Newton.. optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); optimizer->setProfiler(0); optimizer->setMessageHandler(0); optimizer->setVerbose(0); optimizer->getStopCondition()->setTolerance(0.01); optimizer->init(*parameters); //optimizer->init(tl1->getParameters()); optimizer->setMaximumNumberOfEvaluations(1000); optimizer->optimize(); delete parameters; delete optimizer; } */ if (optimize_bls) { //Newton.. ParameterList * parameters= new ParameterList(); parameters->addParameters( tl1->getBranchLengthsParameters()); parameters->addParameters( tl1->getRateDistributionParameters()); OptimizationTools::optimizeNumericalParameters( dynamic_cast<DiscreteRatesAcrossSitesTreeLikelihood*> (tl1), //tl1->getParameters(), *parameters, 0, 1, tolerance, 1000, 0, 0, false, 0, OptimizationTools::OPTIMIZATION_NEWTON, //OptimizationTools::OPTIMIZATION_BRENT); OptimizationTools::OPTIMIZATION_BFGS); delete parameters; } scalar_type LL=- tl1->getValue(); //Here's your log likelihood value ! delete sites; delete alphabet; delete model; delete rDist; delete tl1; return LL; }
int read_sequences(Auto_Unzip & first, Auto_Unzip & second, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) { if (&first == NULL or &second == NULL) return 0; Fasta read; read.set_FASTQ_type(format_type); int n_seq = 0; { mutex::scoped_lock lock(read_mutex); istream & first_in = first.filtered(); istream & second_in = second.filtered(); while (not first.eof() and not second.eof() and (n_seq + 1) < num_seq) { first_in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } Mask & rf = sequences[n_seq]; rf.set_id(read.get_id()); rf.set_sequence(read.get_sequence()); rf.set_quality(read.get_quality()); n_seq++; second_in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } Mask & rs = sequences[n_seq]; rs.set_id(read.get_id()); rs.set_sequence(read.get_sequence()); rs.set_quality(read.get_quality()); n_seq++; //CHECK!! if (rf.id.compare(0, rf.id.size() - 1, rs.id, 0, rs.id.size() - 1) != 0) { ERROR_CHANNEL << "wrong paired reads IDs: '" << rf.id << "' and '" << rs.id << '\'' << endl; exit(2); } } output_progress(first, gui_output); } return n_seq; }
void Module_DCREATE::compute_master(const Options & options) { string prefix_temp = options.output_file + string("_temp"); DEFAULT_CHANNEL << '[' << my_rank << "] reading input" << endl; // Read all the Fasta files and check for duplicate names vector<Fasta *> multi_fasta; set<string> names; pair<set<string>::iterator,bool> ret; bool all_ok = true; size_t sum = 0; for (vector<string>::const_iterator iter = options.input_files.begin(); iter != options.input_files.end(); iter++) { Auto_Unzip input(iter->c_str()); while (not input.eof()) { Fasta * temp = new Fasta(); input.filtered() >> *temp; sum += temp->length(); multi_fasta.push_back(temp); ret = names.insert(temp->get_id()); if (ret.second == false) { ERROR_CHANNEL << "Error: name \"" << temp->get_id() << "\" already exists!" << endl; all_ok = false; } } } if (not all_ok) { for (int node = 1; node < nprocs; node++) send_sequences_to_slave(node, 0, 0, string(), string()); return; } DEFAULT_CHANNEL << '[' << my_rank << "] sorting" << endl; // sort by length if (options.balancing) sort(multi_fasta.begin(), multi_fasta.end(), sort_reverse_function); DEFAULT_CHANNEL << '[' << my_rank << "] preparing header" << endl; // prepare file for header stringstream header_name; header_name << options.output_file << "_h.dht"; ofstream o(header_name.str().c_str()); for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) o << (*iter)->get_id() << '\t' << (*iter)->get_sequence().size() << endl; o.close(); DEFAULT_CHANNEL << '[' << my_rank << "] preparing temporary files" << endl; // prepare sets and create temp files size_t bins = nprocs; size_t bin_length[bins]; ofstream outputs[bins]; for (size_t i = 0; i < bins; i++) { bin_length[i] = 0; stringstream filename; filename << prefix_temp << '_' << (i+1) << ".fasta"; temp_files.push_back(filename.str()); outputs[i].open(filename.str().c_str()); } DEFAULT_CHANNEL << '[' << my_rank << "] writing to files" << endl; // write to files for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) { size_t min_pos = 0; size_t t_min = sum; for (size_t i = 0; i < bins; i++) if (bin_length[i] < t_min) { min_pos = i; t_min = bin_length[min_pos]; } bin_length[min_pos] += (*iter)->length(); outputs[min_pos] << **iter; } for (size_t i = 0; i < bins; i++) { outputs[i].close(); } DEFAULT_CHANNEL << '[' << my_rank << "] sending to slaves" << endl; // send to slaves for (size_t i = 1; i < bins; i++) { stringstream filename_input; stringstream filename_output; filename_input << prefix_temp << '_' << (i+1) << ".fasta"; filename_output << options.output_file << '_' << (i+1) << ".eht"; send_sequences_to_slave(i, options.k, options.blockLength, filename_input.str(), filename_output.str()); } DEFAULT_CHANNEL << '[' << my_rank << "] clearing memory" << endl; // clear memory for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) delete *iter; DEFAULT_CHANNEL << '[' << my_rank << "] computing" << endl; // compute by master process stringstream filename_input; stringstream filename_output; filename_input << prefix_temp << "_1.fasta"; filename_output << options.output_file << "_1.eht"; compute_hash(options.k,options.blockLength,filename_input.str().c_str(), filename_output.str().c_str(),false); //TODO handle methyl_hash DEFAULT_CHANNEL << '[' << my_rank << "] finishing" << endl; stringstream filename_numberfile; filename_numberfile << options.output_file << "_n.dht"; ofstream nf(filename_numberfile.str().c_str()); if (!nf) { ERROR_CHANNEL << "I cannot open file " << filename_numberfile.str() << " for writing!" << endl; exit(6); } nf << nprocs << endl; }
void GenericIndividualSnpCall::PyroHMMsnp(Fasta &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, GenericProbabilisticAlignment &probAligner, list<Allele>& allelesInBlock, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantResults) { VariantCallSetting settingForPyroHMMsnp = snpCallSettings; // allele pool vector<Allele> allelePool; for (list<Allele>::iterator allelesInBlockIter=allelesInBlock.begin(); allelesInBlockIter!=allelesInBlock.end(); allelesInBlockIter++) { allelePool.push_back(*allelesInBlockIter); } // add 10bp flanking segment at each side int windowLeftPosition = leftPosition - snpCallSettings.m_flankingSize; int windowRightPosition = rightPosition + snpCallSettings.m_flankingSize; // genome string genome; fastaObj.GetSequence(chrID, windowLeftPosition, windowRightPosition, genome); int globalDepth; double globalMapQual; int globalStrandPos; int globalStrandNeg; vector<PyroHMMsnp_Sequence_t> readsInWindow; // rewind BAM reader bamObj.Rewind(); // set BAM region bamObj.SetRegion(chrID, windowLeftPosition, chrID, windowRightPosition); // read alignment BamAlignment al; while (bamObj.GetNextAlignment(al)) { // skip if it is not a good alignment if (!GenericBamAlignmentTools::goodAlignment(al)) { continue; } // skip if it is not valid at length if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength)) { continue; } // skip if it is not valid at map quality if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality)) { continue; } // skip if it is not valid at alignment identity if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac)) { continue; } // global info globalDepth += 1; globalMapQual += al.MapQuality*al.MapQuality; if (al.IsReverseStrand()) globalStrandNeg += 1; else globalStrandPos += 1; // get local alignment string t_localRead, t_localGenome; Cigar t_cigar; BamMD t_md; int t_numMismatch, t_numInDel; GenericBamAlignmentTools::getLocalAlignment(al, windowLeftPosition, windowRightPosition-windowLeftPosition, t_localRead, t_localGenome, t_cigar, t_md, t_numMismatch, t_numInDel); if (t_localRead.empty() || t_localGenome.empty()) continue; // save into set PyroHMMsnp_Sequence_t t_seq; t_seq.t_ID = GenericBamAlignmentTools::getBamAlignmentID(al); t_seq.t_sequence = t_localRead; t_seq.t_cigar = t_cigar; t_seq.t_md = t_md; t_seq.t_numMismatch = t_numMismatch; t_seq.t_numInDel = t_numInDel; t_seq.t_mapQualScore = al.MapQuality; if (al.Position>windowLeftPosition) t_seq.t_startPositionShift = al.Position-windowLeftPosition; else t_seq.t_startPositionShift = 0; if (al.GetEndPosition()<windowRightPosition) t_seq.t_endPositionShift = windowRightPosition-al.GetEndPosition(); else t_seq.t_endPositionShift = 0; readsInWindow.push_back(t_seq); } int numData = readsInWindow.size(); // construct the consensus sequence graph GenericDagGraph consensusGraph; vector<string> consensusGraphReads; vector<Cigar> consensusGraphReadCigars; vector<int> consensusGraphReadStarts; // set of aligned reads to construct the graph for (int i=0; i<numData; ++i) { consensusGraphReads.push_back(readsInWindow[i].t_sequence); consensusGraphReadCigars.push_back(readsInWindow[i].t_cigar); consensusGraphReadStarts.push_back(readsInWindow[i].t_startPositionShift); } // build up the graph consensusGraph.buildDagGraph(genome, consensusGraphReads, consensusGraphReadCigars, consensusGraphReadStarts); consensusGraph.edgePruning(snpCallSettings.m_graphPruneLevel); // search topK paths, excluding reference vector<string> topRankConsensusGraphPaths; vector<list<Vertex>> topRankConsensusGraphPathVertexs; vector<double> topRankConsensusGraphPathWeights; consensusGraph.topRankPathsExcludeGenome(30, topRankConsensusGraphPaths, topRankConsensusGraphPathVertexs, topRankConsensusGraphPathWeights); // change vertex list to vertex set vector<set<Vertex>> topRankConsensusGraphPathVertexSet; for (int i=0; i<topRankConsensusGraphPathVertexs.size(); i++) { list<Vertex>::iterator vertexIter = topRankConsensusGraphPathVertexs[i].begin(); set<Vertex> vertexSet; for (; vertexIter!=topRankConsensusGraphPathVertexs[i].end(); vertexIter++) { vertexSet.insert(*vertexIter); } topRankConsensusGraphPathVertexSet.push_back(vertexSet); } // get variant vertices vector<int> allelePositions; vector<string> alleleChars; for (list<Allele>::iterator alleleIter=allelesInBlock.begin(); alleleIter!=allelesInBlock.end(); alleleIter++) { Allele allele = *alleleIter; allelePositions.push_back(allele.m_chrPosition-windowLeftPosition); alleleChars.push_back(allele.m_allele); } // map allele to graph vertex set<Vertex> variantVertexs; map<int,Vertex> mapAlleleToVertex; map<Vertex,int> mapVertexToAllele; for (int v=0; v<consensusGraph.m_numVertexs; v++) { if (consensusGraph.m_skip[v]) continue; if (!consensusGraph.m_isMismatch[v]) continue; int gp = consensusGraph.m_genomePosition[v] - 1; for (int j=0; j<allelePool.size(); j++) { int ap = allelePositions[j]; if (ap==gp) { if (alleleChars[j]==consensusGraph.m_labels[v]) { variantVertexs.insert(v); mapAlleleToVertex[j] = v; mapVertexToAllele[v] = j; } } } } // set up the haplotypes vector<string> haplotypes; vector<int> haplotypeToPathIndex; vector<set<Vertex>> haplotypeVariantVertexs; haplotypes.push_back(genome); haplotypeToPathIndex.push_back(-1); haplotypeVariantVertexs.push_back(set<Vertex>()); int kk = 0; for (int i=0; i<topRankConsensusGraphPaths.size(); i++) { if (kk>=snpCallSettings.m_topK) continue; bool hasVariantVertex = false; int deltaLength = (topRankConsensusGraphPaths[i].length()-genome.length()); deltaLength = abs(deltaLength); if (deltaLength>5) continue; set<Vertex> pathVertexs = topRankConsensusGraphPathVertexSet[i]; set<Vertex> pathVariantVertexs; for (set<Vertex>::iterator variantIter=variantVertexs.begin(); variantIter!=variantVertexs.end(); variantIter++) { if (pathVertexs.find(*variantIter)!=pathVertexs.end()) { hasVariantVertex = true; pathVariantVertexs.insert(*variantIter); } } int totalNumberVariantVertexInPath = 0; for (set<Vertex>::iterator vertexIter=pathVertexs.begin(); vertexIter!=pathVertexs.end(); vertexIter++) { int v = *vertexIter; if (consensusGraph.m_isMismatch[v]) { totalNumberVariantVertexInPath += 1; } } if (hasVariantVertex && totalNumberVariantVertexInPath<=pathVariantVertexs.size()) { haplotypes.push_back(topRankConsensusGraphPaths[i]); haplotypeToPathIndex.push_back(i); haplotypeVariantVertexs.push_back(pathVariantVertexs); kk++; } } int numHaplotypes = haplotypes.size(); // skip if there is no variant haplotype if (numHaplotypes==1) { return; } // compute haplotype data likelihood vector<vector<long double>> haplotypeDataLikelihoods(numHaplotypes); PyroHMMsnpHaplotypeDataLikelihood(probAligner, snpCallSettings.m_band, numHaplotypes, haplotypes, readsInWindow, haplotypeDataLikelihoods); // genotype vector<vector<int>> genotypes; set<set<int>> genotypeDiscovered; for (int i=0; i<numHaplotypes; i++) { vector<int> precedeHaplotypes; PyroHMMsnpGenotypeSet(snpCallSettings.m_ploidy, i, numHaplotypes, precedeHaplotypes, genotypes, genotypeDiscovered); } int numGenotypes = genotypes.size(); // genotype variant vertex vector<set<Vertex>> genotypeVariantVertexs; for (int i=0; i<numGenotypes; i++) { set<Vertex> variantVertexInGenotype; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { int haplotype = genotypes[i][j]; set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype]; variantVertexInGenotype.insert(variantVertexInHaplotype.begin(), variantVertexInHaplotype.end()); } genotypeVariantVertexs.push_back(variantVertexInGenotype); } // genotype priors vector<long double> genotypePriors(numGenotypes); PyroHMMsnpGenotypePrior(numGenotypes, genotypes, settingForPyroHMMsnp, genotypePriors); // genotype likelihoods vector<long double> genotypeLikelihoods(numGenotypes); PyroHMMsnpGenotypeLikelihood(numGenotypes, genotypes, readsInWindow.size(), haplotypeDataLikelihoods, snpCallSettings, genotypeLikelihoods); // genotype posteriors vector<long double> genotypePosteriors(numGenotypes); PyroHMMsnpGenotypePosterior(numGenotypes, genotypePriors, genotypeLikelihoods, genotypePosteriors); // search maximal genotype posterior long double maxGenotypePosterior = 0; int inferGenotype; for (int i=1; i<numGenotypes; i++) { if (maxGenotypePosterior<genotypePosteriors[i]) { maxGenotypePosterior = genotypePosteriors[i]; inferGenotype = i; } } // all variant vertexs in the inferred genotype set<Vertex> inferGenotypeVariantVertexs = genotypeVariantVertexs[inferGenotype]; // count haploid type of variant map<Vertex,vector<int>> inferGenotypeVariantHaploidType; set<Vertex>::iterator inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { int v = *inferVariantIter; vector<int> variantHaploidType; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { int haplotype = genotypes[inferGenotype][j]; set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype]; if (variantVertexInHaplotype.find(v)==variantVertexInHaplotype.end()) { variantHaploidType.push_back(0); }else { variantHaploidType.push_back(1); } } inferGenotypeVariantHaploidType[v] = variantHaploidType; } // variant score map<Vertex,long double> inferGenotypeVariantScore; inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { int v = *inferVariantIter; long double variantScore = 0; for (int i=0; i<numGenotypes; i++) { set<Vertex> variantVertexInGenotype = genotypeVariantVertexs[i]; if (variantVertexInGenotype.find(v)!=variantVertexInGenotype.end()) variantScore += genotypePosteriors[i]; } inferGenotypeVariantScore[v] = variantScore; } // save variant result inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { GenericVariant result; int v = *inferVariantIter; int a = mapVertexToAllele[v]; int variantChrID; int variantChrPos; vector<int> haploidType = inferGenotypeVariantHaploidType[v]; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { if (haploidType[j]==0) { int g = consensusGraph.m_genomePosition[v]; Allele allele; allele.m_allele = consensusGraph.m_labels[g]; result.m_alleles.push_back(allele); }else { Allele allele = allelePool[a]; result.m_alleles.push_back(allele); variantChrID = allele.m_chrID; variantChrPos = allele.m_chrPosition; } } result.m_chrID = variantChrID; result.m_chrPosition = variantChrPos; result.m_probScoreRef = genotypePosteriors[0]; result.m_probScoreVar = genotypePosteriors[inferGenotype]; result.m_variantType = VARIANT_SNP; long double variantScore = inferGenotypeVariantScore[v]; if (fabs(1-variantScore)<1e-300) result.m_quality = 3000; else if (variantScore<1e-300) result.m_quality = 0; else result.m_quality = -10*log10(1-variantScore); char refBase; fastaObj.GetBase(result.m_chrID, result.m_chrPosition, refBase); result.m_reference = refBase; for (int i=0; i<result.m_alleles.size(); i++) { if (result.m_alleles[i].m_allele==result.m_reference) result.m_haploidType.push_back(0); else result.m_haploidType.push_back(1); } // filter if (result.m_quality>=snpCallSettings.m_variantQualityFilter) variantResults.push_back(result); } }
int GenericIndividualSnpCall::call(Fasta &fastaObj, BamReader &bamObj, BamRegion &roi, GenericProbabilisticAlignment &probAligner, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantSet) { RefVector chromosomes = bamObj.GetReferenceData(); // set up genome blocks vector<int> BlockChrID, BlockLeftPos, BlockRightPos; int BlockNumber=setupGenomeBlock(chromosomes, roi, BlockChrID, BlockLeftPos, BlockRightPos); int numSNP = 0; // iterate throught blocks for (int i=0; i<BlockNumber; ++i) { if (m_verbosity>=1) { cout << "processing " << chromosomes[BlockChrID[i]].RefName << ":" << BlockLeftPos[i]+1 << "-" << BlockRightPos[i] << endl; } clock_t startTime = clock(); // genome string BlockGenome; fastaObj.GetSequence(BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockGenome); map<int,list<tuple<char,int,int,double>>> BlockBamData; AlleleSet BlockSnpAlleleCandidates; // profile SNP sites by the simple method simpleSnpCall(BlockGenome, bamObj, BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockSnpAlleleCandidates, BlockBamData); // merge SNP sites to SNP blocks vector<tuple<int,int,list<Allele>>> BlockSnpLoci; mergeSnpSitesToBlocks(BlockSnpAlleleCandidates, BlockSnpLoci); // iterate through Snp locus for (int j=0; j<BlockSnpLoci.size(); j++) { int BlockSnpLeftPos = get<0>(BlockSnpLoci[j]); int BlockSnpRightPos = get<1>(BlockSnpLoci[j]); // it is a SNP site if (BlockSnpRightPos==BlockSnpLeftPos+1) { simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, get<2>(BlockSnpLoci[j]), BlockBamData[BlockSnpLeftPos], snpCallSettings, variantSet); }else if (BlockSnpRightPos==BlockSnpLeftPos+2) { for (int pos=BlockSnpLeftPos; pos<BlockSnpRightPos; pos++) { list<Allele> fAlleles = get<2>(BlockSnpLoci[j]); list<Allele> tAlleles; for (list<Allele>::iterator faIter=fAlleles.begin(); faIter!=fAlleles.end(); faIter++) { if (faIter->m_chrPosition==pos) tAlleles.emplace_back(*faIter); } if (!tAlleles.empty()) simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], pos, pos+1, tAlleles, BlockBamData[pos], snpCallSettings, variantSet); } } else // it is a MNP site { PyroHMMsnp(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, probAligner, get<2>(BlockSnpLoci[j]), snpCallSettings, variantSet); } } clock_t endTime = clock(); if (m_verbosity>=1) { cout << "time elapsed " << ((endTime-startTime)/(double)CLOCKS_PER_SEC/60.) << " minutes"; cout << ", "; cout << "call " << variantSet.size()-numSNP << " SNPs" << endl; } numSNP = variantSet.size(); } return variantSet.size(); }
void buildBWT2 (const std::string& fileName, const std::string& prefixName) { /* read input fasta file */ std::ifstream in {fileName}; /* string to store the sense + reverse complementary of the genome seq */ std::string seq, seqRC {}; /* running accumulator recording the length of each chr */ INTTYPE tempLen {0}, accumulatedLength {0}; /* for concatenated seq */ std::map <INTTYPE, INTTYPE> NPosLen { }; /* file to store which regions has which chr*/ std::ofstream chrStartPos {prefixName + "chrStart"}; /* file to store the length of each chr */ std::ofstream chrLen {prefixName + "chrLen"}; /* read in each fasta and make two string */ while (in.good ()) { Fasta<std::vector> fa {in}; /* store start position of each chr */ chrStartPos << fa.getName () << '\t' << accumulatedLength << '\n'; /* get chr length */ tempLen = fa.getLengthNoN (); /* store chr length */ chrLen << fa.getName () << '\t' << tempLen << '\n'; /* update accumulated length */ accumulatedLength += tempLen; /* update NPosLen */ fa.updateNpos (NPosLen); seq += fa.getSeqNoN (); } chrStartPos.close (); chrLen.close (); /* resize to enough space for the reverse complemetary sequence and a $ sign */ seq.resize (seq.size () * 2 + 1); // TODO: resize does mallocating the extra space and also initialization, the later is not necessary auto iter = seq.begin (); std::advance (iter, (seq.size ()-1)/2); auto iter2 = iter; --iter2; do { switch (*iter2) { case 'A': *iter = 'T'; break; case 'T': *iter = 'A'; break; case 'G': *iter = 'C'; break; case 'C': *iter = 'G'; break; } ++iter; } while (iter2-- != seq.begin ()); *iter = '$'; /* writing NPosLen to file */ { boost::iostreams::filtering_ostream fos; fos.push (boost::iostreams::zlib_compressor()); fos.push (boost::iostreams::file_sink (prefixName + "NposLen.z")); boost::archive::binary_oarchive oa (fos); oa << NPosLen; } { ABSequence<std::string> x ( seq ); ABWT<ABSequence<std::string>> y (x, 512, 64, prefixName); } }
void align_against_collection(string &read, Fasta &rep, int forbidden_rep_id, bool reverse_ref, bool reverse_both, bool local, AlignBox *box, Cost segment_cost) { int best_score = MINUS_INF ; box->ref_nb = MINUS_INF ; int best_best_i = (int) string::npos ; int best_best_j = (int) string::npos ; int best_first_i = (int) string::npos ; int best_first_j = (int) string::npos ; vector<pair<int, int> > score_r; DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions; if (local==true) dpMode = DynProg::Local; // With reverse_ref, the read is reversed to prevent calling revcomp on each reference sequence string sequence_or_rc = revcomp(read, reverse_ref); for (int r = 0 ; r < rep.size() ; r++) { if (r == forbidden_rep_id) continue; DynProg dp = DynProg(sequence_or_rc, rep.sequence(r), dpMode, // DynProg::SemiGlobalTrans, segment_cost, // DNA reverse_both, reverse_both, rep.read(r).marked_pos); bool onlyBottomTriangle = !local ; int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT); if (local==true){ dp.backtrack(); } if (score > best_score) { best_score = score ; best_best_i = dp.best_i ; best_best_j = dp.best_j ; best_first_i = dp.first_i ; best_first_j = dp.first_j ; box->ref_nb = r ; box->ref_label = rep.label(r) ; if (!local) dp.backtrack(); box->marked_pos = dp.marked_pos_i ; } score_r.push_back(make_pair(score, r)); // #define DEBUG_SEGMENT #ifdef DEBUG_SEGMENT cout << rep.label(r) << " " << score << " " << dp.best_i << endl ; #endif } sort(score_r.begin(),score_r.end(),comp_pair); box->ref = rep.sequence(box->ref_nb); box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1; box->del_left = best_first_j; box->start = best_first_i; box->score = score_r; #ifdef DEBUG_SEGMENT cout << "best: " << box->ref_label << " " << best_score ; cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl; cout << endl; #endif if (reverse_ref) // Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ; best_best_i = read.length() - best_best_i - 1 ; box->end = best_best_i ; }