// read ps and ss from a given file and extends the ps to the given reference length assembly file_sse_pool::read(const std::string &__filename, assembly const &__reference) { // regex for a single data line of the sse pool format; groups will be used to construct sequences; // this regex will not match the begin ('bcl::assemble::SSEPool') and end lines ('END') of a sse pool file to // allow reading pdb files as well; // for the line specification, see: http://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html // and http://www.rcsb.org/pdb/static.do?p=file_formats/pdb/index.html; // all pdb lines have to be 80 char long, see http://deposit.rcsb.org/adit/docs/pdb_atom_format.html; // read_to_string_list() trims the whitespace at begin and end, so if the line length is less than 80 chars, // a line will be extended to that length; if the line has more whitespace than 80 chars, it will be ignored. // HELIX line groups explanation: // 1=HELIX; 2=Helix serial number; 3=Helix id; 4=Initial residue name; 5=Chain id; 6=Residue sequence number; // 7=Code for insertions of residues; 8=Terminal residue name; 9=Chain id; 10=Residue sequence number; // 11=Code for insertions of residues; 12=Helix type; 13=Comment; 14=Helix length. // SHEET line groups explanation: // 15=SHEET; 16=Strand number in current sheet; 17=Sheet id; 18=Number of strands in current sheet; // 19=Initial residue name; 20=Chain id; 21=Residue sequence number; 22=Code for insertions of residues; // 23=Terminal residue name; 24=Chain id; 25=Residue sequence number; 26=Code for insertions of residues; // 27=Strand sense with respect to previous; // there is additional information on a SHEET line that is not in explicit groups, but in a generic // placeholder at the end of the SHEET part of the regex. static boost::regex const regex_sse_pool_line( // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 "(HELIX) ([0-9 ]{3}) (...) (...) (.) ([0-9 ]{4})(.) (...) (.) ([0-9 ]{4})(.)([0-9 ]{2})(.{30}) ([0-9 ]{5})" ".*|" // match any space that go beyond the 76 chars specified in the regex, or match the SHEET line // 15 16 17 18 19 20 21 22 23 24 25 26 27 "(SHEET) ([0-9 ]{3}) (...)([0-9 ]{2}) (...) (.)([0-9 ]{4})(.) (...) (.)([0-9 ]{4})(.)([0-9 -]{2}).*|"); std::list<std::string> file_lines(tools::file::read_to_string_list(__filename)); DEBUG << "Read sse pool file content:\n" << boost::algorithm::join(file_lines, "\n"); std::map<char, chain_data> chains; // local variable to store the data read from the file DEBUG << "Parsing sse pool file"; for(auto &line : file_lines) { // process each line from the file // make all lines have length >=80 by filling up with spaces if length <80, to ensure the regex will match line.append(std::max(80 - line.length(), (size_t)0), ' '); boost::smatch match; // regex_match returns true if all chars of line were matched if(!boost::regex_match(line, match, regex_sse_pool_line)) { DEBUG << "No match: " << line; continue; } // if // match[0], same as match.str(), contains the whole string, see // http://www.boost.org/doc/libs/1_55_0/libs/regex/example/snippets/regex_iterator_example.cpp std::string complete_match(match[0]); boost::trim(complete_match); // remove newline at the end (the regex above matches an optional newline) DEBUG << "Match: " << complete_match; if(match[1].matched) { // if the 'HELIX' part matches, a helix line was found DEBUG << "Found HELIX line"; // get strings from all submatches, then check if contents are valid for conversion; // exception 1: lexical_cast on char should be fine, as the submatches are only single char strings; // exception 2: always try to convert residue sequence numbers, if there's no other way of knowing them // (if converting residue sequence numbers does not work, lexical_cast will throw an exception) std::string const helix_serial_number_string(boost::trim_copy(std::string(match[2]))); std::string const helix_id(boost::trim_copy(std::string(match[3]))); std::string const initial_residue_name(match[4]); char const initial_residue_chain_id(boost::lexical_cast<char>(match[5])); size_t const initial_residue_sequence_number( boost::lexical_cast<size_t>(boost::trim_copy(std::string(match[6])))); char const initial_residue_insertion_code(boost::lexical_cast<char>(match[7])); std::string const terminal_residue_name(match[8]); char const terminal_residue_chain_id(boost::lexical_cast<char>(match[9])); size_t const terminal_residue_sequence_number( boost::lexical_cast<size_t>(boost::trim_copy(std::string(match[10])))); char const terminal_residue_insertion_code(boost::lexical_cast<char>(match[11])); std::string const helix_type_string(boost::trim_copy(std::string(match[12]))); std::string const comment(match[13]); std::string const helix_length_string(boost::trim_copy(std::string(match[14]))); // print the submatches as strings before consistency checks and converting to size_t DEBUG << "Submatches: helix_serial_number='" << helix_serial_number_string << "' helix_id='" << helix_id << "' initial_residue='" << initial_residue_name << "|" << initial_residue_chain_id << "|" << initial_residue_sequence_number << "|" << initial_residue_insertion_code << "' terminal_residue='" << terminal_residue_name << "|" << terminal_residue_chain_id << "|" << terminal_residue_sequence_number << "|" << terminal_residue_insertion_code << "' helix_type='" << helix_type_string << "' comment='" << boost::trim_copy(comment) << "' helix_length='" << helix_length_string << "'"; // both chain ids should be identical if(initial_residue_chain_id != terminal_residue_chain_id) { DEBUG << "Ignoring line, initial_residue_chain_id='" << initial_residue_chain_id << "' differing from terminal_residue_chain_id='" << terminal_residue_chain_id << "'"; continue; } // if // add chain id if it isn't found in the map if(chains.find(initial_residue_chain_id) == chains.end()) { chains[initial_residue_chain_id] = chain_data(); } // if // convert strings to size_t and catch exceptions for ones which have a default or a way of calculating size_t helix_serial_number; try { helix_serial_number = boost::lexical_cast<size_t>(helix_serial_number_string); } // try catch(boost::bad_lexical_cast const &__e) { helix_serial_number = chains[initial_residue_chain_id].last_helix_serial_number + 1; DEBUG << "Could not convert helix_serial_number='" << helix_serial_number_string << "' to number, using default helix_serial_number='" << helix_serial_number << "'"; } // catch // helix serial number should be incremented by 1 from last helix serial number if(helix_serial_number != chains[initial_residue_chain_id].last_helix_serial_number + 1) { DEBUG << "Found helix_serial_number='" << helix_serial_number << "' differing from expected helix_serial_number='" << (chains[initial_residue_chain_id].last_helix_serial_number + 1) << "', using helix_serial_number from file."; } // if // update the helix counting variable chains[initial_residue_chain_id].last_helix_serial_number = helix_serial_number; // we could convert helix_type_string into size_t, but realistically no checks can be done b/c a pool // often comes from prediction and the helix_type is not known size_t helix_length; try { helix_length = boost::lexical_cast<size_t>(helix_length_string); } // try catch(boost::bad_lexical_cast const &__e) { // + 1, b/c the initial_residue_sequence_number is part of the helix helix_length = terminal_residue_sequence_number - initial_residue_sequence_number + 1; DEBUG << "Could not convert helix_length='" << helix_length_string << "' to number, using calculated helix_length='" << helix_length << "'"; } // catch // check helix_length is consist with residue sequence numbers if(helix_length != terminal_residue_sequence_number - initial_residue_sequence_number + 1) { size_t const expected_helix_length(terminal_residue_sequence_number - initial_residue_sequence_number + 1); DEBUG << "Found helix_length='" << helix_length << "' differing expected helix_length='" << expected_helix_length << "', ignoring helix_length."; helix_length = expected_helix_length; } // if // resize cc_sequence, fill with unknown cc, and set the correct cc for begin and end of the sse chains[initial_residue_chain_id].cc_sequence.resize( std::max(chains[initial_residue_chain_id].cc_sequence.size(), terminal_residue_sequence_number), cc('X')); chains[initial_residue_chain_id].cc_sequence[initial_residue_sequence_number - 1] = cc(initial_residue_name); chains[initial_residue_chain_id].cc_sequence[terminal_residue_sequence_number - 1] = cc(terminal_residue_name); // create and insert sequence_interval into the pool chains[initial_residue_chain_id].pool.insert(cchb_dssp_interval( initial_residue_sequence_number - 1, terminal_residue_sequence_number - 1, cchb_dssp('H'))); } // if else if(match[15].matched) { // if the 'SHEET' part matches, a sheet line was found DEBUG << "Found SHEET line"; // get strings from all submatches, then check if contents are valid for conversion; // exception 1: lexical_cast on char should be fine, as the submatches are only single char strings; // exception 2: always try to convert residue sequence numbers, if there's no other way of knowing them // (if converting residue sequence numbers does not work, lexical_cast will throw an exception) std::string const strand_number_in_sheet_string(boost::trim_copy(std::string(match[16]))); std::string const sheet_id(boost::trim_copy(std::string(match[17]))); std::string const number_strands_in_sheet_string(boost::trim_copy(std::string(match[18]))); std::string const initial_residue_name(match[19]); char const initial_residue_chain_id(boost::lexical_cast<char>(match[20])); size_t const initial_residue_sequence_number( boost::lexical_cast<size_t>(boost::trim_copy(std::string(match[21])))); char const initial_residue_insertion_code(boost::lexical_cast<char>(match[22])); std::string const terminal_residue_name(match[23]); char const terminal_residue_chain_id(boost::lexical_cast<char>(match[24])); size_t const terminal_residue_sequence_number( boost::lexical_cast<size_t>(boost::trim_copy(std::string(match[25])))); char const terminal_residue_insertion_code(boost::lexical_cast<char>(match[26])); std::string const strand_sense_string(boost::trim_copy(std::string(match[27]))); // print the submatches as strings before consistency checks and converting to size_t DEBUG << "Submatches: strand_number_in_sheet='" << strand_number_in_sheet_string << "' sheet_id='" << sheet_id << "' number_strands_in_sheet='" << number_strands_in_sheet_string << "' initial_residue='" << initial_residue_name << "|" << initial_residue_chain_id << "|" << initial_residue_sequence_number << "|" << initial_residue_insertion_code << "' terminal_residue='" << terminal_residue_name << "|" << terminal_residue_chain_id << "|" << terminal_residue_sequence_number << "|" << terminal_residue_insertion_code << "' strand_sense='" << strand_sense_string << "'"; // we could try to convert strand_number_in_sheet_string, number_strands_in_sheet_string, and // strand_sense_string into size_t, but no checks could be done realistically b/c the pool likely comes // from secondary structure prediction from sequence, and won't have this information. // both chain ids should be identical; no fix needed, chain id is always ignored. if(initial_residue_chain_id != terminal_residue_chain_id) { DEBUG << "Ignoring line, initial_residue_chain_id='" << initial_residue_chain_id << "' differing from terminal_residue_chain_id='" << terminal_residue_chain_id << "'"; continue; } // if // add chain id if it isn't found in the map if(chains.find(initial_residue_chain_id) == chains.end()) { chains[initial_residue_chain_id] = chain_data(); } // if // resize cc_sequence, fill with unknown cc, and set the correct cc for begin and end of the sse chains[initial_residue_chain_id].cc_sequence.resize( std::max(chains[initial_residue_chain_id].cc_sequence.size(), terminal_residue_sequence_number), cc('X')); chains[initial_residue_chain_id].cc_sequence[initial_residue_sequence_number - 1] = cc(initial_residue_name); chains[initial_residue_chain_id].cc_sequence[terminal_residue_sequence_number - 1] = cc(terminal_residue_name); // create and insert sequence_interval into the pool chains[initial_residue_chain_id].pool.insert(cchb_dssp_interval( initial_residue_sequence_number - 1, terminal_residue_sequence_number - 1, cchb_dssp('E'))); } // else if } // for assembly a; // final result size_t sequence_no(1); // start with 1 for(auto chain_pair : chains) { // extend the sequences to the reference length std::string chain_id_string(1, chain_pair.first); if(__reference.has_ensemble(chain_id_string)) { chain_pair.second.cc_sequence.resize( std::max(chain_pair.second.cc_sequence.size(), __reference.get_first_structure(chain_id_string).get_ss().get_sequence().size()), cc('X')); } // if // add to final result a.set(chain_id_string, structure(__filename + "/" + std::to_string(sequence_no), ">lcl|sequence", chain_pair.second.cc_sequence, ss(chain_pair.second.pool, chain_pair.second.cc_sequence.size()))); ++sequence_no; // increase sequence_no, b/c it's not done in the for loop header } // for return a; } // read()
//.x Skimming.C+("Input_2015D_v3_ONE.txt","BB_reduced.root") void Skimming( TString fileList, TString outName ){ //Doing the chain cout<<"Starting... "<<endl; TChain chain_data("cutFlowAnalyzer/Events"); TChain chain_databb("cutFlowAnalyzer/Events_orphan"); std::ifstream Myfile( fileList.Data() ); std::string Line; if( !Myfile ) std::cout<<"ERROR opening "<<fileList<<std::endl; while (std::getline(Myfile, Line)){ TString Line2(Line); if( Line2.Contains("root") ){ chain_data.Add(Line2.Data()); chain_databb.Add(Line2.Data()); } } cout<<"Chain done!"<<endl; //Variables needed bool orph_passOffLineSel_bb, orph_passOffLineSelPt_bb, orph_passOffLineSelPt1788_bb, orph_FiredTrig_bb, orph_FiredTrig_pt_bb, orph_FiredTrig_ptColl_bb; int containstrig2_bb, containstrig_bb; float orph_dimu_mass_bb, orph_dimu_isoTk_bb; chain_databb.SetBranchAddress("orph_passOffLineSel",&orph_passOffLineSel_bb); chain_databb.SetBranchAddress("orph_passOffLineSelPt",&orph_passOffLineSelPt_bb); chain_databb.SetBranchAddress("orph_passOffLineSelPt1788",&orph_passOffLineSelPt1788_bb); chain_databb.SetBranchAddress("orph_FiredTrig",&orph_FiredTrig_bb); chain_databb.SetBranchAddress("orph_FiredTrig_pt",&orph_FiredTrig_pt_bb); chain_databb.SetBranchAddress("orph_FiredTrig_ptColl",&orph_FiredTrig_ptColl_bb); chain_databb.SetBranchAddress("orph_FiredTrig",&orph_FiredTrig_bb); chain_databb.SetBranchAddress("orph_dimu_isoTk",&orph_dimu_isoTk_bb); chain_databb.SetBranchAddress("containstrig2",&containstrig2_bb); chain_databb.SetBranchAddress("orph_dimu_mass",&orph_dimu_mass_bb); chain_databb.SetBranchAddress("containstrig",&containstrig_bb); float massC_mu, massF_mu, isoC_1mm_mu, isoF_1mm_mu; chain_data.SetBranchAddress("massC",&massC_mu); chain_data.SetBranchAddress("massF",&massF_mu); chain_data.SetBranchAddress("isoC_1mm",&isoC_1mm_mu); chain_data.SetBranchAddress("isoF_1mm",&isoF_1mm_mu); //My file TFile *f = new TFile(outName.Data(),"RECREATE"); f->cd(); TTree Events("Events",""); float massC, massF, isoC_1mm, isoF_1mm; Events.Branch("massC",&massC,"massC/F"); Events.Branch("massF",&massF,"massF/F"); Events.Branch("isoC_1mm",&isoC_1mm,"isoC_1mm/F"); Events.Branch("isoF_1mm",&isoF_1mm,"isoF_1mm/F"); bool orph_passOffLineSel, orph_passOffLineSelPt, orph_passOffLineSelPt1788, orph_FiredTrig, orph_FiredTrig_pt, orph_FiredTrig_ptColl; int containstrig2, containstrig; float orph_dimu_mass, orph_dimu_isoTk; TTree Events_orphan("Events_orphan",""); Events_orphan.Branch("orph_passOffLineSel",&orph_passOffLineSel,"orph_passOffLineSel/O"); Events_orphan.Branch("orph_passOffLineSelPt",&orph_passOffLineSelPt,"orph_passOffLineSelPt/O"); Events_orphan.Branch("orph_passOffLineSelPt1788",&orph_passOffLineSelPt1788,"orph_passOffLineSelPt1788/O"); Events_orphan.Branch("orph_FiredTrig",&orph_FiredTrig,"orph_FiredTrig/O"); Events_orphan.Branch("orph_FiredTrig_pt",&orph_FiredTrig_pt,"orph_FiredTrig_pt/O"); Events_orphan.Branch("orph_FiredTrig_ptColl",&orph_FiredTrig_ptColl,"orph_FiredTrig_ptColl/O"); Events_orphan.Branch("containstrig2",&containstrig2,"containstrig2/I"); Events_orphan.Branch("containstrig",&containstrig,"containstrig/I"); Events_orphan.Branch("orph_dimu_isoTk",&orph_dimu_isoTk,"orph_dimu_isoTk/F"); Events_orphan.Branch("orph_dimu_mass",&orph_dimu_mass,"orph_dimu_mass/F"); //Loops cout<<"Now starting loop on bb."<<endl; Long64_t entries = chain_databb.GetEntries(); for(Long64_t i=0; i<entries; i++){ if(i%10000==0) cout<<"i = "<<i<<" / "<<entries<<endl; chain_databb.GetEntry(i); if(orph_dimu_mass>-999.){ orph_passOffLineSel = orph_passOffLineSel_bb; orph_passOffLineSelPt = orph_passOffLineSelPt_bb; orph_passOffLineSelPt1788 = orph_passOffLineSelPt1788_bb; orph_FiredTrig = orph_FiredTrig_bb; orph_FiredTrig_pt = orph_FiredTrig_pt_bb; orph_FiredTrig_ptColl = orph_FiredTrig_ptColl_bb; containstrig2 = containstrig2_bb; containstrig = containstrig_bb; orph_dimu_isoTk = orph_dimu_isoTk_bb; orph_dimu_mass = orph_dimu_mass_bb; Events_orphan.Fill(); } } cout<<"Now starting loop on mu."<<endl; entries = chain_data.GetEntries(); for(Long64_t i=0; i<entries; i++){ if(i%10000==0) cout<<"i = "<<i<<" / "<<entries<<endl; chain_data.GetEntry(i); if(massC_mu>-999. || massF_mu>-999.){ massC = massC_mu; massF = massF_mu; isoC_1mm = isoC_1mm_mu; isoF_1mm = isoF_1mm_mu; Events.Fill(); } } cout<<"CLosing files"<<endl; f->Write(); f->Close(); cout<<"The end."<<endl; }