BWTReaderBinary::BWTReaderBinary(const std::string& filename) : m_stage(IOS_NONE), m_numRunsOnDisk(0), m_numRunsRead(0) { m_pReader = createReader(filename, std::ios::binary); m_stage = IOS_HEADER; }
std::auto_ptr<ObservationReader> ObservationReader::createReader(const std::string& arg) { NUKLEI_TRACE_BEGIN(); std::auto_ptr<ObservationReader> reader; std::string errorsCat = std::string("Error in ObservationReader::createReader.") + "\nErrors at each format attempt were:"; try { reader = createReader(arg, Observation::COVIS3D); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::NUKLEI); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::IIS); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::OSUTXT); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::PCD); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::PLY); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::RIF); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::SERIAL); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::CRD); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::OFF); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::BUILTINVTK); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } try { reader = createReader(arg, Observation::TXT); return reader; } catch (ObservationIOError &e) { errorsCat += "\n" + std::string(e.what()); } throw ObservationIOError ("Error loading observations with automatic type detection. " "Maybe the filename `" + arg + "' is incorrect. " "Else please try again with a defined type."); return reader; NUKLEI_TRACE_END(); }
int DminCombineMain(int argc, char** argv) { parseDminCombineOptions(argc, argv); string line; // for reading the input files std::vector<std::istream*> dminstdErrFiles; std::vector<std::istream*> dminBBAAscoreFiles; for (int i = 0; i < opt::dminFiles.size(); i++) { std::istream* dminBBAAscoreFile; if (file_exists(opt::dminFiles[i] + "_combine.txt")) { dminBBAAscoreFile = createReader((opt::dminFiles[i] + "_combine.txt").c_str()); } else if(file_exists(opt::dminFiles[i] + "_combine.txt.gz")) { dminBBAAscoreFile = createReader((opt::dminFiles[i] + "_combine.txt.gz").c_str()); } else { std::cerr << "Can't fine the file: " << opt::dminFiles[i] + "_combine.txt" << " or " << opt::dminFiles[i] + "_combine.txt.gz" << std::endl; } dminBBAAscoreFiles.push_back(dminBBAAscoreFile); std::istream* dminstdErrFile; if (file_exists(opt::dminFiles[i] + "_combine_stderr.txt")) { dminstdErrFile = createReader((opt::dminFiles[i] + "_combine_stderr.txt").c_str()); } else if(file_exists(opt::dminFiles[i] + "_combine_stderr.txt.gz")) { dminstdErrFile = createReader((opt::dminFiles[i] + "_combine_stderr.txt.gz").c_str()); } else { std::cerr << "Can't fine the file: " << opt::dminFiles[i] + "_combine_stderr.txt" << " or " << opt::dminFiles[i] + "_combine_stderr.txt.gz" << std::endl; } dminstdErrFiles.push_back(dminstdErrFile); std::cerr << "Reading file " << opt::dminFiles[i] << std::endl; } // Now get the standard error values std::ofstream* outFileBBAA = new std::ofstream(opt::runName + "_BBAA.txt"); std::ofstream* outFileDmin = new std::ofstream(opt::runName + "_Dmin.txt"); std::vector<double> BBAA_local_Ds; std::vector<double> ABBA_local_Ds; std::vector<double> BABA_local_Ds; string s1; string s2; string s3; double BBAAtotal = 0; double ABBAtotal = 0; double BABAtotal = 0; bool allDone = false; int processedTriosNumber = 0; do { processedTriosNumber++; if (processedTriosNumber % 10000 == 0) { //durationOverall = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; std::cerr << "Processed " << processedTriosNumber << " trios" << std::endl; //std::cerr << "GettingCounts " << durationGettingCounts << " calculation " << durationCalculation << "secs" << std::endl; } if (opt::subsetStart != -1) { if (processedTriosNumber < opt::subsetStart) { for (int i = 0; i < dminBBAAscoreFiles.size(); i++) { getline(*dminBBAAscoreFiles[i], line); } for (int i = 0; i < dminstdErrFiles.size(); i++) { getline(*dminstdErrFiles[i], line); } continue; } if (processedTriosNumber > (opt::subsetStart+opt::subsetLength)) { std::cerr << "DONE" << std::endl; break; } } for (int i = 0; i < dminBBAAscoreFiles.size(); i++) { if (getline(*dminBBAAscoreFiles[i], line)) { std::vector<string> patternCounts = split(line, '\t'); assert(patternCounts.size() == 6); if (i == 0) { s1 = patternCounts[0]; s2 = patternCounts[1]; s3 = patternCounts[2]; } else { assert(s1 == patternCounts[0]); assert(s2 == patternCounts[1]); assert(s3 == patternCounts[2]); } double BBAA = stringToDouble(patternCounts[3]); double BABA = stringToDouble(patternCounts[4]); double ABBA = stringToDouble(patternCounts[5]); BBAAtotal += BBAA; ABBAtotal += ABBA; BABAtotal += BABA; } } //std::cerr << "ABBAtotal = " << ABBAtotal << std::endl; //std::cerr << "BABAtotal = " << BABAtotal << std::endl; double Dnum1 = ABBAtotal - BABAtotal; // assert(Dnum1 == Dnums[i][0]); double Dnum2 = ABBAtotal - BBAAtotal; // assert(Dnum2 == Dnums[i][1]); double Dnum3 = BBAAtotal - BABAtotal; // assert(Dnum3 == Dnums[i][2]); double Ddenom1 = ABBAtotal + BABAtotal; // assert(Ddenom1 == Ddenoms[i][0]); double Ddenom2 = ABBAtotal + BBAAtotal; // assert(Ddenom2 == Ddenoms[i][1]); double Ddenom3 = BBAAtotal + BABAtotal; // assert(Ddenom3 == Ddenoms[i][2]); //std::cerr << "Dnum1 = " << Dnum1 << std::endl; //std::cerr << "Ddenom1 = " << Ddenom1 << std::endl; double D1 = Dnum1/Ddenom1; double D2 = Dnum2/Ddenom2; double D3 = Dnum3/Ddenom3; //std::cerr << "D1 = " << D1 << std::endl; for (int i = 0; i < dminstdErrFiles.size(); i++) { if (getline(*dminstdErrFiles[i], line)) { std::vector<string> localDs = split(line, '\t'); //assert(localDs.size() == 3 || localDs.size() == 0); if (localDs.size() == 3) { std::vector<string> BBAA_D_strings = split(localDs[0], ','); std::vector<string> BABA_D_strings = split(localDs[1], ','); std::vector<string> ABBA_D_strings = split(localDs[2], ','); for (int j = 0; j < BBAA_D_strings.size(); j++) { //std::cerr << "BBAA_D_strings[j] = " << BBAA_D_strings[j] << std::endl; double thisBBAA_localD = stringToDouble(BBAA_D_strings[j]); if (!std::isnan(thisBBAA_localD)) BBAA_local_Ds.push_back(thisBBAA_localD); // std::cerr << "BABA_D_strings[j] = " << BABA_D_strings[j] << std::endl; double thisBABA_localD = stringToDouble(BABA_D_strings[j]); if (!std::isnan(thisBABA_localD)) BABA_local_Ds.push_back(thisBABA_localD); //std::cerr << "ABBA_D_strings[j] = " << ABBA_D_strings[j] << std::endl; double thisABBA_localD = stringToDouble(ABBA_D_strings[j]); if (!std::isnan(thisABBA_localD)) ABBA_local_Ds.push_back(thisABBA_localD); } } else { print_vector(localDs,std::cerr); } } } if (BBAA_local_Ds.size() == 0 || BABA_local_Ds.size() == 0 || ABBA_local_Ds.size() == 0) { // no info to estimate the standard error; probably all lines have been processed allDone = true; break; } // std::cerr << "D1 = " << D1 << std::endl; //print_vector(BBAA_local_Ds, std::cerr); double BBAAstdErr = jackknive_std_err(BBAA_local_Ds); //print_vector(BABA_local_Ds, std::cerr); double BABAstdErr = jackknive_std_err(BABA_local_Ds); //print_vector(ABBA_local_Ds, std::cerr); //std::cerr << "D1 = " << D1 << std::endl; double ABBAstdErr = jackknive_std_err(ABBA_local_Ds); //std::cerr << "D1 = " << D1 << std::endl; //std::cerr << "BBAAstdErr" << BBAAstdErr << std::endl; double D1_Z = fabs(D1)/BBAAstdErr; double D2_Z = fabs(D2)/BABAstdErr; double D3_Z = fabs(D3)/ABBAstdErr; //std::cerr << "D1_Z = " << D1_Z << std::endl; if (s1 == "Altcal" && s2 == "Altshe" && s3 == "Asplep") { std::cerr << "D1_Z = " << D1_Z << std::endl; std::cerr << "BBAAstdErr = " << BBAAstdErr << std::endl; print_vector(BBAA_local_Ds, std::cerr); std::cerr << "ABBAstdErr = " << ABBAstdErr << std::endl; std::cerr << "BABAstdErr = " << BABAstdErr << std::endl; std::cerr << "ABBAtotal = " << ABBAtotal << std::endl; std::cerr << "BABAtotal = " << BABAtotal << std::endl; std::cerr << "BBAAtotal = " << BBAAtotal << std::endl; } // Find which topology is in agreement with the counts of the BBAA, BABA, and ABBA patterns if (BBAAtotal >= BABAtotal && BBAAtotal >= ABBAtotal) { if (D1 >= 0) *outFileBBAA << s1 << "\t" << s2 << "\t" << s3; else *outFileBBAA << s2 << "\t" << s1 << "\t" << s3; *outFileBBAA << "\t" << fabs(D1) << "\t" << D1_Z << "\t"; *outFileBBAA << BBAAtotal << "\t" << BABAtotal << "\t" << ABBAtotal << std::endl; } else if (BABAtotal >= BBAAtotal && BABAtotal >= ABBAtotal) { if (D2 >= 0) *outFileBBAA << s1 << "\t" << s3 << "\t" << s2; else *outFileBBAA << s3 << "\t" << s1 << "\t" << s2; *outFileBBAA << "\t" << fabs(D2) << "\t" << D2_Z << "\t"; *outFileBBAA << BABAtotal << "\t" << BBAAtotal << "\t" << ABBAtotal << std::endl; } else if (ABBAtotal >= BBAAtotal && ABBAtotal >= BABAtotal) { if (D3 >= 0) *outFileBBAA << s3 << "\t" << s2 << "\t" << s1; else *outFileBBAA << s2 << "\t" << s3 << "\t" << s1; *outFileBBAA << "\t" << fabs(D3) << "\t" << D3_Z << "\t"; *outFileBBAA << ABBAtotal << "\t" << BABAtotal << "\t" << BBAAtotal << std::endl; } // Find Dmin: if (fabs(D1) <= fabs(D2) && fabs(D1) <= fabs(D3)) { // (P3 == S3) if (D1 >= 0) *outFileDmin << s1 << "\t" << s2 << "\t" << s3 << "\t" << D1 << "\t" << D1_Z << "\t" << std::endl; else *outFileDmin << s1 << "\t" << s2 << "\t" << s3 << "\t" << fabs(D1) << "\t" << D1_Z << "\t"<< std::endl; } else if (fabs(D2) <= fabs(D1) && fabs(D2) <= fabs(D3)) { // (P3 == S2) if (D2 >= 0) *outFileDmin << s1 << "\t" << s3 << "\t" << s2 << "\t" << D2 << "\t" << D2_Z << "\t"<< std::endl; else *outFileDmin << s3 << "\t" << s1 << "\t" << s2 << "\t" << fabs(D2) << "\t" << D2_Z << "\t"<< std::endl; } else if (fabs(D3) <= fabs(D1) && fabs(D3) <= fabs(D2)) { // (P3 == S1) if (D3 >= 0) *outFileDmin << s3 << "\t" << s2 << "\t" << s1 << "\t" << D3 << "\t" << D3_Z << "\t"<< std::endl; else *outFileDmin << s2 << "\t" << s3 << "\t" << s1 << "\t" << fabs(D3) << "\t" << D3_Z << "\t" << std::endl;; } BBAA_local_Ds.clear(); ABBA_local_Ds.clear(); BABA_local_Ds.clear(); BBAAtotal = 0; ABBAtotal = 0; BABAtotal = 0; } while(!allDone); /* for (int i = 0; i != trios.size(); i++) { // // Get the standard error values: double D1stdErr = jackknive_std_err(regionDs[i][0]); double D2stdErr = jackknive_std_err(regionDs[i][1]); double D3stdErr = jackknive_std_err(regionDs[i][2]); // Get the D values //Dnums[i][0] = ABBAtotals[i] - BABAtotals[i]; Dnums[i][1] = ABBAtotals[i] - BBAAtotals[i]; Dnums[i][2] = BBAAtotals[i] - BABAtotals[i]; double Dnum1 = ABBAtotals[i] - BABAtotals[i]; // assert(Dnum1 == Dnums[i][0]); double Dnum2 = ABBAtotals[i] - BBAAtotals[i]; // assert(Dnum2 == Dnums[i][1]); double Dnum3 = BBAAtotals[i] - BABAtotals[i]; // assert(Dnum3 == Dnums[i][2]); // Ddenoms[i][0] = ABBAtotals[i] + BABAtotals[i]; Ddenoms[i][1] = ABBAtotals[i] + BBAAtotals[i]; Ddenoms[i][2] = BBAAtotals[i] + BABAtotals[i]; double Ddenom1 = ABBAtotals[i] + BABAtotals[i]; // assert(Ddenom1 == Ddenoms[i][0]); double Ddenom2 = ABBAtotals[i] + BBAAtotals[i]; // assert(Ddenom2 == Ddenoms[i][1]); double Ddenom3 = BBAAtotals[i] + BABAtotals[i]; // assert(Ddenom3 == Ddenoms[i][2]); double D1 = Dnum1/Ddenom1; double D2 = Dnum2/Ddenom2; double D3 = Dnum3/Ddenom3; // Get the Z-scores double D1_Z = abs(D1)/D1stdErr; double D2_Z = abs(D2)/D2stdErr; double D3_Z = abs(D3)/D3stdErr; // Find which topology is in agreement with the counts of the BBAA, BABA, and ABBA patterns if (BBAAtotals[i] >= BABAtotals[i] && BBAAtotals[i] >= ABBAtotals[i]) { if (D1 >= 0) *outFileBBAA << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2]; else *outFileBBAA << trios[i][1] << "\t" << trios[i][0] << "\t" << trios[i][2]; *outFileBBAA << "\t" << abs(D1) << "\t" << D1_Z << "\t"; *outFileBBAA << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl; } else if (BABAtotals[i] >= BBAAtotals[i] && BABAtotals[i] >= ABBAtotals[i]) { if (D2 >= 0) *outFileBBAA << trios[i][0] << "\t" << trios[i][2] << "\t" << trios[i][1]; else *outFileBBAA << trios[i][2] << "\t" << trios[i][0] << "\t" << trios[i][1]; *outFileBBAA << "\t" << abs(D2) << "\t" << D2_Z << "\t"; *outFileBBAA << BABAtotals[i] << "\t" << BBAAtotals[i] << "\t" << ABBAtotals[i] << std::endl; } else if (ABBAtotals[i] >= BBAAtotals[i] && ABBAtotals[i] >= BABAtotals[i]) { if (D3 >= 0) *outFileBBAA << trios[i][2] << "\t" << trios[i][1] << "\t" << trios[i][0]; else *outFileBBAA << trios[i][1] << "\t" << trios[i][2] << "\t" << trios[i][0]; *outFileBBAA << "\t" << abs(D3) << "\t" << D3_Z << "\t"; *outFileBBAA << ABBAtotals[i] << "\t" << BABAtotals[i] << "\t" << BBAAtotals[i] << std::endl; } // Find Dmin: if (abs(D1) <= abs(D2) && abs(D1) <= abs(D3)) { // (P3 == S3) if (D1 >= 0) *outFileDmin << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << D1 << "\t" << D1_Z << "\t" << std::endl; else *outFileDmin << trios[i][1] << "\t" << trios[i][0] << "\t" << trios[i][2] << "\t" << abs(D1) << "\t" << D1_Z << "\t"<< std::endl; // if (BBAAtotals[i] < BABAtotals[i] || BBAAtotals[i] < ABBAtotals[i]) // std::cerr << "\t" << "WARNING: Dmin tree different from DAF tree" << std::endl; } else if (abs(D2) <= abs(D1) && abs(D2) <= abs(D3)) { // (P3 == S2) if (D2 >= 0) *outFileDmin << trios[i][0] << "\t" << trios[i][2] << "\t" << trios[i][1] << "\t" << D2 << "\t" << D2_Z << "\t"<< std::endl; else *outFileDmin << trios[i][2] << "\t" << trios[i][0] << "\t" << trios[i][1] << "\t" << abs(D2) << "\t" << D2_Z << "\t"<< std::endl; // if (BABAtotals[i] < BBAAtotals[i] || BABAtotals[i] < ABBAtotals[i]) // std::cerr << "\t" << "WARNING: Dmin tree different from DAF tree" << std::endl; } else if (abs(D3) <= abs(D1) && abs(D3) <= abs(D2)) { // (P3 == S1) if (D3 >= 0) *outFileDmin << trios[i][2] << "\t" << trios[i][1] << "\t" << trios[i][0] << "\t" << D3 << "\t" << D3_Z << "\t"<< std::endl; else *outFileDmin << trios[i][1] << "\t" << trios[i][2] << "\t" << trios[i][0] << "\t" << abs(D3) << "\t" << D3_Z << "\t" << std::endl;; // if (ABBAtotals[i] < BBAAtotals[i] || ABBAtotals[i] < BABAtotals[i]) // std::cerr << "\t" << "WARNING: Dmin tree different from DAF tree" << std::endl; } // Output a simple file that can be used for combining multiple local runs: *outFileCombine << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl; print_vector(regionDs[i][0], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t"; print_vector(regionDs[i][1], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t"; print_vector(regionDs[i][2], *outFileCombineStdErr, ',',false); *outFileCombineStdErr << std::endl; //std::cerr << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << D1 << "\t" << D2 << "\t" << D3 << "\t" << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl; } */ return 0; }
std::string parseDupHits(const StringVector& hitsFilenames, const std::string& out_prefix) { // Load the suffix array index and the reverse suffix array index // Note these are not the full suffix arrays SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT); SuffixArray* pRevSAI = new SuffixArray(opt::prefix + RSAI_EXT); // Load the read table to look up the lengths of the reads and their ids. // When rmduping a set of reads, the ReadInfoTable can actually be larger than the // BWT if the names of the reads are very long. Previously, when two reads // are duplicated, the read with the lexographically lower read name was chosen // to be kept. To save memory here, we break ties using the index in the ReadInfoTable // instead. This allows us to avoid loading the read names. ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings(), RIO_NUMERICID); std::string outFile = out_prefix + ".fa"; std::string dupFile = out_prefix + ".dups.fa"; std::ostream* pWriter = createWriter(outFile); std::ostream* pDupWriter = createWriter(dupFile); size_t substringRemoved = 0; size_t identicalRemoved = 0; size_t kept = 0; size_t buffer_size = SequenceProcessFramework::BUFFER_SIZE; // The reads must be output in their original ordering. // The hits are in the blocks of buffer_size items. We read // buffer_size items from the first hits file, then buffer_size // from the second and so on until all the hits have been processed. size_t num_files = hitsFilenames.size(); std::vector<std::istream*> reader_vec(num_files, 0); for(size_t i = 0; i < num_files; ++i) { std::cout << "Opening " << hitsFilenames[i] << "\n"; reader_vec[i] = createReader(hitsFilenames[i]); } bool done = false; size_t currReaderIdx = 0; size_t numRead = 0; size_t numReadersDone = 0; std::string line; while(!done) { // Parse a line from the current file bool valid = static_cast<bool>(getline(*reader_vec[currReaderIdx], line)); ++numRead; // Deal with switching the active reader and the end of files if(!valid || numRead == buffer_size) { // Switch the reader currReaderIdx = (currReaderIdx + 1) % num_files; numRead = 0; // Break once all the readers are invalid if(!valid) { ++numReadersDone; if(numReadersDone == num_files) { done = true; break; } } } // Parse the data if(valid) { std::string id; std::string sequence; std::string hitsStr; size_t readIdx; size_t numCopies; bool isSubstring; std::stringstream parser(line); parser >> id; parser >> sequence; getline(parser, hitsStr); OverlapVector ov; OverlapCommon::parseHitsString(hitsStr, pRIT, pRIT, pFwdSAI, pRevSAI, true, readIdx, numCopies, ov, isSubstring); bool isContained = false; if(isSubstring) { ++substringRemoved; isContained = true; } else { for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter) { if(iter->isContainment() && iter->getContainedIdx() == 0) { // This read is contained by some other read ++identicalRemoved; isContained = true; break; } } } SeqItem item = {id, sequence}; std::stringstream meta; meta << id << " NumDuplicates=" << numCopies; if(isContained) { // The read's index in the sequence data base // is needed when removing it from the FM-index. // In the output fasta, we set the reads ID to be the index // and record its old id in the fasta header. std::stringstream newID; newID << item.id << ",seqrank=" << readIdx; item.id = newID.str(); // Write some metadata with the fasta record item.write(*pDupWriter, meta.str()); } else { ++kept; // Write the read item.write(*pWriter, meta.str()); } } } for(size_t i = 0; i < num_files; ++i) { delete reader_vec[i]; unlink(hitsFilenames[i].c_str()); } printf("[%s] Removed %zu substring reads\n", PROGRAM_IDENT, substringRemoved); printf("[%s] Removed %zu identical reads\n", PROGRAM_IDENT, identicalRemoved); printf("[%s] Kept %zu reads\n", PROGRAM_IDENT, kept); // Delete allocated data delete pFwdSAI; delete pRevSAI; delete pRIT; delete pWriter; delete pDupWriter; return dupFile; }
void count_main(int argc, char **argv) { Timer *total_timer = new Timer("count_main"); parse_count_options(argc, argv); std::istream* probes_stream; probes_stream = (opt::probes_file == "-") ? &std::cin : createReader(opt::probes_file); /* Load probes */ ss_probes probes; int probe_length; std::cerr << ">> Loading probes " << std::endl; probes.set_empty_key("-"); load_probes(probes, probes_stream); std::cerr << ">> # of probes (RC included): " << probes.size() << std::endl; probe_length = probes.begin()->first.length(); /* Process reads */ std::string **buffer; /* request mem for buffer of reads */ buffer = (std::string **) calloc(sizeof(std::string *), BUFFER_SIZE); int n_rr = 0; // number of reads read int total = 0; // total number of reads processed SeqReader reader(opt::reads_file, SRF_NO_VALIDATION); while((n_rr = load_to_buffer(&reader, buffer)) != 0) { Timer *pr_timer = new Timer("Processing reads"); std::cerr << ">> Computing screen on " << n_rr << " reads" << std::endl; total += n_rr; pthread_t *tid; pthread_attr_t attr; thread_data *data; unsigned int j; // to iterate over num of threads pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); data = (thread_data*) calloc(opt::n_threads, sizeof(thread_data)); tid = (pthread_t*) calloc(opt::n_threads, sizeof(pthread_t)); /* create/init mutex */ pthread_mutex_t mutex; pthread_mutex_init(&mutex, NULL); for (j=0; j<opt::n_threads; ++j) { data[j].n_reads = n_rr; data[j].tid = j; data[j].buffer = buffer; data[j].probes = &probes; data[j].p_len = probe_length; data[j].mutex = &mutex; data[j].n_threads = opt::n_threads; pthread_create(&tid[j], &attr, worker, data+j); } /* join-start threads */ for (j=0; j<opt::n_threads; ++j) pthread_join(tid[j], 0); std::cerr << ">> Done computing " << n_rr << " reads" << std::endl; std::cerr << ">> Total # of reads processed so far: " << total << std::endl; free(data); free(tid); delete pr_timer; // force dump cpu time. } dump_results(probes); free_up_probes(probes); delete probes_stream; for (int j=0; j<BUFFER_SIZE; j++) delete buffer[j]; free(buffer); delete total_timer; }
XMLReader* ReaderMgr::createReader( const XMLCh* const baseURI , const XMLCh* const sysId , const XMLCh* const pubId , const bool xmlDecl , const XMLReader::RefFrom refFrom , const XMLReader::Types type , const XMLReader::Sources source , InputSource*& srcToFill , const bool calcSrcOfs) { // Create a buffer for expanding the system id XMLBuffer expSysId(1023, fMemoryManager); // // Allow the entity handler to expand the system id if they choose // to do so. // if (fEntityHandler) { if (!fEntityHandler->expandSystemId(sysId, expSysId)) expSysId.set(sysId); } else { expSysId.set(sysId); } // Call the entity resolver interface to get an input source srcToFill = 0; if (fEntityHandler) { XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity, expSysId.getRawBuffer(), XMLUni::fgZeroLenString, pubId, baseURI); srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier); } // // If they didn't create a source via the entity resolver, then we // have to create one on our own. // if (!srcToFill) { LastExtEntityInfo lastInfo; getLastExtEntityInfo(lastInfo); XMLURL urlTmp(fMemoryManager); if ((!urlTmp.setURL((!baseURI || !*baseURI) ? lastInfo.systemId : baseURI, expSysId.getRawBuffer(), urlTmp)) || (urlTmp.isRelative())) { if (!fStandardUriConformant) srcToFill = new (fMemoryManager) LocalFileInputSource ( lastInfo.systemId , expSysId.getRawBuffer() , fMemoryManager ); else ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); } else { if (fStandardUriConformant && urlTmp.hasInvalidChar()) ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager); } } // Put a janitor on the input source Janitor<InputSource> janSrc(srcToFill); // // Now call the other version with the input source that we have, and // return the resulting reader. // XMLReader* retVal = createReader ( *srcToFill , xmlDecl , refFrom , type , source , calcSrcOfs ); // Either way, we can release the input source now janSrc.orphan(); // If it failed for any reason, then return zero. if (!retVal) return 0; // Give this reader the next available reader number and return it retVal->setReaderNum(fNextReaderNum++); return retVal; }
XMLReader* ReaderMgr::createReader( const XMLCh* const sysId , const XMLCh* const pubId , const bool xmlDecl , const XMLReader::RefFrom refFrom , const XMLReader::Types type , const XMLReader::Sources source , InputSource*& srcToFill , const bool calcSrcOfs , XMLSize_t lowWaterMark , const bool disableDefaultEntityResolution) { //Normalize sysId XMLBuffer normalizedSysId(1023, fMemoryManager); if(sysId) XMLString::removeChar(sysId, 0xFFFF, normalizedSysId); const XMLCh* normalizedURI = normalizedSysId.getRawBuffer(); // Create a buffer for expanding the system id XMLBuffer expSysId(1023, fMemoryManager); // // Allow the entity handler to expand the system id if they choose // to do so. // if (fEntityHandler) { if (!fEntityHandler->expandSystemId(normalizedURI, expSysId)) expSysId.set(normalizedURI); } else { expSysId.set(normalizedURI); } // Call the entity resolver interface to get an input source srcToFill = 0; if (fEntityHandler) { LastExtEntityInfo lastInfo; getLastExtEntityInfo(lastInfo); XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity, expSysId.getRawBuffer(), XMLUni::fgZeroLenString, pubId, lastInfo.systemId, this); srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier); } // // If they didn't create a source via the entity resolver, then we // have to create one on our own. // if (!srcToFill) { if (disableDefaultEntityResolution) return 0; LastExtEntityInfo lastInfo; getLastExtEntityInfo(lastInfo); // Keep this #if 0 block as it was exposing a threading problem on AIX. // Got rid of the problem by changing XMLURL to not throw malformedurl // exceptions. #if 0 try { XMLURL urlTmp(lastInfo.systemId, expSysId.getRawBuffer(), fMemoryManager); if (urlTmp.isRelative()) { ThrowXMLwithMemMgr ( MalformedURLException , XMLExcepts::URL_NoProtocolPresent , fMemoryManager ); } else { if (fStandardUriConformant && urlTmp.hasInvalidChar()) ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager); } } catch(const MalformedURLException& e) { // Its not a URL, so lets assume its a local file name if non-standard uri is allowed if (!fStandardUriConformant) srcToFill = new (fMemoryManager) LocalFileInputSource ( lastInfo.systemId , expSysId.getRawBuffer() , fMemoryManager ); else throw e; } #else XMLURL urlTmp(fMemoryManager); if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) || (urlTmp.isRelative())) { if (!fStandardUriConformant) { XMLBuffer resolvedSysId(1023, fMemoryManager); XMLUri::normalizeURI(expSysId.getRawBuffer(), resolvedSysId); srcToFill = new (fMemoryManager) LocalFileInputSource ( lastInfo.systemId , resolvedSysId.getRawBuffer() , fMemoryManager ); } else ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); } else { if (fStandardUriConformant && urlTmp.hasInvalidChar()) ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager); } #endif } // Put a janitor on the input source Janitor<InputSource> janSrc(srcToFill); // // Now call the other version with the input source that we have, and // return the resulting reader. // XMLReader* retVal = createReader ( *srcToFill , xmlDecl , refFrom , type , source , calcSrcOfs , lowWaterMark ); // Either way, we can release the input source now janSrc.orphan(); // If it failed for any reason, then return zero. if (!retVal) return 0; // Give this reader the next available reader number and return it retVal->setReaderNum(fNextReaderNum++); return retVal; }
StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigned int minOverlap, bool allowContainments, size_t maxEdges) { // Initialize graph StringGraph* pGraph = new StringGraph; std::istream* pReader = createReader(filename); int stage = 0; int line = 0; std::string recordLine; while(getline(*pReader, recordLine)) { ASQG::RecordType rt = ASQG::getRecordType(recordLine); switch(rt) { case ASQG::RT_HEADER: { if(stage != 0) { std::cerr << "Error: Unexpected header record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::HeaderRecord headerRecord(recordLine); const SQG::IntTag& overlapTag = headerRecord.getOverlapTag(); if(overlapTag.isInitialized()) pGraph->setMinOverlap(overlapTag.get()); else pGraph->setMinOverlap(0); const SQG::FloatTag& errorRateTag = headerRecord.getErrorRateTag(); if(errorRateTag.isInitialized()) pGraph->setErrorRate(errorRateTag.get()); const SQG::IntTag& containmentTag = headerRecord.getContainmentTag(); if(containmentTag.isInitialized()) pGraph->setContainmentFlag(containmentTag.get()); else pGraph->setContainmentFlag(true); // conservatively assume containments are present const SQG::IntTag& transitiveTag = headerRecord.getTransitiveTag(); if(!transitiveTag.isInitialized()) { std::cerr << "Warning: ASQG does not have transitive tag\n"; pGraph->setTransitiveFlag(true); } else { pGraph->setTransitiveFlag(transitiveTag.get()); } break; } case ASQG::RT_VERTEX: { // progress the stage if we are done the header if(stage == 0) stage = 1; if(stage != 1) { std::cerr << "Error: Unexpected vertex record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::VertexRecord vertexRecord(recordLine); const SQG::IntTag& ssTag = vertexRecord.getSubstringTag(); // Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(vertexRecord.getID(), vertexRecord.getSeq()); Vertex* pVertex = new Vertex(vertexRecord.getID(), vertexRecord.getSeq()); if(ssTag.isInitialized() && ssTag.get() == 1) { // Vertex is a substring of some other vertex, mark it as contained pVertex->setContained(true); pGraph->setContainmentFlag(true); } pGraph->addVertex(pVertex); break; } case ASQG::RT_EDGE: { if(stage == 1) stage = 2; if(stage != 2) { std::cerr << "Error: Unexpected edge record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::EdgeRecord edgeRecord(recordLine); const Overlap& ovr = edgeRecord.getOverlap(); // Add the edge to the graph if(ovr.match.getMinOverlapLength() >= (int)minOverlap) SGAlgorithms::createEdgesFromOverlap(pGraph, ovr, allowContainments, maxEdges); break; } } ++line; } // Completely delete the edges for all nodes that were marked as super-repetitive in the graph SGSuperRepeatVisitor superRepeatVisitor; pGraph->visit(superRepeatVisitor); // Remove any duplicate edges SGDuplicateVisitor dupVisit; pGraph->visit(dupVisit); SGGraphStatsVisitor statsVisit; pGraph->visit(statsVisit); // Remove identical vertices // This is much cheaper to do than remove via // SGContainRemove as no remodelling needs to occur /* SGIdenticalRemoveVisitor irv; pGraph->visit(irv); // Remove substring vertices while(pGraph->hasContainment()) { SGContainRemoveVisitor crv; pGraph->visit(crv); } */ delete pReader; return pGraph; }