BWTReaderBinary::BWTReaderBinary(const std::string& filename) : m_stage(IOS_NONE), m_numRunsOnDisk(0), m_numRunsRead(0)
{
    m_pReader = createReader(filename, std::ios::binary);
    m_stage = IOS_HEADER;
}
示例#2
0
  std::auto_ptr<ObservationReader>
  ObservationReader::createReader(const std::string& arg)
  {
    NUKLEI_TRACE_BEGIN();
    std::auto_ptr<ObservationReader> reader;

    std::string errorsCat = std::string("Error in ObservationReader::createReader.") +
      "\nErrors at each format attempt were:";

    try {
      reader = createReader(arg, Observation::COVIS3D);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::NUKLEI);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }
    
    try {
      reader = createReader(arg, Observation::IIS);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }
    
    try {
      reader = createReader(arg, Observation::OSUTXT);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::PCD);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::PLY);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::RIF);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::SERIAL);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::CRD);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::OFF);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::BUILTINVTK);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    try {
      reader = createReader(arg, Observation::TXT);
      return reader;
    } catch (ObservationIOError &e) {
      errorsCat += "\n" + std::string(e.what());
    }

    throw ObservationIOError
      ("Error loading observations with automatic type detection. "
       "Maybe the filename `" + arg + "' is incorrect. "
       "Else please try again with a defined type.");
    return reader;
    NUKLEI_TRACE_END();
  }
示例#3
0
int DminCombineMain(int argc, char** argv) {
    parseDminCombineOptions(argc, argv);
    string line; // for reading the input files
    
    
    std::vector<std::istream*> dminstdErrFiles; std::vector<std::istream*> dminBBAAscoreFiles;
    for (int i = 0; i < opt::dminFiles.size(); i++) {
        std::istream* dminBBAAscoreFile;
        if (file_exists(opt::dminFiles[i] + "_combine.txt")) {
            dminBBAAscoreFile = createReader((opt::dminFiles[i] + "_combine.txt").c_str());
        } else if(file_exists(opt::dminFiles[i] + "_combine.txt.gz")) {
            dminBBAAscoreFile = createReader((opt::dminFiles[i] + "_combine.txt.gz").c_str());
        } else {
            std::cerr << "Can't fine the file: " << opt::dminFiles[i] + "_combine.txt" << " or " << opt::dminFiles[i] + "_combine.txt.gz" << std::endl;
        }
        dminBBAAscoreFiles.push_back(dminBBAAscoreFile);
        std::istream* dminstdErrFile;
        if (file_exists(opt::dminFiles[i] + "_combine_stderr.txt")) {
            dminstdErrFile = createReader((opt::dminFiles[i] + "_combine_stderr.txt").c_str());
        } else if(file_exists(opt::dminFiles[i] + "_combine_stderr.txt.gz")) {
            dminstdErrFile = createReader((opt::dminFiles[i] + "_combine_stderr.txt.gz").c_str());
        } else {
            std::cerr << "Can't fine the file: " << opt::dminFiles[i] + "_combine_stderr.txt" << " or " << opt::dminFiles[i] + "_combine_stderr.txt.gz" << std::endl;
        }
        dminstdErrFiles.push_back(dminstdErrFile);
        std::cerr << "Reading file " << opt::dminFiles[i] << std::endl;
    }
    
    
    // Now get the standard error values
    std::ofstream* outFileBBAA = new std::ofstream(opt::runName + "_BBAA.txt"); std::ofstream* outFileDmin = new std::ofstream(opt::runName + "_Dmin.txt");
    std::vector<double> BBAA_local_Ds; std::vector<double> ABBA_local_Ds; std::vector<double> BABA_local_Ds;
    string s1; string s2; string s3;
    double BBAAtotal = 0; double ABBAtotal = 0; double BABAtotal = 0;
    bool allDone = false;
    int processedTriosNumber = 0;
    do {
        processedTriosNumber++;
        if (processedTriosNumber % 10000 == 0) {
            //durationOverall = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
            std::cerr << "Processed " << processedTriosNumber << " trios" << std::endl;
            //std::cerr << "GettingCounts " << durationGettingCounts << " calculation " << durationCalculation << "secs" << std::endl;
        }
        if (opt::subsetStart != -1) {
            if (processedTriosNumber < opt::subsetStart) {
                for (int i = 0; i < dminBBAAscoreFiles.size(); i++) { getline(*dminBBAAscoreFiles[i], line); }
                for (int i = 0; i < dminstdErrFiles.size(); i++) { getline(*dminstdErrFiles[i], line); }
                continue;
            }
            if (processedTriosNumber > (opt::subsetStart+opt::subsetLength)) {
                std::cerr << "DONE" << std::endl; break;
            }
        }
        for (int i = 0; i < dminBBAAscoreFiles.size(); i++) {
            if (getline(*dminBBAAscoreFiles[i], line)) {
                std::vector<string> patternCounts = split(line, '\t');
                assert(patternCounts.size() == 6);
                if (i == 0) {
                    s1 = patternCounts[0];
                    s2 = patternCounts[1];
                    s3 = patternCounts[2];
                } else {
                    assert(s1 == patternCounts[0]); assert(s2 == patternCounts[1]); assert(s3 == patternCounts[2]);
                }
                double BBAA = stringToDouble(patternCounts[3]);
                double BABA = stringToDouble(patternCounts[4]);
                double ABBA = stringToDouble(patternCounts[5]);
                BBAAtotal += BBAA; ABBAtotal += ABBA; BABAtotal += BABA;
            }
        }
        //std::cerr << "ABBAtotal = " << ABBAtotal << std::endl;
        //std::cerr << "BABAtotal = " << BABAtotal << std::endl;
        double Dnum1 = ABBAtotal - BABAtotal; // assert(Dnum1 == Dnums[i][0]);
        double Dnum2 = ABBAtotal - BBAAtotal; // assert(Dnum2 == Dnums[i][1]);
        double Dnum3 = BBAAtotal - BABAtotal; // assert(Dnum3 == Dnums[i][2]);
        double Ddenom1 = ABBAtotal + BABAtotal; // assert(Ddenom1 == Ddenoms[i][0]);
        double Ddenom2 = ABBAtotal + BBAAtotal; // assert(Ddenom2 == Ddenoms[i][1]);
        double Ddenom3 = BBAAtotal + BABAtotal; // assert(Ddenom3 == Ddenoms[i][2]);
        //std::cerr << "Dnum1 = " << Dnum1 << std::endl;
        //std::cerr << "Ddenom1 = " << Ddenom1 << std::endl;
        double D1 = Dnum1/Ddenom1; double D2 = Dnum2/Ddenom2; double D3 = Dnum3/Ddenom3;
        //std::cerr << "D1 = " << D1 << std::endl;
        for (int i = 0; i < dminstdErrFiles.size(); i++) {
            if (getline(*dminstdErrFiles[i], line)) {
                std::vector<string> localDs = split(line, '\t');
                //assert(localDs.size() == 3 || localDs.size() == 0);
                if (localDs.size() == 3) {
                    std::vector<string> BBAA_D_strings = split(localDs[0], ',');
                    std::vector<string> BABA_D_strings = split(localDs[1], ',');
                    std::vector<string> ABBA_D_strings = split(localDs[2], ',');
                    for (int j = 0; j < BBAA_D_strings.size(); j++) {
                        //std::cerr << "BBAA_D_strings[j] = " << BBAA_D_strings[j] << std::endl;
                        double thisBBAA_localD = stringToDouble(BBAA_D_strings[j]);
                        if (!std::isnan(thisBBAA_localD)) BBAA_local_Ds.push_back(thisBBAA_localD);
                       // std::cerr << "BABA_D_strings[j] = " << BABA_D_strings[j] << std::endl;
                        double thisBABA_localD = stringToDouble(BABA_D_strings[j]);
                        if (!std::isnan(thisBABA_localD)) BABA_local_Ds.push_back(thisBABA_localD);
                        //std::cerr << "ABBA_D_strings[j] = " << ABBA_D_strings[j] << std::endl;
                        double thisABBA_localD = stringToDouble(ABBA_D_strings[j]);
                        if (!std::isnan(thisABBA_localD)) ABBA_local_Ds.push_back(thisABBA_localD);
                    }
                } else {
                    print_vector(localDs,std::cerr);
                }
            }
        }
        if (BBAA_local_Ds.size() == 0 || BABA_local_Ds.size() == 0 || ABBA_local_Ds.size() == 0) { // no info to estimate the standard error; probably all lines have been processed
            allDone = true; break;
        }
       // std::cerr << "D1 = " << D1 << std::endl;
        //print_vector(BBAA_local_Ds, std::cerr);
        double BBAAstdErr = jackknive_std_err(BBAA_local_Ds);
        //print_vector(BABA_local_Ds, std::cerr);
        double BABAstdErr = jackknive_std_err(BABA_local_Ds);
        //print_vector(ABBA_local_Ds, std::cerr);
        //std::cerr << "D1 = " << D1 << std::endl;
        double ABBAstdErr = jackknive_std_err(ABBA_local_Ds);
        //std::cerr << "D1 = " << D1 << std::endl;
        //std::cerr << "BBAAstdErr" << BBAAstdErr << std::endl;
        double D1_Z = fabs(D1)/BBAAstdErr; double D2_Z = fabs(D2)/BABAstdErr;
        double D3_Z = fabs(D3)/ABBAstdErr;
        //std::cerr << "D1_Z = " << D1_Z << std::endl;
        if (s1 == "Altcal" && s2 == "Altshe" && s3 == "Asplep") {
            std::cerr << "D1_Z = " << D1_Z << std::endl;
            std::cerr << "BBAAstdErr = " << BBAAstdErr << std::endl;
            print_vector(BBAA_local_Ds, std::cerr);
            std::cerr << "ABBAstdErr = " << ABBAstdErr << std::endl;
            std::cerr << "BABAstdErr = " << BABAstdErr << std::endl;
            std::cerr << "ABBAtotal = " << ABBAtotal << std::endl;
            std::cerr << "BABAtotal = " << BABAtotal << std::endl;
            std::cerr << "BBAAtotal = " << BBAAtotal << std::endl;
        }

        
        // Find which topology is in agreement with the counts of the BBAA, BABA, and ABBA patterns
        if (BBAAtotal >= BABAtotal && BBAAtotal >= ABBAtotal) {
            if (D1 >= 0)
                *outFileBBAA << s1 << "\t" << s2 << "\t" << s3;
            else
                *outFileBBAA << s2 << "\t" << s1 << "\t" << s3;
            *outFileBBAA << "\t" << fabs(D1) << "\t" << D1_Z << "\t";
            *outFileBBAA << BBAAtotal << "\t" << BABAtotal << "\t" << ABBAtotal << std::endl;
        } else if (BABAtotal >= BBAAtotal && BABAtotal >= ABBAtotal) {
            if (D2 >= 0)
                *outFileBBAA << s1 << "\t" << s3 << "\t" << s2;
            else
                *outFileBBAA << s3 << "\t" << s1 << "\t" << s2;
            *outFileBBAA << "\t" << fabs(D2) << "\t" << D2_Z << "\t";
            *outFileBBAA << BABAtotal << "\t" << BBAAtotal << "\t" << ABBAtotal << std::endl;
        } else if (ABBAtotal >= BBAAtotal && ABBAtotal >= BABAtotal) {
            if (D3 >= 0)
                *outFileBBAA << s3 << "\t" << s2 << "\t" << s1;
            else
                *outFileBBAA << s2 << "\t" << s3 << "\t" << s1;
            *outFileBBAA << "\t" << fabs(D3) << "\t" << D3_Z << "\t";
            *outFileBBAA << ABBAtotal << "\t" << BABAtotal << "\t" << BBAAtotal << std::endl;
        }
        
        // Find Dmin:
        if (fabs(D1) <= fabs(D2) && fabs(D1) <= fabs(D3)) { // (P3 == S3)
            if (D1 >= 0)
                *outFileDmin << s1 << "\t" << s2 << "\t" << s3 << "\t" << D1 << "\t" << D1_Z << "\t" << std::endl;
            else
                *outFileDmin << s1 << "\t" << s2 << "\t" << s3 << "\t" << fabs(D1) << "\t" << D1_Z << "\t"<< std::endl;
        } else if (fabs(D2) <= fabs(D1) && fabs(D2) <= fabs(D3)) { // (P3 == S2)
            if (D2 >= 0)
                *outFileDmin << s1 << "\t" << s3 << "\t" << s2 << "\t" << D2 << "\t" << D2_Z << "\t"<< std::endl;
            else
                *outFileDmin << s3 << "\t" << s1 << "\t" << s2 << "\t" << fabs(D2) << "\t" << D2_Z << "\t"<< std::endl;
        } else if (fabs(D3) <= fabs(D1) && fabs(D3) <= fabs(D2)) { // (P3 == S1)
            if (D3 >= 0)
                *outFileDmin << s3 << "\t" << s2 << "\t" << s1 << "\t" << D3 << "\t" << D3_Z << "\t"<< std::endl;
            else
                *outFileDmin << s2 << "\t" << s3 << "\t" << s1 << "\t" << fabs(D3) << "\t" << D3_Z << "\t" << std::endl;;
        }
        
        BBAA_local_Ds.clear(); ABBA_local_Ds.clear(); BABA_local_Ds.clear();
        BBAAtotal = 0; ABBAtotal = 0; BABAtotal = 0;
    } while(!allDone);
  
    
    
    /*
    for (int i = 0; i != trios.size(); i++) { //
        // Get the standard error values:
        double D1stdErr = jackknive_std_err(regionDs[i][0]); double D2stdErr = jackknive_std_err(regionDs[i][1]);
        double D3stdErr = jackknive_std_err(regionDs[i][2]);
        // Get the D values
        //Dnums[i][0] = ABBAtotals[i] - BABAtotals[i]; Dnums[i][1] = ABBAtotals[i] - BBAAtotals[i]; Dnums[i][2] = BBAAtotals[i] - BABAtotals[i];
        double Dnum1 = ABBAtotals[i] - BABAtotals[i]; // assert(Dnum1 == Dnums[i][0]);
        double Dnum2 = ABBAtotals[i] - BBAAtotals[i]; // assert(Dnum2 == Dnums[i][1]);
        double Dnum3 = BBAAtotals[i] - BABAtotals[i]; // assert(Dnum3 == Dnums[i][2]);
        // Ddenoms[i][0] = ABBAtotals[i] + BABAtotals[i]; Ddenoms[i][1] = ABBAtotals[i] + BBAAtotals[i]; Ddenoms[i][2] = BBAAtotals[i] + BABAtotals[i];
        double Ddenom1 = ABBAtotals[i] + BABAtotals[i]; // assert(Ddenom1 == Ddenoms[i][0]);
        double Ddenom2 = ABBAtotals[i] + BBAAtotals[i]; // assert(Ddenom2 == Ddenoms[i][1]);
        double Ddenom3 = BBAAtotals[i] + BABAtotals[i]; // assert(Ddenom3 == Ddenoms[i][2]);
        double D1 = Dnum1/Ddenom1; double D2 = Dnum2/Ddenom2; double D3 = Dnum3/Ddenom3;
        // Get the Z-scores
        double D1_Z = abs(D1)/D1stdErr; double D2_Z = abs(D2)/D2stdErr;
        double D3_Z = abs(D3)/D3stdErr;
        
        
        // Find which topology is in agreement with the counts of the BBAA, BABA, and ABBA patterns
        if (BBAAtotals[i] >= BABAtotals[i] && BBAAtotals[i] >= ABBAtotals[i]) {
            if (D1 >= 0)
                *outFileBBAA << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2];
            else
                *outFileBBAA << trios[i][1] << "\t" << trios[i][0] << "\t" << trios[i][2];
            *outFileBBAA << "\t" << abs(D1) << "\t" << D1_Z << "\t";
            *outFileBBAA << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl;
        } else if (BABAtotals[i] >= BBAAtotals[i] && BABAtotals[i] >= ABBAtotals[i]) {
            if (D2 >= 0)
                *outFileBBAA << trios[i][0] << "\t" << trios[i][2] << "\t" << trios[i][1];
            else
                *outFileBBAA << trios[i][2] << "\t" << trios[i][0] << "\t" << trios[i][1];
            *outFileBBAA << "\t" << abs(D2) << "\t" << D2_Z << "\t";
            *outFileBBAA << BABAtotals[i] << "\t" << BBAAtotals[i] << "\t" << ABBAtotals[i] << std::endl;
        } else if (ABBAtotals[i] >= BBAAtotals[i] && ABBAtotals[i] >= BABAtotals[i]) {
            if (D3 >= 0)
                *outFileBBAA << trios[i][2] << "\t" << trios[i][1] << "\t" << trios[i][0];
            else
                *outFileBBAA << trios[i][1] << "\t" << trios[i][2] << "\t" << trios[i][0];
            *outFileBBAA << "\t" << abs(D3) << "\t" << D3_Z << "\t";
            *outFileBBAA << ABBAtotals[i] << "\t" << BABAtotals[i] << "\t" << BBAAtotals[i] << std::endl;
        }
        
        // Find Dmin:
        if (abs(D1) <= abs(D2) && abs(D1) <= abs(D3)) { // (P3 == S3)
            if (D1 >= 0)
                *outFileDmin << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << D1 << "\t" << D1_Z << "\t" << std::endl;
            else
                *outFileDmin << trios[i][1] << "\t" << trios[i][0] << "\t" << trios[i][2] << "\t" << abs(D1) << "\t" << D1_Z << "\t"<< std::endl;
            // if (BBAAtotals[i] < BABAtotals[i] || BBAAtotals[i] < ABBAtotals[i])
            //     std::cerr << "\t" << "WARNING: Dmin tree different from DAF tree" << std::endl;
        } else if (abs(D2) <= abs(D1) && abs(D2) <= abs(D3)) { // (P3 == S2)
            if (D2 >= 0)
                *outFileDmin << trios[i][0] << "\t" << trios[i][2] << "\t" << trios[i][1] << "\t" << D2 << "\t" << D2_Z << "\t"<< std::endl;
            else
                *outFileDmin << trios[i][2] << "\t" << trios[i][0] << "\t" << trios[i][1] << "\t" << abs(D2) << "\t" << D2_Z << "\t"<< std::endl;
            // if (BABAtotals[i] < BBAAtotals[i] || BABAtotals[i] < ABBAtotals[i])
            //     std::cerr << "\t" << "WARNING: Dmin tree different from DAF tree" << std::endl;
        } else if (abs(D3) <= abs(D1) && abs(D3) <= abs(D2)) { // (P3 == S1)
            if (D3 >= 0)
                *outFileDmin << trios[i][2] << "\t" << trios[i][1] << "\t" << trios[i][0] << "\t" << D3 << "\t" << D3_Z << "\t"<< std::endl;
            else
                *outFileDmin << trios[i][1] << "\t" << trios[i][2] << "\t" << trios[i][0] << "\t" << abs(D3) << "\t" << D3_Z << "\t" << std::endl;;
            // if (ABBAtotals[i] < BBAAtotals[i] || ABBAtotals[i] < BABAtotals[i])
            //     std::cerr << "\t" << "WARNING: Dmin tree different from DAF tree" << std::endl;
        }
        
        // Output a simple file that can be used for combining multiple local runs:
        *outFileCombine << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl;
        print_vector(regionDs[i][0], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t"; print_vector(regionDs[i][1], *outFileCombineStdErr, ',', false); *outFileCombineStdErr << "\t";
        print_vector(regionDs[i][2], *outFileCombineStdErr, ',',false); *outFileCombineStdErr << std::endl;
        
        //std::cerr << trios[i][0] << "\t" << trios[i][1] << "\t" << trios[i][2] << "\t" << D1 << "\t" << D2 << "\t" << D3 << "\t" << BBAAtotals[i] << "\t" << BABAtotals[i] << "\t" << ABBAtotals[i] << std::endl;
    }
     */
    return 0;
    
}
示例#4
0
文件: rmdup.cpp 项目: Milt0n/sga
std::string parseDupHits(const StringVector& hitsFilenames, const std::string& out_prefix)
{
    // Load the suffix array index and the reverse suffix array index
    // Note these are not the full suffix arrays
    SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT);
    SuffixArray* pRevSAI = new SuffixArray(opt::prefix + RSAI_EXT);

    // Load the read table to look up the lengths of the reads and their ids.
    // When rmduping a set of reads, the ReadInfoTable can actually be larger than the
    // BWT if the names of the reads are very long. Previously, when two reads
    // are duplicated, the read with the lexographically lower read name was chosen
    // to be kept. To save memory here, we break ties using the index in the ReadInfoTable
    // instead. This allows us to avoid loading the read names.
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings(), RIO_NUMERICID);

    std::string outFile = out_prefix + ".fa";
    std::string dupFile = out_prefix + ".dups.fa";
    std::ostream* pWriter = createWriter(outFile);
    std::ostream* pDupWriter = createWriter(dupFile);

    size_t substringRemoved = 0;
    size_t identicalRemoved = 0;
    size_t kept = 0;
    size_t buffer_size = SequenceProcessFramework::BUFFER_SIZE;

    // The reads must be output in their original ordering.
    // The hits are in the blocks of buffer_size items. We read
    // buffer_size items from the first hits file, then buffer_size
    // from the second and so on until all the hits have been processed.
    size_t num_files = hitsFilenames.size();
    std::vector<std::istream*> reader_vec(num_files, 0);

    for(size_t i = 0; i < num_files; ++i)
    {
        std::cout << "Opening " << hitsFilenames[i] << "\n";
        reader_vec[i] = createReader(hitsFilenames[i]);
    }

    bool done = false;
    size_t currReaderIdx = 0;
    size_t numRead = 0;
    size_t numReadersDone = 0;
    std::string line;

    while(!done)
    {
        // Parse a line from the current file
        bool valid = static_cast<bool>(getline(*reader_vec[currReaderIdx], line));
        ++numRead;
        // Deal with switching the active reader and the end of files
        if(!valid || numRead == buffer_size)
        {
            // Switch the reader
            currReaderIdx = (currReaderIdx + 1) % num_files;
            numRead = 0;

            // Break once all the readers are invalid
            if(!valid)
            {
                ++numReadersDone;
                if(numReadersDone == num_files)
                {
                    done = true;
                    break;
                }
            }
        }

        // Parse the data
        if(valid)
        {
            std::string id;
            std::string sequence;
            std::string hitsStr;
            size_t readIdx;
            size_t numCopies;
            bool isSubstring;

            std::stringstream parser(line);
            parser >> id;
            parser >> sequence;
            getline(parser, hitsStr);

            OverlapVector ov;
            OverlapCommon::parseHitsString(hitsStr, pRIT, pRIT, pFwdSAI, pRevSAI, true, readIdx, numCopies, ov, isSubstring);
            
            bool isContained = false;
            if(isSubstring)
            {
                ++substringRemoved;
                isContained = true;
            }
            else
            {
                for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter)
                {
                    if(iter->isContainment() && iter->getContainedIdx() == 0)
                    {
                        // This read is contained by some other read
                        ++identicalRemoved;
                        isContained = true;
                        break;
                    }
                }
            }

            SeqItem item = {id, sequence};
            std::stringstream meta;
            meta << id << " NumDuplicates=" << numCopies;

            if(isContained)
            {
                // The read's index in the sequence data base
                // is needed when removing it from the FM-index.
                // In the output fasta, we set the reads ID to be the index
                // and record its old id in the fasta header.
                std::stringstream newID;
                newID << item.id << ",seqrank=" << readIdx;
                item.id = newID.str();

                // Write some metadata with the fasta record
                item.write(*pDupWriter, meta.str());
            }
            else
            {
                ++kept;
                // Write the read
                item.write(*pWriter, meta.str());
            }
        }
    }

    for(size_t i = 0; i < num_files; ++i)
    {
        delete reader_vec[i];
        unlink(hitsFilenames[i].c_str());
    }

    
    printf("[%s] Removed %zu substring reads\n", PROGRAM_IDENT, substringRemoved);
    printf("[%s] Removed %zu identical reads\n", PROGRAM_IDENT, identicalRemoved);
    printf("[%s] Kept %zu reads\n", PROGRAM_IDENT, kept);

    // Delete allocated data
    delete pFwdSAI;
    delete pRevSAI;
    delete pRIT;
    delete pWriter;
    delete pDupWriter;

    return dupFile;
}
示例#5
0
文件: count.cpp 项目: drio/egrl
void count_main(int argc, char **argv)
{
  Timer *total_timer = new Timer("count_main");
  parse_count_options(argc, argv);

  std::istream* probes_stream;
  probes_stream = (opt::probes_file == "-") ?
                  &std::cin : createReader(opt::probes_file);

  /* Load probes */
  ss_probes probes;
  int probe_length;
  std::cerr << ">> Loading probes " << std::endl;
  probes.set_empty_key("-");
  load_probes(probes, probes_stream);
  std::cerr << ">> # of probes (RC included): " << probes.size() << std::endl;
  probe_length = probes.begin()->first.length();

  /* Process reads */
  std::string **buffer;
  /* request mem for buffer of reads */
  buffer = (std::string **) calloc(sizeof(std::string *), BUFFER_SIZE);
  int n_rr  = 0;  // number of reads read
  int total = 0;  // total number of reads processed
  SeqReader reader(opt::reads_file, SRF_NO_VALIDATION);
  while((n_rr = load_to_buffer(&reader, buffer)) != 0) {
    Timer *pr_timer = new Timer("Processing reads");
    std::cerr << ">> Computing screen on " << n_rr << " reads" << std::endl;
    total += n_rr;
    pthread_t *tid;
    pthread_attr_t attr;
    thread_data *data;
    unsigned int j; // to iterate over num of threads
    pthread_attr_init(&attr);
    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    data = (thread_data*) calloc(opt::n_threads, sizeof(thread_data));
    tid  = (pthread_t*) calloc(opt::n_threads, sizeof(pthread_t));
    /* create/init mutex */
    pthread_mutex_t mutex;
    pthread_mutex_init(&mutex, NULL);
    for (j=0; j<opt::n_threads; ++j) {
      data[j].n_reads   = n_rr;
      data[j].tid       = j;
      data[j].buffer    = buffer;
      data[j].probes    = &probes;
      data[j].p_len     = probe_length;
      data[j].mutex     = &mutex;
      data[j].n_threads = opt::n_threads;
      pthread_create(&tid[j], &attr, worker, data+j);
    }
    /* join-start threads */
    for (j=0; j<opt::n_threads; ++j) pthread_join(tid[j], 0);
    std::cerr << ">> Done computing " << n_rr << " reads" << std::endl;
    std::cerr << ">> Total # of reads processed so far: " << total << std::endl;
    free(data); free(tid);
    delete pr_timer; // force dump cpu time.
  }

  dump_results(probes);
  free_up_probes(probes);
  delete probes_stream;
  for (int j=0; j<BUFFER_SIZE; j++) delete buffer[j];
  free(buffer);
  delete total_timer;
}
示例#6
0
XMLReader* ReaderMgr::createReader( const   XMLCh* const        baseURI
                                    , const XMLCh* const        sysId
                                    , const XMLCh* const        pubId
                                    , const bool                xmlDecl
                                    , const XMLReader::RefFrom  refFrom
                                    , const XMLReader::Types    type
                                    , const XMLReader::Sources  source
                                    ,       InputSource*&       srcToFill
                                    , const bool                calcSrcOfs)
{
    // Create a buffer for expanding the system id
    XMLBuffer expSysId(1023, fMemoryManager);

    //
    //  Allow the entity handler to expand the system id if they choose
    //  to do so.
    //
    if (fEntityHandler)
    {
        if (!fEntityHandler->expandSystemId(sysId, expSysId))
            expSysId.set(sysId);
    }
     else
    {
        expSysId.set(sysId);
    }

    // Call the entity resolver interface to get an input source
    srcToFill = 0;
    if (fEntityHandler)
    {
        XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
                            expSysId.getRawBuffer(), XMLUni::fgZeroLenString, pubId, baseURI);
        srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier);
    }

    //
    //  If they didn't create a source via the entity resolver, then we
    //  have to create one on our own.
    //
    if (!srcToFill)
    {
        LastExtEntityInfo lastInfo;
        getLastExtEntityInfo(lastInfo);

        XMLURL urlTmp(fMemoryManager);
        if ((!urlTmp.setURL((!baseURI || !*baseURI) ? lastInfo.systemId : baseURI, expSysId.getRawBuffer(), urlTmp)) ||
            (urlTmp.isRelative()))
        {
            if (!fStandardUriConformant)
                srcToFill = new (fMemoryManager) LocalFileInputSource
                (
                    lastInfo.systemId
                    , expSysId.getRawBuffer()
                    , fMemoryManager
                );
            else
                ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);            
        }
        else
        {
            if (fStandardUriConformant && urlTmp.hasInvalidChar())
                ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
            srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager);
        }        
    }

    // Put a janitor on the input source
    Janitor<InputSource> janSrc(srcToFill);

    //
    //  Now call the other version with the input source that we have, and
    //  return the resulting reader.
    //
    XMLReader* retVal = createReader
    (
        *srcToFill
        , xmlDecl
        , refFrom
        , type
        , source
        , calcSrcOfs
    );

    // Either way, we can release the input source now
    janSrc.orphan();

    // If it failed for any reason, then return zero.
    if (!retVal)
        return 0;

    // Give this reader the next available reader number and return it
    retVal->setReaderNum(fNextReaderNum++);
    return retVal;
}
示例#7
0
XMLReader* ReaderMgr::createReader( const   XMLCh* const        sysId
                                    , const XMLCh* const        pubId
                                    , const bool                xmlDecl
                                    , const XMLReader::RefFrom  refFrom
                                    , const XMLReader::Types    type
                                    , const XMLReader::Sources  source
                                    ,       InputSource*&       srcToFill
                                    , const bool                calcSrcOfs
                                    ,       XMLSize_t           lowWaterMark
                                    , const bool                disableDefaultEntityResolution)
{
    //Normalize sysId
    XMLBuffer normalizedSysId(1023, fMemoryManager);
    if(sysId)
        XMLString::removeChar(sysId, 0xFFFF, normalizedSysId);
    const XMLCh* normalizedURI = normalizedSysId.getRawBuffer();

    // Create a buffer for expanding the system id
    XMLBuffer expSysId(1023, fMemoryManager);

    //
    //  Allow the entity handler to expand the system id if they choose
    //  to do so.
    //
    if (fEntityHandler)
    {
        if (!fEntityHandler->expandSystemId(normalizedURI, expSysId))
            expSysId.set(normalizedURI);
    }
     else
    {
        expSysId.set(normalizedURI);
    }

    // Call the entity resolver interface to get an input source
    srcToFill = 0;
    if (fEntityHandler)
    {
        LastExtEntityInfo lastInfo;
        getLastExtEntityInfo(lastInfo);
        XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
                            expSysId.getRawBuffer(), XMLUni::fgZeroLenString, pubId, lastInfo.systemId,
                            this);
        srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier);
    }

    //
    //  If they didn't create a source via the entity resolver, then we
    //  have to create one on our own.
    //
    if (!srcToFill)
    {
        if (disableDefaultEntityResolution)
            return 0;

        LastExtEntityInfo lastInfo;
        getLastExtEntityInfo(lastInfo);

// Keep this #if 0 block as it was exposing a threading problem on AIX.
// Got rid of the problem by changing XMLURL to not throw malformedurl
// exceptions.
#if 0
        try
        {
            XMLURL urlTmp(lastInfo.systemId, expSysId.getRawBuffer(), fMemoryManager);
            if (urlTmp.isRelative())
            {
                ThrowXMLwithMemMgr
                (
                    MalformedURLException
                    , XMLExcepts::URL_NoProtocolPresent
                    , fMemoryManager
                );
            }
            else {
                if (fStandardUriConformant && urlTmp.hasInvalidChar())
                    ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
                srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager);
            }
        }

        catch(const MalformedURLException& e)
        {
            // Its not a URL, so lets assume its a local file name if non-standard uri is allowed
            if (!fStandardUriConformant)
                srcToFill = new (fMemoryManager) LocalFileInputSource
                (
                    lastInfo.systemId
                    , expSysId.getRawBuffer()
                    , fMemoryManager
                );
            else
                throw e;
        }
#else
        XMLURL urlTmp(fMemoryManager);
        if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) ||
            (urlTmp.isRelative()))
        {
            if (!fStandardUriConformant)
            {
                XMLBuffer resolvedSysId(1023, fMemoryManager);
                XMLUri::normalizeURI(expSysId.getRawBuffer(), resolvedSysId);

                srcToFill = new (fMemoryManager) LocalFileInputSource
                (
                    lastInfo.systemId
                    , resolvedSysId.getRawBuffer()
                    , fMemoryManager
                );
            }
            else
                ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
        }
        else
        {
            if (fStandardUriConformant && urlTmp.hasInvalidChar())
                ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
            srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager);
        }
#endif
    }

    // Put a janitor on the input source
    Janitor<InputSource> janSrc(srcToFill);

    //
    //  Now call the other version with the input source that we have, and
    //  return the resulting reader.
    //
    XMLReader* retVal = createReader
    (
        *srcToFill
        , xmlDecl
        , refFrom
        , type
        , source
        , calcSrcOfs
        , lowWaterMark
    );

    // Either way, we can release the input source now
    janSrc.orphan();

    // If it failed for any reason, then return zero.
    if (!retVal)
        return 0;

    // Give this reader the next available reader number and return it
    retVal->setReaderNum(fNextReaderNum++);
    return retVal;
}
示例#8
0
StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigned int minOverlap, 
                              bool allowContainments, size_t maxEdges)
{
    // Initialize graph
    StringGraph* pGraph = new StringGraph;

    std::istream* pReader = createReader(filename);

    int stage = 0;
    int line = 0;
    std::string recordLine;
    while(getline(*pReader, recordLine))
    {
        ASQG::RecordType rt = ASQG::getRecordType(recordLine);
        switch(rt)
        {
            case ASQG::RT_HEADER:
            {
                if(stage != 0)
                {
                    std::cerr << "Error: Unexpected header record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::HeaderRecord headerRecord(recordLine);
                const SQG::IntTag& overlapTag = headerRecord.getOverlapTag();
                if(overlapTag.isInitialized())
                    pGraph->setMinOverlap(overlapTag.get());
                else
                    pGraph->setMinOverlap(0);

                const SQG::FloatTag& errorRateTag = headerRecord.getErrorRateTag();
                if(errorRateTag.isInitialized())
                    pGraph->setErrorRate(errorRateTag.get());
                
                const SQG::IntTag& containmentTag = headerRecord.getContainmentTag();
                if(containmentTag.isInitialized())
                    pGraph->setContainmentFlag(containmentTag.get());
                else
                    pGraph->setContainmentFlag(true); // conservatively assume containments are present

                const SQG::IntTag& transitiveTag = headerRecord.getTransitiveTag();
                if(!transitiveTag.isInitialized())
                {
                    std::cerr << "Warning: ASQG does not have transitive tag\n";
                    pGraph->setTransitiveFlag(true);
                }
                else
                {
                    pGraph->setTransitiveFlag(transitiveTag.get());
                }

                break;
            }
            case ASQG::RT_VERTEX:
            {
                // progress the stage if we are done the header
                if(stage == 0)
                    stage = 1;

                if(stage != 1)
                {
                    std::cerr << "Error: Unexpected vertex record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::VertexRecord vertexRecord(recordLine);
                const SQG::IntTag& ssTag = vertexRecord.getSubstringTag();

		//                Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(vertexRecord.getID(), vertexRecord.getSeq());
		Vertex* pVertex = new Vertex(vertexRecord.getID(), vertexRecord.getSeq());
                if(ssTag.isInitialized() && ssTag.get() == 1)
                {
                    // Vertex is a substring of some other vertex, mark it as contained
                    pVertex->setContained(true);
                    pGraph->setContainmentFlag(true);
                }
                pGraph->addVertex(pVertex);
                break;
            }
            case ASQG::RT_EDGE:
            {
                if(stage == 1)
                    stage = 2;
                
                if(stage != 2)
                {
                    std::cerr << "Error: Unexpected edge record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::EdgeRecord edgeRecord(recordLine);
                const Overlap& ovr = edgeRecord.getOverlap();

                // Add the edge to the graph
                if(ovr.match.getMinOverlapLength() >= (int)minOverlap)
                    SGAlgorithms::createEdgesFromOverlap(pGraph, ovr, allowContainments, maxEdges);
                break;
            }
        }
        ++line;
    }

    // Completely delete the edges for all nodes that were marked as super-repetitive in the graph
    SGSuperRepeatVisitor superRepeatVisitor;
    pGraph->visit(superRepeatVisitor);

    // Remove any duplicate edges
    SGDuplicateVisitor dupVisit;
    pGraph->visit(dupVisit);

    SGGraphStatsVisitor statsVisit;
    pGraph->visit(statsVisit);
    // Remove identical vertices
    // This is much cheaper to do than remove via
    // SGContainRemove as no remodelling needs to occur
   /*
    SGIdenticalRemoveVisitor irv;
    pGraph->visit(irv);

    // Remove substring vertices
    while(pGraph->hasContainment())
    {
        SGContainRemoveVisitor crv;
        pGraph->visit(crv);
    }
*/
    delete pReader;
    return pGraph;
}