Exemple #1
0
// Flatten sections into one sequence
void Tree::Node::Flatten(Sequence& seq) const
{
	for(vector<Sequence>::const_iterator cit = m_vSections.begin();
		cit != m_vSections.end(); ++cit)
		seq.Append(*cit);
}
Exemple #2
0
bool WaveClip::Resample(int rate, ProgressDialog *progress)
{
   if (rate == mRate)
      return true; // Nothing to do

   double factor = (double)rate / (double)mRate;
   ::Resample* resample = new ::Resample(true, factor, factor);
   
   int bufsize = 65536;
   float* inBuffer = new float[bufsize];
   float* outBuffer = new float[bufsize];
   sampleCount pos = 0;
   bool error = false;
   int outGenerated = 0;
   sampleCount numSamples = mSequence->GetNumSamples();

   Sequence* newSequence =
      new Sequence(mSequence->GetDirManager(), mSequence->GetSampleFormat());
   
   /**
    * We want to keep going as long as we have something to feed the resampler
    * with OR as long as the resampler spews out samples (which could continue
    * for a few iterations after we stop feeding it)
    */
   while (pos < numSamples || outGenerated > 0)
   {
      int inLen = numSamples - pos;
      if (inLen > bufsize)
         inLen = bufsize;
         
      bool isLast = ((pos + inLen) == numSamples);
      
      if (!mSequence->Get((samplePtr)inBuffer, floatSample, pos, inLen))
      {
         error = true;
         break;
      }
      
      int inBufferUsed = 0;
      outGenerated = resample->Process(factor, inBuffer, inLen, isLast,
                                           &inBufferUsed, outBuffer, bufsize);
                                           
      pos += inBufferUsed;
      
      if (outGenerated < 0)
      {
         error = true;
         break;
      }

      if (!newSequence->Append((samplePtr)outBuffer, floatSample,
                               outGenerated))
      {
         error = true;
         break;
      }

      if (progress)
      {
         int updateResult = progress->Update(pos, numSamples);
         error = (updateResult != eProgressSuccess);
         if (error)
         {
            break;
         }
      }
   }
   
   delete[] inBuffer;
   delete[] outBuffer;
   delete resample;
   
   if (error)
   {
      delete newSequence;
   } else
   {
      delete mSequence;
      mSequence = newSequence;
      mRate = rate;

      // Invalidate wave display cache
      if (mWaveCache)
      {
         delete mWaveCache;
         mWaveCache = NULL;
      }
      mWaveCache = new WaveCache(1);
      // Invalidate the spectrum display cache
      if (mSpecCache)
         delete mSpecCache;
      mSpecCache = new SpecCache(1, 1, false);
   }

   return !error;
}
Exemple #3
0
int main(int argc, char *argv[])
{
    int min_contig = 100;
    double similar = 0.95;
    double complete_rate = 0.8;
    bool is_local = false;
    
    OptionsDescription desc;
    desc.AddOption("min_contig", "", min_contig, "minimum contigs");
    desc.AddOption("similar", "", similar, "similarity");
    desc.AddOption("complete_rate", "", complete_rate, "completeness");
    desc.AddOption("is_local", "", is_local, "local align");

    try
    {
        desc.Parse(argc, argv);
    }
    catch (exception &e)
    {
        cerr << e.what() << endl;
        cerr << "validate_contigs_blat - validate contigs by blat." << endl;
        cerr << "Usage: validate_contigs_blat ref.fa contigs.fa." << endl;
        cerr << "Allowed Options: " << endl;
        cerr << desc << endl;
        exit(1);
    }

    deque<Sequence> refs;
    deque<string> ref_names;
    ReadSequence(argv[1], refs, ref_names);

    deque<Sequence> contigs;
    deque<string> contig_names;
    ReadSequence(argv[2], contigs, contig_names);

    vector<int> is_found(refs.size());
    vector<vector<double> > flags(refs.size());
    map<string, int> dict;
    for (unsigned i = 0; i < refs.size(); ++i)
    {
        flags[i].resize(refs[i].size(), false);
        size_t index = ref_names[i].find(' ');
        if (index != string::npos)
            ref_names[i].resize(index);
        dict[ref_names[i]] = i;
    }

    int num_gaps = 0;
    for (unsigned i = 0; i < contigs.size(); ++i)
    {
        size_t index = contig_names[i].find(' ');
        if (index != string::npos)
            contig_names[i].resize(index);

        bool is_new_gap = true;
        for (unsigned j = 0; j < contigs[i].size(); ++j)
        {
            if (contigs[i][j] == 4)
            {
                if (is_new_gap)
                {
                    is_new_gap = false;
                    ++num_gaps;
                }
            }
            else
                is_new_gap = true;
        }
    }

    string blat_file = string(argv[2]) + ".blat";
    FILE *fblat = OpenFile(blat_file, "rb");

    map<string, int> valid_contigs;
    deque<int> valid_lengths;
    int64_t num_mismatch = 0;
    while (fgets(line, MaxLine, fblat) != NULL)
    {
        BlatRecord record;
        record.Parse(line);

        deque<BlatRecord> records;
        records.push_back(record);

        while (fgets(line, MaxLine, fblat) != NULL)
        {
            record.Parse(line);
            if (record.query_name == records.back().query_name)
                records.push_back(record);
            else
            {
                fseek(fblat, -strlen(line), SEEK_CUR);
                break;
            }
        }

        int index = 0;
        for (unsigned i = 0; i < records.size(); ++i)
        {
            if (records[i].match_count > similar * records[i].query_length
                    && records[i].match_count > similar * abs(record.ref_to - record.ref_from))
                records[index++] = records[i];
        }
        records.resize(index);

        for (unsigned i = 0; i < records.size(); ++i)
        {
            record = records[i];
            int ref_id = dict[record.ref_name];

            //if (record.match_count > similar * record.query_length && record.query_length >= min_contig
            if ((record.match_count > similar * record.query_length 
                        || (is_local && record.match_count > similar * abs(record.query_to - record.query_from)))
            //if (record.match_count > similar * abs(record.query_to - record.query_from)
                    && abs(record.query_to - record.query_from) >= min_contig
                    && record.match_count > similar * abs(record.ref_to - record.ref_from)
               )
            {
                //if (record.match_count >= similar * record.ref_length)
                if (record.match_count >= complete_rate * record.ref_length)
                    is_found[ref_id] = true;
    //            else
    //                continue;

                int not_used = 0;
                for (unsigned i = 0; i < record.blocks.size(); ++i)
                {
                    BlatBlock block = record.blocks[i];
                    for (unsigned j = block.ref_from; j < block.ref_from + block.size; ++j)
                    {
                        if (flags[ref_id][j] == false)
                        {
                            //flags[ref_id][j] = true;
                            not_used++;
                        }
                        
                        flags[ref_id][j] += 1.0 / records.size();
                    }
                }

                if (valid_contigs.find(record.query_name) == valid_contigs.end())
                {
                    valid_contigs[record.query_name] = record.mismatch_count;
                    valid_lengths.push_back(record.query_to - record.query_from);
                }
                else
                {
                    valid_contigs[record.query_name] = min(record.mismatch_count, (int64_t)valid_contigs[record.query_name]);

                    if (not_used > similar * record.query_length)
                        valid_lengths.push_back(record.query_to - record.query_from);
                }
            }
        }
    }

    for (map<string, int>::iterator p = valid_contigs.begin(); p != valid_contigs.end(); ++p)
    {
        num_mismatch += p->second;
    }

    long long count = 0;
    long long total = 0;
    for (unsigned k = 0; k < flags.size(); ++k)
    {
        for (unsigned i = 0; i < flags[k].size(); ++i)
        {
            if (flags[k][i])
                ++count;
            ++total;
        }
    }

    //valid_lengths.push_back(60000);
    sort(valid_lengths.begin(), valid_lengths.end());
    reverse(valid_lengths.begin(), valid_lengths.end());

    long long n50 = 0;
    long long sum = 0;
    long long n80 = 0;

    for (unsigned i = 0; i < valid_lengths.size(); ++i)
    {
        sum += valid_lengths[i];
        if (sum >= 0.5 * total && n50 == 0)
            n50 = valid_lengths[i];
        if (sum >= 0.8 * total && n80 == 0)
            n80 = valid_lengths[i];
    }
    cout << "total " << total << " " << sum << endl;

    long long maximum = 0;
    long long mean = 0;
    if (valid_lengths.size() > 0)
    {
        maximum = valid_lengths[0];
        mean = sum / valid_lengths.size();
    }

    long long sum_wrong = 0;
    long long num_wrong = 0;
    long long corret_contigs = 0;
    long long sum_corret = 0;
    int last_id = 0;
    int last_error = 0;
    deque<int> contig_flags(contigs.size(), false);
    FastaWriter error_writer(FormatString("%s.error.fa", argv[2]));
    for (unsigned i = 0; i < contigs.size(); ++i)
    {
        if ((int)contigs[i].size() < min_contig)
            continue;

        if (valid_contigs.find(contig_names[i]) == valid_contigs.end())
        {
            ++num_wrong;
            sum_wrong += contigs[i].size();
            error_writer.Write(contigs[i], contig_names[i]);
        }
        else
        {
            last_id = i;
            last_error = sum_wrong;

            ++corret_contigs;
            sum_corret += contigs[i].size();
            contig_flags[i] = true;
            //correct_writer.Write(contigs[i], contig_names[i]);
        }
    }

    printf("last id %d %d total contigs %d gaps %d\n", last_id, last_error, (int)(num_wrong + corret_contigs), num_gaps);
    printf("contigs: %lld N50: %lld coverage: %.2f%% max: %lld mean: %lld total: %lld/%lld N80: %lld\n",
            (long long)valid_contigs.size(), n50, count * 100.0 / total, maximum, mean, count, total, n80);
    printf("substitution error: %.4f%% wrong contigs: %lld %lld correct: %lld %lld %s\n", 
            num_mismatch * 100.0 /sum, num_wrong, sum_wrong, corret_contigs, sum_corret, argv[2]);

    deque<int> lengths;
    for (unsigned i = 0; i < refs.size(); ++i)
    {
        int last = 0;
        for (unsigned j = 0; j < refs[i].size(); ++j)
        {
            if (flags[i][j] == 0)
            {
                if (flags[i][last])
                {
                    lengths.push_back(j - last);
                    last = j;
                }
            }
            else
            {
                if (flags[i][last] == 0)
                    last = j;
            }
        }
    }
    sort(lengths.begin(), lengths.end());
    reverse(lengths.begin(), lengths.end());

    deque<Sequence> gaps;
    deque<bool> is_no_long_gaps(refs.size());
    for (unsigned i = 0; i < refs.size(); ++i)
    {
        deque<int> tmp;
        Sequence gap;

        if (flags[i][0])
            tmp.push_back(0);

        for (unsigned j = 0; j < refs[i].size(); ++j)
        {
            if (flags[i][j] == false)
            {
                gap.Append(refs[i][j]);
            }
            else
            {
                if (gap.size() > 0)
                {
                    gaps.push_back(gap);
                    tmp.push_back(gap.size());
                }
                gap.resize(0);
            }
        }

        if (gap.size() > 0)
        {
            gaps.push_back(gap);
            tmp.push_back(gap.size());
        }
        else
            tmp.push_back(0);

        is_no_long_gaps[i] = true;
        for (unsigned j = 1; j+1 < tmp.size(); ++j)
        {
            if (tmp[j] > 50)
                is_no_long_gaps[i] = false;
        }
    }
    WriteSequence(FormatString("%s.gap.fa", argv[2]), gaps, "gap");

    FastaWriter ref_writer(argv[1] + string(".found.fa"));
    FILE *ffcound_list = OpenFile(argv[1] + string(".found.fa.list"), "wb");
    int found = 0;
    int covered = 0;
    int total_contigs = 0;
    for (unsigned i = 0; i < refs.size(); ++i)
    {
        int count = 0;
        double total_hit = 0;
        for (unsigned j = 0; j < flags[i].size(); ++j)
        {
            if (flags[i][j])
                ++count;
            total_hit += flags[i][j];
        }

        if (count > complete_rate * refs[i].size() 
                //&& is_no_long_gaps[i]
                //&& 1.0 * total_hit / count > 2
                )
        {
            ++covered;
            ref_writer.Write(refs[i], ref_names[i]);
            fprintf(ffcound_list, "%s %.4f\n", ref_names[i].c_str(), 1.0 * total_hit / count);
        }
        if (is_found[i])
            ++found;
    }

    int64_t total_bases = 0;
    for (unsigned i = 0; i < contigs.size(); ++i)
    {
        if ((int)contigs[i].size() >= min_contig)
        {
            ++total_contigs;
            total_bases += contigs[i].size();
        }
    }

    cout << corret_contigs << " " << total_contigs << " " << total_bases << endl;
    printf("cover ref: %d %d\n", covered, (int)refs.size());
    printf("found ref: %d %d\n", found, (int)refs.size());
    printf("precision: %.2f%% %d %d\n", 100.0 * corret_contigs / total_contigs, (int)corret_contigs, total_contigs);

    FastaWriter correct_writer(FormatString("%s.correct.fa", argv[2]));
    for (unsigned i = 0; i < contigs.size(); i += 2)
    {
        if (contig_flags[i] || contig_flags[i+1])
        {
            correct_writer.Write(contigs[i], contig_names[i]);
            correct_writer.Write(contigs[i+1], contig_names[i+1]);
        }
    }
    
    return 0;
}