Exemple #1
0
TSeqRange CSpliced_exon::GetRowSeq_range(CSeq_align::TDim    row,
                                         bool always_as_nuc) const
{
    if (row != 0  &&  row != 1) {
        NCBI_THROW(CSeqalignException, eInvalidRowNumber,
                   "CSpliced_exon::CreateRowSeq_interval() - "
                   "row number must be 0 or 1 for spliced-segs.");
    }
    if (row == 0) {
        _ASSERT(GetProduct_start().Which() == GetProduct_end().Which());
        switch ( GetProduct_start().Which() ) {
        case CProduct_pos::e_Nucpos:
            return TSeqRange(GetProduct_start().GetNucpos(),
                             GetProduct_end().GetNucpos());

        case CProduct_pos::e_Protpos:
            if (always_as_nuc) {
                return TSeqRange(GetProduct_start().AsSeqPos(),
                                 GetProduct_end().AsSeqPos());
            } else {
                return TSeqRange(GetProduct_start().GetProtpos().GetAmin(),
                                 GetProduct_end().GetProtpos().GetAmin());
            }

        default:
            NCBI_THROW(CSeqalignException, eInvalidAlignment,
                       "CSpliced_exon::CreateRowSeq_interval() - "
                       "start/end product-pos types do not match.");
            break;
        }
    }

        // Genomic
        return TSeqRange(GetGenomic_start(), GetGenomic_end());
}
 /// Default constructor
 CSeqFormatterConfig() {
     m_LineWidth = 80;
     m_SeqRange = TSeqRange();
     m_Strand = objects::eNa_strand_other;
     m_TargetOnly = false;
     m_UseCtrlA = false;
     m_FiltAlgoId = -1;
     m_FmtAlgoId = -1;
 }
void
CRemoteBlastDbAdapter::GetSequenceBatch(const vector<int>& oids,
            const vector<TSeqRange>& ranges,
            vector< CRef<CSeq_data> >& sequence_data)
{
    _ASSERT( !ranges.empty() );
    _ASSERT(oids.size() == ranges.size());
    sequence_data.clear();

    vector<int> oids2fetch;
    vector<TSeqRange> ranges2fetch;
    for (vector<int>::size_type i = 0; i < oids.size(); i++) {
        CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]]; 
        _ASSERT(cached_seqdata.IsValid());
        // default is to fetch the entire sequence
        int begin = 0, end = cached_seqdata.GetLength();
        if (ranges[i] != TSeqRange::GetEmpty()) {   // get partial sequence
            begin = ranges[i].GetFrom();
            end = ranges[i].GetToOpen();
        }
        if ( !cached_seqdata.HasSequenceData(begin, end) ) {
            oids2fetch.push_back(oids[i]);
            ranges2fetch.push_back(TSeqRange(begin, end-1));
            if (ranges[i] != TSeqRange::GetEmpty()) {   // get partial sequence
                _ASSERT(ranges[i] == ranges2fetch.back());
            }
        }
    }

    x_FetchDataByBatch(oids2fetch, ranges2fetch);

    // Populate the return value
    sequence_data.reserve(oids.size());
    for (vector<int>::size_type i = 0; i < oids.size(); i++) {
        CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]]; 
        _ASSERT(cached_seqdata.IsValid());
        int begin = 0, end = cached_seqdata.GetLength();
        if (ranges[i] != TSeqRange::GetEmpty()) {
            begin = ranges[i].GetFrom();
            end = ranges[i].GetToOpen();
        }
        _ASSERT(cached_seqdata.HasSequenceData(begin, end));
        sequence_data.push_back(cached_seqdata.GetSeqDataChunk(begin, end));
    }
    _ASSERT(sequence_data.size() == oids.size());

#if _DEBUG
    for (vector<int>::size_type i = 0; i < sequence_data.size(); i++) {
        _ASSERT(sequence_data[i] != NULL);
    }
#endif
}
Exemple #4
0
CRangeCollection<TSeqPos> CSpliced_exon::GetRowSeq_insertions(
    CSeq_align::TDim    row,
    const CSpliced_seg& seg,
    const CRangeCollection<TSeqPos> &within_product_ranges) const
{
    vector<ENa_strand> strand(2, eNa_strand_unknown);
    if (IsSetProduct_strand()) {
        strand[0] = GetProduct_strand();
    } else if (seg.IsSetProduct_strand()) {
        strand[0] = seg.GetProduct_strand();
    }
    if (IsSetGenomic_strand()) {
        strand[1] = GetGenomic_strand();
    } else if (seg.IsSetGenomic_strand()) {
        strand[1] = seg.GetGenomic_strand();
    }

    vector<int> direction;
    direction.push_back(strand[0] == eNa_strand_minus ? -1 : 1);
    direction.push_back(strand[1] == eNa_strand_minus ? -1 : 1);

    vector<TSeqPos> pos;
    pos.push_back(strand[0] == eNa_strand_minus
        ? GetRowSeq_range(0,true).GetTo() : GetRowSeq_range(0,true).GetFrom());
    pos.push_back(strand[1] == eNa_strand_minus
        ? GetRowSeq_range(1,true).GetTo() : GetRowSeq_range(1,true).GetFrom());

    CRangeCollection<TSeqPos> insertions;
    if (IsSetParts()) {
        ITERATE (TParts, it, GetParts()) {
            const CSpliced_exon_chunk& chunk = **it;
            switch (chunk.Which()) {
            case CSpliced_exon_chunk::e_Match:
                pos[0] += chunk.GetMatch() * direction[0];
                pos[1] += chunk.GetMatch() * direction[1];
                break;
    
            case CSpliced_exon_chunk::e_Mismatch:
                pos[0] += chunk.GetMismatch() * direction[0];
                pos[1] += chunk.GetMismatch() * direction[1];
                break;
    
            case CSpliced_exon_chunk::e_Diag:
                pos[0] += chunk.GetDiag() * direction[0];
                pos[1] += chunk.GetDiag() * direction[1];
                break;
    
            case CSpliced_exon_chunk::e_Product_ins:
                if (row == 0) {
                    if (strand[0] == eNa_strand_minus) {
                        insertions += TSeqRange(pos[0] - chunk.GetProduct_ins() + 1,
                                          pos[0]);
                    } else {
                        insertions += TSeqRange(pos[0],
                                          pos[0] + chunk.GetProduct_ins() - 1);
                    }
                }
                pos[0] += chunk.GetProduct_ins() * direction[0];
                break;
    
            case CSpliced_exon_chunk::e_Genomic_ins:
                /// Add genomic insertion if the current position on the product is within the range
                if (row == 1 && within_product_ranges.IntersectingWith(
                                    TSeqRange(pos[0], pos[0])))
                {
                    if (strand[1] == eNa_strand_minus) {
                        insertions += TSeqRange(pos[1] - chunk.GetGenomic_ins() + 1,
                                          pos[1]);
                    } else {
                        insertions += TSeqRange(pos[1],
                                          pos[1] + chunk.GetGenomic_ins() - 1);
                    }
                }
                pos[1] += chunk.GetGenomic_ins() * direction[1];
                break;
    
            default:
                break;
            }
        }
    }
    if (row == 0) {
        insertions &= within_product_ranges;
    }
    return insertions;
}
Exemple #5
0
// To be moved to MapCompLen.cpp
void CAgpValidateApplication::x_LoadLenFa(CNcbiIstream& istr, const string& filename)
{
  string line;
  string acc, acc_long;
  int line_num=0;
  int acc_count=0;

  // these are initialized only to suppress the warnings
  int header_line_num=0;
  int len=0;
  int prev_len=0;

  TRangeColl range_coll; // runs of Ns in the fasta of the current component
  TSeqPos mfa_firstMasked=0;
  TSeqPos mfa_pos=0;
  bool mfa_bMasked=false;
  bool mfa_prevMasked=false;

  while( NcbiGetline(istr, line, "\r\n") ) {
    line_num++;
    //if(line.size()==0) continue;

    if(line[0]=='>') {
      if( acc.size() ) {
        // close off the previous acc

        // warn if acc could also be an accession
        OverrideLenIfAccession(acc, len);

        prev_len =  m_comp2len.AddCompLen(acc, len);
        if(acc_long!=acc) prev_len =  m_comp2len.AddCompLen(acc_long, len, false);
        if(prev_len) goto LengthRedefinedFa;

        if(mfa_bMasked) {
          if(mfa_pos-mfa_firstMasked > 10)
            range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
        }
        if(!range_coll.empty()) {
          m_comp2range_coll[acc] = range_coll;
        }

        range_coll.clear();
        mfa_firstMasked=mfa_pos=0;
        mfa_bMasked=false;
        mfa_prevMasked=false;
      }

      // Get first word, trim final '|' (if any).
      SIZE_TYPE pos1=line.find(' ' , 1);
      SIZE_TYPE pos2=line.find('\t', 1);
      if(pos2<pos1) pos1 = pos2;
      if(pos1!=NPOS) {
        pos1--;
        if(pos1>0 && line[pos1]=='|') pos1--;
      }

      acc_long=line.substr(1, pos1);
      acc=ExtractAccession( acc_long );
      len=0;
      header_line_num=line_num;
      acc_count++;
    }
    else {
      if(acc.size()==0) {
        cerr<< "ERROR - expecting >fasta_header at start of file " << filename << ", got:\n"
            << line.substr(0, 100) << "\n\n";
        exit(1);
      }

      for(SIZE_TYPE i=0; i<line.size(); i++ ) {
        if(!isalpha(line[i])) {
          cerr<< "ERROR - non-alphabetic character in the FASTA:\n"
                 "  file " << filename << "\n  line " << line_num << "\n  column " << i+1 << "\n\n";
          exit(1);
        }

        mfa_pos++;
        mfa_bMasked = toupper(line[i]) == 'N';
        if(mfa_bMasked!=mfa_prevMasked) {
          if(mfa_bMasked) {
            mfa_firstMasked=mfa_pos;
          }
          else{
            if(mfa_pos-mfa_firstMasked > 10)
              range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
          }
        }
        mfa_prevMasked=mfa_bMasked;

      }

      len+=line.size();

      /* to do: save runs of Ns as CRangeCollection<TSeqPos>
         later, will test component spans with:

         // returns iterator pointing to the TRange that has ToOpen > pos
          const_iterator  find(position_type pos)   const
          {
              PRangeLessPos<TRange, position_type> p;
              return lower_bound(begin(), end(), pos, p);
          }
      */
    }
  }

  if( acc.size() ) {
    // close off the last acc
    prev_len =  m_comp2len.AddCompLen(acc, len);
    if(acc_long!=acc) prev_len =  m_comp2len.AddCompLen(acc_long, len, false);
    if(prev_len) goto LengthRedefinedFa;

    if(mfa_bMasked) {
      if(mfa_pos-mfa_firstMasked > 10)
        range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
    }
    if(!range_coll.empty()) {
      m_comp2range_coll[acc] = range_coll;
    }
  }
  if(acc_count==0) {
    cerr<< "WARNING - empty file " << filename << "\n";
  }
  return;

LengthRedefinedFa:
  cerr<< "ERROR - sequence length redefined from " << prev_len << " to " << len << "\n"
      << "  sequence id: " << acc_long << "\n"
      << "  File: " << filename << "\n"
      << "  Lines: "<< header_line_num << ".." << line_num << "\n\n";
  exit(1);
}
 operator TSeqRange() const {
     return TSeqRange(m_Interval->GetFrom(), m_Interval->GetTo()-1);
 }
Exemple #7
0
    ///                 should the sequence data be fetched by this library?
    /// @param range Range restriction for all sequences (default means no
    ///                 restriction). To support the specification of a single
    ///                 coordinate (start or stop), use the SetRange() method,
    ///                 the missing coordinate will be set the default value 
    ///                 (e.g.: 0 for starting coordinate, sequence length for
    ///                 ending coordinate) [in]
    /// @param seqlen_thresh2guess sequence length threshold for molecule
    ///                 type guessing (see @ref kSeqLenThreshold2Guess) [in]
    /// @param local_id_counter counter used to create the CSeqidGenerator to
    ///                 create local identifiers for sequences read [in]
    CBlastInputSourceConfig(const SDataLoaderConfig& dlconfig,
                  objects::ENa_strand strand = objects::eNa_strand_other,
                  bool lowercase = false,
                  bool believe_defline = false,
                  TSeqRange range = TSeqRange(),
                  bool retrieve_seq_data = true,
                  int local_id_counter = 1,
                  unsigned int seqlen_thresh2guess = 
                    numeric_limits<unsigned int>::max());

    /// Destructor
    ///
    ~CBlastInputSourceConfig() {}

    /// Set the strand to a specified value
    /// @param strand The strand value
    ///
    void SetStrand(objects::ENa_strand strand) { m_Strand = strand; }

    /// Retrieve the current strand value