TSeqRange CSpliced_exon::GetRowSeq_range(CSeq_align::TDim row, bool always_as_nuc) const { if (row != 0 && row != 1) { NCBI_THROW(CSeqalignException, eInvalidRowNumber, "CSpliced_exon::CreateRowSeq_interval() - " "row number must be 0 or 1 for spliced-segs."); } if (row == 0) { _ASSERT(GetProduct_start().Which() == GetProduct_end().Which()); switch ( GetProduct_start().Which() ) { case CProduct_pos::e_Nucpos: return TSeqRange(GetProduct_start().GetNucpos(), GetProduct_end().GetNucpos()); case CProduct_pos::e_Protpos: if (always_as_nuc) { return TSeqRange(GetProduct_start().AsSeqPos(), GetProduct_end().AsSeqPos()); } else { return TSeqRange(GetProduct_start().GetProtpos().GetAmin(), GetProduct_end().GetProtpos().GetAmin()); } default: NCBI_THROW(CSeqalignException, eInvalidAlignment, "CSpliced_exon::CreateRowSeq_interval() - " "start/end product-pos types do not match."); break; } } // Genomic return TSeqRange(GetGenomic_start(), GetGenomic_end()); }
/// Default constructor CSeqFormatterConfig() { m_LineWidth = 80; m_SeqRange = TSeqRange(); m_Strand = objects::eNa_strand_other; m_TargetOnly = false; m_UseCtrlA = false; m_FiltAlgoId = -1; m_FmtAlgoId = -1; }
void CRemoteBlastDbAdapter::GetSequenceBatch(const vector<int>& oids, const vector<TSeqRange>& ranges, vector< CRef<CSeq_data> >& sequence_data) { _ASSERT( !ranges.empty() ); _ASSERT(oids.size() == ranges.size()); sequence_data.clear(); vector<int> oids2fetch; vector<TSeqRange> ranges2fetch; for (vector<int>::size_type i = 0; i < oids.size(); i++) { CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]]; _ASSERT(cached_seqdata.IsValid()); // default is to fetch the entire sequence int begin = 0, end = cached_seqdata.GetLength(); if (ranges[i] != TSeqRange::GetEmpty()) { // get partial sequence begin = ranges[i].GetFrom(); end = ranges[i].GetToOpen(); } if ( !cached_seqdata.HasSequenceData(begin, end) ) { oids2fetch.push_back(oids[i]); ranges2fetch.push_back(TSeqRange(begin, end-1)); if (ranges[i] != TSeqRange::GetEmpty()) { // get partial sequence _ASSERT(ranges[i] == ranges2fetch.back()); } } } x_FetchDataByBatch(oids2fetch, ranges2fetch); // Populate the return value sequence_data.reserve(oids.size()); for (vector<int>::size_type i = 0; i < oids.size(); i++) { CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]]; _ASSERT(cached_seqdata.IsValid()); int begin = 0, end = cached_seqdata.GetLength(); if (ranges[i] != TSeqRange::GetEmpty()) { begin = ranges[i].GetFrom(); end = ranges[i].GetToOpen(); } _ASSERT(cached_seqdata.HasSequenceData(begin, end)); sequence_data.push_back(cached_seqdata.GetSeqDataChunk(begin, end)); } _ASSERT(sequence_data.size() == oids.size()); #if _DEBUG for (vector<int>::size_type i = 0; i < sequence_data.size(); i++) { _ASSERT(sequence_data[i] != NULL); } #endif }
CRangeCollection<TSeqPos> CSpliced_exon::GetRowSeq_insertions( CSeq_align::TDim row, const CSpliced_seg& seg, const CRangeCollection<TSeqPos> &within_product_ranges) const { vector<ENa_strand> strand(2, eNa_strand_unknown); if (IsSetProduct_strand()) { strand[0] = GetProduct_strand(); } else if (seg.IsSetProduct_strand()) { strand[0] = seg.GetProduct_strand(); } if (IsSetGenomic_strand()) { strand[1] = GetGenomic_strand(); } else if (seg.IsSetGenomic_strand()) { strand[1] = seg.GetGenomic_strand(); } vector<int> direction; direction.push_back(strand[0] == eNa_strand_minus ? -1 : 1); direction.push_back(strand[1] == eNa_strand_minus ? -1 : 1); vector<TSeqPos> pos; pos.push_back(strand[0] == eNa_strand_minus ? GetRowSeq_range(0,true).GetTo() : GetRowSeq_range(0,true).GetFrom()); pos.push_back(strand[1] == eNa_strand_minus ? GetRowSeq_range(1,true).GetTo() : GetRowSeq_range(1,true).GetFrom()); CRangeCollection<TSeqPos> insertions; if (IsSetParts()) { ITERATE (TParts, it, GetParts()) { const CSpliced_exon_chunk& chunk = **it; switch (chunk.Which()) { case CSpliced_exon_chunk::e_Match: pos[0] += chunk.GetMatch() * direction[0]; pos[1] += chunk.GetMatch() * direction[1]; break; case CSpliced_exon_chunk::e_Mismatch: pos[0] += chunk.GetMismatch() * direction[0]; pos[1] += chunk.GetMismatch() * direction[1]; break; case CSpliced_exon_chunk::e_Diag: pos[0] += chunk.GetDiag() * direction[0]; pos[1] += chunk.GetDiag() * direction[1]; break; case CSpliced_exon_chunk::e_Product_ins: if (row == 0) { if (strand[0] == eNa_strand_minus) { insertions += TSeqRange(pos[0] - chunk.GetProduct_ins() + 1, pos[0]); } else { insertions += TSeqRange(pos[0], pos[0] + chunk.GetProduct_ins() - 1); } } pos[0] += chunk.GetProduct_ins() * direction[0]; break; case CSpliced_exon_chunk::e_Genomic_ins: /// Add genomic insertion if the current position on the product is within the range if (row == 1 && within_product_ranges.IntersectingWith( TSeqRange(pos[0], pos[0]))) { if (strand[1] == eNa_strand_minus) { insertions += TSeqRange(pos[1] - chunk.GetGenomic_ins() + 1, pos[1]); } else { insertions += TSeqRange(pos[1], pos[1] + chunk.GetGenomic_ins() - 1); } } pos[1] += chunk.GetGenomic_ins() * direction[1]; break; default: break; } } } if (row == 0) { insertions &= within_product_ranges; } return insertions; }
// To be moved to MapCompLen.cpp void CAgpValidateApplication::x_LoadLenFa(CNcbiIstream& istr, const string& filename) { string line; string acc, acc_long; int line_num=0; int acc_count=0; // these are initialized only to suppress the warnings int header_line_num=0; int len=0; int prev_len=0; TRangeColl range_coll; // runs of Ns in the fasta of the current component TSeqPos mfa_firstMasked=0; TSeqPos mfa_pos=0; bool mfa_bMasked=false; bool mfa_prevMasked=false; while( NcbiGetline(istr, line, "\r\n") ) { line_num++; //if(line.size()==0) continue; if(line[0]=='>') { if( acc.size() ) { // close off the previous acc // warn if acc could also be an accession OverrideLenIfAccession(acc, len); prev_len = m_comp2len.AddCompLen(acc, len); if(acc_long!=acc) prev_len = m_comp2len.AddCompLen(acc_long, len, false); if(prev_len) goto LengthRedefinedFa; if(mfa_bMasked) { if(mfa_pos-mfa_firstMasked > 10) range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1); } if(!range_coll.empty()) { m_comp2range_coll[acc] = range_coll; } range_coll.clear(); mfa_firstMasked=mfa_pos=0; mfa_bMasked=false; mfa_prevMasked=false; } // Get first word, trim final '|' (if any). SIZE_TYPE pos1=line.find(' ' , 1); SIZE_TYPE pos2=line.find('\t', 1); if(pos2<pos1) pos1 = pos2; if(pos1!=NPOS) { pos1--; if(pos1>0 && line[pos1]=='|') pos1--; } acc_long=line.substr(1, pos1); acc=ExtractAccession( acc_long ); len=0; header_line_num=line_num; acc_count++; } else { if(acc.size()==0) { cerr<< "ERROR - expecting >fasta_header at start of file " << filename << ", got:\n" << line.substr(0, 100) << "\n\n"; exit(1); } for(SIZE_TYPE i=0; i<line.size(); i++ ) { if(!isalpha(line[i])) { cerr<< "ERROR - non-alphabetic character in the FASTA:\n" " file " << filename << "\n line " << line_num << "\n column " << i+1 << "\n\n"; exit(1); } mfa_pos++; mfa_bMasked = toupper(line[i]) == 'N'; if(mfa_bMasked!=mfa_prevMasked) { if(mfa_bMasked) { mfa_firstMasked=mfa_pos; } else{ if(mfa_pos-mfa_firstMasked > 10) range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1); } } mfa_prevMasked=mfa_bMasked; } len+=line.size(); /* to do: save runs of Ns as CRangeCollection<TSeqPos> later, will test component spans with: // returns iterator pointing to the TRange that has ToOpen > pos const_iterator find(position_type pos) const { PRangeLessPos<TRange, position_type> p; return lower_bound(begin(), end(), pos, p); } */ } } if( acc.size() ) { // close off the last acc prev_len = m_comp2len.AddCompLen(acc, len); if(acc_long!=acc) prev_len = m_comp2len.AddCompLen(acc_long, len, false); if(prev_len) goto LengthRedefinedFa; if(mfa_bMasked) { if(mfa_pos-mfa_firstMasked > 10) range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1); } if(!range_coll.empty()) { m_comp2range_coll[acc] = range_coll; } } if(acc_count==0) { cerr<< "WARNING - empty file " << filename << "\n"; } return; LengthRedefinedFa: cerr<< "ERROR - sequence length redefined from " << prev_len << " to " << len << "\n" << " sequence id: " << acc_long << "\n" << " File: " << filename << "\n" << " Lines: "<< header_line_num << ".." << line_num << "\n\n"; exit(1); }
operator TSeqRange() const { return TSeqRange(m_Interval->GetFrom(), m_Interval->GetTo()-1); }
/// should the sequence data be fetched by this library? /// @param range Range restriction for all sequences (default means no /// restriction). To support the specification of a single /// coordinate (start or stop), use the SetRange() method, /// the missing coordinate will be set the default value /// (e.g.: 0 for starting coordinate, sequence length for /// ending coordinate) [in] /// @param seqlen_thresh2guess sequence length threshold for molecule /// type guessing (see @ref kSeqLenThreshold2Guess) [in] /// @param local_id_counter counter used to create the CSeqidGenerator to /// create local identifiers for sequences read [in] CBlastInputSourceConfig(const SDataLoaderConfig& dlconfig, objects::ENa_strand strand = objects::eNa_strand_other, bool lowercase = false, bool believe_defline = false, TSeqRange range = TSeqRange(), bool retrieve_seq_data = true, int local_id_counter = 1, unsigned int seqlen_thresh2guess = numeric_limits<unsigned int>::max()); /// Destructor /// ~CBlastInputSourceConfig() {} /// Set the strand to a specified value /// @param strand The strand value /// void SetStrand(objects::ENa_strand strand) { m_Strand = strand; } /// Retrieve the current strand value