Esempio n. 1
0
CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLCRLF(void)
{
    if (m_AutoEOL) {
        EEOLStyle style = x_AdvanceEOLSimple('\n', '\r');
        if (style == eEOL_mixed) {
            // found an embedded CR
            m_EOLStyle = eEOL_cr;
        } else if (style != eEOL_crlf) {
            m_EOLStyle = eEOL_lf;
        }
    } else {
        string extra;
        NcbiGetline(*m_Stream, m_Line, '\n', &m_LastReadSize);
        while ( !AtEOF()  &&  !NStr::EndsWith(m_Line, "\r") ) {
            SIZE_TYPE extra_count;
            m_Line += '\n';
            NcbiGetline(*m_Stream, extra, '\n', &extra_count);
            m_Line += extra;
            m_LastReadSize += extra_count + 1;
        }
        if (NStr::EndsWith(m_Line, "\r")) {
            m_Line.resize(m_Line.size() - 1);
        }
    }
    return m_EOLStyle;
}
Esempio n. 2
0
static char * ALIGNMENT_CALLBACK s_ReadLine(void *user_data)
{
    CNcbiIstream *is = static_cast<CNcbiIstream *>(user_data);
    if (!*is) {
        return 0;
    }
    string s;
    NcbiGetline(*is, s, "\n");
    NStr::ReplaceInPlace (s, "\r", "");
    return strdup(s.c_str());
}
Esempio n. 3
0
CStreamLineReader& CStreamLineReader::operator++(void)
{
    ++m_LineNumber;
    if ( m_UngetLine ) {
        m_UngetLine = false;
        return *this;
    }

    switch (m_EOLStyle) {
    case eEOL_unknown: x_AdvanceEOLUnknown();                   break;
    case eEOL_cr:      x_AdvanceEOLSimple('\r', '\n');          break;
    case eEOL_lf:      x_AdvanceEOLSimple('\n', '\r');          break;
    case eEOL_crlf:    x_AdvanceEOLCRLF();                      break;
    case eEOL_mixed:   NcbiGetline(*m_Stream, m_Line, "\r\n");  break;
    }
    return *this;
}
Esempio n. 4
0
CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLSimple(char eol,
                                                                   char alt_eol)
{
    SIZE_TYPE pos;
    NcbiGetline(*m_Stream, m_Line, eol, &m_LastReadSize);
    if (m_AutoEOL  &&  (pos = m_Line.find(alt_eol)) != NPOS) {
        ++pos;
        if (eol != '\n'  ||  pos != m_Line.size()) {
            // an *immediately* preceding CR is quite all right
            CStreamUtils::Pushback(*m_Stream, m_Line.data() + pos,
                                   m_Line.size() - pos);
            m_EOLStyle = eEOL_mixed;
        }
        m_Line.resize(pos - 1);
        m_LastReadSize = pos;
        return (m_EOLStyle == eEOL_mixed) ? m_EOLStyle : eEOL_crlf;
    } else if (m_AutoEOL  &&  eol == '\r'  &&
               CT_EQ_INT_TYPE(m_Stream->peek(), CT_TO_INT_TYPE(alt_eol))) {
        m_Stream->get();
        ++m_LastReadSize;
        return eEOL_crlf;
    }
    return (eol == '\r') ? eEOL_cr : eEOL_lf;
}
Esempio n. 5
0
CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLUnknown(void)
{
    _ASSERT(m_AutoEOL);
    NcbiGetline(*m_Stream, m_Line, "\r\n", &m_LastReadSize);
    m_Stream->unget();
    CT_INT_TYPE eol = m_Stream->get();
    if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\r'))) {
        m_EOLStyle = eEOL_cr;
    } else if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\n'))) {
        // NcbiGetline doesn't yield enough information to determine
        // whether eEOL_lf or eEOL_crlf is more appropriate, and not
        // all streams allow tellg() (which could otherwise resolve
        // matters), so defer further analysis to x_AdvanceEOLCRLF,
        // which will be responsible for reading the next line and
        // supports switching to eEOL_lf as appropriate.
        //
        // An alternative approach would have been to pass \n\r rather
        // than \r\n, and then check for an immediately following \n
        // if eol turned out to be \r, but that would miscount an
        // actual(!) \n\r sequence as a single line break.
        m_EOLStyle = eEOL_crlf;
    }
    return m_EOLStyle;
}
Esempio n. 6
0
int main(int argc, char* argv[])
{
    CAccPatternCounter pc;

    // Add accessions
    string s;
    while( NcbiGetline(cin, s, "\r\n") ) {
        if( s.size() ) {
            pc.AddName(s);
        }
    }

    // Print expanded patterns and counts, most frequent patterns first.
    // Runs of digits are replaced with ranges, or kept as is.
    CAccPatternCounter::TMapCountToString cnt_pat; // multimap<int,string>
    pc.GetSortedPatterns(cnt_pat);

    for(CAccPatternCounter::TMapCountToString::reverse_iterator
        it = cnt_pat.rbegin(); it != cnt_pat.rend(); ++it
    ) {
        // pattern <tab> count
        cout<< it->second << "\t" << it->first  << "\n";
    }
}
Esempio n. 7
0
int ProcessStream(istream &in, ostream& out)
{
  CAgpRenumber renum(out);

  string s;
  CNcbiOstrstream* buf=new CNcbiOstrstream();
  int buf_lines=0;
  int code=0;

  // for reporting
  bool had_space    =false;
  bool had_extra_tab=false;
  bool no_eol_at_eof=false;
  bool bad_case_gap =false;

  while( NcbiGetline(in, s, "\r\n") ) {
    // get rid of spaces except in or in front of EOL #comments
    char prev_ch=0;
    int tab_count=0;
    bool at_beg=true;

    char component_type=0;
    for(SIZE_TYPE i=0; i<s.size(); i++) {
      char ch=s[i];
      switch(ch) {
        case ' ':
          if(at_beg) continue;
          had_space=true;
          ch='\t';
        case '\t':
          if(prev_ch!='\t') {
            tab_count++;
            *buf<<'\t';
            if(tab_count>8) {
              if( tab_count==9 && i<s.size()-1 && s[i+1]=='#' ) {
                // don't bark at the tab we keep (for aesthetic reasons)
                // in front of EOL comment in component lines
              }
              else if(!had_space){
                had_extra_tab=true;
              }
            }
          }
          else if(!had_space){
             // not necessarily a complete diags, but at least true
            had_extra_tab=true;
          }

          break;
        case '#':
          *buf << s.substr(i);
          goto EndFor;
        default:
          // 2010/09/14 lowercase gap type and linkage
          if(prev_ch=='\t' && tab_count==4) {
            component_type=ch;
          }
          if( (component_type=='N' || component_type=='U') &&
              (tab_count==6 || tab_count==7) && tolower(ch)!=ch
          ) {
            ch=tolower(ch); bad_case_gap=true;
          }

          if(tab_count>8) {
            // A fatal error - let CAgpRow catch it and complain
            *buf << '\t' << s.substr(i);
            goto EndFor;
          }
          at_beg=false;
          *buf << ch;
      }
      prev_ch=ch;
    }
    EndFor:

    *buf << '\n';
    if(++buf_lines>=MAX_BUF_LINES) {
      buf_lines=0;

      s = CNcbiOstrstreamToString(*buf);
      CNcbiIstrstream is(s.data(), s.size());
      code=renum.ReadStream(is, CAgpReader::eFinalize_No);
      if(code) break;

      delete buf;
      buf=new CNcbiOstrstream();
    }

    if(in.eof()) no_eol_at_eof=true;
  }

  if(buf_lines) {
    s = CNcbiOstrstreamToString(*buf);
    CNcbiIstrstream is(s.data(), s.size());
    code=renum.ReadStream(is, CAgpReader::eFinalize_No);
  }

  if(!code) code=renum.Finalize();
  if( code) {
    cerr << renum.GetErrorMessage()<<"\nRenumbering not completed because of errors.\n";
    return 1;
  }

  if(had_space           ) cerr << "Spaces converted to tabs.\n";
  if(had_extra_tab       ) cerr << "Extra tabs removed.\n";
  if(renum.had_empty_line) cerr << "Empty line(s) removed.\n";
  if(renum.custom_err.had_missing_tab) cerr << "Missing tabs added at the ends of gap lines.\n";
  //if(renum.custom_err.bad_part_number) cerr << "Invalid part numbers corrected.\n";
  if(no_eol_at_eof       ) cerr << "Line break added at the end of file.\n";
  if(bad_case_gap        ) cerr << "Gap type/linkage converted to lower case.\n";
  if(renum.reordered_ln_ev) cerr << "Linkage evidence terms reordered.\n";
  if(renum.renum_objs    ) cerr << renum.renum_objs << " object(s) renumbered.\n";
  if(renum.no_renum_objs ) {
    if(renum.renum_objs)
      cerr << renum.no_renum_objs << " object(s) did not need renumbering.\n";
    else
      cerr << "All lines have proper object_beg, object_end, part_number.\n";
  }

  delete buf;
  return 0;
}
Esempio n. 8
0
// To be moved to MapCompLen.cpp
void CAgpValidateApplication::x_LoadLenFa(CNcbiIstream& istr, const string& filename)
{
  string line;
  string acc, acc_long;
  int line_num=0;
  int acc_count=0;

  // these are initialized only to suppress the warnings
  int header_line_num=0;
  int len=0;
  int prev_len=0;

  TRangeColl range_coll; // runs of Ns in the fasta of the current component
  TSeqPos mfa_firstMasked=0;
  TSeqPos mfa_pos=0;
  bool mfa_bMasked=false;
  bool mfa_prevMasked=false;

  while( NcbiGetline(istr, line, "\r\n") ) {
    line_num++;
    //if(line.size()==0) continue;

    if(line[0]=='>') {
      if( acc.size() ) {
        // close off the previous acc

        // warn if acc could also be an accession
        OverrideLenIfAccession(acc, len);

        prev_len =  m_comp2len.AddCompLen(acc, len);
        if(acc_long!=acc) prev_len =  m_comp2len.AddCompLen(acc_long, len, false);
        if(prev_len) goto LengthRedefinedFa;

        if(mfa_bMasked) {
          if(mfa_pos-mfa_firstMasked > 10)
            range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
        }
        if(!range_coll.empty()) {
          m_comp2range_coll[acc] = range_coll;
        }

        range_coll.clear();
        mfa_firstMasked=mfa_pos=0;
        mfa_bMasked=false;
        mfa_prevMasked=false;
      }

      // Get first word, trim final '|' (if any).
      SIZE_TYPE pos1=line.find(' ' , 1);
      SIZE_TYPE pos2=line.find('\t', 1);
      if(pos2<pos1) pos1 = pos2;
      if(pos1!=NPOS) {
        pos1--;
        if(pos1>0 && line[pos1]=='|') pos1--;
      }

      acc_long=line.substr(1, pos1);
      acc=ExtractAccession( acc_long );
      len=0;
      header_line_num=line_num;
      acc_count++;
    }
    else {
      if(acc.size()==0) {
        cerr<< "ERROR - expecting >fasta_header at start of file " << filename << ", got:\n"
            << line.substr(0, 100) << "\n\n";
        exit(1);
      }

      for(SIZE_TYPE i=0; i<line.size(); i++ ) {
        if(!isalpha(line[i])) {
          cerr<< "ERROR - non-alphabetic character in the FASTA:\n"
                 "  file " << filename << "\n  line " << line_num << "\n  column " << i+1 << "\n\n";
          exit(1);
        }

        mfa_pos++;
        mfa_bMasked = toupper(line[i]) == 'N';
        if(mfa_bMasked!=mfa_prevMasked) {
          if(mfa_bMasked) {
            mfa_firstMasked=mfa_pos;
          }
          else{
            if(mfa_pos-mfa_firstMasked > 10)
              range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
          }
        }
        mfa_prevMasked=mfa_bMasked;

      }

      len+=line.size();

      /* to do: save runs of Ns as CRangeCollection<TSeqPos>
         later, will test component spans with:

         // returns iterator pointing to the TRange that has ToOpen > pos
          const_iterator  find(position_type pos)   const
          {
              PRangeLessPos<TRange, position_type> p;
              return lower_bound(begin(), end(), pos, p);
          }
      */
    }
  }

  if( acc.size() ) {
    // close off the last acc
    prev_len =  m_comp2len.AddCompLen(acc, len);
    if(acc_long!=acc) prev_len =  m_comp2len.AddCompLen(acc_long, len, false);
    if(prev_len) goto LengthRedefinedFa;

    if(mfa_bMasked) {
      if(mfa_pos-mfa_firstMasked > 10)
        range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
    }
    if(!range_coll.empty()) {
      m_comp2range_coll[acc] = range_coll;
    }
  }
  if(acc_count==0) {
    cerr<< "WARNING - empty file " << filename << "\n";
  }
  return;

LengthRedefinedFa:
  cerr<< "ERROR - sequence length redefined from " << prev_len << " to " << len << "\n"
      << "  sequence id: " << acc_long << "\n"
      << "  File: " << filename << "\n"
      << "  Lines: "<< header_line_num << ".." << line_num << "\n\n";
  exit(1);
}
Esempio n. 9
0
void CAgpValidateApplication::x_ValidateFile(
  CNcbiIstream& istr)
{

  if( 0==(m_ValidationType&VT_Acc) ) {
    // CAgpReader
    m_reader.SetVersion(m_agp_version);
    m_reader.ReadStream(istr); // , false
  }
  else {
    int line_num = 0;
    string  line;
    CRef<CAgpRow> agp_row( CAgpRow::New(pAgpErr.GetPointer(), m_agp_version));

    // Allow Unix, DOS, Mac EOL characters
    while( NcbiGetline(istr, line, "\r\n") ) {
      line_num++;

      int code=agp_row->FromString(line);
      if(code==-1) continue; // skip a comment line
      bool queued=false;
      bool comp2len_check_failed=false;

      if(code==0) {
        if( !agp_row->IsGap() ) {
          if( m_comp2len.size() && !agp_row->IsGap() ) {
            TMapStrInt::iterator it = m_comp2len.find( agp_row->GetComponentId() );
            if( it!=m_comp2len.end() ) {
              comp2len_check_failed=!agp_row->CheckComponentEnd(it->second);
              // Skip regular genbank-based validation for this line;
              // will print it verbatim, same as gap or error line.
              m_AltValidator->QueueLine(line);
              queued=true;
            }
            // else: will try Entrez and ObjMan
          }
          if(!queued){
            // component line - queue for batch lookup
            m_AltValidator->QueueLine(line,
              agp_row->GetComponentId(), line_num, agp_row->component_end);
            queued=true;
          }
        }
      }
      // else: the error message already reached the error handler

      if(m_AltValidator->m_out && !queued) {
        // error or gap line - queue for verbatim reprinting
        m_AltValidator->QueueLine(line);
      }

      if( code!=0 || comp2len_check_failed || // process the batch now so that error lines are printed in the correct order
          m_AltValidator->QueueSize() >= 1000
      ) {
          AutoPtr<CNcbiOstrstream> tmp_messages = pAgpErr->m_messages;
          pAgpErr->m_messages.reset(  new CNcbiOstrstream );

        // process a batch of preceding lines
        m_AltValidator->ProcessQueue();

        pAgpErr->m_messages = tmp_messages;
      }

      pAgpErr->LineDone(line, line_num, code!=0 );
    }
    m_AltValidator->ProcessQueue();
  }
}
Esempio n. 10
0
int COMSSAMerge::Run()
{    

    try {

	CArgs args = GetArgs();


    CRef <COMSSASearch> MySearch(new COMSSASearch);

    ESerialDataFormat InFileType(eSerial_Xml), OutFileType(eSerial_Xml);

    bool obz2(false);  // output bzip2 compressed?
    bool ibz2(false);  // input bzip2 compressed?

    if(args["ox"]) OutFileType = eSerial_Xml;
    else if(args["ob"]) OutFileType = eSerial_AsnBinary;
    else if(args["ot"]) OutFileType = eSerial_AsnText;
    else if(args["obz2"]) {
        OutFileType = eSerial_Xml;
        obz2 = true;
    }
    else ERR_POST(Fatal << "output file type not given");

    if(args["ix"]) InFileType = eSerial_Xml;
    else if(args["ib"]) InFileType = eSerial_AsnBinary;
    else if(args["it"]) InFileType = eSerial_AsnText;
    else if(args["ibz2"]) {
        InFileType = eSerial_Xml;
        ibz2 = true;
    }
    else ERR_POST(Fatal << "input file type not given");


    // loop thru input files
    if ( args["i"].AsString() != "") {
        ifstream is(args["i"].AsString().c_str());
        bool Begin(true);
        if(!is)
            ERR_POST(Fatal << "unable to open input file list " << args["i"].AsString());
        while(!is.eof()) {
            string iFileName;
            NcbiGetline(is, iFileName, "\x0d\x0a");
            if(iFileName == "" || is.eof()) continue;
            try {
                CRef <COMSSASearch> InSearch(new COMSSASearch);
                CSearchHelper::ReadCompleteSearch(iFileName, InFileType, ibz2, *InSearch);
//                InSearch->ReadCompleteSearch(iFileName, InFileType, ibz2);
                if(Begin) {
                    Begin = false;
                    MySearch->CopyCMSSearch(InSearch);
                }
                else {
                    // add
                    MySearch->AppendSearch(InSearch);
                }
            }
            catch(CException& e) {
                ERR_POST(Fatal << "exception: " << e.what());
                return 1;
            }
        }
    }
    else if ( args.GetNExtra() ) {
        for (size_t extra = 1;  extra <= args.GetNExtra();  extra++) {
            CRef <COMSSASearch> InSearch(new COMSSASearch);
            CSearchHelper::ReadCompleteSearch(args[extra].AsString(), InFileType, ibz2, *InSearch);
            //InSearch->ReadCompleteSearch(args[extra].AsString(), InFileType, ibz2);
            try {
                if(extra == 1) {
                    // copy
                    MySearch->CopyCMSSearch(InSearch);
                }
                else {
                    // add
                    MySearch->AppendSearch(InSearch);
                }
            }
            catch(CException& e) {
                ERR_POST(Fatal << "exception: " << e.what());
                return 1;
            }
        }
    }
 
    // write out the new search

    auto_ptr <CNcbiOfstream> raw_out;
    auto_ptr <CCompressionOStream> compress_out;
    auto_ptr <CObjectOStream> txt_out;
    
    if( obz2 ) {
        raw_out.reset(new CNcbiOfstream(args["o"].AsString().c_str()));
        compress_out.reset( new CCompressionOStream (*raw_out, 
                                                     new CBZip2StreamCompressor(), 
                                                     CCompressionStream::fOwnProcessor)); 
        txt_out.reset(CObjectOStream::Open(OutFileType, *compress_out)); 
    }
    else {
        txt_out.reset(CObjectOStream::Open(args["o"].AsString().c_str(), OutFileType));
    }


//    auto_ptr <CObjectOStream> txt_out(
//         CObjectOStream::Open(args["o"].AsString(), OutFileType));

    if(txt_out.get()) {
        SetUpOutputFile(txt_out.get(), OutFileType);
        if (args["sw"]) {
            txt_out->Write(ObjectInfo(*(*MySearch->SetResponse().begin())));
	}
        else {
            txt_out->Write(ObjectInfo(*MySearch));
        }
        txt_out->Flush();
        txt_out->Close();
    }


    } catch (NCBI_NS_STD::exception& e) {
	ERR_POST(Fatal << "Exception in COMSSAMerge::Run: " << e.what());
    }

    return 0;
}