CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLCRLF(void) { if (m_AutoEOL) { EEOLStyle style = x_AdvanceEOLSimple('\n', '\r'); if (style == eEOL_mixed) { // found an embedded CR m_EOLStyle = eEOL_cr; } else if (style != eEOL_crlf) { m_EOLStyle = eEOL_lf; } } else { string extra; NcbiGetline(*m_Stream, m_Line, '\n', &m_LastReadSize); while ( !AtEOF() && !NStr::EndsWith(m_Line, "\r") ) { SIZE_TYPE extra_count; m_Line += '\n'; NcbiGetline(*m_Stream, extra, '\n', &extra_count); m_Line += extra; m_LastReadSize += extra_count + 1; } if (NStr::EndsWith(m_Line, "\r")) { m_Line.resize(m_Line.size() - 1); } } return m_EOLStyle; }
static char * ALIGNMENT_CALLBACK s_ReadLine(void *user_data) { CNcbiIstream *is = static_cast<CNcbiIstream *>(user_data); if (!*is) { return 0; } string s; NcbiGetline(*is, s, "\n"); NStr::ReplaceInPlace (s, "\r", ""); return strdup(s.c_str()); }
CStreamLineReader& CStreamLineReader::operator++(void) { ++m_LineNumber; if ( m_UngetLine ) { m_UngetLine = false; return *this; } switch (m_EOLStyle) { case eEOL_unknown: x_AdvanceEOLUnknown(); break; case eEOL_cr: x_AdvanceEOLSimple('\r', '\n'); break; case eEOL_lf: x_AdvanceEOLSimple('\n', '\r'); break; case eEOL_crlf: x_AdvanceEOLCRLF(); break; case eEOL_mixed: NcbiGetline(*m_Stream, m_Line, "\r\n"); break; } return *this; }
CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLSimple(char eol, char alt_eol) { SIZE_TYPE pos; NcbiGetline(*m_Stream, m_Line, eol, &m_LastReadSize); if (m_AutoEOL && (pos = m_Line.find(alt_eol)) != NPOS) { ++pos; if (eol != '\n' || pos != m_Line.size()) { // an *immediately* preceding CR is quite all right CStreamUtils::Pushback(*m_Stream, m_Line.data() + pos, m_Line.size() - pos); m_EOLStyle = eEOL_mixed; } m_Line.resize(pos - 1); m_LastReadSize = pos; return (m_EOLStyle == eEOL_mixed) ? m_EOLStyle : eEOL_crlf; } else if (m_AutoEOL && eol == '\r' && CT_EQ_INT_TYPE(m_Stream->peek(), CT_TO_INT_TYPE(alt_eol))) { m_Stream->get(); ++m_LastReadSize; return eEOL_crlf; } return (eol == '\r') ? eEOL_cr : eEOL_lf; }
CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLUnknown(void) { _ASSERT(m_AutoEOL); NcbiGetline(*m_Stream, m_Line, "\r\n", &m_LastReadSize); m_Stream->unget(); CT_INT_TYPE eol = m_Stream->get(); if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\r'))) { m_EOLStyle = eEOL_cr; } else if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\n'))) { // NcbiGetline doesn't yield enough information to determine // whether eEOL_lf or eEOL_crlf is more appropriate, and not // all streams allow tellg() (which could otherwise resolve // matters), so defer further analysis to x_AdvanceEOLCRLF, // which will be responsible for reading the next line and // supports switching to eEOL_lf as appropriate. // // An alternative approach would have been to pass \n\r rather // than \r\n, and then check for an immediately following \n // if eol turned out to be \r, but that would miscount an // actual(!) \n\r sequence as a single line break. m_EOLStyle = eEOL_crlf; } return m_EOLStyle; }
int main(int argc, char* argv[]) { CAccPatternCounter pc; // Add accessions string s; while( NcbiGetline(cin, s, "\r\n") ) { if( s.size() ) { pc.AddName(s); } } // Print expanded patterns and counts, most frequent patterns first. // Runs of digits are replaced with ranges, or kept as is. CAccPatternCounter::TMapCountToString cnt_pat; // multimap<int,string> pc.GetSortedPatterns(cnt_pat); for(CAccPatternCounter::TMapCountToString::reverse_iterator it = cnt_pat.rbegin(); it != cnt_pat.rend(); ++it ) { // pattern <tab> count cout<< it->second << "\t" << it->first << "\n"; } }
int ProcessStream(istream &in, ostream& out) { CAgpRenumber renum(out); string s; CNcbiOstrstream* buf=new CNcbiOstrstream(); int buf_lines=0; int code=0; // for reporting bool had_space =false; bool had_extra_tab=false; bool no_eol_at_eof=false; bool bad_case_gap =false; while( NcbiGetline(in, s, "\r\n") ) { // get rid of spaces except in or in front of EOL #comments char prev_ch=0; int tab_count=0; bool at_beg=true; char component_type=0; for(SIZE_TYPE i=0; i<s.size(); i++) { char ch=s[i]; switch(ch) { case ' ': if(at_beg) continue; had_space=true; ch='\t'; case '\t': if(prev_ch!='\t') { tab_count++; *buf<<'\t'; if(tab_count>8) { if( tab_count==9 && i<s.size()-1 && s[i+1]=='#' ) { // don't bark at the tab we keep (for aesthetic reasons) // in front of EOL comment in component lines } else if(!had_space){ had_extra_tab=true; } } } else if(!had_space){ // not necessarily a complete diags, but at least true had_extra_tab=true; } break; case '#': *buf << s.substr(i); goto EndFor; default: // 2010/09/14 lowercase gap type and linkage if(prev_ch=='\t' && tab_count==4) { component_type=ch; } if( (component_type=='N' || component_type=='U') && (tab_count==6 || tab_count==7) && tolower(ch)!=ch ) { ch=tolower(ch); bad_case_gap=true; } if(tab_count>8) { // A fatal error - let CAgpRow catch it and complain *buf << '\t' << s.substr(i); goto EndFor; } at_beg=false; *buf << ch; } prev_ch=ch; } EndFor: *buf << '\n'; if(++buf_lines>=MAX_BUF_LINES) { buf_lines=0; s = CNcbiOstrstreamToString(*buf); CNcbiIstrstream is(s.data(), s.size()); code=renum.ReadStream(is, CAgpReader::eFinalize_No); if(code) break; delete buf; buf=new CNcbiOstrstream(); } if(in.eof()) no_eol_at_eof=true; } if(buf_lines) { s = CNcbiOstrstreamToString(*buf); CNcbiIstrstream is(s.data(), s.size()); code=renum.ReadStream(is, CAgpReader::eFinalize_No); } if(!code) code=renum.Finalize(); if( code) { cerr << renum.GetErrorMessage()<<"\nRenumbering not completed because of errors.\n"; return 1; } if(had_space ) cerr << "Spaces converted to tabs.\n"; if(had_extra_tab ) cerr << "Extra tabs removed.\n"; if(renum.had_empty_line) cerr << "Empty line(s) removed.\n"; if(renum.custom_err.had_missing_tab) cerr << "Missing tabs added at the ends of gap lines.\n"; //if(renum.custom_err.bad_part_number) cerr << "Invalid part numbers corrected.\n"; if(no_eol_at_eof ) cerr << "Line break added at the end of file.\n"; if(bad_case_gap ) cerr << "Gap type/linkage converted to lower case.\n"; if(renum.reordered_ln_ev) cerr << "Linkage evidence terms reordered.\n"; if(renum.renum_objs ) cerr << renum.renum_objs << " object(s) renumbered.\n"; if(renum.no_renum_objs ) { if(renum.renum_objs) cerr << renum.no_renum_objs << " object(s) did not need renumbering.\n"; else cerr << "All lines have proper object_beg, object_end, part_number.\n"; } delete buf; return 0; }
// To be moved to MapCompLen.cpp void CAgpValidateApplication::x_LoadLenFa(CNcbiIstream& istr, const string& filename) { string line; string acc, acc_long; int line_num=0; int acc_count=0; // these are initialized only to suppress the warnings int header_line_num=0; int len=0; int prev_len=0; TRangeColl range_coll; // runs of Ns in the fasta of the current component TSeqPos mfa_firstMasked=0; TSeqPos mfa_pos=0; bool mfa_bMasked=false; bool mfa_prevMasked=false; while( NcbiGetline(istr, line, "\r\n") ) { line_num++; //if(line.size()==0) continue; if(line[0]=='>') { if( acc.size() ) { // close off the previous acc // warn if acc could also be an accession OverrideLenIfAccession(acc, len); prev_len = m_comp2len.AddCompLen(acc, len); if(acc_long!=acc) prev_len = m_comp2len.AddCompLen(acc_long, len, false); if(prev_len) goto LengthRedefinedFa; if(mfa_bMasked) { if(mfa_pos-mfa_firstMasked > 10) range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1); } if(!range_coll.empty()) { m_comp2range_coll[acc] = range_coll; } range_coll.clear(); mfa_firstMasked=mfa_pos=0; mfa_bMasked=false; mfa_prevMasked=false; } // Get first word, trim final '|' (if any). SIZE_TYPE pos1=line.find(' ' , 1); SIZE_TYPE pos2=line.find('\t', 1); if(pos2<pos1) pos1 = pos2; if(pos1!=NPOS) { pos1--; if(pos1>0 && line[pos1]=='|') pos1--; } acc_long=line.substr(1, pos1); acc=ExtractAccession( acc_long ); len=0; header_line_num=line_num; acc_count++; } else { if(acc.size()==0) { cerr<< "ERROR - expecting >fasta_header at start of file " << filename << ", got:\n" << line.substr(0, 100) << "\n\n"; exit(1); } for(SIZE_TYPE i=0; i<line.size(); i++ ) { if(!isalpha(line[i])) { cerr<< "ERROR - non-alphabetic character in the FASTA:\n" " file " << filename << "\n line " << line_num << "\n column " << i+1 << "\n\n"; exit(1); } mfa_pos++; mfa_bMasked = toupper(line[i]) == 'N'; if(mfa_bMasked!=mfa_prevMasked) { if(mfa_bMasked) { mfa_firstMasked=mfa_pos; } else{ if(mfa_pos-mfa_firstMasked > 10) range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1); } } mfa_prevMasked=mfa_bMasked; } len+=line.size(); /* to do: save runs of Ns as CRangeCollection<TSeqPos> later, will test component spans with: // returns iterator pointing to the TRange that has ToOpen > pos const_iterator find(position_type pos) const { PRangeLessPos<TRange, position_type> p; return lower_bound(begin(), end(), pos, p); } */ } } if( acc.size() ) { // close off the last acc prev_len = m_comp2len.AddCompLen(acc, len); if(acc_long!=acc) prev_len = m_comp2len.AddCompLen(acc_long, len, false); if(prev_len) goto LengthRedefinedFa; if(mfa_bMasked) { if(mfa_pos-mfa_firstMasked > 10) range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1); } if(!range_coll.empty()) { m_comp2range_coll[acc] = range_coll; } } if(acc_count==0) { cerr<< "WARNING - empty file " << filename << "\n"; } return; LengthRedefinedFa: cerr<< "ERROR - sequence length redefined from " << prev_len << " to " << len << "\n" << " sequence id: " << acc_long << "\n" << " File: " << filename << "\n" << " Lines: "<< header_line_num << ".." << line_num << "\n\n"; exit(1); }
void CAgpValidateApplication::x_ValidateFile( CNcbiIstream& istr) { if( 0==(m_ValidationType&VT_Acc) ) { // CAgpReader m_reader.SetVersion(m_agp_version); m_reader.ReadStream(istr); // , false } else { int line_num = 0; string line; CRef<CAgpRow> agp_row( CAgpRow::New(pAgpErr.GetPointer(), m_agp_version)); // Allow Unix, DOS, Mac EOL characters while( NcbiGetline(istr, line, "\r\n") ) { line_num++; int code=agp_row->FromString(line); if(code==-1) continue; // skip a comment line bool queued=false; bool comp2len_check_failed=false; if(code==0) { if( !agp_row->IsGap() ) { if( m_comp2len.size() && !agp_row->IsGap() ) { TMapStrInt::iterator it = m_comp2len.find( agp_row->GetComponentId() ); if( it!=m_comp2len.end() ) { comp2len_check_failed=!agp_row->CheckComponentEnd(it->second); // Skip regular genbank-based validation for this line; // will print it verbatim, same as gap or error line. m_AltValidator->QueueLine(line); queued=true; } // else: will try Entrez and ObjMan } if(!queued){ // component line - queue for batch lookup m_AltValidator->QueueLine(line, agp_row->GetComponentId(), line_num, agp_row->component_end); queued=true; } } } // else: the error message already reached the error handler if(m_AltValidator->m_out && !queued) { // error or gap line - queue for verbatim reprinting m_AltValidator->QueueLine(line); } if( code!=0 || comp2len_check_failed || // process the batch now so that error lines are printed in the correct order m_AltValidator->QueueSize() >= 1000 ) { AutoPtr<CNcbiOstrstream> tmp_messages = pAgpErr->m_messages; pAgpErr->m_messages.reset( new CNcbiOstrstream ); // process a batch of preceding lines m_AltValidator->ProcessQueue(); pAgpErr->m_messages = tmp_messages; } pAgpErr->LineDone(line, line_num, code!=0 ); } m_AltValidator->ProcessQueue(); } }
int COMSSAMerge::Run() { try { CArgs args = GetArgs(); CRef <COMSSASearch> MySearch(new COMSSASearch); ESerialDataFormat InFileType(eSerial_Xml), OutFileType(eSerial_Xml); bool obz2(false); // output bzip2 compressed? bool ibz2(false); // input bzip2 compressed? if(args["ox"]) OutFileType = eSerial_Xml; else if(args["ob"]) OutFileType = eSerial_AsnBinary; else if(args["ot"]) OutFileType = eSerial_AsnText; else if(args["obz2"]) { OutFileType = eSerial_Xml; obz2 = true; } else ERR_POST(Fatal << "output file type not given"); if(args["ix"]) InFileType = eSerial_Xml; else if(args["ib"]) InFileType = eSerial_AsnBinary; else if(args["it"]) InFileType = eSerial_AsnText; else if(args["ibz2"]) { InFileType = eSerial_Xml; ibz2 = true; } else ERR_POST(Fatal << "input file type not given"); // loop thru input files if ( args["i"].AsString() != "") { ifstream is(args["i"].AsString().c_str()); bool Begin(true); if(!is) ERR_POST(Fatal << "unable to open input file list " << args["i"].AsString()); while(!is.eof()) { string iFileName; NcbiGetline(is, iFileName, "\x0d\x0a"); if(iFileName == "" || is.eof()) continue; try { CRef <COMSSASearch> InSearch(new COMSSASearch); CSearchHelper::ReadCompleteSearch(iFileName, InFileType, ibz2, *InSearch); // InSearch->ReadCompleteSearch(iFileName, InFileType, ibz2); if(Begin) { Begin = false; MySearch->CopyCMSSearch(InSearch); } else { // add MySearch->AppendSearch(InSearch); } } catch(CException& e) { ERR_POST(Fatal << "exception: " << e.what()); return 1; } } } else if ( args.GetNExtra() ) { for (size_t extra = 1; extra <= args.GetNExtra(); extra++) { CRef <COMSSASearch> InSearch(new COMSSASearch); CSearchHelper::ReadCompleteSearch(args[extra].AsString(), InFileType, ibz2, *InSearch); //InSearch->ReadCompleteSearch(args[extra].AsString(), InFileType, ibz2); try { if(extra == 1) { // copy MySearch->CopyCMSSearch(InSearch); } else { // add MySearch->AppendSearch(InSearch); } } catch(CException& e) { ERR_POST(Fatal << "exception: " << e.what()); return 1; } } } // write out the new search auto_ptr <CNcbiOfstream> raw_out; auto_ptr <CCompressionOStream> compress_out; auto_ptr <CObjectOStream> txt_out; if( obz2 ) { raw_out.reset(new CNcbiOfstream(args["o"].AsString().c_str())); compress_out.reset( new CCompressionOStream (*raw_out, new CBZip2StreamCompressor(), CCompressionStream::fOwnProcessor)); txt_out.reset(CObjectOStream::Open(OutFileType, *compress_out)); } else { txt_out.reset(CObjectOStream::Open(args["o"].AsString().c_str(), OutFileType)); } // auto_ptr <CObjectOStream> txt_out( // CObjectOStream::Open(args["o"].AsString(), OutFileType)); if(txt_out.get()) { SetUpOutputFile(txt_out.get(), OutFileType); if (args["sw"]) { txt_out->Write(ObjectInfo(*(*MySearch->SetResponse().begin()))); } else { txt_out->Write(ObjectInfo(*MySearch)); } txt_out->Flush(); txt_out->Close(); } } catch (NCBI_NS_STD::exception& e) { ERR_POST(Fatal << "Exception in COMSSAMerge::Run: " << e.what()); } return 0; }